Coverage Report

Created: 2026-04-14 11:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_cnnp_model.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_cnnp_model.h"
6
#include "_ccv_nnc_graph.h"
7
#ifdef HAVE_CUDA
8
#include "gpu/ccv_nnc_compat.h"
9
#endif
10
11
// MARK - Level-5 API
12
13
ccv_cnnp_model_io_t ccv_cnnp_model_apply(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t* const inputs, const int input_size)
14
557
{
15
557
  if (!model->io)
16
548
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
17
557
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s) + sizeof(ccv_nnc_tensor_symbol_t) * model->output_size);
18
557
  model_io->param_ref = 0;
19
557
  model_io->param_sel = 0;
20
557
  model_io->visit = 0;
21
557
  model_io->model = model;
22
557
  model_io->dependencies = 0;
23
557
  model_io->dependents = 0;
24
557
  model_io->outgoings = 0;
25
557
  model_io->outputs = (ccv_nnc_tensor_symbol_t*)(model_io + 1);
26
557
  ccv_array_push(model->io, &model_io);
27
557
  if (input_size > 0)
28
554
  {
29
554
    model_io->incomings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), input_size, 0);
30
554
    ccv_array_resize(model_io->incomings, input_size);
31
554
    int i;
32
554
    memcpy(ccv_array_get(model_io->incomings, 0), inputs, sizeof(ccv_cnnp_model_io_t) * input_size);
33
1.25k
    for (i = 0; i < input_size; 
i++700
)
34
700
    {
35
700
      if (!inputs[i]->outgoings)
36
608
        inputs[i]->outgoings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
37
700
      ccv_array_push(inputs[i]->outgoings, &model_io);
38
700
    }
39
554
  } else {
40
3
    model_io->incomings = 0;
41
3
  }
42
557
  return model_io;
43
557
}
44
45
void ccv_cnnp_model_add_dependencies(ccv_cnnp_model_io_t model_io, const ccv_cnnp_model_io_t* const dependencies, const int dependency_size)
46
2
{
47
2
  assert(dependency_size > 0);
48
2
  if (!model_io->dependencies)
49
2
    model_io->dependencies = ccv_array_new(sizeof(ccv_cnnp_model_io_t), dependency_size, 0);
50
2
  int i, j;
51
5
  for (i = 0; i < dependency_size; 
i++3
)
52
3
  {
53
3
    int flag = 0;
54
    // Check if it is already exist or not.
55
4
    for (j = 0; !flag && j < model_io->dependencies->rnum; 
j++1
)
56
1
      if (*(ccv_cnnp_model_io_t*)ccv_array_get(model_io->dependencies, j) == dependencies[i])
57
0
        flag = 1;
58
3
    if (flag)
59
0
      continue;
60
3
    ccv_array_push(model_io->dependencies, dependencies + i);
61
3
    ++dependencies[i]->dependents;
62
3
  }
63
2
}
64
65
int ccv_cnnp_model_output_size(const ccv_cnnp_model_t* const model)
66
0
{
67
0
  return model->output_size;
68
0
}
69
70
int ccv_cnnp_model_is_trainable(const ccv_cnnp_model_t* const model)
71
16
{
72
  // If the model is compiled, it is default to 1 unless it is not.
73
16
  if (model->compiled_data)
74
4
    return model->is_trainable >= 0 ? model->is_trainable : 
10
;
75
12
  return model->is_trainable;
76
16
}
77
78
ccv_cnnp_model_io_t ccv_cnnp_model_parameters(ccv_cnnp_model_t* const model, const int selector, const int index)
79
393
{
80
393
  if (!model->io)
81
38
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
82
393
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s));
83
393
  model_io->param_ref = index >= 0 ? 
index + 140
:
ALL_PARAMETERS353
;
84
393
  model_io->param_sel = selector >= 0 ? 
selector + 1308
:
ALL_PARAMETERS85
;
85
393
  model_io->visit = 0;
86
393
  model_io->model = model;
87
393
  model_io->outputs = 0;
88
393
  model_io->dependencies = 0;
89
393
  model_io->dependents = 0;
90
393
  model_io->incomings = 0;
91
393
  model_io->outgoings = 0;
92
393
  ccv_array_push(model->io, &model_io);
93
393
  return model_io;
94
393
}
95
96
void ccv_cnnp_model_notify_hook(ccv_cnnp_model_t* const model, ccv_cnnp_model_notify_f func, void* const context)
97
3
{
98
3
  model->notify_hook.func = func;
99
3
  model->notify_hook.context = context;
100
3
}
101
102
void ccv_cnnp_model_notify(const ccv_cnnp_model_t* const model, const int tag, void* const payload)
103
14
{
104
14
  if (model->notify_hook.func)
105
3
    model->notify_hook.func(model, tag, payload, model->notify_hook.context);
106
14
  if (model->isa->notify)
107
1
    model->isa->notify(model, tag, payload);
108
14
}
109
110
static int _ccv_nnc_array_dedup_graph_exec_symbols(ccv_nnc_graph_exec_symbol_t* const graph_exec_symbols, int graph_exec_symbol_size)
111
2.24k
{
112
2.24k
  int i, j;
113
4.86k
  for (i = 0; i < graph_exec_symbol_size; 
i++2.61k
)
114
2.61k
  {
115
2.61k
    ccv_nnc_graph_exec_symbol_t* const graph_exec_symbol = graph_exec_symbols + i;
116
    // Check whether this tensor symbol has any duplicate.
117
23.2k
    for (j = i + 1; j < graph_exec_symbol_size;)
118
20.6k
    {
119
20.6k
      ccv_nnc_graph_exec_symbol_t* const other_symbol = graph_exec_symbols + j;
120
      // If there is a same tensor symbol, remove it.
121
20.6k
      if (other_symbol->d == graph_exec_symbol->d && 
other_symbol->graph == graph_exec_symbol->graph2.71k
)
122
2.71k
      {
123
2.71k
        if (j + 1 < graph_exec_symbol_size)
124
439
          *other_symbol = graph_exec_symbols[graph_exec_symbol_size - 1];
125
2.71k
        --graph_exec_symbol_size;
126
2.71k
        continue;
127
2.71k
      }
128
17.9k
      ++j;
129
17.9k
    }
130
2.61k
  }
131
2.24k
  return graph_exec_symbol_size;
132
2.24k
}
133
134
void ccv_cnnp_model_add_to_array(void* const context, const ccv_nnc_tensor_symbol_t symbol, const int is_trainable)
135
3.16k
{
136
3.16k
  ccv_cnnp_model_add_to_array_context_t* const add_to_array_context = (ccv_cnnp_model_add_to_array_context_t*)context;
137
3.16k
  ccv_cnnp_model_t* const model = add_to_array_context->sequence->model;
138
3.16k
  int i;
139
3.16k
  if (add_to_array_context->add_parameter_indices && 
!model->parameter_indices2.97k
)
140
2.52k
    model->parameter_indices = ccv_array_new(sizeof(int), 0, 0);
141
37.1k
  for (i = 0; i < add_to_array_context->symbols->rnum; 
i++33.9k
)
142
33.9k
  {
143
33.9k
    const ccv_nnc_tensor_symbol_t other_symbol = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(add_to_array_context->symbols, i);
144
33.9k
    if (other_symbol.d == symbol.d && 
other_symbol.graph == symbol.graph28
)
145
28
    {
146
      // Only add to parameter_indices if it is trainable.
147
28
      if (add_to_array_context->add_parameter_indices)
148
15
        ccv_array_add_unique_int(model->parameter_indices, i);
149
      // Found it, return, don't add it.
150
28
      return;
151
28
    }
152
33.9k
  }
153
  // Only add to parameter_indices if it is trainable.
154
3.13k
  if (add_to_array_context->add_parameter_indices)
155
2.95k
    ccv_array_push(model->parameter_indices, &add_to_array_context->symbols->rnum);
156
  // This is a new one, no need to add_unique_int, it is unique.
157
3.13k
  ccv_array_push(add_to_array_context->symbols, &symbol);
158
3.13k
  if (add_to_array_context->trainables)
159
2.96k
    ccv_array_push(add_to_array_context->trainables, &is_trainable);
160
3.13k
  char id[2048];
161
3.13k
  id[0] = add_to_array_context->prefix;
162
3.13k
  id[1] = '-';
163
3.13k
  int total_len = 2;
164
6.50k
  for (i = 0; i < add_to_array_context->sequence->sequences->rnum; 
i++3.36k
)
165
3.36k
  {
166
3.36k
    const ccv_cnnp_model_name_t* const name = (ccv_cnnp_model_name_t*)ccv_array_get(add_to_array_context->sequence->sequences, i);
167
3.36k
    int len;
168
3.36k
    if (name->name && 
name->name[0] != '\0'364
)
169
364
      len = snprintf(id + total_len, 2048 - total_len, "%s-%d-", name->name, name->sequence);
170
3.00k
    else
171
3.00k
      len = snprintf(id + total_len, 2048 - total_len, "%d-", name->sequence);
172
3.36k
    total_len += len;
173
3.36k
    if (total_len >= 2047)
174
0
      break;
175
3.36k
  }
176
3.13k
  if (total_len < 2047)
177
3.13k
    total_len += snprintf(id + total_len, 2048 - total_len, "%d", add_to_array_context->sequence->it);
178
3.13k
  assert(total_len < 2048);
179
3.13k
  char *heap_id = (char*)ccmalloc(total_len + 1);
180
3.13k
  memcpy(heap_id, id, total_len + 1);
181
3.13k
  ccv_array_push(add_to_array_context->ids, &heap_id);
182
3.13k
  ++add_to_array_context->sequence->it;
183
3.13k
}
184
185
static void _ccv_cnnp_compiled_data_init(ccv_cnnp_compiled_data_t* const compiled_data, const int output_size, ccv_array_t* const gradient_checkpoints)
186
2.30k
{
187
2.30k
  compiled_data->f = compiled_data->fits + output_size;
188
2.30k
  compiled_data->xpu_alloc.mp_hdr = -1;
189
2.30k
  compiled_data->xpu_alloc.freed = kh_init(dy_str);
190
2.30k
  compiled_data->xpu_alloc.allocd = kh_init(dy_alloc);
191
2.30k
  compiled_data->gradient_checkpoints = gradient_checkpoints;
192
2.30k
}
193
194
typedef struct {
195
  void* old_graph_exec_symbol_new_hook_context;
196
  ccv_nnc_graph_exec_symbol_new_hook_f old_graph_exec_symbol_new_hook;
197
  ccv_nnc_symbolic_graph_t* graph;
198
  ccv_cnnp_model_build_data_t* build_data;
199
} ccv_cnnp_model_set_exec_flags_context_t;
200
201
static void _ccv_cnnp_model_set_exec_flags(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
202
2.93k
{
203
2.93k
  ccv_cnnp_model_set_exec_flags_context_t* flags_context = (ccv_cnnp_model_set_exec_flags_context_t*)context;
204
2.93k
  if (flags_context->build_data->exec_flags)
205
0
    ccv_nnc_graph_exec_symbol_set_flags(flags_context->graph, symbol, flags_context->build_data->exec_flags);
206
2.93k
  if (flags_context->old_graph_exec_symbol_new_hook)
207
2.20k
    flags_context->old_graph_exec_symbol_new_hook(flags_context->old_graph_exec_symbol_new_hook_context, symbol, cmd, inputs, input_size, outputs, output_size, name);
208
2.93k
}
209
210
static void _ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t loss)
211
2.30k
{
212
2.30k
  assert(model->graph);
213
2.30k
  model->inputs = ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * input_size);
214
2.30k
  int i;
215
4.68k
  for (i = 0; i < input_size; 
i++2.38k
)
216
2.38k
    model->inputs[i] = ccv_nnc_tensor_symbol_new(model->graph, inputs[i], 0);
217
2.30k
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
218
2.30k
  ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0);
219
2.30k
  ccv_array_t* const parameter_trainables = ccv_array_new(sizeof(int), 0, 0);
220
2.30k
  ccv_cnnp_model_sequence_t model_sequence = {
221
2.30k
    .bank = kh_init(ccv_cnnp_model_name_bank)
222
2.30k
  };
223
2.30k
  ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = {
224
2.30k
    .add_parameter_indices = 1,
225
2.30k
    .prefix = 't',
226
2.30k
    .sequence = &model_sequence,
227
2.30k
    .symbols = parameters,
228
2.30k
    .ids = parameter_ids,
229
2.30k
    .trainables = parameter_trainables,
230
2.30k
  };
231
2.30k
  ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
232
2.30k
  ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0);
233
2.30k
  ccv_cnnp_model_add_to_array_context_t add_to_output_context = {
234
2.30k
    .add_parameter_indices = 0,
235
2.30k
    .prefix = 'r',
236
2.30k
    .sequence = &model_sequence,
237
2.30k
    .symbols = internals,
238
2.30k
    .ids = internal_ids,
239
2.30k
    .trainables = 0,
240
2.30k
  };
241
2.30k
  ccv_cnnp_model_build_data_t build_data = {
242
2.30k
    .exec_flags = 0,
243
2.30k
    .is_trainable = model->is_trainable >= 0 ? 
model->is_trainable2.30k
:
14
,
244
2.30k
    .model_sequence = &model_sequence,
245
2.30k
    .add_to_array = ccv_cnnp_model_add_to_array,
246
2.30k
    .parameters = parameters,
247
2.30k
    .context = {
248
2.30k
      .add_to_parameter = &add_to_parameter_context,
249
2.30k
      .add_to_output = &add_to_output_context,
250
2.30k
    },
251
2.30k
    .gradient_checkpoints = 0,
252
2.30k
  };
253
2.30k
  model->data = &build_data;
254
2.30k
  ccv_cnnp_model_set_exec_flags_context_t flags_context = {
255
2.30k
    .graph = model->graph,
256
2.30k
    .build_data = &build_data,
257
2.30k
    .old_graph_exec_symbol_new_hook = 0,
258
2.30k
    .old_graph_exec_symbol_new_hook_context = 0
259
2.30k
  };
260
2.30k
  flags_context.old_graph_exec_symbol_new_hook_context = ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_set_exec_flags, &flags_context, &flags_context.old_graph_exec_symbol_new_hook);
261
2.30k
  ccv_cnnp_model_build(model, model->graph, model->inputs, input_size, 0, 0);
262
  // Reset back to previous hook.
263
2.30k
  ccv_nnc_graph_exec_symbol_new_hook(model->graph, flags_context.old_graph_exec_symbol_new_hook, flags_context.old_graph_exec_symbol_new_hook_context, 0);
264
4.63k
  for (i = 0; i < model->output_size; 
i++2.32k
)
265
2.32k
  {
266
2.32k
    const ccv_nnc_tensor_symbol_t output = model->outputs[i];
267
2.32k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, output);
268
2.32k
    if (alias_to.d == CCV_NNC_NO_TENSOR_SYMBOL)
269
1.31k
      continue;
270
    // If output is an alias, insert data transform regardless for result correctness (we cannot bind an alias). You can check ccv_nnc_tensor_bind_symbol method
271
    // to see that we can correctly bind a tensor which from it, has aliases, but we cannot bind an alias tensor correctly (this is expected, sort of, to be
272
    // honest, because we cannot handle cases of alias is part of the original tensor but bind differently).
273
1.00k
    const ccv_nnc_tensor_param_t output_params = ccv_nnc_tensor_symbol_params(model->graph, output);
274
1.00k
    model->outputs[i] = ccv_nnc_tensor_symbol_new(model->graph, output_params, 0);
275
1.00k
    ccv_nnc_graph_exec_symbol_t make_contiguous = ccv_nnc_graph_exec_symbol_new(model->graph, CMD_FORMAT_TRANSFORM_FORWARD(), &output, 1, model->outputs + i, 1, "contiguous");
276
1.00k
    ccv_nnc_graph_exec_symbol_set_flags(model->graph, make_contiguous, CCV_NNC_GRAPH_EXEC_DISABLE_OPT);
277
1.00k
  }
278
2.30k
  model->data = 0;
279
2.30k
  kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank);
280
2.30k
  if (model_sequence.sequences)
281
2.29k
    ccv_array_free(model_sequence.sequences);
282
  // Check if there are parameters that are not trainables. If there are, we will allocate uint64 bitmap to record that.
283
2.30k
  int not_trainables = 0;
284
  // Assert no parameter is alias.
285
5.26k
  for (i = 0; i < parameters->rnum; 
i++2.95k
)
286
2.95k
  {
287
2.95k
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(parameters, i);
288
2.95k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(parameter.graph, parameter);
289
2.95k
    assert(alias_to.graph == 0); // Cannot find the one alias to.
290
2.95k
    if (*(int*)ccv_array_get(parameter_trainables, i) == 0)
291
14
      not_trainables = 1;
292
2.95k
  }
293
2.30k
  assert(parameters->rnum == parameter_trainables->rnum);
294
2.30k
  uint64_t* parameter_flags = 0;
295
2.30k
  if (not_trainables)
296
10
  {
297
10
    parameter_flags = (uint64_t*)cccalloc(((parameters->rnum + 63) >> 6), sizeof(uint64_t));
298
44
    for (i = 0; i < parameter_trainables->rnum; 
i++34
)
299
34
      if (*(int*)ccv_array_get(parameter_trainables, i))
300
20
        parameter_flags[i >> 6] |= ((uint64_t)1 << (i & 63));
301
10
  }
302
2.30k
  ccv_array_free(parameter_trainables);
303
  // Assert no internal is alias.
304
2.47k
  for (i = 0; i < internals->rnum; 
i++165
)
305
165
  {
306
165
    const ccv_nnc_tensor_symbol_t internal = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(internals, i);
307
165
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(internal.graph, internal);
308
165
    assert(alias_to.graph == 0); // Cannot find the one alias to.
309
165
  }
310
2.30k
  const int output_size = model->output_size;
311
2.30k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
312
2.30k
  const int parameters_rnum = parameters->rnum;
313
2.30k
  if (input_size > 0)
314
2.30k
  {
315
2.30k
    ccv_array_resize(parameters, parameters_rnum + input_size);
316
2.30k
    memcpy(ccv_array_get(parameters, parameters_rnum), model->inputs, input_size * sizeof(ccv_nnc_tensor_symbol_t));
317
2.30k
  }
318
2.30k
  ccv_nnc_symbolic_graph_simplify(model->graph,
319
2.30k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION,
320
2.30k
      CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT,
321
2.30k
      CCV_NNC_SIMPLIFY_OPS_FUSION,
322
2.30k
      CCV_NNC_SIMPLIFY_GRAPH_PRUNING),
323
2.30k
    ccv_array_get(parameters, 0), parameters_rnum + input_size,
324
2.30k
    model->outputs, output_size,
325
2.30k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
326
2.30k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
327
  // Size it down.
328
2.30k
  parameters->rnum = parameters_rnum;
329
2.30k
  ccv_cnnp_compiled_data_t* compiled_data = model->compiled_data = cccalloc(1, sizeof(ccv_cnnp_compiled_data_t) + sizeof(ccv_nnc_tensor_symbol_t) * (output_size * 2 - 1));
330
2.30k
  _ccv_cnnp_compiled_data_init(compiled_data, output_size, build_data.gradient_checkpoints);
331
2.30k
  const int evaluate_to_size = compiled_data->evaluate.to_size = ccv_nnc_symbolic_graph_destination_size(model->graph);
332
2.30k
  assert(evaluate_to_size > 0);
333
2.30k
  compiled_data->evaluate.tos = ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
334
2.30k
  memcpy(compiled_data->evaluate.tos, ccv_nnc_symbolic_graph_destinations(model->graph), sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
335
2.30k
  compiled_data->loss = loss;
336
2.30k
  if (loss.cmd == CCV_NNC_NOOP)
337
2.29k
  {
338
    // If no loss function provided, there is no fits.
339
4.61k
    for (i = 0; i < output_size; 
i++2.31k
)
340
2.31k
    {
341
2.31k
      compiled_data->fits[i] = NO_TENSOR_SYMBOL;
342
2.31k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, model->outputs[i]);
343
2.31k
      if (alias_to.d < 0)
344
2.31k
        compiled_data->f[i] = model->outputs[i];
345
0
      else { // We cannot differentiate against an alias, therefore, we have to verify this output is full, and we can diff against the original.
346
0
        int ofs[CCV_NNC_MAX_DIM_ALLOC];
347
0
        int inc[CCV_NNC_MAX_DIM_ALLOC];
348
0
        ccv_nnc_tensor_symbol_alias_params(model->graph, model->outputs[i], ofs, inc);
349
0
        int j;
350
0
        for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
351
0
          { assert(ofs[j] == 0); } // There is no ofs.
352
0
        compiled_data->f[i] = alias_to; // Unfortunately, I cannot assert the size yet.
353
0
      }
354
2.31k
    }
355
2.29k
  } else {
356
20
    for (i = 0; i < output_size; 
i++10
)
357
10
    {
358
10
      const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(model->graph, model->outputs[i]);
359
10
      const ccv_nnc_tensor_symbol_t fit = compiled_data->fits[i] = ccv_nnc_tensor_symbol_new(model->graph, info, 0);
360
10
      compiled_data->f[i] = ccv_nnc_tensor_symbol_new(model->graph, ccv_nnc_tensor_auto, 0);
361
10
      ccv_nnc_graph_exec_symbol_new(model->graph, loss, TENSOR_SYMBOL_LIST(model->outputs[i], fit), TENSOR_SYMBOL_LIST(compiled_data->f[i]), 0);
362
10
    }
363
10
  }
364
2.30k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
365
2.30k
  ccv_nnc_symbolic_graph_simplify(model->graph,
366
2.30k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_OPS_FUSION), // Only do Ops fusion, in this way, we can fuse the loss function.
367
2.30k
    0, 0, // No need to provide binds at this point.
368
2.30k
    compiled_data->f, model->output_size,
369
2.30k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
370
2.30k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
371
  // If inputs are from GPU, stream type is GPU.
372
2.30k
  compiled_data->parameters = parameters;
373
2.30k
  compiled_data->parameter_flags = parameter_flags;
374
2.30k
  compiled_data->internals = internals;
375
2.30k
  compiled_data->ids.parameters = parameter_ids;
376
2.30k
  compiled_data->ids.internals = internal_ids;
377
2.30k
  ccv_cnnp_model_gradient_checkpoints_cleanup_after_build(compiled_data, model->graph);
378
2.30k
}
379
380
static void _ccv_cnnp_graph_push_graph_exec_symbol(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
381
8.82k
{
382
8.82k
  ccv_array_t* const stack = (ccv_array_t*)context;
383
8.82k
  ccv_array_push(stack, &symbol.d);
384
8.82k
}
385
386
static void _ccv_nnc_tensor_symbol_reinit(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
387
38.5k
{
388
38.5k
  const ccv_nnc_tensor_symbol_t src_symbol = {
389
38.5k
    .d = src_index,
390
38.5k
    .graph = src_graph
391
38.5k
  };
392
38.5k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
393
38.5k
    .d = dest_index,
394
38.5k
    .graph = dest_graph
395
38.5k
  };
396
38.5k
  const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
397
38.5k
  ccv_nnc_tensor_symbol_set(dest_graph, dest_symbol, params);
398
38.5k
  int ofs[CCV_NNC_MAX_DIM_ALLOC];
399
38.5k
  int inc[CCV_NNC_MAX_DIM_ALLOC];
400
38.5k
  if (0 == ccv_nnc_tensor_symbol_alias_params(src_graph, src_symbol, ofs, inc))
401
2.00k
    ccv_nnc_tensor_symbol_alias_set(dest_graph, dest_symbol, ofs, inc);
402
38.5k
}
403
404
static int _ccv_nnc_tensor_symbol_check_dim(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
405
2.41k
{
406
2.41k
  const ccv_nnc_tensor_symbol_t src_symbol = {
407
2.41k
    .d = src_index,
408
2.41k
    .graph = src_graph
409
2.41k
  };
410
2.41k
  const ccv_nnc_tensor_param_t src_params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
411
2.41k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
412
2.41k
    .d = dest_index,
413
2.41k
    .graph = dest_graph
414
2.41k
  };
415
2.41k
  const ccv_nnc_tensor_param_t dest_params = ccv_nnc_tensor_symbol_params(dest_graph, dest_symbol);
416
2.41k
  return memcmp(src_params.dim, dest_params.dim, sizeof(src_params.dim)) == 0;
417
2.41k
}
418
419
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size);
420
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data);
421
422
typedef struct {
423
  int parallel_count;
424
  ccv_nnc_symbolic_graph_t* graph;
425
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
426
} ccv_nnc_graph_exec_update_t;
427
428
static void _ccv_cnnp_cmd_update_for_execs(void* const context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint)
429
58
{
430
58
  ccv_nnc_graph_exec_update_t* const graph_exec_update = (ccv_nnc_graph_exec_update_t*)context;
431
58
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = graph_exec_update->graph_exec_arena;
432
58
  ccv_nnc_graph_exec_t graph_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, symbol);
433
58
  ccv_nnc_graph_exec_set(graph_exec.graph, graph_exec, cmd);
434
58
  ccv_nnc_graph_exec_set_hint(graph_exec.graph, graph_exec, hint);
435
58
  const ccv_nnc_symbolic_graph_t* const graph = graph_exec_update->graph;
436
58
  const int parallel_count = graph_exec_update->parallel_count;
437
58
  int i;
438
178
  for (i = 1; i < parallel_count; 
i++120
)
439
120
  {
440
120
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, ccv_nnc_graph_exec_symbol_copy(graph, symbol, i));
441
120
    if (!CCV_NO_GRAPH_EXEC(copy))
442
120
    {
443
120
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
444
120
      ccv_nnc_graph_exec_set_hint(copy.graph, copy, hint);
445
120
    }
446
120
  }
447
58
}
448
449
void ccv_cnnp_model_absorb(ccv_cnnp_model_t* const model, ccv_cnnp_model_t* const init, const ccv_nnc_tensor_param_t* const inputs, const int input_size)
450
2.20k
{
451
2.20k
  assert(model->graph);
452
2.20k
  assert(model->compiled_data);
453
2.20k
  assert(!init->graph);
454
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
455
2.20k
  init->graph = ccv_nnc_symbolic_graph_new();
456
2.20k
  ccv_array_t* const stack = ccv_array_new(sizeof(int), 0, 0);
457
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, _ccv_cnnp_graph_push_graph_exec_symbol, stack, 0);
458
2.20k
  _ccv_cnnp_model_compile(init, inputs, input_size, compiled_data->loss);
459
2.20k
  init->parallel_count = model->parallel_count;
460
2.20k
  init->memory_compression = model->memory_compression;
461
2.20k
  init->memory_reduction = model->memory_reduction;
462
2.20k
  init->gradient_checkpointing = model->gradient_checkpointing;
463
2.20k
  init->compiled_data->stream_type = model->compiled_data->stream_type;
464
2.20k
  init->compiled_data->minimize.minimizer = model->compiled_data->minimize.minimizer;
465
2.20k
  init->compiled_data->minimize.max_saved_aux_size = model->compiled_data->minimize.max_saved_aux_size;
466
2.20k
  if (model->compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
467
2.20k
    _ccv_cnnp_model_gradient_init(init, model->compiled_data->gradient_mode, model->compiled_data->disable_outgrad, 0, 0);
468
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, 0, 0, 0);
469
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(init->graph, TRAVERSE_FULL);
470
2.20k
  int i, j;
471
  // Verify parameters, internals and saved_aux in both graph has the same dimensionality.
472
4.61k
  for (i = 0; i < compiled_data->parameters->rnum; 
i++2.41k
)
473
2.41k
  {
474
2.41k
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
475
2.41k
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
476
2.41k
  }
477
2.20k
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
478
0
  {
479
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
480
0
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
481
0
  }
482
  // Update inputs.
483
2.20k
  assert(model->input_size == init->input_size);
484
4.40k
  
for (i = 0; 2.20k
i < model->input_size;
i++2.20k
)
485
2.20k
    if (model->inputs[i].d >= 0)
486
2.20k
    {
487
2.20k
      assert(init->inputs[i].d >= 0);
488
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->inputs[i].d, model->inputs[i].d);
489
2.20k
    }
490
  // Update outputs.
491
2.20k
  assert(model->output_size == init->output_size);
492
4.40k
  
for (i = 0; 2.20k
i < model->output_size;
i++2.20k
)
493
2.20k
  {
494
2.20k
    if (model->outputs[i].d >= 0)
495
2.20k
    {
496
2.20k
      assert(init->outputs[i].d >= 0);
497
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->outputs[i].d, model->outputs[i].d);
498
2.20k
    }
499
2.20k
    if (model->outputs[i].d != model->compiled_data->f[i].d)
500
0
    {
501
0
      assert(init->outputs[i].d != init->compiled_data->f[i].d);
502
0
      if (model->compiled_data->f[i].d >= 0)
503
0
      {
504
0
        assert(init->compiled_data->f[i].d >= 0);
505
0
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->compiled_data->f[i].d, model->compiled_data->f[i].d);
506
0
      }
507
0
    }
508
2.20k
  }
509
  // Go through the graph to set tensor on matching symbols
510
11.0k
  
for (i = 0; 2.20k
i < stack->rnum;
i++8.82k
)
511
8.82k
  {
512
8.82k
    const int d = *(int*)ccv_array_get(stack, i);
513
    // If exceed range, skip.
514
8.82k
    if (d >= ccv_nnc_graph_exec_symbol_count(init->graph) ||
515
8.82k
      d >= ccv_nnc_graph_exec_symbol_count(model->graph))
516
0
      continue;
517
8.82k
    const ccv_nnc_graph_exec_symbol_t src_symbol = {
518
8.82k
      .d = d,
519
8.82k
      .graph = init->graph
520
8.82k
    };
521
8.82k
    const ccv_nnc_graph_exec_symbol_t dest_symbol = {
522
8.82k
      .d = d,
523
8.82k
      .graph = model->graph
524
8.82k
    };
525
8.82k
    const ccv_nnc_cmd_t src_cmd = ccv_nnc_graph_exec_symbol_cmd(init->graph, src_symbol);
526
8.82k
    const ccv_nnc_cmd_t dest_cmd = ccv_nnc_graph_exec_symbol_cmd(model->graph, dest_symbol);
527
    // If the name doesn't match, skip.
528
8.82k
    if (dest_cmd.cmd != src_cmd.cmd && 
src_cmd.cmd != CCV_NNC_NOOP0
)
529
0
      continue;
530
    // Now get all the inputs and outputs, if matches, set them.
531
8.82k
    const int* src_inputs;
532
8.82k
    int src_input_size;
533
8.82k
    const int* src_outputs;
534
8.82k
    int src_output_size;
535
8.82k
    ccv_nnc_graph_exec_symbol_io(init->graph, src_symbol, &src_inputs, &src_input_size, &src_outputs, &src_output_size);
536
8.82k
    const int* dest_inputs;
537
8.82k
    int dest_input_size;
538
8.82k
    const int* dest_outputs;
539
8.82k
    int dest_output_size;
540
8.82k
    ccv_nnc_graph_exec_symbol_io(model->graph, dest_symbol, &dest_inputs, &dest_input_size, &dest_outputs, &dest_output_size);
541
    // We may have unmatched input / output size because this is the minimizer and it has
542
    // different saved_aux (for example, when we shrunk with CMD_NOOP).
543
8.82k
    if (src_input_size != dest_input_size)
544
0
      continue;
545
8.82k
    if (src_output_size != dest_output_size)
546
0
      continue;
547
8.82k
    ccv_nnc_graph_exec_symbol_set(model->graph, dest_symbol, src_cmd);
548
    // There may be mismatches of the source tensor symbols and destination tensor symbols. The reason is because
549
    // we may later passed-in the minimizer, therefore, we may allocate tensors for minimizer later in the original
550
    // graph whereas in the newly created graph, it is streamlined (the minimizer exists from the beginning). That
551
    // will make the order of tensor symbols creation different, therefore, exact which tensor is which wrong as
552
    // well. However, set a new minimizer won't change the exec symbol ordering, because we never create new exec
553
    // symbols after gradient init step. Changing a new minimizer just updated that exec symbols setting, it is not
554
    // a new exec symbol.
555
33.7k
    for (j = 0; j < src_input_size; 
j++24.8k
)
556
24.8k
      if (src_inputs[j] >= 0)
557
20.4k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_inputs[j], dest_inputs[j]);
558
22.4k
    for (j = 0; j < src_output_size; 
j++13.6k
)
559
13.6k
      if (src_outputs[j] >= 0)
560
13.6k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_outputs[j], dest_outputs[j]);
561
8.82k
  }
562
2.20k
  ccv_array_free(stack);
563
  // After this, we get all tensors in the model graph resolved through tensor_auto.
564
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(model->graph, TRAVERSE_FULL);
565
  // Verify symbols we get matches.
566
2.20k
  const int parameter_size = compiled_data->parameters->rnum;
567
4.61k
  for (i = 0; i < parameter_size; 
i++2.41k
)
568
2.41k
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->parameters, i))->d); }
569
2.20k
  const int internal_size = compiled_data->internals->rnum;
570
2.20k
  for (i = 0; i < internal_size; 
i++0
)
571
0
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->internals, i))->d); }
572
  // Go through compiled data.
573
2.20k
  if (compiled_data->tensor_arena)
574
2.20k
  {
575
2.20k
    const int flag = ccv_nnc_tensor_arena_reinit(compiled_data->tensor_arena, model->graph);
576
2.20k
    if (flag == 0 && compiled_data->graph_exec_arena)
577
2.20k
    {
578
2.20k
      ccv_nnc_graph_exec_reinit(compiled_data->graph_exec_arena, compiled_data->graph, model->graph);
579
      // Since we will reinit, if we previously set is_test, we need to set it again.
580
2.20k
      if (compiled_data->is_test)
581
1
      {
582
1
        const int parallel_count = ccv_max(model->parallel_count, 1);
583
1
        ccv_nnc_graph_exec_update_t update = {
584
1
          .parallel_count = parallel_count,
585
1
          .graph = model->graph,
586
1
          .graph_exec_arena = compiled_data->graph_exec_arena,
587
1
        };
588
1
        ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
589
1
      }
590
2.20k
    } else
591
      // Free-up tensor arena & graph exec arena.
592
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
593
2.20k
  }
594
  // There are other compiled graphs, for accum and apply gradients.
595
  // However, the main conclusion is, these absorb operations shouldn't impact parameters.
596
  // Thus, it won't impact the shape of gradients (only outgrad). Since for outgrad, we
597
  // don't allocate ourselves, it is not a concern. For normal gradients, the shape cannot
598
  // be changed otherwise parameters' shape will be meaningless. The same goes to internals.
599
  // That is why we don't update these compiled graphs at all this point.
600
  // Free the model, we've already "absorbed" it.
601
2.20k
  ccv_cnnp_model_free(init);
602
2.20k
}
603
604
void ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t minimizer, const ccv_nnc_cmd_t loss)
605
2.30k
{
606
2.30k
  assert(input_size == model->input_size || model->input_size == 0);
607
2.30k
  if (model->input_size == 0)
608
10
    model->input_size = input_size;
609
2.30k
  if (!model->graph) // The graph is not compiled yet.
610
102
  {
611
102
    model->graph = ccv_nnc_symbolic_graph_new();
612
102
    _ccv_cnnp_model_compile(model, inputs, input_size, loss);
613
102
    assert(model->compiled_data);
614
102
    int i, flag = 0;
615
256
    for (i = 0; !flag && 
i < input_size236
;
i++154
)
616
154
      flag = (CCV_TENSOR_GET_MEMORY(inputs[i].type) == CCV_TENSOR_GPU_MEMORY);
617
    // If inputs are from GPU, stream type is GPU.
618
102
    model->compiled_data->stream_type = flag ? 
CCV_STREAM_CONTEXT_GPU20
:
CCV_STREAM_CONTEXT_CPU82
;
619
102
    model->compiled_data->minimize.minimizer = minimizer;
620
102
    model->compiled_data->minimize.max_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
621
2.20k
  } else {
622
    // Now, finally fill in this part. If the graph is already compiled, we make a copy of the model.
623
    // And then absorb the "new model" to the old one.
624
2.20k
    ccv_cnnp_model_t* const init = ccv_cnnp_model_copy(model, model->is_trainable);
625
2.20k
    ccv_cnnp_model_absorb(model, init, inputs, input_size);
626
    // Reset minimizer.
627
2.20k
    ccv_cnnp_model_set_minimizer(model, minimizer, 1, 0, 0);
628
2.20k
  }
629
2.30k
}
630
631
ccv_cnnp_model_t* ccv_cnnp_model_copy(const ccv_cnnp_model_t* const model, const int is_trainable)
632
2.20k
{
633
2.20k
  ccv_cnnp_model_t* const new_model = _ccv_cnnp_model_copy(model, 0);
634
2.20k
  new_model->is_trainable = is_trainable;
635
2.20k
  return new_model;
636
2.20k
}
637
638
void ccv_cnnp_model_tensor_auto(ccv_cnnp_model_t* const model, ccv_nnc_tensor_param_t* const outputs, const int output_size)
639
4.45k
{
640
4.45k
  assert(model->graph);
641
4.45k
  assert(output_size == model->output_size);
642
4.45k
  ccv_nnc_symbolic_graph_t* const graph = model->graph;
643
4.45k
  ccv_nnc_symbolic_graph_tensor_auto(graph, TRAVERSE_FULL);
644
4.45k
  int i;
645
8.90k
  for (i = 0; i < output_size; 
i++4.45k
)
646
4.45k
  {
647
4.45k
    assert(model->outputs[i].d != CCV_NNC_NO_TENSOR_SYMBOL);
648
4.45k
    outputs[i] = ccv_nnc_tensor_symbol_params(graph, model->outputs[i]);
649
4.45k
  }
650
4.45k
}
651
652
void ccv_cnnp_model_set_workspace_size(ccv_cnnp_model_t* const model, size_t workspace_size)
653
3
{
654
3
  if (workspace_size == model->workspace_size)
655
0
    return;
656
3
  model->workspace_size = workspace_size;
657
3
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
658
3
  if (compiled_data && compiled_data->graph)
659
0
    ccv_nnc_graph_autotune(compiled_data->graph, workspace_size, 0, TRAVERSE_FULL);
660
3
}
661
662
size_t ccv_cnnp_model_workspace_size(ccv_cnnp_model_t* const model)
663
0
{
664
0
  return model->workspace_size;
665
0
}
666
667
void ccv_cnnp_model_set_data_parallel(ccv_cnnp_model_t* const model, const int parallel)
668
15
{
669
15
  if (parallel == 0)
670
0
    model->parallel_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
671
15
  else
672
15
    model->parallel_count = parallel;
673
15
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
674
15
  if (compiled_data)
675
11
    { assert(!compiled_data->graph); }
676
15
}
677
678
void ccv_cnnp_model_set_max_concurrency(ccv_cnnp_model_t* const model, const int max_stream_count)
679
0
{
680
0
  model->max_stream_count = max_stream_count;
681
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
682
0
  if (compiled_data)
683
0
    { assert(!compiled_data->graph); }
684
0
}
685
686
void ccv_cnnp_model_set_memory_compression(ccv_cnnp_model_t* const model, const int memory_compression)
687
0
{
688
0
  model->memory_compression = memory_compression;
689
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
690
0
  if (compiled_data)
691
0
    { assert(!compiled_data->graph); }
692
0
}
693
694
void ccv_cnnp_model_set_memory_reduction(ccv_cnnp_model_t* const model, const int memory_reduction)
695
0
{
696
0
  model->memory_reduction = memory_reduction;
697
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
698
0
  if (compiled_data)
699
0
    { assert(!compiled_data->graph); }
700
0
}
701
702
void ccv_cnnp_model_set_gradient_checkpointing(ccv_cnnp_model_t* const model, const int gradient_checkpointing)
703
2
{
704
2
  model->gradient_checkpointing = gradient_checkpointing;
705
2
}
706
707
int ccv_cnnp_model_gradient_checkpointing(ccv_cnnp_model_t* const model)
708
0
{
709
0
  return model->gradient_checkpointing;
710
0
}
711
712
typedef struct {
713
  int parallel_count;
714
  ccv_nnc_symbolic_graph_t* graph;
715
  ccv_cnnp_compiled_data_t* compiled_data;
716
  ccv_nnc_tensor_arena_t* tensor_arena;
717
} ccv_nnc_tensor_init_states_t;
718
719
static int _ccv_cnnp_any_to_init(const ccv_cnnp_compiled_data_t* const compiled_data)
720
104
{
721
104
  int i;
722
104
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
723
184
  for (i = 0; i < compiled_data->parameters->rnum; 
i++80
)
724
119
  {
725
119
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
726
119
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
727
39
      return 1;
728
119
  }
729
65
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
730
6
  {
731
6
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
732
6
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
733
6
      return 1;
734
6
  }
735
59
  return 0;
736
65
}
737
738
static void _ccv_cnnp_init_states_for_tensors(void* const context, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const input, const ccv_nnc_tensor_symbol_t output_symbol)
739
341
{
740
341
  ccv_nnc_tensor_init_states_t* const tensor_init_states = (ccv_nnc_tensor_init_states_t*)context;
741
341
  ccv_nnc_tensor_arena_t* const tensor_arena = tensor_init_states->tensor_arena;
742
341
  ccv_nnc_tensor_t* const output_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, output_symbol);
743
341
  if (!output_tensor)
744
0
    return;
745
341
  const int d = output_symbol.d;
746
341
  assert(d < tensor_init_states->compiled_data->tensors_init.size);
747
341
  uint32_t* const init_v = CCV_NNC_INIT_V(tensor_init_states->compiled_data->tensors_init.v);
748
341
  if (init_v[d >> 5] & (1u << (d & 0x1f)))
749
34
    return;
750
307
  init_v[d >> 5] |= (1u << (d & 0x1f));
751
307
  ccv_nnc_cmd_exec(cmd, hint, flags, &input, input ? 
116
:
0291
, &output_tensor, 1, 0);
752
307
  const ccv_nnc_symbolic_graph_t* const graph = tensor_init_states->graph;
753
307
  const int parallel_count = tensor_init_states->parallel_count;
754
307
  int i;
755
787
  for (i = 1; i < parallel_count; 
i++480
)
756
480
  {
757
480
    ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_copy(graph, output_symbol, i));
758
480
    if (copy)
759
480
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &output_tensor, 1, &copy, 1, 0);
760
480
  }
761
307
}
762
763
// This method can only handle cases we added new tensors and exec, never delete. This invariant is true because
764
// we setup everything (including calling simplify method) in ccv_cnnp_model_compile method, before this rewind setup.
765
static void _ccv_cnnp_model_rewind_graph(ccv_cnnp_model_t* const model)
766
2
{
767
2
  assert(model->graph);
768
2
  assert(model->compiled_data);
769
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
770
2
  assert(compiled_data->rewindables);
771
2
  int i;
772
51
  for (i = 0; i < compiled_data->rewindables->rnum; 
i++49
)
773
49
  {
774
49
    const ccv_cnnp_rewind_symbol_t* const rewind_symbol = (ccv_cnnp_rewind_symbol_t*)ccv_array_get(compiled_data->rewindables, i);
775
49
    if (rewind_symbol->type == CCV_CNNP_REWIND_GRAPH_EXEC)
776
16
      ccv_nnc_graph_exec_symbol_free(model->graph, rewind_symbol->graph_exec);
777
33
    else if (rewind_symbol->type == CCV_CNNP_REWIND_TENSOR)
778
33
      ccv_nnc_tensor_symbol_free(model->graph, rewind_symbol->tensor);
779
49
  }
780
2
  ccv_array_clear(compiled_data->rewindables);
781
2
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
782
2
}
783
784
static void _ccv_cnnp_model_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name)
785
6.14k
{
786
6.14k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
787
6.14k
    .type = CCV_CNNP_REWIND_TENSOR,
788
6.14k
    .tensor = symbol
789
6.14k
  };
790
6.14k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
791
6.14k
  ccv_array_push(rewind_symbols, &rewind_symbol);
792
6.14k
}
793
794
static void _ccv_cnnp_model_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC], const int inc[CCV_NNC_MAX_DIM_ALLOC], const ccv_nnc_tensor_param_t info, const char* const name)
795
476
{
796
476
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
797
476
    .type = CCV_CNNP_REWIND_TENSOR,
798
476
    .tensor = symbol
799
476
  };
800
476
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
801
476
  ccv_array_push(rewind_symbols, &rewind_symbol);
802
476
}
803
804
static void _ccv_cnnp_model_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
805
2.34k
{
806
2.34k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
807
2.34k
    .type = CCV_CNNP_REWIND_GRAPH_EXEC,
808
2.34k
    .graph_exec = symbol
809
2.34k
  };
810
2.34k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
811
2.34k
  ccv_array_push(rewind_symbols, &rewind_symbol);
812
2.34k
}
813
814
static void _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd, ccv_nnc_symbolic_graph_t* const symbolic_graph)
815
35.0k
{
816
35.0k
  ccv_nnc_graph_exec_t const update_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, exec_symbol);
817
35.0k
  if (!CCV_NO_GRAPH_EXEC(update_exec))
818
19.9k
    ccv_nnc_graph_exec_set(update_exec.graph, update_exec, cmd);
819
35.0k
  int i;
820
49.9k
  for (i = 1; i < parallel_count; 
i++14.8k
)
821
14.8k
  {
822
14.8k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
823
14.8k
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, copy_symbol);
824
14.8k
    if (!CCV_NO_GRAPH_EXEC(copy))
825
14.6k
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
826
14.8k
  }
827
35.0k
}
828
829
static void _ccv_cnnp_model_graph_exec_symbol_set(ccv_nnc_symbolic_graph_t* const symbolic_graph, ccv_cnnp_compiled_data_t* const compiled_data, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd)
830
20.0k
{
831
20.0k
  assert(compiled_data);
832
20.0k
  assert(symbolic_graph);
833
20.0k
  ccv_nnc_graph_exec_symbol_set(symbolic_graph, exec_symbol, cmd);
834
20.0k
  int i;
835
35.0k
  for (i = 1; i < parallel_count; 
i++14.9k
)
836
14.9k
  {
837
14.9k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
838
14.9k
    if (copy_symbol.graph)
839
14.8k
      ccv_nnc_graph_exec_symbol_set(symbolic_graph, copy_symbol, cmd);
840
14.9k
  }
841
20.0k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = compiled_data->graph_exec_arena;
842
20.0k
  if (graph_exec_arena)
843
20.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
844
  // Skip backward graph exec arena because it is for a specific accum symbolic graph, not the main graph (model->graph)
845
20.0k
  ccv_nnc_graph_exec_arena_t* const gradient_graph_exec_arena = compiled_data->apply_gradients.graph_exec_arena;
846
20.0k
  if (gradient_graph_exec_arena)
847
15.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(gradient_graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
848
20.0k
}
849
850
static int _ccv_cnnp_set_minimizer_for_parameter(ccv_nnc_symbolic_graph_t* const graph, ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_graph_exec_symbol_t* const update_nodes, ccv_nnc_tensor_symbol_t* const updated_parameters, ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parallel_count, const ccv_nnc_cmd_t minimizer, const int saved_aux_size, const int max_saved_aux_size, const int parameter_indice)
851
20.0k
{
852
20.0k
  int this_parameter_flag = 0;
853
20.0k
  if (update_nodes[parameter_indice].d == CCV_NNC_NO_TENSOR_SYMBOL)
854
0
    return this_parameter_flag;
855
20.0k
  const ccv_nnc_cmd_t old_minimizer = ccv_nnc_graph_exec_symbol_cmd(graph, update_nodes[parameter_indice]);
856
20.0k
  int j, k;
857
  // For no-op, we can preserve previous saved_aux_size.
858
20.0k
  if (old_minimizer.cmd != minimizer.cmd && 
minimizer.cmd != CCV_NNC_NOOP71
)
859
67
  {
860
    // If the old minimizer is a noop, then the old_saved_aux_size should be whatever its previous
861
    // saved_aux_size is, otherwise we will reinit the saved_aux repeatedly if you switch between
862
    // noop and a minimizer. We don't want that because we do that in high-level frameworks to
863
    // make sure some model parameters don't update if we don't want them to.
864
67
    int old_saved_aux_size;
865
67
    if (old_minimizer.cmd == CCV_NNC_NOOP)
866
67
    {
867
67
      int input_size;
868
67
      ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], 0, &input_size, 0, 0);
869
67
      if (input_size < 2) // This is not legit.
870
0
        old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
871
67
      else // See ccv_nnc_minimizer_saved_aux_size, the saved_aux is inputs excluding gradients and parameters.
872
67
        old_saved_aux_size = input_size - 2;
873
67
    } else
874
0
      old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
875
67
    if (old_saved_aux_size != saved_aux_size)
876
65
    {
877
65
      this_parameter_flag = 1;
878
65
      if (saved_aux_size > old_saved_aux_size)
879
65
      {
880
        // Allocate new tensor symbols.
881
65
        const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(graph, updated_parameters[parameter_indice]);
882
189
        for (j = old_saved_aux_size; j < saved_aux_size; 
j++124
)
883
124
        {
884
124
          saved_aux[parameter_indice * max_saved_aux_size + j].source = ccv_nnc_tensor_symbol_new(graph, info, 0);
885
124
          saved_aux[parameter_indice * max_saved_aux_size + j].destination = ccv_nnc_tensor_symbol_new(graph, info, 0);
886
124
          const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
887
460
          for (k = 1; k < parallel_count; 
k++336
)
888
336
          {
889
336
            ccv_nnc_tensor_param_t dev_info = info;
890
336
            if (k != device_id)
891
336
              CCV_TENSOR_SET_DEVICE_ID(dev_info.type, k);
892
0
            else
893
0
              CCV_TENSOR_SET_DEVICE_ID(dev_info.type, 0);
894
336
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
895
336
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
896
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, src_copy);
897
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, dest_copy);
898
336
          }
899
124
        }
900
65
      } else {
901
0
        for (j = saved_aux_size; j < old_saved_aux_size; j++)
902
0
        {
903
0
          for (k = 1; k < parallel_count; k++)
904
0
          {
905
0
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
906
0
            if (src_copy.d >= 0)
907
0
            {
908
0
              ccv_nnc_tensor_symbol_free(graph, src_copy);
909
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, NO_TENSOR_SYMBOL);
910
0
            }
911
0
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
912
0
            if (dest_copy.d >= 0)
913
0
            {
914
0
              ccv_nnc_tensor_symbol_free(graph, dest_copy);
915
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, NO_TENSOR_SYMBOL);
916
0
            }
917
0
          }
918
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source);
919
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination);
920
0
          saved_aux[parameter_indice * max_saved_aux_size + j].source = saved_aux[parameter_indice * max_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
921
0
        }
922
0
      }
923
65
    }
924
67
  }
925
20.0k
  _ccv_cnnp_model_graph_exec_symbol_set(graph, compiled_data, parallel_count, update_nodes[parameter_indice], minimizer);
926
20.0k
  if (this_parameter_flag)
927
65
  {
928
65
    ccv_nnc_tensor_symbol_t update_inputs[saved_aux_size + 2];
929
65
    ccv_nnc_tensor_symbol_t update_outputs[saved_aux_size + 1];
930
65
    const int* inputs = 0;
931
65
    int input_size = 0;
932
65
    ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], &inputs, &input_size, 0, 0);
933
65
    assert(input_size >= 1);
934
65
    update_inputs[0].d = inputs[0];
935
65
    update_inputs[0].graph = graph;
936
65
    update_inputs[1].d = inputs[1];
937
65
    update_inputs[1].graph = graph;
938
65
    update_outputs[0] = updated_parameters[parameter_indice];
939
189
    for (j = 0; j < saved_aux_size; 
j++124
)
940
124
    {
941
124
      update_inputs[j + 2] = saved_aux[parameter_indice * max_saved_aux_size + j].source;
942
124
      update_outputs[j + 1] = saved_aux[parameter_indice * max_saved_aux_size + j].destination;
943
124
    }
944
65
    ccv_nnc_graph_exec_symbol_set_io(graph, update_nodes[parameter_indice], update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
945
233
    for (k = 1; k < parallel_count; 
k++168
)
946
168
    {
947
168
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(graph, update_nodes[parameter_indice], k);
948
168
      assert(copy.d >= 0);
949
168
      ccv_nnc_graph_exec_symbol_io(graph, copy, &inputs, &input_size, 0, 0);
950
168
      assert(input_size >= 1);
951
168
      update_inputs[0].d = inputs[0];
952
168
      update_inputs[0].graph = graph;
953
168
      update_inputs[1].d = inputs[1];
954
168
      update_inputs[1].graph = graph;
955
168
      update_outputs[0] = ccv_nnc_tensor_symbol_copy(graph, updated_parameters[parameter_indice], k);
956
504
      for (j = 0; j < saved_aux_size; 
j++336
)
957
336
      {
958
336
        update_inputs[j + 2] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
959
336
        update_outputs[j + 1] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
960
336
      }
961
168
      ccv_nnc_graph_exec_symbol_set_io(graph, copy, update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
962
168
    }
963
65
  }
964
20.0k
  return this_parameter_flag;
965
20.0k
}
966
967
typedef struct {
968
  int parameter_size;
969
  ccv_nnc_cmd_t minimizer;
970
  ccv_cnnp_model_io_t parameters[1];
971
} ccv_cnnp_set_minimizer_for_parameter_t;
972
973
static int _ccv_cnnp_apply_parameters_with_minimizer(ccv_cnnp_model_t* const model)
974
296
{
975
296
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
976
296
  assert(compiled_data);
977
296
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
978
  // We update all parameters, at this point, we have one minimizer.
979
296
  const int parameter_size = compiled_data->parameters->rnum;
980
296
  ccv_nnc_graph_exec_symbol_t* const update_nodes = compiled_data->update_nodes;
981
296
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
982
296
  assert(symbolic_graph);
983
296
  const int parallel_count = ccv_max(model->parallel_count, 1);
984
296
  ccv_array_t* const parameters = compiled_data->minimize.parameters;
985
296
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
986
296
  int i, j, flag = 0;
987
301
  for (i = 0; i < parameters->rnum; 
i++5
)
988
5
  {
989
5
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = *(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(parameters, i);
990
10
    for (j = 0; j < set_minimizer_for_parameter->parameter_size; 
j++5
)
991
5
    {
992
5
      const int param_sel = set_minimizer_for_parameter->parameters[j]->param_sel > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_sel - 13
:
set_minimizer_for_parameter->parameters[j]->param_sel2
;
993
5
      assert(set_minimizer_for_parameter->parameters[j]->param_sel != 0);
994
5
      const int old_rnum = parameter_indices->rnum;
995
5
      ccv_cnnp_model_add_to_parameter_indices(set_minimizer_for_parameter->parameters[j]->model, param_sel, parameter_indices);
996
5
      const int param_ref = set_minimizer_for_parameter->parameters[j]->param_ref > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_ref - 10
: set_minimizer_for_parameter->parameters[j]->param_ref;
997
5
      assert(set_minimizer_for_parameter->parameters[j]->param_ref != 0);
998
5
      if (param_ref >= 0)
999
0
      {
1000
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
1001
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
1002
0
        parameter_indices->rnum = old_rnum + 1;
1003
0
      }
1004
5
    }
1005
5
    const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(set_minimizer_for_parameter->minimizer);
1006
    // We may have duplicated indices, but that is OK, we will set it twice.
1007
58
    for (j = 0; j < parameter_indices->rnum; 
j++53
)
1008
53
    {
1009
53
      const int d = *(int*)ccv_array_get(parameter_indices, j);
1010
53
      assert(d <= parameter_size);
1011
53
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, set_minimizer_for_parameter->minimizer, saved_aux_size, max_saved_aux_size, d))
1012
0
        flag = 1;
1013
53
    }
1014
5
    ccv_array_clear(parameter_indices);
1015
5
  }
1016
296
  ccv_array_free(parameter_indices);
1017
296
  return flag;
1018
296
}
1019
1020
static void _ccv_cnnp_scatter_saved_aux(ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parameter_size, const int old_saved_aux_size, const int new_saved_aux_size)
1021
2.25k
{
1022
2.25k
  if (new_saved_aux_size == old_saved_aux_size)
1023
2.24k
    return;
1024
2.25k
  assert
(new_saved_aux_size > old_saved_aux_size)7
;
1025
7
  int i, j;
1026
72
  for (i = parameter_size - 1; i >= 0; 
i--65
)
1027
65
  {
1028
189
    for (j = new_saved_aux_size - 1; j >= old_saved_aux_size; 
j--124
)
1029
124
      saved_aux[i * new_saved_aux_size + j].source = saved_aux[i * new_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
1030
65
    for (j = old_saved_aux_size - 1; j >= 0; 
j--0
)
1031
0
      saved_aux[i * new_saved_aux_size + j] = saved_aux[i * old_saved_aux_size + j];
1032
65
  }
1033
7
}
1034
1035
static void _ccv_cnnp_model_set_rewindables(ccv_cnnp_model_t* const model)
1036
49
{
1037
49
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1038
49
  assert(compiled_data);
1039
49
  if (!compiled_data->rewindables)
1040
49
    compiled_data->rewindables = ccv_array_new(sizeof(ccv_cnnp_rewind_symbol_t), 0, 0);
1041
49
  ccv_nnc_tensor_symbol_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_new_hook, compiled_data->rewindables, 0);
1042
49
  ccv_nnc_tensor_symbol_alias_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_alias_new_hook, compiled_data->rewindables, 0);
1043
49
  ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_graph_exec_symbol_new_hook, compiled_data->rewindables, 0);
1044
49
}
1045
1046
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size)
1047
2.24k
{
1048
2.24k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1049
2.24k
  assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
1050
2.24k
  assert(gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
1051
2.24k
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1052
2.24k
  assert(evaluate_to_size > 0);
1053
2.24k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1054
2.24k
  compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1055
2.24k
  compiled_data->evaluate.to_ops = (ccv_nnc_graph_exec_t*)(compiled_data->evaluate.tos + evaluate_to_size * parallel_count);
1056
2.24k
  int i, j;
1057
2.24k
  const int output_size = model->output_size;
1058
2.24k
  assert(!fits || fit_size == output_size * parallel_count);
1059
2.24k
  if (fits)
1060
12
    
for (i = 0; 6
i < output_size;
i++6
)
1061
6
      ccv_nnc_tensor_symbol_set(model->graph, compiled_data->fits[i], fits[i]->info);
1062
2.24k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
1063
2.24k
  const int parameter_size = compiled_data->parameters->rnum;
1064
2.24k
  compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * max_saved_aux_size * parameter_size);
1065
2.24k
  compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
1066
2.24k
  compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
1067
2.24k
  int parameter_size_maybe_more = parameter_size;
1068
2.24k
  compiled_data->disable_outgrad = disable_outgrad;
1069
2.24k
  int outgrad_size;
1070
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
1071
9
    outgrad_size = 0;
1072
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
1073
2.23k
    outgrad_size = model->input_size;
1074
3
  else {
1075
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
1076
3
    outgrad_size = 0;
1077
10
    for (i = 0; i < model->input_size; 
i++7
)
1078
7
      if (!(disable_outgrad & ((uint64_t)1 << i)))
1079
3
        ++outgrad_size;
1080
3
  }
1081
2.24k
  compiled_data->outgrad_size = outgrad_size;
1082
2.24k
  parameter_size_maybe_more += outgrad_size;
1083
2.24k
  compiled_data->gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size_maybe_more + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size_maybe_more * parallel_count);
1084
2.24k
  compiled_data->outgrads = parameter_size_maybe_more > parameter_size ? 
compiled_data->gradients + parameter_size2.23k
:
09
;
1085
2.24k
  compiled_data->backward.tos = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->gradients + parameter_size_maybe_more);
1086
2.24k
  compiled_data->backward.to_size = parameter_size_maybe_more;
1087
2.24k
  ccv_nnc_tensor_symbol_t* parameters = (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0);
1088
2.24k
  if (compiled_data->parameter_flags)
1089
4
  {
1090
4
    parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size);
1091
25
    for (i = 0; i < parameter_size; 
i++21
)
1092
21
      if (compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))
1093
14
        parameters[i] = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1094
7
      else
1095
7
        parameters[i] = NO_TENSOR_SYMBOL;
1096
4
  }
1097
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
1098
9
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1099
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
1100
2.23k
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, model->inputs, model->input_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1101
3
  else { // Compute minimize with gradients including selected inputs.
1102
3
    assert(model->input_size > 0);
1103
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
1104
3
    assert(outgrad_size > 0);
1105
3
    ccv_nnc_tensor_symbol_t outgrads[outgrad_size];
1106
3
    j = 0;
1107
10
    for (i = 0; i < model->input_size; 
i++7
)
1108
7
      if (!(disable_outgrad & ((uint64_t)1 << i)))
1109
3
        outgrads[j++] = model->inputs[i];
1110
3
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, outgrads, outgrad_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1111
3
  }
1112
2.24k
  if (compiled_data->parameter_flags)
1113
4
    ccfree(parameters);
1114
2.24k
  _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer), compiled_data->minimize.max_saved_aux_size);
1115
2.24k
  if (compiled_data->minimize.parameters)
1116
5
    _ccv_cnnp_apply_parameters_with_minimizer(model);
1117
  // Go through gradient checkpoints to generate tensor inputs for backward pass just before executing the backward pass.
1118
2.24k
  ccv_cnnp_model_apply_gradient_checkpoints(compiled_data, model->graph);
1119
4.49k
  for (i = 0; i < output_size; 
i++2.24k
)
1120
2.24k
  {
1121
2.24k
    const ccv_nnc_tensor_symbol_t df = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
1122
    // Init this to 1 so we can backprop.
1123
2.24k
    ccv_nnc_tensor_symbol_set_flags(model->graph, df, CCV_NNC_TENSOR_SYMBOL_INIT_ONES);
1124
2.24k
  }
1125
2.24k
  compiled_data->backward.to_size = 0;
1126
7.16k
  for (i = 0; i < parameter_size_maybe_more; 
i++4.92k
)
1127
4.92k
    if (compiled_data->gradients[i].d != CCV_NNC_NO_TENSOR_SYMBOL)
1128
4.91k
      compiled_data->backward.tos[compiled_data->backward.to_size++] = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->gradients[i]);
1129
2.24k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS);
1130
2.24k
  ccv_nnc_symbolic_graph_set_destinations(model->graph, compiled_data->update_nodes, parameter_size);
1131
4.50k
  for (i = 0; i < parameter_size_maybe_more - parameter_size; 
i++2.25k
)
1132
2.25k
  {
1133
2.25k
    if (compiled_data->outgrads[i].d < 0) // When we go through input, we might find zero-length inputs, and for these, we cannot have any outgrads.
1134
0
      continue;
1135
2.25k
    const ccv_nnc_graph_exec_symbol_t outgrad = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->outgrads[i]);
1136
2.25k
    const int* tos;
1137
2.25k
    int to_size;
1138
2.25k
    ccv_nnc_graph_exec_symbol_to(model->graph, outgrad, &tos, &to_size);
1139
2.25k
    if (to_size == 0) // If this is the end (no minimizers afterwards). We need to attach this as a destination. Otherwise this is covered in update_nodes.
1140
14
    {
1141
14
      const ccv_nnc_graph_exec_symbol_t* destinations = ccv_nnc_symbolic_graph_destinations(model->graph);
1142
14
      const int destination_count = ccv_nnc_symbolic_graph_destination_size(model->graph);
1143
14
      int flag = 0;
1144
14
      const int outgrad_destination_start = ccv_max(0, destination_count - i);
1145
16
      for (j = i - 1; !flag && 
j >= 014
;
j--2
)
1146
2
        if (j + outgrad_destination_start < destination_count)
1147
2
          flag = (destinations[j + outgrad_destination_start].d == outgrad.d);
1148
14
      if (!flag) // Only if we cannot find it, we add it.
1149
12
        ccv_nnc_symbolic_graph_add_destination(model->graph, outgrad);
1150
14
    }
1151
2.25k
  }
1152
2.24k
  if (parallel_count > 1)
1153
8
  {
1154
8
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1155
8
      0, 0,
1156
8
      compiled_data->gradients, parameter_size /* No need to deal with outgrads, we don't allreduce outgrads */,
1157
8
      compiled_data->gradients /* We only care about gradients before allreduce, thus, update our current pointers */,
1158
8
      0, 0, 0,
1159
8
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1160
8
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1161
8
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1162
16
    for (i = 0; i < evaluate_to_size; 
i++8
)
1163
32
      
for (j = 1; 8
j < parallel_count;
j++24
)
1164
24
      {
1165
24
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1166
24
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1167
24
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1168
24
      }
1169
8
    const int backward_to_size = compiled_data->backward.to_size;
1170
146
    for (i = 0; i < backward_to_size; 
i++138
)
1171
552
      
for (j = 1; 138
j < parallel_count;
j++414
)
1172
414
      {
1173
414
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->backward.tos[i], j);
1174
414
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1175
414
          compiled_data->backward.tos[compiled_data->backward.to_size++] = copy;
1176
414
      }
1177
8
  }
1178
  // Only use memory compression if we are in gradient parameter mode.
1179
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS2.23k
)
1180
2.24k
  {
1181
2.24k
    if (model->memory_compression)
1182
0
      ccv_nnc_symbolic_graph_memory_compression(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1183
2.24k
    if (model->memory_reduction)
1184
0
      ccv_nnc_symbolic_graph_memory_reduction(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1185
2.24k
  }
1186
2.24k
  compiled_data->backward.to_size = _ccv_nnc_array_dedup_graph_exec_symbols(compiled_data->backward.tos, compiled_data->backward.to_size);
1187
2.24k
  compiled_data->gradient_mode = gradient_mode;
1188
2.24k
}
1189
1190
void ccv_cnnp_model_tensors_init_0(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1191
99
{
1192
99
  assert(!compiled_data->tensors.parameters);
1193
99
  const int parameter_size = compiled_data->parameters->rnum;
1194
99
  const int parallel_count = ccv_max(model->parallel_count, 1);
1195
99
  const int internal_size = compiled_data->internals->rnum;
1196
99
  compiled_data->tensors_init.size = ccv_nnc_tensor_symbol_count(model->graph);
1197
99
  compiled_data->tensors_init.v = cccalloc(((compiled_data->tensors_init.size + 31) >> 5), sizeof(uint32_t));
1198
99
  compiled_data->tensors.parameters = (ccv_nnc_tensor_t**)cccalloc((parameter_size + internal_size) * parallel_count, sizeof(ccv_nnc_tensor_t*));
1199
99
  compiled_data->tensors.internals = compiled_data->tensors.parameters + parameter_size * parallel_count;
1200
99
}
1201
1202
int ccv_cnnp_model_tensors_any_to_alloc(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1203
37
{
1204
37
  int i, j;
1205
37
  const int parameter_size = compiled_data->parameters->rnum;
1206
37
  const int parallel_count = ccv_max(model->parallel_count, 1);
1207
37
  const int internal_size = compiled_data->internals->rnum;
1208
82
  for (i = 0; i < parameter_size; 
i++45
)
1209
66
  {
1210
    // parameters has to be allocated all together.
1211
66
    if (compiled_data->tensors.parameters[i])
1212
45
    {
1213
45
      for (j = 1; j < parallel_count; 
j++0
)
1214
0
        { assert(compiled_data->tensors.parameters[i + j * parameter_size]); }
1215
45
      continue;
1216
45
    }
1217
21
    return 1;
1218
66
  }
1219
16
  for (i = 0; i < internal_size; 
i++0
)
1220
0
  {
1221
0
    if (!compiled_data->tensors.internals[i])
1222
0
      return 1;
1223
0
    for (j = 1; j < parallel_count; j++)
1224
0
      if (!compiled_data->tensors.internals[i + j * internal_size])
1225
0
        return 1;
1226
0
  }
1227
16
  return 0;
1228
16
}
1229
1230
void ccv_cnnp_model_tensors_init_1(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1231
82
{
1232
82
  int i, j;
1233
82
  const int parameter_size = compiled_data->parameters->rnum;
1234
82
  const int parallel_count = ccv_max(model->parallel_count, 1);
1235
82
  const int internal_size = compiled_data->internals->rnum;
1236
348
  for (i = 0; i < parameter_size; 
i++266
)
1237
266
  {
1238
    // parameters has to be allocated all together.
1239
266
    if (compiled_data->tensors.parameters[i])
1240
13
    {
1241
13
      for (j = 1; j < parallel_count; 
j++0
)
1242
0
        { assert(compiled_data->tensors.parameters[i + j * parameter_size]); }
1243
13
      continue;
1244
13
    }
1245
253
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1246
253
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1247
253
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1248
68
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1249
253
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1250
253
    compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0);
1251
655
    for (j = 1; j < parallel_count; 
j++402
)
1252
402
    {
1253
402
      if (j != device_id)
1254
402
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1255
0
      else
1256
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1257
402
      compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1258
402
    }
1259
253
  }
1260
82
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1261
144
  for (i = 0; i < internal_size; 
i++62
)
1262
62
  {
1263
62
    const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i);
1264
62
    const int d = retained.d;
1265
62
    if (init_v[d >> 5] & (1u << (d & 0x1f)))
1266
0
      continue;
1267
62
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(retained.graph, retained);
1268
62
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1269
7
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1270
62
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1271
62
    if (!compiled_data->tensors.internals[i])
1272
62
      compiled_data->tensors.internals[i] = ccv_nnc_tensor_new(0, info, 0);
1273
158
    for (j = 1; j < parallel_count; 
j++96
)
1274
96
    {
1275
96
      if (j != device_id)
1276
96
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1277
0
      else
1278
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1279
96
      if (!compiled_data->tensors.internals[i + j * internal_size])
1280
96
        compiled_data->tensors.internals[i + j * internal_size] = ccv_nnc_tensor_new(0, info, 0);
1281
96
    }
1282
62
  }
1283
82
  compiled_data->tensors_init.v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); // Remove 1 if any.
1284
82
}
1285
1286
static void _ccv_cnnp_model_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1287
77
{
1288
77
  ccv_cnnp_model_tensors_init_0(model, compiled_data);
1289
77
  ccv_cnnp_model_tensors_init_1(model, compiled_data);
1290
77
}
1291
1292
static void _ccv_cnnp_model_copy_tensors(const uint32_t* const tensors_init, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1293
6
{
1294
6
  assert(parallel_count > 0);
1295
6
  int i, j;
1296
12
  for (i = 0; i < tensor_size; 
i++6
)
1297
6
  {
1298
6
    if (!tensors[i])
1299
0
      continue;
1300
6
    const int d = tensor_symbols[i].d;
1301
6
    if (!(tensors_init[d >> 5] & (1u << (d & 0x1f))))
1302
0
      continue;
1303
24
    
for (j = 1; 6
j < parallel_count;
j++18
)
1304
18
      if (tensors[i + j * tensor_size])
1305
18
      {
1306
18
        ccv_nnc_tensor_t* const input = CCV_NNC_TENSOR(tensors[i]);
1307
18
        ccv_nnc_tensor_t* const output = CCV_NNC_TENSOR(tensors[i + j * tensor_size]);
1308
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &input, 1, &output, 1, 0);
1309
18
      }
1310
6
  }
1311
6
}
1312
1313
static void _ccv_cnnp_model_remove_nocopies(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t** const tensors, const int tensor_size, const int parallel_count)
1314
104
{
1315
104
  assert(parallel_count > 0);
1316
104
  int i, j;
1317
167
  for (i = 0; i < tensor_size; 
i++63
)
1318
63
  {
1319
63
    const ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1320
159
    for (j = 1; j < parallel_count; 
j++96
)
1321
96
    {
1322
96
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1323
96
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1324
96
      if (copy_tensor && copy.d == CCV_NNC_NO_TENSOR_SYMBOL)
1325
0
      { // We shouldn't allocate this, free it up.
1326
0
        ccv_nnc_tensor_free(tensors[i + j * tensor_size]);
1327
0
        tensors[i + j * tensor_size] = 0;
1328
0
      }
1329
96
    }
1330
63
  }
1331
104
}
1332
1333
static void _ccv_cnnp_model_bind_tensors(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count, ccv_array_t* const tensor_binds)
1334
556
{
1335
556
  assert(parallel_count > 0);
1336
556
  int i, j;
1337
1.95k
  for (i = 0; i < tensor_size; 
i++1.39k
)
1338
1.39k
  {
1339
1.39k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1340
1.39k
    if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL)
1341
7
      continue;
1342
1.38k
    if (graph)
1343
1.38k
    {
1344
1.38k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1345
1.38k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1346
0
        tensor_symbol = alias_to;
1347
1.38k
    }
1348
1.38k
    ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(tensors[i]);
1349
1.38k
    if (tensor && 
tensor_symbol.d != CCV_NNC_NO_TENSOR_SYMBOL1.38k
)
1350
1.38k
    {
1351
1.38k
      const ccv_nnc_tensor_bind_t retained_bind = {
1352
1.38k
        .symbol = tensor_symbol,
1353
1.38k
        .tensor = tensor
1354
1.38k
      };
1355
1.38k
      ccv_array_push(tensor_binds, &retained_bind);
1356
1.38k
    }
1357
2.93k
    for (j = 1; j < parallel_count; 
j++1.54k
)
1358
1.54k
    {
1359
1.54k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1360
1.54k
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1361
1.54k
      if (copy_tensor && copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1362
1.54k
      {
1363
1.54k
        const ccv_nnc_tensor_bind_t bind = {
1364
1.54k
          .symbol = copy,
1365
1.54k
          .tensor = tensors[i + j * tensor_size]
1366
1.54k
        };
1367
1.54k
        ccv_array_push(tensor_binds, &bind);
1368
1.54k
      }
1369
1.54k
    }
1370
1.38k
  }
1371
556
}
1372
1373
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data)
1374
2.41k
{
1375
2.41k
  if (compiled_data->graph)
1376
104
    ccv_nnc_graph_free(compiled_data->graph);
1377
2.41k
  compiled_data->graph = 0;
1378
2.41k
  compiled_data->is_test = 0;
1379
2.41k
  if (compiled_data->tensor_arena)
1380
104
    ccv_nnc_tensor_arena_free(compiled_data->tensor_arena);
1381
2.41k
  compiled_data->tensor_arena = 0;
1382
2.41k
  if (compiled_data->graph_exec_arena)
1383
104
    ccv_nnc_graph_exec_arena_free(compiled_data->graph_exec_arena);
1384
2.41k
  compiled_data->graph_exec_arena = 0;
1385
2.41k
  if (compiled_data->backward.from_ops)
1386
37
    ccfree(compiled_data->backward.from_ops);
1387
2.41k
  compiled_data->backward.from_ops = 0;
1388
2.41k
  if (compiled_data->evaluate.schedule)
1389
42
    ccv_nnc_graph_static_schedule_free(compiled_data->evaluate.schedule);
1390
2.41k
  compiled_data->evaluate.schedule = 0;
1391
2.41k
  if (compiled_data->backward.schedule)
1392
31
    ccv_nnc_graph_static_schedule_free(compiled_data->backward.schedule);
1393
2.41k
  compiled_data->backward.schedule = 0;
1394
2.41k
}
1395
1396
static void _ccv_cnnp_compiled_data_gradient_free(ccv_cnnp_compiled_data_t* const compiled_data)
1397
2.31k
{
1398
2.31k
  if (compiled_data->gradients)
1399
2.24k
    ccfree(compiled_data->gradients);
1400
2.31k
  compiled_data->gradients = 0;
1401
2.31k
  if (compiled_data->updated_parameters)
1402
2.24k
    ccfree(compiled_data->updated_parameters);
1403
2.31k
  compiled_data->updated_parameters = 0;
1404
2.31k
  compiled_data->update_nodes = 0;
1405
2.31k
  compiled_data->saved_aux = 0;
1406
2.31k
}
1407
1408
static void _ccv_cnnp_compiled_data_backward_free(ccv_cnnp_compiled_data_t* const compiled_data)
1409
2.35k
{
1410
2.35k
  if (compiled_data->backward.gradients)
1411
5
    ccfree(compiled_data->backward.gradients);
1412
2.35k
  compiled_data->backward.gradients = 0;
1413
2.35k
  if (compiled_data->backward.accum)
1414
5
    ccv_nnc_graph_free(compiled_data->backward.accum);
1415
2.35k
  compiled_data->backward.accum = 0;
1416
2.35k
  if (compiled_data->backward.tensor_arena)
1417
5
    ccv_nnc_tensor_arena_free(compiled_data->backward.tensor_arena);
1418
2.35k
  compiled_data->backward.tensor_arena = 0;
1419
2.35k
  if (compiled_data->backward.graph_exec_arena)
1420
5
    ccv_nnc_graph_exec_arena_free(compiled_data->backward.graph_exec_arena);
1421
2.35k
  compiled_data->backward.graph_exec_arena = 0;
1422
2.35k
}
1423
1424
static void _ccv_cnnp_compiled_data_apply_gradients_free(ccv_cnnp_compiled_data_t* const compiled_data)
1425
2.32k
{
1426
2.32k
  if (compiled_data->apply_gradients.graph)
1427
24
    ccv_nnc_graph_free(compiled_data->apply_gradients.graph);
1428
2.32k
  compiled_data->apply_gradients.graph = 0;
1429
2.32k
  if (compiled_data->apply_gradients.tensor_arena)
1430
24
    ccv_nnc_tensor_arena_free(compiled_data->apply_gradients.tensor_arena);
1431
2.32k
  compiled_data->apply_gradients.tensor_arena = 0;
1432
2.32k
  if (compiled_data->apply_gradients.graph_exec_arena)
1433
24
    ccv_nnc_graph_exec_arena_free(compiled_data->apply_gradients.graph_exec_arena);
1434
2.32k
  compiled_data->apply_gradients.graph_exec_arena = 0;
1435
2.32k
}
1436
1437
// Compile the graph to run ccv_cnnp_model_fit
1438
static void _ccv_cnnp_model_fit_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1439
8
{
1440
8
  int i, j;
1441
8
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1442
8
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE);
1443
8
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_FIT_MODE;
1444
8
  const int parallel_count = ccv_max(model->parallel_count, 1);
1445
8
  assert(output_size == model->output_size * parallel_count);
1446
8
  assert(!fits || output_size == fit_size);
1447
8
  assert(output_size > 0);
1448
8
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1449
8
  {
1450
8
    _ccv_cnnp_model_set_rewindables(model);
1451
8
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1452
8
  } else 
if (0
compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES0
) {
1453
0
    _ccv_cnnp_model_rewind_graph(model);
1454
0
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1455
0
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1456
0
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1457
0
  }
1458
8
  const int tensors_init = !!compiled_data->tensors_init.v;
1459
8
  if (!tensors_init)
1460
4
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1461
4
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1462
  // Check if it is not fully allocated, if it is not, init_1.
1463
3
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1464
8
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1465
8
  assert((input_size % parallel_count) == 0);
1466
8
  assert((output_size % parallel_count) == 0);
1467
8
  assert((fit_size % parallel_count) == 0);
1468
8
  const int input_size_per_p = input_size / parallel_count;
1469
8
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1470
8
  const int output_size_per_p = output_size / parallel_count;
1471
8
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1472
8
  const int fit_size_per_p = fit_size / parallel_count;
1473
8
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count, tensor_binds);
1474
8
  const int parameter_size = compiled_data->parameters->rnum;
1475
8
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1476
8
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1477
8
  const int internal_size = compiled_data->internals->rnum;
1478
8
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1479
8
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1480
8
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1481
8
  ccv_array_free(tensor_binds);
1482
8
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1483
8
  if (tensors_init && 
parallel_count > 14
)
1484
0
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1485
  // If tensor is not init'ed, we need to init states first.
1486
8
  if (_ccv_cnnp_any_to_init(compiled_data))
1487
7
  {
1488
7
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1489
7
      .parallel_count = parallel_count,
1490
7
      .graph = model->graph,
1491
7
      .compiled_data = compiled_data,
1492
7
      .tensor_arena = compiled_data->tensor_arena
1493
7
    };
1494
7
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1495
7
  }
1496
8
  compiled_data->is_test = 0;
1497
8
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer);
1498
  // No need to set because it is default to training mode.
1499
  // ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1500
105
  for (i = 0; i < saved_aux_size * parameter_size; 
i++97
)
1501
97
  {
1502
97
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
1503
5
      continue;
1504
92
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, compiled_data->saved_aux[i].source);
1505
92
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
1506
296
    for (j = 1; j < parallel_count; 
j++204
)
1507
204
    {
1508
204
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
1509
204
      if (copy)
1510
204
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
1511
204
    }
1512
92
  }
1513
8
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1514
8
  compiled_data->evaluate.to_op_size = 0;
1515
22
  for (i = 0; i < evaluate_to_size; 
i++14
)
1516
14
  {
1517
14
    ccv_nnc_graph_exec_t const to = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1518
14
    if (to.graph)
1519
14
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to;
1520
14
  }
1521
8
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1522
8
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1523
8
}
1524
1525
ccv_nnc_stream_context_t* ccv_cnnp_model_default_stream(const ccv_cnnp_model_t* const model)
1526
0
{
1527
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1528
0
  if (!compiled_data || !compiled_data->graph)
1529
0
    return 0;
1530
0
  return ccv_nnc_graph_default_stream(compiled_data->graph);
1531
0
}
1532
1533
uint64_t ccv_cnnp_model_memory_size(const ccv_cnnp_model_t* const model)
1534
0
{
1535
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1536
0
  if (!compiled_data || !compiled_data->tensor_arena)
1537
0
    return 0;
1538
0
  return ccv_nnc_tensor_arena_size(compiled_data->tensor_arena);
1539
0
}
1540
1541
static void _ccv_cnnp_bind_tensors_to_arena(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1542
38.9k
{
1543
38.9k
  int i, j;
1544
114k
  for (i = 0; i < tensor_size; 
i++75.6k
)
1545
75.6k
  {
1546
75.6k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1547
75.6k
    if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL)
1548
0
      continue;
1549
75.6k
    if (graph)
1550
72.7k
    {
1551
72.7k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1552
72.7k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1553
0
        tensor_symbol = alias_to;
1554
72.7k
    }
1555
75.6k
    ccv_nnc_tensor_bind_symbol(tensor_arena, tensor_symbol, tensors[i]);
1556
77.4k
    for (j = 1; j < parallel_count; 
j++1.77k
)
1557
1.77k
    {
1558
1.77k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1559
1.77k
      if (copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1560
1.77k
        ccv_nnc_tensor_bind_symbol(tensor_arena, copy, tensors[i + tensor_size * j]);
1561
1.77k
    }
1562
75.6k
  }
1563
38.9k
}
1564
1565
void ccv_cnnp_model_fit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1566
2.54k
{
1567
2.54k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1568
2.54k
  assert(compiled_data);
1569
2.54k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1570
2.54k
  assert(output_size == model->output_size * parallel_count);
1571
2.54k
  assert(input_size == model->input_size * parallel_count);
1572
2.54k
  assert(!fits || fit_size == output_size);
1573
2.54k
  assert(model->graph);
1574
2.54k
  if (!compiled_data->graph || 
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE2.53k
)
1575
8
  {
1576
8
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1577
8
    _ccv_cnnp_compiled_data_backward_free(compiled_data);
1578
8
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
1579
    // Compile the symbolic graph down only when needed.
1580
8
    _ccv_cnnp_model_fit_jit(model, inputs, input_size, fits, fit_size, outputs, output_size);
1581
2.53k
  } else {
1582
2.53k
    assert((input_size % parallel_count) == 0);
1583
2.53k
    assert((output_size % parallel_count) == 0);
1584
2.53k
    assert((fit_size % parallel_count) == 0);
1585
2.53k
    const int input_size_per_p = input_size / parallel_count;
1586
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1587
2.53k
    const int output_size_per_p = output_size / parallel_count;
1588
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1589
2.53k
    const int fit_size_per_p = fit_size / parallel_count;
1590
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count);
1591
2.53k
  }
1592
2.54k
  if (compiled_data->is_test)
1593
0
  {
1594
0
    compiled_data->is_test = 0;
1595
0
    ccv_nnc_graph_exec_update_t update = {
1596
0
      .parallel_count = parallel_count,
1597
0
      .graph = model->graph,
1598
0
      .graph_exec_arena = compiled_data->graph_exec_arena,
1599
0
    };
1600
0
    ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1601
0
  }
1602
2.54k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1603
2.54k
}
1604
1605
// Compile the graph to run ccv_cnnp_model_evaluate with require_grad = false (MULTISTAGE_MODE_NO_GRAD).
1606
static void _ccv_cnnp_model_multistage_no_grad_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1607
59
{
1608
59
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1609
59
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD;
1610
59
  const int parallel_count = ccv_max(model->parallel_count, 1);
1611
59
  assert(output_size == model->output_size * parallel_count);
1612
59
  assert(output_size > 0);
1613
  // If the gradient is not initialized, continue to setup parallel process. We don't init gradient here, but rather,
1614
  // we setup proper rewindables so the graph can be rewinded to previous state before we run data parallel.
1615
59
  if (parallel_count > 1 && 
compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE6
)
1616
6
  {
1617
6
    const int evaluate_to_size = compiled_data->evaluate.to_size;
1618
6
    compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1619
6
    _ccv_cnnp_model_set_rewindables(model);
1620
6
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1621
6
      0, 0,
1622
6
      0, 0, 0,
1623
6
      0, 0, 0,
1624
6
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1625
6
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1626
6
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1627
6
    int i, j;
1628
12
    for (i = 0; i < evaluate_to_size; 
i++6
)
1629
24
      
for (j = 1; 6
j < parallel_count;
j++18
)
1630
18
      {
1631
18
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1632
18
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1633
18
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1634
18
      }
1635
6
  }
1636
59
  const int tensors_init = !!compiled_data->tensors_init.v;
1637
59
  if (!tensors_init)
1638
35
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1639
24
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1640
  // Check if it is not fully allocated, if it is not, init_1.
1641
1
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1642
59
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1643
59
  assert((input_size % parallel_count) == 0);
1644
59
  assert((output_size % parallel_count) == 0);
1645
59
  const int input_size_per_p = input_size / parallel_count;
1646
59
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1647
59
  const int output_size_per_p = output_size / parallel_count;
1648
59
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1649
59
  const int parameter_size = compiled_data->parameters->rnum;
1650
59
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1651
59
  const int internal_size = compiled_data->internals->rnum;
1652
59
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1653
59
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1654
  // If we generated gradient for the graph, only compile part of the graph because the rest is irrelevant for evaluation.
1655
59
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1656
59
  ccv_array_free(tensor_binds);
1657
59
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1658
  // If tensor is not init'ed, we need to init states first.
1659
59
  if (tensors_init && 
parallel_count > 124
)
1660
6
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1661
59
  if (_ccv_cnnp_any_to_init(compiled_data))
1662
17
  {
1663
17
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1664
17
      .parallel_count = parallel_count,
1665
17
      .graph = model->graph,
1666
17
      .compiled_data = compiled_data,
1667
17
      .tensor_arena = compiled_data->tensor_arena
1668
17
    };
1669
17
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1670
17
  }
1671
59
  compiled_data->is_test = 1;
1672
59
  ccv_nnc_graph_exec_update_t update = {
1673
59
    .parallel_count = parallel_count,
1674
59
    .graph = model->graph,
1675
59
    .graph_exec_arena = compiled_data->graph_exec_arena,
1676
59
  };
1677
59
  ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
1678
59
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1679
59
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1680
59
}
1681
1682
static void _ccv_cnnp_model_gradient_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1683
36
{
1684
36
  assert(!compiled_data->tensors.gradients);
1685
36
  const int parameter_size = compiled_data->parameters->rnum;
1686
36
  const int parallel_count = ccv_max(model->parallel_count, 1);
1687
36
  compiled_data->tensors.gradients = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * parameter_size * 2 * parallel_count);
1688
36
  compiled_data->tensors.accum_gradients = compiled_data->tensors.gradients + parameter_size * parallel_count;
1689
36
  int i, j;
1690
186
  for (i = 0; i < parameter_size; 
i++150
)
1691
150
  {
1692
150
    if (compiled_data->parameter_flags && 
!(compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))6
)
1693
2
    {
1694
2
      compiled_data->tensors.gradients[i] = 0;
1695
2
      compiled_data->tensors.accum_gradients[i] = 0;
1696
2
      for (j = 1; j < parallel_count; 
j++0
)
1697
0
      {
1698
0
        compiled_data->tensors.gradients[i + j * parameter_size] = 0;
1699
0
        compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1700
0
      }
1701
2
      continue;
1702
2
    }
1703
148
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1704
148
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1705
148
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1706
38
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1707
148
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1708
148
    compiled_data->tensors.gradients[i] = ccv_nnc_tensor_new(0, info, 0);
1709
148
    compiled_data->tensors.accum_gradients[i] = 0; // delay the accumulated gradient allocation until when we need it.
1710
328
    for (j = 1; j < parallel_count; 
j++180
)
1711
180
    {
1712
180
      if (j != device_id)
1713
180
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1714
0
      else
1715
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1716
180
      compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1717
180
      compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1718
180
    }
1719
148
  }
1720
36
}
1721
1722
static int _ccv_cnnp_is_disable_outgrad_all(const uint64_t disable_outgrad, const int input_size)
1723
8.04k
{
1724
8.04k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL)
1725
15
    return 1;
1726
8.03k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE)
1727
8.02k
    return 0;
1728
7
  int i;
1729
7
  for (i = 0; i < input_size; 
i++0
)
1730
7
    if (!(disable_outgrad & ((uint64_t)1 << i)))
1731
7
      return 0;
1732
0
  return 1;
1733
7
}
1734
1735
// Compile the graph to run ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1736
// Particularly, this method compiles the evaluation and backprop graph (the main graph).
1737
static void _ccv_cnnp_model_multistage_jit_0(ccv_cnnp_model_t* const model, const uint64_t disable_outgrad, const int is_test, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1738
37
{
1739
37
  int i, j;
1740
37
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1741
37
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES1
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS36
;
1742
37
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE || compiled_data->gradient_mode != target_gradient_mode);
1743
37
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE;
1744
37
  const int parallel_count = ccv_max(model->parallel_count, 1);
1745
37
  assert(output_size == model->output_size * parallel_count);
1746
37
  assert(output_size > 0);
1747
  // There shouldn't be a loss function if we evaluate with multistage jit.
1748
37
  assert(compiled_data->loss.cmd == CCV_NNC_NOOP);
1749
37
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1750
35
  {
1751
35
    _ccv_cnnp_model_set_rewindables(model);
1752
35
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1753
35
  } else 
if (2
compiled_data->gradient_mode != target_gradient_mode2
) {
1754
2
    _ccv_cnnp_model_rewind_graph(model);
1755
2
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1756
2
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1757
2
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1758
2
  }
1759
37
  const int tensors_init = !!compiled_data->tensors_init.v;
1760
37
  if (!tensors_init)
1761
29
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1762
8
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1763
  // Check if it is not fully allocated, if it is not, init_1.
1764
1
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1765
37
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1766
37
  assert((input_size % parallel_count) == 0);
1767
37
  assert((output_size % parallel_count) == 0);
1768
37
  const int input_size_per_p = input_size / parallel_count;
1769
37
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1770
37
  const int output_size_per_p = output_size / parallel_count;
1771
37
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1772
37
  const int parameter_size = compiled_data->parameters->rnum;
1773
37
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1774
37
  const int internal_size = compiled_data->internals->rnum;
1775
37
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1776
37
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1777
37
  if (!compiled_data->tensors.gradients)
1778
36
    _ccv_cnnp_model_gradient_tensors_init(model, compiled_data);
1779
37
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
1780
37
  if (compiled_data->backward.to_size > 0)
1781
37
    ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->backward.tos, compiled_data->backward.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1782
0
  else
1783
0
    ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1784
37
  ccv_array_free(tensor_binds);
1785
37
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1786
37
  if (tensors_init && 
parallel_count > 18
)
1787
0
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1788
  // If tensor is not init'ed, we need to init states first.
1789
37
  if (_ccv_cnnp_any_to_init(compiled_data))
1790
21
  {
1791
21
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1792
21
      .parallel_count = parallel_count,
1793
21
      .graph = model->graph,
1794
21
      .compiled_data = compiled_data,
1795
21
      .tensor_arena = compiled_data->tensor_arena
1796
21
    };
1797
21
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1798
21
  }
1799
37
  compiled_data->is_test = is_test;
1800
37
  ccv_nnc_graph_exec_update_t update = {
1801
37
    .parallel_count = parallel_count,
1802
37
    .graph = model->graph,
1803
37
    .graph_exec_arena = compiled_data->graph_exec_arena,
1804
37
  };
1805
37
  ccv_cnnp_model_set_is_test(model, is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1806
37
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1807
37
  compiled_data->evaluate.to_op_size = 0;
1808
37
  ccv_array_t* const backward_from = ccv_array_new(sizeof(int), 0, 0);
1809
92
  for (i = 0; i < evaluate_to_size; 
i++55
)
1810
55
  {
1811
55
    ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1812
55
    if (to_op.graph)
1813
55
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to_op;
1814
55
    const int* tos;
1815
55
    int to_size;
1816
55
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->evaluate.tos[i], &tos, &to_size);
1817
110
    for (j = 0; j < to_size; 
j++55
)
1818
55
    {
1819
55
      ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
1820
55
        .d = tos[j],
1821
55
        .graph = model->graph
1822
55
      });
1823
55
      if (to_op.graph)
1824
55
        ccv_array_add_unique_int(backward_from, to_op.d);
1825
55
    }
1826
55
  }
1827
37
  assert(backward_from->rnum > 0);
1828
37
  compiled_data->backward.from_op_size = backward_from->rnum;
1829
37
  compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1830
92
  for (i = 0; i < backward_from->rnum; 
i++55
)
1831
55
    compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1832
55
      .d = *(int*)ccv_array_get(backward_from, i),
1833
55
      .graph = compiled_data->graph,
1834
55
    };
1835
  // If there are any set node (to set some tensors to 0) inserted through backward pass, these won't be executed if we just do sources -> evaluate.to_ops, backward.from_ops -> destinations. We need this logic to find out these nodes and explicitly adding them to backward.from_ops.
1836
37
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(compiled_data->graph->exec_info, 0);
1837
37
  const int exec_info_size = compiled_data->graph->exec_info->rnum;
1838
37
  uint32_t* const visited = cccalloc((exec_info_size + 31) >> 5, sizeof(uint32_t));
1839
37
  const ccv_nnc_graph_exec_t* const sources = (ccv_nnc_graph_exec_t*)ccv_array_get(compiled_data->graph->sources, 0);
1840
37
  const int source_size = compiled_data->graph->sources->rnum;
1841
74
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new37
(compiled_data->graph, exec_info, exec_info_size, sources, source_size, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size, 0);
1842
628
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1843
628
    visited[(idx >> 5)] |= (1u << (idx & 31));
1844
628
  } ccv_nnc_graph_visit_endfor
1845
74
  ccv_nnc_graph_visit_free(visit);
1846
74
  const ccv_nnc_graph_exec_t* const destinations = (ccv_nnc_graph_exec_t*)
ccv_array_get37
(compiled_data->graph->destinations, 0);
1847
74
  const int destination_size = compiled_data->graph->destinations->rnum;
1848
74
  visit = 
ccv_nnc_graph_visit_new37
(compiled_data->graph, exec_info, exec_info_size, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, destinations, destination_size, 0);
1849
680
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1850
680
    visited[(idx >> 5)] |= (1u << (idx & 31));
1851
680
  } ccv_nnc_graph_visit_endfor
1852
74
  ccv_nnc_graph_visit_free(visit);
1853
74
  visit = 
ccv_nnc_graph_visit_new37
(compiled_data->graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0);
1854
  // Find any missing nodes to be added as source. Right now, these are only set nodes.
1855
1.36k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1856
1.36k
    if (!(visited[(idx >> 5)] & (1u << (idx & 31))))
1857
55
    {
1858
55
      assert(exec_info[idx].cmd.cmd == CCV_NNC_SET_FORWARD);
1859
55
      if (exec_info[idx].cmd.info.blas.a[0] == 0) // Special-casing for empty out the tensor set function, not for the set grad to 1 one.
1860
0
        ccv_array_add_unique_int(backward_from, idx);
1861
55
    }
1862
1.36k
  } ccv_nnc_graph_visit_endfor
1863
37
  ccv_nnc_graph_visit_free(visit);
1864
37
  ccfree(visited);
1865
37
  if (backward_from->rnum != compiled_data->backward.from_op_size) // If it doesn't match, need to redo this.
1866
0
  {
1867
0
    compiled_data->backward.from_op_size = backward_from->rnum;
1868
0
    compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccrealloc(compiled_data->backward.from_ops, sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1869
0
    for (i = 0; i < backward_from->rnum; i++)
1870
0
      compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1871
0
        .d = *(int*)ccv_array_get(backward_from, i),
1872
0
        .graph = compiled_data->graph,
1873
0
      };
1874
0
  }
1875
37
  ccv_array_free(backward_from);
1876
37
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1877
37
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1878
37
}
1879
1880
void ccv_cnnp_model_dry_run(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1881
8.00k
{
1882
8.00k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1883
8.00k
  assert(compiled_data);
1884
8.00k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1885
8.00k
  assert(output_size == model->output_size * parallel_count);
1886
8.00k
  assert(input_size == model->input_size * parallel_count);
1887
8.00k
  assert(model->graph);
1888
8.00k
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(params.disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES14
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS7.99k
;
1889
8.00k
  const int mode_mismatch = (params.requires_grad && 
(7.86k
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE7.86k
||
compiled_data->gradient_mode != target_gradient_mode7.82k
||
compiled_data->disable_outgrad != params.disable_outgrad7.82k
));
1890
8.00k
  if (!compiled_data->graph || 
mode_mismatch7.91k
)
1891
96
  {
1892
96
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1893
96
    if (mode_mismatch) // If mode mismatch, we need to redo the backward as well (no need to redo apply_gradients, it doesn't require target_gradient_mode or disable_outgrad.
1894
37
      _ccv_cnnp_compiled_data_backward_free(compiled_data);
1895
96
    if (params.requires_grad)
1896
37
      _ccv_cnnp_model_multistage_jit_0(model, params.disable_outgrad, params.is_test, inputs, input_size, outputs, output_size);
1897
59
    else
1898
59
      _ccv_cnnp_model_multistage_no_grad_jit(model, inputs, input_size, outputs, output_size);
1899
7.91k
  } else {
1900
7.91k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->tensor_arena);
1901
7.91k
    assert((input_size % parallel_count) == 0);
1902
7.91k
    const int input_size_per_p = input_size / parallel_count;
1903
7.91k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1904
7.91k
    assert((output_size % parallel_count) == 0);
1905
7.91k
    const int output_size_per_p = output_size / parallel_count;
1906
7.91k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1907
7.91k
  }
1908
8.00k
  if (compiled_data->is_test != params.is_test)
1909
64
  {
1910
64
    compiled_data->is_test = params.is_test;
1911
64
    ccv_nnc_graph_exec_update_t update = {
1912
64
      .parallel_count = parallel_count,
1913
64
      .graph = model->graph,
1914
64
      .graph_exec_arena = compiled_data->graph_exec_arena,
1915
64
    };
1916
64
    ccv_cnnp_model_set_is_test(model, params.is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1917
64
  }
1918
8.00k
}
1919
1920
void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1921
8.00k
{
1922
8.00k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1923
8.00k
  assert(compiled_data);
1924
8.00k
  ccv_cnnp_model_dry_run(model, params, inputs, input_size, outputs, output_size);
1925
8.00k
  if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD)
1926
73
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1927
7.93k
  else {
1928
7.93k
    if (!compiled_data->evaluate.schedule)
1929
42
      compiled_data->evaluate.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, 0, 0, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size);
1930
7.93k
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->evaluate.schedule, tensor_tape, stream_context);
1931
7.93k
  }
1932
8.00k
}
1933
1934
// Compile the graph to run ccv_cnnp_model_backward after ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1935
// Particularly, this method compiles the accumulator graph.
1936
static void _ccv_cnnp_model_multistage_jit_1(ccv_cnnp_model_t* const model)
1937
5
{
1938
5
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1939
5
  assert(compiled_data);
1940
5
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1941
5
  ccv_nnc_symbolic_graph_t* accum = ccv_nnc_symbolic_graph_new();
1942
5
  const int parallel_count = ccv_max(model->parallel_count, 1);
1943
5
  const int parameter_size = compiled_data->parameters->rnum;
1944
5
  int i, j;
1945
5
  compiled_data->backward.gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size * parallel_count * 3);
1946
5
  compiled_data->backward.accum_gradients = compiled_data->backward.gradients + parameter_size * parallel_count;
1947
5
  compiled_data->backward.updated_accum_gradients = compiled_data->backward.accum_gradients + parameter_size * parallel_count;
1948
20
  for (i = 0; i < parameter_size; 
i++15
)
1949
30
    
for (j = 0; 15
j < parallel_count;
j++15
)
1950
15
      if (compiled_data->tensors.gradients[i + j * parameter_size])
1951
15
      {
1952
15
        const ccv_nnc_tensor_param_t info = compiled_data->tensors.gradients[i + j * parameter_size]->info;
1953
        // Now, the old gradient is the accumulated gradient, getting new gradient tensor setup so we can collect them.
1954
15
        compiled_data->tensors.accum_gradients[i + j * parameter_size] = compiled_data->tensors.gradients[i + j * parameter_size];
1955
15
        compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1956
15
        ccv_nnc_tensor_symbol_t inputs[2];
1957
15
        inputs[0] = compiled_data->backward.accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1958
15
        inputs[1] = compiled_data->backward.gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1959
15
        ccv_nnc_tensor_symbol_t output = compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1960
15
        ccv_nnc_graph_exec_symbol_new(accum, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0);
1961
15
      } else {
1962
0
        compiled_data->backward.accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1963
0
        compiled_data->backward.gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1964
0
        compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1965
0
      }
1966
5
  ccv_nnc_graph_exec_symbol_autogen(accum, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1967
5
  if (ccv_nnc_symbolic_graph_source_size(accum) == 0)
1968
0
  {
1969
0
    ccv_nnc_symbolic_graph_free(accum);
1970
    // Create empty graph.
1971
0
    compiled_data->backward.accum = ccv_nnc_graph_new();
1972
0
    ccv_nnc_graph_topsort(compiled_data->backward.accum, 0, 0);
1973
0
    return;
1974
0
  }
1975
5
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1976
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1977
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1, tensor_binds);
1978
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1979
5
  ccv_nnc_symbolic_graph_compile(accum, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(accum), SYMBOLIC_GRAPH_DESTINATIONS(accum), &compiled_data->backward.accum, &compiled_data->backward.tensor_arena, &compiled_data->backward.graph_exec_arena);
1980
5
  ccv_nnc_symbolic_graph_free(accum);
1981
5
  ccv_array_free(tensor_binds);
1982
5
  ccv_nnc_graph_set_default_static_schedule(compiled_data->backward.accum, compiled_data->stream_type, model->max_stream_count);
1983
5
}
1984
1985
void ccv_cnnp_model_backward(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const ingrads, const int ingrad_size, ccv_nnc_tensor_t* const* const outgrads, const int outgrad_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1986
7.91k
{
1987
7.91k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1988
7.91k
  assert(compiled_data);
1989
7.91k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1990
7.91k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1991
7.91k
  assert(ingrad_size == 0 || ingrad_size == model->output_size * parallel_count);
1992
7.91k
  if (outgrad_size > 0)
1993
2.51k
    { assert(outgrad_size == compiled_data->outgrad_size * parallel_count); }
1994
7.91k
  assert(model->graph);
1995
7.91k
  assert(compiled_data->graph);
1996
7.91k
  const int parameter_size = compiled_data->parameters->rnum;
1997
  // If we need to accumulate the gradients now, do jit on accumulator.
1998
7.91k
  if (compiled_data->backward.count > 0)
1999
1.71k
  {
2000
1.71k
    if (!compiled_data->backward.accum)
2001
5
      _ccv_cnnp_model_multistage_jit_1(model);
2002
1.71k
    else if (compiled_data->backward.count == 1) {
2003
      //  On this round, we need to switch accumulated gradients with gradients (so we can do accumulation properly).
2004
496
      int i;
2005
1.48k
      for (i = 0; i < parameter_size * parallel_count; 
i++986
)
2006
986
      {
2007
986
        ccv_nnc_tensor_t* tensor;
2008
986
        CCV_SWAP(compiled_data->tensors.accum_gradients[i], compiled_data->tensors.gradients[i], tensor);
2009
986
      }
2010
496
      if (compiled_data->backward.tensor_arena)
2011
496
      {
2012
496
        ccv_nnc_tensor_arena_clear_bindings(compiled_data->backward.tensor_arena);
2013
        // Do rebind in case we messed up the binding (we switch accum_gradients and gradients).
2014
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1);
2015
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
2016
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
2017
496
      }
2018
496
    }
2019
1.71k
  }
2020
7.91k
  const int ingrad_size_per_p = model->output_size;
2021
7.91k
  const int outgrad_size_per_p = compiled_data->outgrad_size;
2022
7.91k
  int i, j;
2023
15.8k
  for (i = 0; i < ingrad_size_per_p; 
i++7.91k
)
2024
7.91k
  {
2025
7.91k
    const ccv_nnc_tensor_symbol_t ingrad = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
2026
7.91k
    if (!ingrad_size || 
!ingrads3.79k
||
ingrads[i] == 03.79k
)
2027
4.22k
    {
2028
      // Set it to 1 if it is not specified.
2029
4.22k
      ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ingrad);
2030
4.22k
      if (ingrad_tensor)
2031
4.22k
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
2032
4.34k
      for (j = 1; j < parallel_count; 
j++120
)
2033
120
      {
2034
120
        ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j));
2035
120
        if (ingrad_tensor)
2036
120
          ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
2037
120
      }
2038
4.22k
    } else {
2039
      // Make sure the length matches, in case it is an alias.
2040
3.69k
      assert(ccv_nnc_tensor_count(ingrads[i]->info) == ccv_nnc_tensor_count(ccv_nnc_tensor_symbol_params(model->graph, ingrad)));
2041
3.69k
      ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ingrad, ingrads[i]);
2042
3.70k
      for (j = 1; j < parallel_count; 
j++6
)
2043
6
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j), ingrads[i + ingrad_size_per_p * j]);
2044
3.69k
    }
2045
7.91k
  }
2046
7.91k
  if (outgrad_size > 0)
2047
2.51k
  {
2048
2.51k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS && "shouldn't pass disable_outgrad to ccv_cnnp_model_evaluate before if you plan to compute outgrad");
2049
5.14k
    
for (i = 0; 2.51k
i < outgrad_size_per_p;
i++2.63k
)
2050
2.63k
      if (outgrads[i])
2051
2.43k
      {
2052
2.43k
        const ccv_nnc_tensor_symbol_t outgrad = compiled_data->outgrads[i];
2053
2.43k
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, outgrad, outgrads[i]);
2054
2.44k
        for (j = 1; j < parallel_count; 
j++6
)
2055
6
          ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, outgrad, j), outgrads[i + outgrad_size_per_p * j]);
2056
2.43k
      }
2057
5.40k
  } else {
2058
5.40k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES ||
2059
5.40k
      compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS);
2060
5.40k
  }
2061
  // We need to rebind here because in ccv_cnnp_evaluate, we clear bindings, that will reset all bindings for the gradients.
2062
  // For parameters and internals these are fine because when we clear bindings, it restores to original bindings, which are these
2063
  // parameters and internals. The same cannot be said for gradients due to the accum_gradients switching.
2064
7.91k
  _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
2065
7.91k
  if (!compiled_data->backward.schedule)
2066
31
    compiled_data->backward.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, 0, 0);
2067
  // Run the backward pass.
2068
7.91k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->backward.schedule, tensor_tape, stream_context);
2069
  // If we need to run accumulation round, do that now.
2070
7.91k
  if (compiled_data->backward.count > 0)
2071
1.71k
    ccv_nnc_graph_run_with_schedule(compiled_data->backward.accum, 0, 0, 0, stream_context);
2072
  // Update the count, this determines whether we need to accumulate or not.
2073
7.91k
  ++compiled_data->backward.count;
2074
7.91k
}
2075
2076
// Compile the graph to run ccv_cnnp_model_apply_gradients after ccv_cnnp_model_backward (MULTISTAGE_MODE).
2077
// Particularly, this method compiles the parameter update graph.
2078
static void _ccv_cnnp_model_multistage_jit_2(ccv_cnnp_model_t* const model)
2079
24
{
2080
24
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2081
24
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
2082
24
  const int parallel_count = ccv_max(model->parallel_count, 1);
2083
24
  const int parameter_size = compiled_data->parameters->rnum;
2084
24
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
2085
24
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
2086
24
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
2087
  // Bind accumulated gradients.
2088
24
  if (compiled_data->backward.count > 1)
2089
4
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count, tensor_binds);
2090
20
  else
2091
20
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
2092
24
  ccv_array_t* const apply_gradients_from = ccv_array_new(sizeof(int), 0, 0);
2093
24
  int i, j;
2094
256
  for (i = 0; i < compiled_data->backward.to_size; 
i++232
)
2095
232
  {
2096
232
    const int* tos;
2097
232
    int to_size;
2098
232
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->backward.tos[i], &tos, &to_size);
2099
738
    for (j = 0; j < to_size; 
j++506
)
2100
506
    {
2101
      // Check if this is already show up in the backward graph, if that is the case, it won't be in the apply
2102
      // gradients graph.
2103
506
      const ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
2104
506
        .d = tos[j],
2105
506
        .graph = model->graph,
2106
506
      });
2107
506
      if (!exec.graph)
2108
316
        ccv_array_add_unique_int(apply_gradients_from, tos[j]);
2109
506
    }
2110
232
  }
2111
24
  const int from_size = apply_gradients_from->rnum;
2112
24
  if (from_size == 0)
2113
0
  {
2114
0
    ccv_array_free(apply_gradients_from);
2115
0
    ccv_array_free(tensor_binds);
2116
0
    return;
2117
0
  }
2118
24
  ccv_nnc_graph_exec_symbol_t* const froms = (ccv_nnc_graph_exec_symbol_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * from_size);
2119
160
  for (i = 0; i < from_size; 
i++136
)
2120
136
    froms[i] = (ccv_nnc_graph_exec_symbol_t){
2121
136
      .d = *(int*)ccv_array_get(apply_gradients_from, i),
2122
136
      .graph = model->graph
2123
136
    };
2124
24
  ccv_array_free(apply_gradients_from);
2125
  // It can only ends with updates on the parameters.
2126
24
  ccv_array_t* const tos = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), parameter_size * parallel_count, 0);
2127
160
  for (i = 0;  i < parameter_size; 
i++136
)
2128
136
  {
2129
136
    if (compiled_data->update_nodes[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
2130
0
      continue;
2131
136
    ccv_array_push(tos, &compiled_data->update_nodes[i]);
2132
316
    for (j = 1; j < parallel_count; 
j++180
)
2133
180
    {
2134
180
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->update_nodes[i], j);
2135
180
      ccv_array_push(tos, &copy);
2136
180
    }
2137
136
  }
2138
24
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, froms, from_size, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(tos, 0), tos->rnum, &compiled_data->apply_gradients.graph, &compiled_data->apply_gradients.tensor_arena, &compiled_data->apply_gradients.graph_exec_arena);
2139
24
  ccv_array_free(tos);
2140
24
  ccv_array_free(tensor_binds);
2141
24
  ccfree(froms);
2142
24
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2143
219
  for (i = 0; i < max_saved_aux_size * parameter_size; 
i++195
)
2144
195
  {
2145
    // Skip on no tensor.
2146
195
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
2147
0
      continue;
2148
195
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, compiled_data->saved_aux[i].source);
2149
195
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
2150
543
    for (j = 1; j < parallel_count; 
j++348
)
2151
348
    {
2152
348
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
2153
348
      if (copy)
2154
348
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
2155
348
    }
2156
195
  }
2157
24
  ccv_nnc_graph_set_default_static_schedule(compiled_data->apply_gradients.graph, compiled_data->stream_type, model->max_stream_count);
2158
24
}
2159
2160
void ccv_cnnp_model_apply_gradients(ccv_cnnp_model_t* const model, ccv_nnc_stream_context_t* const stream_context)
2161
7.84k
{
2162
7.84k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2163
7.84k
  assert(compiled_data);
2164
7.84k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
2165
7.84k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2166
7.84k
  assert(model->graph);
2167
7.84k
  assert(compiled_data->graph);
2168
  // Skip if there is no backward pass.
2169
7.84k
  if (compiled_data->backward.count <= 0)
2170
1.65k
    return;
2171
  // Skip if there is no parameters.
2172
6.19k
  if (compiled_data->parameters->rnum == 0)
2173
6
  {
2174
6
    compiled_data->backward.count = 0;
2175
6
    return;
2176
6
  }
2177
6.19k
  if (!compiled_data->apply_gradients.graph)
2178
24
    _ccv_cnnp_model_multistage_jit_2(model);
2179
6.16k
  else {
2180
6.16k
    const int parameter_size = compiled_data->parameters->rnum;
2181
6.16k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->apply_gradients.tensor_arena);
2182
    // Change to bind accum_gradients if we do gradient accumulation (run backward more than once).
2183
6.16k
    if (compiled_data->backward.count > 1)
2184
497
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count);
2185
5.67k
    else
2186
5.67k
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
2187
6.16k
  }
2188
6.19k
  if (compiled_data->apply_gradients.graph)
2189
6.19k
    ccv_nnc_graph_run_with_schedule(compiled_data->apply_gradients.graph, 0, 0, 0, stream_context);
2190
  // Reset backward count to 0.
2191
6.19k
  compiled_data->backward.count = 0;
2192
6.19k
}
2193
2194
void ccv_cnnp_model_set_parameter(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, const ccv_nnc_tensor_t* const tensor)
2195
35
{
2196
35
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2197
35
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 18
:
parameter->param_sel27
;
2198
35
  assert(parameter->param_sel != 0);
2199
35
  const int tensors_init = !!compiled_data->tensors_init.v;
2200
35
  int this_tensor_init = tensors_init;
2201
35
  if (!tensors_init)
2202
19
    ccv_cnnp_model_tensors_init_0(model, compiled_data);
2203
16
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
2204
  // Check if it is not fully allocated, if it is not, init_1.
2205
15
    this_tensor_init = 0;
2206
35
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2207
35
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2208
35
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 134
:
parameter->param_ref1
;
2209
35
  if (param_ref < 0)
2210
1
    { assert(parameter_indices->rnum == 1); }
2211
34
  else
2212
34
    { assert(param_ref < parameter_indices->rnum); }
2213
35
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2214
35
  ccv_array_free(parameter_indices);
2215
35
  const int parameter_size = compiled_data->parameters->rnum;
2216
35
  assert(d >= 0);
2217
35
  assert(d < parameter_size);
2218
35
  const int parallel_count = ccv_max(model->parallel_count, 1);
2219
35
  int i;
2220
35
  if (!this_tensor_init)
2221
34
  {
2222
34
    if (compiled_data->tensors.parameters[d])
2223
0
    {
2224
0
      for (i = 1; i < parallel_count; i++)
2225
0
        { assert(compiled_data->tensors.parameters[d + i * parameter_size]); }
2226
0
      this_tensor_init = 1;
2227
34
    } else {
2228
34
      const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, d);
2229
34
      ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
2230
34
      if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
2231
34
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
2232
34
      const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
2233
34
      compiled_data->tensors.parameters[d] = ccv_nnc_tensor_new(0, info, 0);
2234
34
      for (i = 1; i < parallel_count; 
i++0
)
2235
0
      {
2236
0
        if (i != device_id)
2237
0
          CCV_TENSOR_SET_DEVICE_ID(info.type, i);
2238
0
        else
2239
0
          CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
2240
0
        compiled_data->tensors.parameters[d + i * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
2241
0
      }
2242
34
    }
2243
34
  }
2244
35
  ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2245
35
  assert(dest);
2246
35
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)tensor), TENSOR_LIST(dest), 0);
2247
35
  for (i = 1; i < parallel_count; 
i++0
)
2248
0
  {
2249
0
    ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d + i * parameter_size]);
2250
0
    if (copy_tensor)
2251
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
2252
0
  }
2253
  // Mark this symbol as init'ed.
2254
35
  const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, d))->d;
2255
35
  uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2256
35
  init_v[s >> 5] |= (1u << (s & 0x1f));
2257
  // If we just allocated this tensor, now it is time to check if we need to mark it as fully allocated.
2258
35
  if (!this_tensor_init)
2259
34
  {
2260
34
    if (ccv_cnnp_model_tensors_any_to_alloc(model, compiled_data))
2261
20
      compiled_data->tensors_init.v = (uint32_t*)((uintptr_t)compiled_data->tensors_init.v | (uintptr_t)1);
2262
14
    else // Remove the flag.
2263
14
      compiled_data->tensors_init.v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2264
34
  }
2265
35
}
2266
2267
void ccv_cnnp_model_parameter_copy(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, ccv_nnc_tensor_t* const tensor)
2268
6
{
2269
6
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2270
6
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 13
:
parameter->param_sel3
;
2271
6
  assert(parameter->param_sel != 0);
2272
6
  assert(compiled_data->tensors.parameters);
2273
6
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2274
6
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2275
6
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 13
:
parameter->param_ref3
;
2276
6
  if (param_ref < 0)
2277
3
    { assert(parameter_indices->rnum == 1); }
2278
3
  else
2279
3
    { assert(param_ref < parameter_indices->rnum); }
2280
6
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2281
6
  ccv_array_free(parameter_indices);
2282
6
  const int parameter_size = compiled_data->parameters->rnum;
2283
6
  assert(d >= 0);
2284
6
  assert(d < parameter_size);
2285
  // We don't need to consider parallel_count, every parameter on each device is identical.
2286
6
  ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2287
6
  assert(src);
2288
6
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(tensor), 0);
2289
6
}
2290
2291
ccv_nnc_tensor_param_t ccv_cnnp_model_parameter_tensor_params(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter)
2292
1
{
2293
1
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2294
1
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 10
: parameter->param_sel;
2295
1
  assert(parameter->param_sel != 0);
2296
1
  assert(compiled_data->tensors.parameters);
2297
1
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2298
1
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2299
1
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 10
: parameter->param_ref;
2300
1
  if (param_ref < 0)
2301
1
    { assert(parameter_indices->rnum == 1); }
2302
0
  else
2303
0
    { assert(param_ref < parameter_indices->rnum); }
2304
1
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2305
1
  ccv_array_free(parameter_indices);
2306
1
  const int parameter_size = compiled_data->parameters->rnum;
2307
1
  assert(d >= 0);
2308
1
  assert(d < parameter_size);
2309
  // We don't need to consider parallel_count, every parameter on each device is identical.
2310
1
  ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2311
1
  assert(tensor);
2312
1
  return tensor->info;
2313
1
}
2314
2315
const char* ccv_cnnp_model_parameter_name(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter)
2316
2
{
2317
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2318
2
  const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 1 : 
parameter->param_sel0
;
2319
2
  assert(parameter->param_sel != 0);
2320
2
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2321
2
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2322
2
  const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 1 : 
parameter->param_ref0
;
2323
2
  if (param_ref < 0)
2324
0
    { assert(parameter_indices->rnum == 1); }
2325
2
  else
2326
2
    { assert(param_ref < parameter_indices->rnum); }
2327
2
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2328
2
  ccv_array_free(parameter_indices);
2329
2
  const int parameter_size = compiled_data->parameters->rnum;
2330
2
  assert(d >= 0);
2331
2
  assert(d < parameter_size);
2332
2
  return *(char**)ccv_array_get(compiled_data->ids.parameters, d);
2333
2
}
2334
2335
int ccv_cnnp_model_parameter_count(ccv_cnnp_model_t* const model)
2336
0
{
2337
0
  assert(model->compiled_data);
2338
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2339
0
  return compiled_data->parameters->rnum;
2340
0
}
2341
2342
uint64_t ccv_cnnp_model_parameters_size(ccv_cnnp_model_t* const model)
2343
0
{
2344
0
  assert(model->compiled_data);
2345
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2346
0
  const int parameter_size = compiled_data->parameters->rnum;
2347
0
  int i;
2348
0
  const ccv_nnc_symbolic_graph_t* const graph = model->graph;
2349
0
  uint64_t size = 0;
2350
0
  const int tensors_init = !!compiled_data->tensors_init.v;
2351
0
  uint32_t* const init_v = tensors_init ? CCV_NNC_INIT_V(compiled_data->tensors_init.v) : 0;
2352
0
  for (i = 0; i < parameter_size; i++)
2353
0
  {
2354
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
2355
0
    if (tensors_init && compiled_data->tensors.parameters && (init_v[d >> 5] | (1u << (d & 0x1f))) && compiled_data->tensors.parameters[i])
2356
0
    {
2357
0
      ccv_nnc_tensor_param_t params = compiled_data->tensors.parameters[i]->info;
2358
0
      size += ccv_nnc_tensor_data_size(params);
2359
0
      continue;
2360
0
    }
2361
0
    ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(graph, (ccv_nnc_tensor_symbol_t){
2362
0
      .graph = graph,
2363
0
      .d = d
2364
0
    });
2365
0
    size += ccv_nnc_tensor_data_size(params);
2366
0
  }
2367
0
  return size;
2368
0
}
2369
2370
int ccv_cnnp_model_parameters_move(ccv_cnnp_model_t* const model, char** const names, ccv_nnc_tensor_t** const tensors, const int count, int type)
2371
3
{
2372
3
  assert(model->compiled_data);
2373
3
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2374
3
  if (count != compiled_data->parameters->rnum)
2375
0
    return 0;
2376
3
  if (CCV_TENSOR_GET_DEVICE(type) == CCV_COMPUTE_DEVICE_ANY)
2377
0
    CCV_TENSOR_SET_DEVICE_ID(type, 0);
2378
3
  int i;
2379
  // We don't need to consider parallel_count, every parameter on each device is identical.
2380
6
  for (i = 0; i < count; 
i++3
)
2381
3
  {
2382
3
    ccv_nnc_tensor_t* tensor = compiled_data->tensors.parameters[i];
2383
3
    if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything.
2384
0
    {
2385
0
      tensors[i] = 0;
2386
0
      continue;
2387
0
    }
2388
3
    tensor = CCV_NNC_TENSOR(tensor);
2389
3
    if (tensor->info.type == type)
2390
3
      tensors[i] = tensor;
2391
0
    else {
2392
0
      ccv_nnc_tensor_param_t info = tensor->info;
2393
0
      info.type = type;
2394
0
      tensors[i] = ccv_nnc_tensor_new(0, info, 0); // Create this tensor, don't initiate copy yet.
2395
0
    }
2396
3
  }
2397
6
  for (i = 0; i < count; 
i++3
)
2398
3
  {
2399
3
    ccv_nnc_tensor_t* tensor = compiled_data->tensors.parameters[i];
2400
3
    if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything.
2401
0
      continue;
2402
3
    tensor = CCV_NNC_TENSOR(tensor);
2403
    // Now initiate transfer. We should do this one on a stream.
2404
3
    if (tensor->info.type != type)
2405
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(tensors[i]), 0);
2406
3
  }
2407
  // Copy names and remove parameters.
2408
6
  for (i = 0; i < count; 
i++3
)
2409
3
  {
2410
3
    ccv_nnc_tensor_t* const tensor = compiled_data->tensors.parameters[i];
2411
3
    if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything.
2412
0
    {
2413
0
      names[i] = 0;
2414
0
      continue;
2415
0
    }
2416
3
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2417
3
    const size_t name_len = ccv_min(strnlen(name, 1023), 1023);
2418
3
    names[i] = ccmalloc(name_len + 1);
2419
3
    names[i][name_len] = 0;
2420
3
    memcpy(names[i], name, name_len);
2421
3
    if (tensor->info.type == type)
2422
3
      compiled_data->tensors.parameters[i] = 0; // Only move when it is moved.
2423
3
  }
2424
3
  return 1;
2425
3
}
2426
2427
KHASH_MAP_INIT_STR(ccv_cnnp_parameter_id, int)
2428
2429
void ccv_cnnp_model_set_parameters_from_key_values(ccv_cnnp_model_t* const model, char* const* const names, ccv_nnc_tensor_t** const tensors, const int count, const int invalidates)
2430
2
{
2431
2
  assert(model->compiled_data);
2432
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2433
2
  int i;
2434
2
  khash_t(ccv_cnnp_parameter_id)* id_map = 0;
2435
2
  if (count != compiled_data->parameters->rnum)
2436
0
  {
2437
0
    id_map = kh_init(ccv_cnnp_parameter_id);
2438
    // Build the map between name and the index.
2439
0
    for (i = 0; i < count; i++)
2440
0
    {
2441
0
      int ret;
2442
0
      const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, names[i], &ret);
2443
0
      assert(ret != 0);
2444
0
      kh_val(id_map, k) = i;
2445
0
    }
2446
0
  }
2447
2
  const int parameter_size = compiled_data->parameters->rnum;
2448
2
  int* copy_back = 0;
2449
2
  const int tensors_init = !!compiled_data->tensors_init.v;
2450
2
  if (!tensors_init)
2451
1
    ccv_cnnp_model_tensors_init_0(model, compiled_data);
2452
2
  const int parallel_count = ccv_max(model->parallel_count, 1);
2453
2
  uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2454
4
  for (i = 0; i < parameter_size; 
i++2
)
2455
2
  {
2456
2
    int j = i;
2457
2
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2458
2
    if (i >= 0 || 
strncmp(name, names[i], 1023) != 00
)
2459
2
    {
2460
      // Build the map.
2461
2
      if (id_map == 0)
2462
2
      {
2463
2
        id_map = kh_init(ccv_cnnp_parameter_id);
2464
4
        for (j = 0; j < count; 
j++2
)
2465
2
        {
2466
2
          int ret;
2467
2
          const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, names[j], &ret);
2468
2
          assert(ret != 0);
2469
2
          kh_val(id_map, k) = j;
2470
2
        }
2471
2
      }
2472
2
      const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, name);
2473
2
      if (k == kh_end(id_map)) // Cannot find the name, skip.
2474
0
        continue;
2475
2
      j = kh_val(id_map, k);
2476
2
    }
2477
2
    if (compiled_data->tensors.parameters[i]) // Cannot be a shared parameter to read.
2478
0
      { assert(!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1)); }
2479
2
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
2480
2
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
2481
2
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
2482
1
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
2483
2
    const int d = parameter.d;
2484
2
    if (info.type == tensors[j]->info.type && invalidates) // Can move.
2485
1
    {
2486
      // Deallocate it if needed.
2487
1
      if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1))
2488
1
        if (compiled_data->tensors.parameters[i])
2489
0
          ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]);
2490
1
      compiled_data->tensors.parameters[i] = tensors[j];
2491
1
      tensors[j] = 0;
2492
1
    } else {
2493
1
      if (!compiled_data->tensors.parameters[i])
2494
1
      { // Not allocated, to allocate first.
2495
        // Create new one, make sure we create this by having the right parameters.
2496
1
        const int type = info.type;
2497
1
        info = tensors[j]->info;
2498
1
        info.type = type; // Revert back the type.
2499
1
        compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0);
2500
1
      }
2501
1
      if (!copy_back)
2502
1
        copy_back = (int*)cccalloc(parameter_size, sizeof(int));
2503
1
      copy_back[i] = j + 1;
2504
1
    }
2505
2
    init_v[d >> 5] |= (1u << (d & 0x1f));
2506
    // Create this tensor for other data parallel allocations.
2507
2
    info = compiled_data->tensors.parameters[i]->info; // In case we loaded a different info.
2508
2
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
2509
2
    for (j = 1; j < parallel_count; 
j++0
)
2510
0
      if (!compiled_data->tensors.parameters[i + j * parameter_size])
2511
0
      {
2512
0
        if (j != device_id)
2513
0
          CCV_TENSOR_SET_DEVICE_ID(info.type, j);
2514
0
        else
2515
0
          CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
2516
0
        compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
2517
0
      }
2518
      // No need to copy over, this is done in ccv_cnnp_model.c's copy_tensors method.
2519
2
  }
2520
2
  if (id_map)
2521
2
    kh_destroy(ccv_cnnp_parameter_id, id_map);
2522
  // Now do the transfer.
2523
2
  if (copy_back)
2524
1
  {
2525
2
    for (i = 0; i < parameter_size; 
i++1
)
2526
1
    {
2527
1
      ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[i]);
2528
1
      if (copy_back[i] == 0)
2529
0
        continue;
2530
1
      const int j = copy_back[i] - 1;
2531
1
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensors[j]), TENSOR_LIST(tensor), 0);
2532
1
    }
2533
1
    ccfree(copy_back);
2534
1
  }
2535
2
}
2536
2537
ccv_cnnp_model_io_t ccv_cnnp_model_parameter_first(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f first, void* const context)
2538
0
{
2539
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2540
0
  assert(compiled_data);
2541
0
  const int parameter_size = compiled_data->parameters->rnum;
2542
0
  int i;
2543
0
  for (i = 0; i < parameter_size; i++)
2544
0
  {
2545
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2546
0
    if (first(model, name, context))
2547
0
      return ccv_cnnp_model_parameters(model, -1, i);
2548
0
  }
2549
0
  return 0;
2550
0
}
2551
2552
ccv_array_t* ccv_cnnp_model_parameters_filter(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f filter, void* const context)
2553
0
{
2554
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2555
0
  assert(compiled_data);
2556
0
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 0, 0);
2557
0
  const int parameter_size = compiled_data->parameters->rnum;
2558
0
  int i;
2559
0
  for (i = 0; i < parameter_size; i++)
2560
0
  {
2561
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2562
0
    if (filter(model, name, context))
2563
0
    {
2564
0
      ccv_cnnp_model_io_t parameter = ccv_cnnp_model_parameters(model, -1, i);
2565
0
      ccv_array_push(parameters, &parameter);
2566
0
    }
2567
0
  }
2568
0
  return parameters;
2569
2570
0
}
2571
2572
CCV_WARN_UNUSED(ccv_cnnp_model_io_t) ccv_cnnp_model_parameter_first_uninit(ccv_cnnp_model_t* const model)
2573
0
{
2574
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2575
0
  assert(compiled_data);
2576
0
  const int tensors_init = !!compiled_data->tensors_init.v;
2577
0
  if (!tensors_init) // If nothing initialized, we return parameter 0.
2578
0
    return ccv_cnnp_model_parameters(model, -1, 0);
2579
0
  const int parameter_size = compiled_data->parameters->rnum;
2580
0
  int i;
2581
0
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2582
0
  for (i = 0; i < parameter_size; i++)
2583
0
  {
2584
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
2585
0
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
2586
0
      return ccv_cnnp_model_parameters(model, -1, i);
2587
0
  }
2588
0
  return 0;
2589
0
}
2590
2591
static ccv_array_t* _ccv_cnnp_model_parameter_indices(const ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, int* const param_ref)
2592
49
{
2593
49
  const int to_param_sel = parameters->param_sel > 0 ? 
parameters->param_sel - 10
: parameters->param_sel;
2594
49
  assert(parameters->param_sel != 0);
2595
49
  ccv_array_t* const to_parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2596
49
  ccv_cnnp_model_add_to_parameter_indices(parameters->model, to_param_sel, to_parameter_indices);
2597
49
  *param_ref = parameters->param_ref > 0 ? 
parameters->param_ref - 10
: parameters->param_ref;
2598
49
  return to_parameter_indices;
2599
49
}
2600
2601
static void _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_array_t** const parameter_indices, int* const param_ref, ccv_array_t** const from_parameter_indices, int* const from_param_ref, const int only_init_0)
2602
14
{
2603
  // If the model is not compiled yet. Compile them now.
2604
14
  if (!model->graph)
2605
3
  {
2606
3
    model->graph = ccv_nnc_symbolic_graph_new();
2607
3
    assert(from_model->compiled_data);
2608
3
    const int input_size = from_model->input_size;
2609
3
    ccv_nnc_tensor_param_t input_params[input_size];
2610
3
    int i;
2611
9
    for (i = 0; i < input_size; 
i++6
)
2612
6
      input_params[i] = ccv_nnc_tensor_symbol_params(from_model->graph, from_model->inputs[i]);
2613
3
    _ccv_cnnp_model_compile(model, input_params, input_size, from_model->compiled_data->loss);
2614
3
    model->parallel_count = from_model->parallel_count;
2615
3
    model->memory_compression = from_model->memory_compression;
2616
3
    model->memory_reduction = from_model->memory_reduction;
2617
3
    model->gradient_checkpointing = from_model->gradient_checkpointing;
2618
3
    model->compiled_data->stream_type = from_model->compiled_data->stream_type;
2619
3
    model->compiled_data->minimize.minimizer = from_model->compiled_data->minimize.minimizer;
2620
3
    model->compiled_data->minimize.max_saved_aux_size = from_model->compiled_data->minimize.max_saved_aux_size;
2621
3
  }
2622
14
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2623
14
  assert(to_compiled_data);
2624
14
  const int to_tensors_init = !!to_compiled_data->tensors_init.v;
2625
14
  if (!to_tensors_init)
2626
10
  {
2627
10
    if (only_init_0)
2628
1
      ccv_cnnp_model_tensors_init_0(model, to_compiled_data);
2629
9
    else
2630
9
      _ccv_cnnp_model_tensors_init(model, to_compiled_data);
2631
10
  } else 
if (4
!only_init_04
&&
(uintptr_t)to_compiled_data->tensors_init.v & (uintptr_t)13
)
2632
    // Check if it is not fully allocated, if it is not, init_1.
2633
0
      ccv_cnnp_model_tensors_init_1(model, to_compiled_data);
2634
14
  assert(to_compiled_data->tensors.parameters);
2635
14
  *parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, param_ref);
2636
14
  *from_parameter_indices = _ccv_cnnp_model_parameter_indices(from_model, from_parameters, from_param_ref);
2637
14
  if (*from_param_ref < 0 && *param_ref >= 0)
2638
0
    { assert((*from_parameter_indices)->rnum == 1); }
2639
14
  else if (*from_param_ref >= 0)
2640
0
    { assert(*from_param_ref < (*from_parameter_indices)->rnum); }
2641
14
  if (*param_ref < 0 && *from_param_ref >= 0)
2642
0
    { assert((*parameter_indices)->rnum == 1); }
2643
14
  else if (*param_ref >= 0)
2644
0
    { assert(*param_ref < (*parameter_indices)->rnum); }
2645
14
}
2646
2647
void ccv_cnnp_model_set_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2648
9
{
2649
9
  ccv_array_t* to_parameter_indices;
2650
9
  int to_param_ref;
2651
9
  ccv_array_t* from_parameter_indices;
2652
9
  int from_param_ref;
2653
9
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0);
2654
  // Should be exactly the same tensor.
2655
9
  if (to_param_ref < 0 && from_param_ref < 0)
2656
9
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2657
  // To models.
2658
9
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2659
9
  assert(to_compiled_data);
2660
  // From models.
2661
9
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2662
9
  const int parallel_count = ccv_max(model->parallel_count, 1);
2663
9
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2664
9
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2665
9
  int i, j;
2666
9
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2667
9
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2668
18
  for (i = 0; i < rnum; 
i++9
)
2669
9
  {
2670
9
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2671
9
    assert(src_d >= 0);
2672
9
    assert(src_d < from_compiled_data->parameters->rnum);
2673
9
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2674
    // If the original is not init'ed. We cannot copy from.
2675
9
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2676
0
      continue;
2677
9
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2678
9
    assert(dest_d >= 0);
2679
9
    assert(dest_d < to_compiled_data->parameters->rnum);
2680
9
    ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]);
2681
9
    assert(src);
2682
9
    ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2683
9
    assert(dest);
2684
9
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(dest), 0);
2685
27
    for (j = 1; j < parallel_count; 
j++18
)
2686
18
    {
2687
18
      ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2688
18
      if (copy_tensor)
2689
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
2690
18
    }
2691
    // Mark this symbol as init'ed.
2692
9
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2693
9
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2694
9
  }
2695
9
  ccv_array_free(to_parameter_indices);
2696
9
  ccv_array_free(from_parameter_indices);
2697
9
}
2698
2699
void ccv_cnnp_model_share_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_cnnp_model_parameters_renamer_f renamer, void* const context)
2700
2
{
2701
2
  ccv_array_t* to_parameter_indices;
2702
2
  int to_param_ref;
2703
2
  ccv_array_t* from_parameter_indices;
2704
2
  int from_param_ref;
2705
2
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 1);
2706
  // Should be exactly the same tensor.
2707
2
  if (renamer == 0 && 
to_param_ref < 01
&&
from_param_ref < 01
)
2708
1
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2709
  // To models.
2710
2
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2711
2
  assert(to_compiled_data);
2712
  // From models.
2713
2
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2714
2
  const int parallel_count = ccv_max(model->parallel_count, 1);
2715
2
  assert(parallel_count == ccv_max(from_model->parallel_count, 1)); // Should have the same parallel count can share parameters.
2716
2
  const int from_parameter_size = from_compiled_data->parameters->rnum;
2717
2
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2718
2
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2719
2
  int i, j;
2720
2
  khash_t(ccv_cnnp_parameter_id)* id_map = 0;
2721
2
  char* updated_name = 0;
2722
2
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2723
2
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2724
8
  for (i = 0; i < rnum; 
i++6
)
2725
6
  {
2726
6
    int src_d = (from_param_ref >= 0 ? 
from_param_ref0
: i) < from_parameter_indices->rnum ?
*(int*)4
ccv_array_get4
(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i) :
from_parameter_size2
;
2727
    // Need to figure out how to use the renamer here.
2728
6
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2729
6
    assert(dest_d >= 0);
2730
6
    assert(dest_d < to_parameter_size);
2731
6
    if (renamer)
2732
3
    {
2733
3
      const char* const src_name = (src_d < from_parameter_size && 
src_d >= 01
) ?
*(char**)1
ccv_array_get1
(from_compiled_data->ids.parameters, src_d) :
02
;
2734
3
      const char* const dest_name = *(char**)ccv_array_get(to_compiled_data->ids.parameters, dest_d);
2735
3
      if (!updated_name)
2736
1
        updated_name = (char*)ccmalloc(1024);
2737
3
      const size_t src_name_len = src_name == 0 ? 
02
:
ccv_min1
(strnlen(src_name, 1023), 1023);
2738
3
      if (src_name_len > 0)
2739
1
        memcpy(updated_name, src_name, src_name_len);
2740
3
      updated_name[src_name_len] = 0;
2741
3
      if (renamer(context, dest_name, updated_name, 1024) != 0)
2742
0
        continue; // Skip this.
2743
3
      if (src_name != 0 && 
memcmp(updated_name, src_name, src_name_len) == 01
&&
strnlen(updated_name, 1023) == src_name_len0
)
2744
0
      {
2745
        // Nothing changed.
2746
3
      } else {
2747
3
        if (!id_map)
2748
1
        {
2749
1
          id_map = kh_init(ccv_cnnp_parameter_id);
2750
2
          for (j = 0; j < from_parameter_size; 
j++1
)
2751
1
          {
2752
1
            int ret;
2753
1
            const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, *(char**)ccv_array_get(from_compiled_data->ids.parameters, j), &ret);
2754
1
            assert(ret != 0);
2755
1
            kh_val(id_map, k) = j;
2756
1
          }
2757
1
        }
2758
3
        const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, updated_name);
2759
3
        if (k == kh_end(id_map)) // Cannot find the name, skip.
2760
2
          continue;
2761
1
        src_d = kh_val(id_map, k);
2762
1
        assert(src_d >= 0);
2763
1
        assert(src_d < from_parameter_size);
2764
1
      }
2765
3
    }
2766
6
    assert
(src_d >= 0)4
;
2767
4
    assert(src_d < from_parameter_size);
2768
4
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2769
    // If the original is not init'ed. We cannot share from.
2770
4
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2771
0
      continue;
2772
8
    
for (j = 0; 4
j < parallel_count;
j++4
)
2773
4
    {
2774
4
      ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * from_parameter_size]);
2775
4
      assert(src);
2776
4
      ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
2777
4
      if (dest && 
!((uintptr_t)dest & (uintptr_t)1)0
)
2778
0
        ccv_nnc_tensor_free(dest);
2779
4
      to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size] = (ccv_nnc_tensor_t*)((uintptr_t)src | (uintptr_t)1);
2780
4
    }
2781
    // Mark this symbol as init'ed.
2782
4
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2783
4
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2784
4
  }
2785
2
  ccv_array_free(to_parameter_indices);
2786
2
  ccv_array_free(from_parameter_indices);
2787
2
  if (id_map)
2788
1
    kh_destroy(ccv_cnnp_parameter_id, id_map);
2789
2
  if (updated_name)
2790
1
    ccfree(updated_name);
2791
  // Mark it as incomplete so we will call init_1.
2792
2
  if (ccv_cnnp_model_tensors_any_to_alloc(model, to_compiled_data))
2793
1
    to_compiled_data->tensors_init.v = (uint32_t*)((uintptr_t)to_compiled_data->tensors_init.v | (uintptr_t)1);
2794
1
  else // Remove the flag.
2795
1
    to_compiled_data->tensors_init.v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2796
2
}
2797
2798
ccv_nnc_stream_context_t* ccv_cnnp_compiled_data_get_stream(ccv_cnnp_compiled_data_t* const compiled_data, const int type)
2799
24
{
2800
24
  if (!compiled_data->stream_map)
2801
4
    compiled_data->stream_map = kh_init(stream_map);
2802
24
  int ret = 0;
2803
24
  khiter_t k = kh_put(stream_map, compiled_data->stream_map, type, &ret);
2804
24
  assert(ret >= 0);
2805
24
  ccv_nnc_stream_context_t* stream = kh_val(compiled_data->stream_map, k);
2806
  // If ret == 0, the key already exist, we can return directly, otherwise, create and return.
2807
24
  if (ret != 0)
2808
16
  {
2809
16
    stream = ccv_nnc_stream_context_new(type);
2810
16
    kh_val(compiled_data->stream_map, k) = stream;
2811
16
  }
2812
24
  return stream;
2813
24
}
2814
2815
void ccv_cnnp_model_parameters_zip_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2816
3
{
2817
3
  ccv_array_t* to_parameter_indices;
2818
3
  int to_param_ref;
2819
3
  ccv_array_t* from_parameter_indices;
2820
3
  int from_param_ref;
2821
3
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0);
2822
  // Should be exactly the same tensor.
2823
3
  if (to_param_ref < 0 && from_param_ref < 0)
2824
3
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2825
  // To models.
2826
3
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2827
3
  assert(to_compiled_data);
2828
  // From models.
2829
3
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2830
3
  const int parallel_count = ccv_max(model->parallel_count, 1);
2831
3
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2832
3
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2833
3
  assert(aux_in_size >= 0);
2834
3
  assert(aux_out_size >= 0);
2835
3
  int i, j;
2836
3
  ccv_nnc_tensor_t* inputs[aux_in_size + 2];
2837
3
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2838
3
  for (i = 0; i < aux_in_size; 
i++0
)
2839
0
    inputs[i + 2] = aux_ins[i];
2840
3
  for (i = 0; i < aux_out_size; 
i++0
)
2841
0
    outputs[i + 1] = aux_outs[i];
2842
3
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2843
3
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2844
6
  for (i = 0; i < rnum; 
i++3
)
2845
3
  {
2846
3
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2847
3
    assert(src_d >= 0);
2848
3
    assert(src_d < from_compiled_data->parameters->rnum);
2849
3
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2850
    // If the original is not init'ed. We cannot copy from.
2851
3
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2852
0
      continue;
2853
3
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2854
3
    assert(dest_d >= 0);
2855
3
    assert(dest_d < to_compiled_data->parameters->rnum);
2856
3
    if (parallel_count > 1)
2857
2
    {
2858
2
      ccv_nnc_stream_context_t* streams[parallel_count];
2859
2
      ccv_nnc_stream_signal_t* signal;
2860
2
      if (stream_context)
2861
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2862
10
      for (j = 0; j < parallel_count; 
j++8
)
2863
8
      {
2864
8
        ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * to_parameter_size]);
2865
8
        ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2866
8
        if (!dest || !src)
2867
0
        {
2868
0
          streams[j] = 0;
2869
0
          continue;
2870
0
        }
2871
        // At the moment, can only handle them on the same device.
2872
8
        assert(CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GET_MEMORY(dest->info.type));
2873
8
        assert(CCV_TENSOR_GET_DEVICE_ID(src->info.type) == CCV_TENSOR_GET_DEVICE_ID(dest->info.type));
2874
8
        const int stream_type = CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2875
8
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(src->info.type);
2876
8
        int type = stream_type;
2877
8
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2878
8
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2879
        // Wait signal to finish.
2880
8
        if (stream_context)
2881
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2882
8
        inputs[0] = outputs[0] = dest;
2883
8
        inputs[1] = src;
2884
8
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_0);
2885
8
        if (stream_context)
2886
4
        {
2887
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2888
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2889
4
        }
2890
8
        streams[j] = stream_0;
2891
8
      }
2892
      // If this should be blocking, blocking it.
2893
2
      if (!stream_context)
2894
5
        
for (j = 0; 1
j < parallel_count;
j++4
)
2895
4
          if (streams[j])
2896
4
            ccv_nnc_stream_context_wait(streams[j]);
2897
2
    } else {
2898
1
      ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]);
2899
1
      assert(src);
2900
1
      ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2901
1
      assert(dest);
2902
1
      inputs[0] = outputs[0] = dest;
2903
1
      inputs[1] = src;
2904
1
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_context);
2905
1
    }
2906
    // Mark this symbol as init'ed.
2907
3
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2908
3
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2909
3
  }
2910
3
  ccv_array_free(to_parameter_indices);
2911
3
  ccv_array_free(from_parameter_indices);
2912
3
}
2913
2914
void ccv_cnnp_model_parameters_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2915
15
{
2916
15
  int to_param_ref;
2917
15
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2918
  // To models.
2919
15
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2920
15
  assert(to_compiled_data);
2921
  // Tensor has to be inited already.
2922
15
  assert(!!to_compiled_data->tensors_init.v);
2923
15
  assert(to_compiled_data->tensors.parameters);
2924
  // From models.
2925
15
  const int parallel_count = ccv_max(model->parallel_count, 1);
2926
15
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2927
15
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2928
15
  assert(aux_in_size >= 0);
2929
15
  assert(aux_out_size >= 0);
2930
15
  int i, j;
2931
15
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
2932
15
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2933
15
  for (i = 0; i < aux_in_size; 
i++0
)
2934
0
    inputs[i + 1] = aux_ins[i];
2935
15
  for (i = 0; i < aux_out_size; 
i++0
)
2936
0
    outputs[i + 1] = aux_outs[i];
2937
30
  for (i = 0; i < rnum; 
i++15
)
2938
15
  {
2939
15
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2940
15
    assert(dest_d >= 0);
2941
15
    assert(dest_d < to_compiled_data->parameters->rnum);
2942
15
    if (parallel_count > 1)
2943
4
    {
2944
4
      ccv_nnc_stream_context_t* streams[parallel_count];
2945
4
      ccv_nnc_stream_signal_t* signal;
2946
4
      if (stream_context)
2947
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2948
20
      for (j = 0; j < parallel_count; 
j++16
)
2949
16
      {
2950
16
        ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2951
16
        if (!dest)
2952
0
        {
2953
0
          streams[j] = 0;
2954
0
          continue;
2955
0
        }
2956
16
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2957
16
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
2958
16
        int type = stream_type;
2959
16
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2960
16
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2961
        // Wait signal to finish.
2962
16
        if (stream_context)
2963
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2964
16
        inputs[0] = outputs[0] = dest;
2965
16
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
2966
16
        if (stream_context)
2967
4
        {
2968
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2969
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2970
4
        }
2971
16
        streams[j] = stream_0;
2972
16
      }
2973
      // If this should be blocking, blocking it.
2974
4
      if (!stream_context)
2975
15
        
for (j = 0; 3
j < parallel_count;
j++12
)
2976
12
          if (streams[j])
2977
12
            ccv_nnc_stream_context_wait(streams[j]);
2978
11
    } else {
2979
11
      ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2980
11
      assert(dest);
2981
11
      inputs[0] = outputs[0] = dest;
2982
11
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
2983
11
    }
2984
    // No need to mark this symbol as init'ed, it is already.
2985
15
  }
2986
15
  ccv_array_free(to_parameter_indices);
2987
15
}
2988
2989
void ccv_cnnp_model_parameter_gradients_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2990
6
{
2991
6
  int to_param_ref;
2992
6
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2993
  // To models.
2994
6
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2995
6
  assert(to_compiled_data);
2996
  // Tensor has to be inited already.
2997
6
  assert(!!to_compiled_data->tensors_init.v);
2998
6
  ccv_nnc_tensor_t** tensor_gradients;
2999
6
  if (to_compiled_data->backward.count > 1)
3000
3
    tensor_gradients = to_compiled_data->tensors.accum_gradients;
3001
3
  else
3002
3
    tensor_gradients = to_compiled_data->tensors.gradients;
3003
6
  assert(tensor_gradients);
3004
  // From models.
3005
6
  const int parallel_count = ccv_max(model->parallel_count, 1);
3006
6
  const int to_parameter_size = to_compiled_data->parameters->rnum;
3007
6
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
3008
6
  assert(aux_in_size >= 0);
3009
6
  assert(aux_out_size >= 0);
3010
6
  int i, j;
3011
6
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
3012
6
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
3013
10
  for (i = 0; i < aux_in_size; 
i++4
)
3014
4
    inputs[i + 1] = aux_ins[i];
3015
14
  for (i = 0; i < aux_out_size; 
i++8
)
3016
8
    outputs[i + 1] = aux_outs[i];
3017
12
  for (i = 0; i < rnum; 
i++6
)
3018
6
  {
3019
6
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
3020
6
    assert(dest_d >= 0);
3021
6
    assert(dest_d < to_compiled_data->parameters->rnum);
3022
6
    if (parallel_count > 1)
3023
0
    {
3024
0
      ccv_nnc_stream_context_t* streams[parallel_count];
3025
0
      ccv_nnc_stream_signal_t* signal;
3026
0
      if (stream_context)
3027
0
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
3028
0
      for (j = 0; j < parallel_count; j++)
3029
0
      {
3030
0
        ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d + j * to_parameter_size];
3031
0
        if (!dest)
3032
0
        {
3033
0
          streams[j] = 0;
3034
0
          continue;
3035
0
        }
3036
0
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU;
3037
0
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
3038
0
        int type = stream_type;
3039
0
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
3040
0
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
3041
        // Wait signal to finish.
3042
0
        if (stream_context)
3043
0
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
3044
0
        inputs[0] = outputs[0] = dest;
3045
0
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
3046
0
        if (stream_context)
3047
0
        {
3048
0
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
3049
0
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
3050
0
        }
3051
0
        streams[j] = stream_0;
3052
0
      }
3053
      // If this should be blocking, blocking it.
3054
0
      if (!stream_context)
3055
0
        for (j = 0; j < parallel_count; j++)
3056
0
          if (streams[j])
3057
0
            ccv_nnc_stream_context_wait(streams[j]);
3058
6
    } else {
3059
6
      ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d];
3060
6
      if (!dest)
3061
0
        continue;
3062
6
      assert(dest);
3063
6
      inputs[0] = outputs[0] = dest;
3064
6
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
3065
6
    }
3066
    // No need to mark this symbol as init'ed, it is already.
3067
6
  }
3068
6
  ccv_array_free(to_parameter_indices);
3069
6
}
3070
3071
void ccv_cnnp_model_parameters_to_unified_memory(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, ccv_nnc_stream_context_t* const stream_context)
3072
0
{
3073
  // Only CUDA backend has this feature.
3074
0
#ifdef HAVE_CUDA
3075
0
  int to_param_ref;
3076
0
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
3077
  // To models.
3078
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3079
0
  assert(compiled_data);
3080
  // Tensor has to be inited already.
3081
0
  assert(!!compiled_data->tensors_init.v);
3082
0
  assert(compiled_data->tensors.parameters);
3083
  // From models.
3084
0
  const int parallel_count = ccv_max(model->parallel_count, 1);
3085
0
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 1;
3086
0
  int i;
3087
0
  for (i = 0; i < rnum; i++)
3088
0
  {
3089
0
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
3090
0
    assert(dest_d >= 0);
3091
0
    assert(dest_d < compiled_data->parameters->rnum);
3092
0
    if (parallel_count > 1)
3093
0
    {
3094
0
      assert(0 && "Cannot support this when data parallel is in effect.");
3095
0
    } else {
3096
0
      ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(compiled_data->tensors.parameters[dest_d]);
3097
0
      assert(src);
3098
0
      ccv_nnc_tensor_param_t params = src->info;
3099
0
      if (CCV_TENSOR_GET_MEMORY(params.type) != CCV_TENSOR_GPU_MEMORY)
3100
0
        continue;
3101
0
      const size_t size = ccv_nnc_tensor_data_size(params);
3102
0
      if (size <= 0)
3103
0
        continue;
3104
0
      const int should_free = !((uintptr_t)compiled_data->tensors.parameters[dest_d] & (uintptr_t)1);
3105
0
      const int tfb = (CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_CPU_MEMORY && params.format == CCV_TENSOR_FORMAT_NHWC && params.dim[2] > 0 && params.dim[2] <= CCV_MAX_CHANNEL && params.dim[0] > 0 && params.dim[1] > 0 && params.dim[3] == 0);
3106
0
      ccv_nnc_tensor_t* const tensor = (ccv_nnc_tensor_t*)ccmalloc(sizeof(ccv_nnc_tensor_t));
3107
0
      tensor->dataof = 0;
3108
0
      tensor->alias_ref = 0;
3109
0
      tensor->sig = 0;
3110
0
      tensor->refcount = 1;
3111
0
      tensor->info = params;
3112
0
      if (tfb)
3113
0
      {
3114
0
        tensor->type = CCV_NO_DATA_ALLOC | CCV_MATRIX_DENSE | CCV_GET_DATA_TYPE(params.datatype) | params.dim[2];
3115
        // This corresponding to mat->step
3116
0
        tensor->info.dim[4] = CCV_GET_STEP(params.dim[1], (CCV_GET_DATA_TYPE(params.datatype) | params.dim[2]));
3117
0
      } else // This won't be recognized by ccv_dense_matrix_t
3118
0
        tensor->type = CCV_NO_DATA_ALLOC | CCV_MATRIX_DENSE | CCV_GET_DATA_TYPE(params.datatype);
3119
      // Remove this flag so it can be deallocated as usual.
3120
0
      tensor->type &= ~CCV_NO_DATA_ALLOC;
3121
0
      assert(CCV_TENSOR_GET_DEVICE(params.type) != CCV_COMPUTE_DEVICE_ANY);
3122
0
      void* ptr = cumallocmanaged(CCV_TENSOR_GET_DEVICE_ID(params.type), size);
3123
0
      if (ptr) // If allocated successfully. Otherwise we go through the fallback path.
3124
0
      {
3125
0
        tensor->data.u8 = (uint8_t*)ptr;
3126
0
        tensor->type |= CCV_MAPPED_MEM; // This denotes the tensor is mapped to CPU, and would prefer a explicit prefetch call.
3127
0
      } else {
3128
        // Allocation failed.
3129
0
        ccfree(tensor);
3130
0
        continue;
3131
0
      }
3132
      // TODO: Cannot run this on the stream context yet, due to allocation and deallocations.
3133
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &src, 1, &tensor, 1, 0);
3134
0
      cumemadvisereadmostly(CCV_TENSOR_GET_DEVICE_ID(params.type), tensor->data.u8, size);
3135
0
      compiled_data->tensors.parameters[dest_d] = tensor;
3136
      // Can free out the old one.
3137
0
      if (should_free)
3138
0
        ccv_nnc_tensor_free(src);
3139
0
    }
3140
    // No need to mark this symbol as init'ed, it is already.
3141
0
  }
3142
0
  ccv_array_free(to_parameter_indices);
3143
0
#endif
3144
0
}
3145
3146
ccv_nnc_cmd_t ccv_cnnp_model_minimizer(ccv_cnnp_model_t* const model)
3147
2.20k
{
3148
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3149
2.20k
  assert(compiled_data);
3150
2.20k
  return compiled_data->minimize.minimizer;
3151
2.20k
}
3152
3153
void ccv_cnnp_model_set_minimizer(ccv_cnnp_model_t* const model, const ccv_nnc_cmd_t minimizer, const int reset, const ccv_cnnp_model_io_t* const set_parameters, const int set_parameter_size)
3154
4.36k
{
3155
4.36k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3156
4.36k
  assert(compiled_data);
3157
4.36k
  const int parameter_size = compiled_data->parameters->rnum;
3158
4.36k
  if (parameter_size == 0)
3159
6
    return;
3160
4.35k
  if (reset)
3161
2.49k
    { assert(set_parameters == 0 && set_parameter_size == 0); }
3162
4.35k
  const int old_max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
3163
4.35k
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
3164
4.35k
  if (saved_aux_size > compiled_data->minimize.max_saved_aux_size)
3165
7
    compiled_data->minimize.max_saved_aux_size = saved_aux_size;
3166
4.35k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
3167
  // We update all parameters, at this point, we have one minimizer.
3168
4.35k
  if (set_parameters == 0 || 
set_parameter_size == 0301
)
3169
4.05k
    compiled_data->minimize.minimizer = minimizer;
3170
4.35k
  int i;
3171
4.35k
  if (set_parameters && 
set_parameter_size301
)
3172
301
  {
3173
    // I need to save what's the minimizer along with this.
3174
301
    if (!compiled_data->minimize.parameters)
3175
5
      compiled_data->minimize.parameters = ccv_array_new(sizeof(ccv_cnnp_set_minimizer_for_parameter_t*), 1, 0);
3176
301
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = ccmalloc(sizeof(ccv_cnnp_set_minimizer_for_parameter_t) + (set_parameter_size - 1) * sizeof(ccv_cnnp_model_io_t));
3177
301
    set_minimizer_for_parameter->minimizer = minimizer;
3178
301
    set_minimizer_for_parameter->parameter_size = set_parameter_size;
3179
301
    memcpy(set_minimizer_for_parameter->parameters, set_parameters, sizeof(ccv_cnnp_model_io_t) * set_parameter_size);
3180
301
    ccv_array_push(compiled_data->minimize.parameters, &set_minimizer_for_parameter);
3181
301
  }
3182
  // If reset is true, clear the parameters array.
3183
4.35k
  if (reset && 
compiled_data->minimize.parameters2.49k
)
3184
291
  {
3185
582
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++291
)
3186
291
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
3187
291
    ccv_array_clear(compiled_data->minimize.parameters);
3188
291
  }
3189
4.35k
  if (!compiled_data->update_nodes)
3190
9
    return;
3191
4.34k
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
3192
4.34k
  assert(symbolic_graph);
3193
4.34k
  if (saved_aux_size > old_max_saved_aux_size)
3194
7
  {
3195
7
    assert(compiled_data->updated_parameters);
3196
    // Reallocate first, move them around later.
3197
7
    compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccrealloc(compiled_data->updated_parameters, sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * saved_aux_size * parameter_size);
3198
7
    compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
3199
7
    compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
3200
    // We need to do this from back to front because saved_aux_size > old_saved_aux_size, it could overlap.
3201
7
    _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, old_max_saved_aux_size, saved_aux_size);
3202
7
  }
3203
4.34k
  int flag = 0;
3204
4.34k
  const int parallel_count = ccv_max(model->parallel_count, 1);
3205
4.34k
  if (set_parameters && 
set_parameter_size296
)
3206
296
  {
3207
296
    ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
3208
592
    for (i = 0; i < set_parameter_size; 
i++296
)
3209
296
    {
3210
296
      const int param_sel = set_parameters[i]->param_sel > 0 ? 
set_parameters[i]->param_sel - 1291
:
set_parameters[i]->param_sel5
;
3211
296
      assert(set_parameters[i]->param_sel != 0);
3212
296
      const int old_rnum = parameter_indices->rnum;
3213
296
      ccv_cnnp_model_add_to_parameter_indices(set_parameters[i]->model, param_sel, parameter_indices);
3214
296
      const int param_ref = set_parameters[i]->param_ref > 0 ? 
set_parameters[i]->param_ref - 10
: set_parameters[i]->param_ref;
3215
296
      assert(set_parameters[i]->param_ref != 0);
3216
296
      if (param_ref >= 0)
3217
0
      {
3218
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
3219
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
3220
0
        parameter_indices->rnum = old_rnum + 1;
3221
0
      }
3222
296
    }
3223
    // We may have duplicated indices, but that is OK, we will set it twice.
3224
5.24k
    
for (i = 0; 296
i < parameter_indices->rnum;
i++4.95k
)
3225
4.95k
    {
3226
4.95k
      const int d = *(int*)ccv_array_get(parameter_indices, i);
3227
4.95k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, d))
3228
0
        flag = 1;
3229
4.95k
    }
3230
296
    ccv_array_free(parameter_indices);
3231
4.05k
  } else {
3232
19.1k
    for (i = 0; i < parameter_size; 
i++15.0k
)
3233
15.0k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, i))
3234
65
        flag = 1;
3235
4.05k
    if (compiled_data->minimize.parameters)
3236
291
      if (_ccv_cnnp_apply_parameters_with_minimizer(model))
3237
0
        flag = 1;
3238
4.05k
  }
3239
4.34k
  if (flag)
3240
7
  {
3241
    // If saved_aux_size doesn't match, we need to remove / add new saved_aux to the graph. But first, free up apply gradients graph.
3242
7
    if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_FIT_MODE)
3243
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
3244
7
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
3245
7
  }
3246
4.34k
}
3247
3248
void ccv_cnnp_model_set_compile_params(ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_compile_param_t compile_params)
3249
0
{
3250
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3251
0
  assert(compiled_data);
3252
0
  compiled_data->compile_params = compile_params;
3253
0
}
3254
3255
void ccv_cnnp_model_dot(const ccv_cnnp_model_t* const model, const int flags, FILE** const outs, const int out_size)
3256
48
{
3257
48
  if (model->graph && 
out_size > 047
)
3258
47
    ccv_nnc_symbolic_graph_dot(model->graph, flags, outs[0]);
3259
48
  if (model->compiled_data && 
model->compiled_data->graph47
&&
out_size > 116
)
3260
0
    ccv_nnc_graph_dot(model->compiled_data->graph, flags, outs[1]);
3261
48
  if (model->compiled_data && 
model->compiled_data->backward.accum47
&&
out_size > 20
)
3262
0
    ccv_nnc_graph_dot(model->compiled_data->backward.accum, flags, outs[2]);
3263
48
  if (model->compiled_data && 
model->compiled_data->apply_gradients.graph47
&&
out_size > 33
)
3264
0
    ccv_nnc_graph_dot(model->compiled_data->apply_gradients.graph, flags, outs[3]);
3265
48
}
3266
3267
void ccv_cnnp_model_format(const ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_format_f format_fn, void* const context)
3268
0
{
3269
0
  if (model->graph)
3270
0
    ccv_nnc_symbolic_graph_format(model->graph, 0, 0, 0, 0, format_fn, context);
3271
0
}
3272
3273
static void _ccv_cnnp_compiled_data_free(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
3274
2.30k
{
3275
2.30k
  int i;
3276
2.30k
  const int parameter_size = compiled_data->parameters->rnum;
3277
2.30k
  ccv_array_free(compiled_data->parameters);
3278
2.30k
  if (compiled_data->parameter_flags)
3279
10
    ccfree(compiled_data->parameter_flags);
3280
2.30k
  const int internal_size = compiled_data->internals->rnum;
3281
2.30k
  ccv_array_free(compiled_data->internals);
3282
2.30k
  assert(compiled_data->ids.parameters->rnum == parameter_size);
3283
2.30k
  assert(compiled_data->ids.internals->rnum == internal_size);
3284
5.26k
  
for (i = 0; 2.30k
i < parameter_size;
i++2.95k
)
3285
2.95k
    ccfree(*(char**)ccv_array_get(compiled_data->ids.parameters, i));
3286
2.30k
  ccv_array_free(compiled_data->ids.parameters);
3287
2.47k
  for (i = 0; i < internal_size; 
i++165
)
3288
165
    ccfree(*(char**)ccv_array_get(compiled_data->ids.internals, i));
3289
2.30k
  ccv_array_free(compiled_data->ids.internals);
3290
2.30k
  const int parallel_count = ccv_max(model->parallel_count, 1);
3291
2.30k
  if (compiled_data->tensors.parameters)
3292
99
  {
3293
803
    for (i = 0; i < parameter_size * parallel_count; 
i++704
)
3294
      // If it is not marked as not belonging, we can free it.
3295
704
      if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1))
3296
700
        if (compiled_data->tensors.parameters[i])
3297
698
          ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]);
3298
258
    for (i = 0; i < internal_size * parallel_count; 
i++159
)
3299
159
      if (compiled_data->tensors.internals[i])
3300
158
        ccv_nnc_tensor_free(compiled_data->tensors.internals[i]);
3301
99
    ccfree(compiled_data->tensors.parameters);
3302
99
  }
3303
2.30k
  if (compiled_data->tensors.gradients)
3304
36
  {
3305
366
    for (i = 0; i < parameter_size * parallel_count; 
i++330
)
3306
330
    {
3307
330
      if (compiled_data->tensors.gradients[i])
3308
328
        ccv_nnc_tensor_free(compiled_data->tensors.gradients[i]);
3309
330
      if (compiled_data->tensors.accum_gradients[i])
3310
15
        ccv_nnc_tensor_free(compiled_data->tensors.accum_gradients[i]);
3311
330
    }
3312
36
    ccfree(compiled_data->tensors.gradients);
3313
36
  }
3314
2.30k
  if (compiled_data->minimize.parameters)
3315
5
  {
3316
15
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++10
)
3317
10
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
3318
5
    ccv_array_free(compiled_data->minimize.parameters);
3319
5
  }
3320
2.30k
  if (compiled_data->rewindables)
3321
49
    ccv_array_free(compiled_data->rewindables);
3322
2.30k
  if (compiled_data->tensors_init.v)
3323
99
    ccfree(CCV_NNC_INIT_V(compiled_data->tensors_init.v));
3324
2.30k
  if (compiled_data->evaluate.tos)
3325
2.30k
    ccfree(compiled_data->evaluate.tos);
3326
2.30k
  compiled_data->evaluate.tos = 0;
3327
2.30k
  if (compiled_data->stream_map)
3328
4
  {
3329
4
    khiter_t k;
3330
36
    for (k = 
kh_begin4
(compiled_data->stream_map); k != kh_end(compiled_data->stream_map);
++k32
)
3331
32
    {
3332
32
      if (!kh_exist(compiled_data->stream_map, k))
3333
16
        continue;
3334
16
      ccv_nnc_stream_context_t* const stream = kh_val(compiled_data->stream_map, k);
3335
16
      ccv_nnc_stream_context_free(stream);
3336
16
    }
3337
4
    kh_destroy(stream_map, compiled_data->stream_map);
3338
4
  }
3339
2.30k
  _ccv_cnnp_compiled_data_graph_free(compiled_data);
3340
2.30k
  _ccv_cnnp_compiled_data_gradient_free(compiled_data);
3341
2.30k
  _ccv_cnnp_compiled_data_backward_free(compiled_data);
3342
2.30k
  _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
3343
2.30k
  if (compiled_data->gradient_checkpoints)
3344
2
  {
3345
4
    for (i = 0; i < compiled_data->gradient_checkpoints->rnum; 
i++2
)
3346
2
    {
3347
2
      ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(compiled_data->gradient_checkpoints, i);
3348
2
      assert(checkpoint->inputs);
3349
2
      ccfree(checkpoint->inputs);
3350
2
      ccv_array_free(checkpoint->tensor_symbols);
3351
2
    }
3352
2
    ccv_array_free(compiled_data->gradient_checkpoints);
3353
2
  }
3354
2.30k
  ccv_nnc_xpu_alloc_destroy(&compiled_data->xpu_alloc);
3355
2.30k
  ccfree(compiled_data);
3356
2.30k
}
3357
3358
void ccv_cnnp_model_free(ccv_cnnp_model_t* const model)
3359
5.45k
{
3360
5.45k
  ccv_cnnp_model_deinit(model);
3361
5.45k
  if (model->isa->dealloc)
3362
1.22k
    model->isa->dealloc(model);
3363
5.45k
  if (model->io)
3364
794
  {
3365
794
    int i;
3366
1.95k
    for (i = 0; i < model->io->rnum; 
i++1.15k
)
3367
1.15k
    {
3368
1.15k
      ccv_cnnp_model_io_t model_io = *(ccv_cnnp_model_io_t*)ccv_array_get(model->io, i);
3369
1.15k
      if (model_io->outgoings)
3370
650
        ccv_array_free(model_io->outgoings);
3371
1.15k
      if (model_io->incomings)
3372
591
        ccv_array_free(model_io->incomings);
3373
1.15k
      if (model_io->dependencies)
3374
2
        ccv_array_free(model_io->dependencies);
3375
1.15k
      ccfree(model_io);
3376
1.15k
    }
3377
794
    ccv_array_free(model->io);
3378
794
  }
3379
5.45k
  if (model->parameter_indices)
3380
2.52k
    ccv_array_free(model->parameter_indices);
3381
5.45k
  if (model->inputs)
3382
2.30k
    ccfree(model->inputs);
3383
5.45k
  if (model->graph)
3384
2.30k
    ccv_nnc_symbolic_graph_free(model->graph);
3385
5.45k
  if (model->compiled_data)
3386
2.30k
    _ccv_cnnp_compiled_data_free(model, model->compiled_data);
3387
5.45k
  if (model->name)
3388
220
    ccfree(model->name);
3389
5.45k
  ccfree(model);
3390
5.45k
}
3391
3392
void ccv_cnnp_model_cancel(ccv_cnnp_model_t* const model)
3393
0
{
3394
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3395
0
  if (!compiled_data)
3396
0
    return;
3397
0
  if (compiled_data->graph)
3398
0
    ccv_nnc_graph_cancel(compiled_data->graph);
3399
0
  if (compiled_data->apply_gradients.graph)
3400
0
    ccv_nnc_graph_cancel(compiled_data->apply_gradients.graph);
3401
0
}
3402
3403
void ccv_cnnp_model_set_flags(ccv_cnnp_model_t* const model, const int flags)
3404
0
{
3405
0
  model->exec_flags = flags;
3406
0
}
3407
3408
int ccv_cnnp_model_flags(ccv_cnnp_model_t* const model)
3409
0
{
3410
0
  return model->exec_flags;
3411
0
}