Coverage Report

Created: 2024-12-10 23:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_cnnp_model.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_cnnp_model.h"
6
#include "_ccv_nnc_graph.h"
7
8
// MARK - Level-5 API
9
10
ccv_cnnp_model_io_t ccv_cnnp_model_apply(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t* const inputs, const int input_size)
11
545
{
12
545
  if (!model->io)
13
536
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
14
545
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s) + sizeof(ccv_nnc_tensor_symbol_t) * model->output_size);
15
545
  model_io->param_ref = 0;
16
545
  model_io->param_sel = 0;
17
545
  model_io->visit = 0;
18
545
  model_io->model = model;
19
545
  model_io->dependencies = 0;
20
545
  model_io->dependents = 0;
21
545
  model_io->outgoings = 0;
22
545
  model_io->outputs = (ccv_nnc_tensor_symbol_t*)(model_io + 1);
23
545
  ccv_array_push(model->io, &model_io);
24
545
  if (input_size > 0)
25
542
  {
26
542
    model_io->incomings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), input_size, 0);
27
542
    ccv_array_resize(model_io->incomings, input_size);
28
542
    int i;
29
542
    memcpy(ccv_array_get(model_io->incomings, 0), inputs, sizeof(ccv_cnnp_model_io_t) * input_size);
30
1.22k
    for (i = 0; i < input_size; 
i++680
)
31
680
    {
32
680
      if (!inputs[i]->outgoings)
33
592
        inputs[i]->outgoings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
34
680
      ccv_array_push(inputs[i]->outgoings, &model_io);
35
680
    }
36
542
  } else {
37
3
    model_io->incomings = 0;
38
3
  }
39
545
  return model_io;
40
545
}
41
42
void ccv_cnnp_model_add_dependencies(ccv_cnnp_model_io_t model_io, const ccv_cnnp_model_io_t* const dependencies, const int dependency_size)
43
2
{
44
2
  assert(dependency_size > 0);
45
2
  if (!model_io->dependencies)
46
2
    model_io->dependencies = ccv_array_new(sizeof(ccv_cnnp_model_io_t), dependency_size, 0);
47
2
  int i, j;
48
5
  for (i = 0; i < dependency_size; 
i++3
)
49
3
  {
50
3
    int flag = 0;
51
    // Check if it is already exist or not.
52
4
    for (j = 0; !flag && j < model_io->dependencies->rnum; 
j++1
)
53
1
      if (*(ccv_cnnp_model_io_t*)ccv_array_get(model_io->dependencies, j) == dependencies[i])
54
0
        flag = 1;
55
3
    if (flag)
56
0
      continue;
57
3
    ccv_array_push(model_io->dependencies, dependencies + i);
58
3
    ++dependencies[i]->dependents;
59
3
  }
60
2
}
61
62
int ccv_cnnp_model_output_size(const ccv_cnnp_model_t* const model)
63
0
{
64
0
  return model->output_size;
65
0
}
66
67
int ccv_cnnp_model_is_trainable(const ccv_cnnp_model_t* const model)
68
16
{
69
  // If the model is compiled, it is default to 1 unless it is not.
70
16
  if (model->compiled_data)
71
4
    return model->is_trainable >= 0 ? model->is_trainable : 
10
;
72
12
  return model->is_trainable;
73
16
}
74
75
ccv_cnnp_model_io_t ccv_cnnp_model_parameters(ccv_cnnp_model_t* const model, const int selector, const int index)
76
389
{
77
389
  if (!model->io)
78
35
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
79
389
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s));
80
389
  model_io->param_ref = index >= 0 ? 
index + 137
:
ALL_PARAMETERS352
;
81
389
  model_io->param_sel = selector >= 0 ? 
selector + 1308
:
ALL_PARAMETERS81
;
82
389
  model_io->visit = 0;
83
389
  model_io->model = model;
84
389
  model_io->outputs = 0;
85
389
  model_io->dependencies = 0;
86
389
  model_io->dependents = 0;
87
389
  model_io->incomings = 0;
88
389
  model_io->outgoings = 0;
89
389
  ccv_array_push(model->io, &model_io);
90
389
  return model_io;
91
389
}
92
93
void ccv_cnnp_model_notify_hook(ccv_cnnp_model_t* const model, ccv_cnnp_model_notify_f func, void* const context)
94
3
{
95
3
  model->notify_hook.func = func;
96
3
  model->notify_hook.context = context;
97
3
}
98
99
void ccv_cnnp_model_notify(const ccv_cnnp_model_t* const model, const int tag, void* const payload)
100
14
{
101
14
  if (model->notify_hook.func)
102
3
    model->notify_hook.func(model, tag, payload, model->notify_hook.context);
103
14
  if (model->isa->notify)
104
1
    model->isa->notify(model, tag, payload);
105
14
}
106
107
static int _ccv_nnc_array_dedup_graph_exec_symbols(ccv_nnc_graph_exec_symbol_t* const graph_exec_symbols, int graph_exec_symbol_size)
108
2.24k
{
109
2.24k
  int i, j;
110
4.84k
  for (i = 0; i < graph_exec_symbol_size; 
i++2.60k
)
111
2.60k
  {
112
2.60k
    ccv_nnc_graph_exec_symbol_t* const graph_exec_symbol = graph_exec_symbols + i;
113
    // Check whether this tensor symbol has any duplicate.
114
23.2k
    for (j = i + 1; j < graph_exec_symbol_size;)
115
20.6k
    {
116
20.6k
      ccv_nnc_graph_exec_symbol_t* const other_symbol = graph_exec_symbols + j;
117
      // If there is a same tensor symbol, remove it.
118
20.6k
      if (other_symbol->d == graph_exec_symbol->d && 
other_symbol->graph == graph_exec_symbol->graph2.70k
)
119
2.70k
      {
120
2.70k
        if (j + 1 < graph_exec_symbol_size)
121
436
          *other_symbol = graph_exec_symbols[graph_exec_symbol_size - 1];
122
2.70k
        --graph_exec_symbol_size;
123
2.70k
        continue;
124
2.70k
      }
125
17.9k
      ++j;
126
17.9k
    }
127
2.60k
  }
128
2.24k
  return graph_exec_symbol_size;
129
2.24k
}
130
131
void ccv_cnnp_model_add_to_array(void* const context, const ccv_nnc_tensor_symbol_t symbol, const int is_trainable)
132
3.14k
{
133
3.14k
  ccv_cnnp_model_add_to_array_context_t* const add_to_array_context = (ccv_cnnp_model_add_to_array_context_t*)context;
134
3.14k
  ccv_cnnp_model_t* const model = add_to_array_context->sequence->model;
135
3.14k
  int i;
136
3.14k
  if (add_to_array_context->add_parameter_indices && 
!model->parameter_indices2.96k
)
137
2.52k
    model->parameter_indices = ccv_array_new(sizeof(int), 0, 0);
138
37.1k
  for (i = 0; i < add_to_array_context->symbols->rnum; 
i++33.9k
)
139
33.9k
  {
140
33.9k
    const ccv_nnc_tensor_symbol_t other_symbol = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(add_to_array_context->symbols, i);
141
33.9k
    if (other_symbol.d == symbol.d && 
other_symbol.graph == symbol.graph24
)
142
24
    {
143
      // Only add to parameter_indices if it is trainable.
144
24
      if (add_to_array_context->add_parameter_indices)
145
15
        ccv_array_add_unique_int(model->parameter_indices, i);
146
      // Found it, return, don't add it.
147
24
      return;
148
24
    }
149
33.9k
  }
150
  // Only add to parameter_indices if it is trainable.
151
3.12k
  if (add_to_array_context->add_parameter_indices)
152
2.94k
    ccv_array_push(model->parameter_indices, &add_to_array_context->symbols->rnum);
153
  // This is a new one, no need to add_unique_int, it is unique.
154
3.12k
  ccv_array_push(add_to_array_context->symbols, &symbol);
155
3.12k
  if (add_to_array_context->trainables)
156
2.96k
    ccv_array_push(add_to_array_context->trainables, &is_trainable);
157
3.12k
  char id[2048];
158
3.12k
  id[0] = add_to_array_context->prefix;
159
3.12k
  id[1] = '-';
160
3.12k
  int total_len = 2;
161
6.47k
  for (i = 0; i < add_to_array_context->sequence->sequences->rnum; 
i++3.34k
)
162
3.34k
  {
163
3.34k
    const ccv_cnnp_model_name_t* const name = (ccv_cnnp_model_name_t*)ccv_array_get(add_to_array_context->sequence->sequences, i);
164
3.34k
    int len;
165
3.34k
    if (name->name && 
name->name[0] != '\0'345
)
166
345
      len = snprintf(id + total_len, 2048 - total_len, "%s-%d-", name->name, name->sequence);
167
3.00k
    else
168
3.00k
      len = snprintf(id + total_len, 2048 - total_len, "%d-", name->sequence);
169
3.34k
    total_len += len;
170
3.34k
    if (total_len >= 2047)
171
0
      break;
172
3.34k
  }
173
3.12k
  if (total_len < 2047)
174
3.12k
    total_len += snprintf(id + total_len, 2048 - total_len, "%d", add_to_array_context->sequence->it);
175
3.12k
  assert(total_len < 2048);
176
3.12k
  char *heap_id = (char*)ccmalloc(total_len + 1);
177
3.12k
  memcpy(heap_id, id, total_len + 1);
178
3.12k
  ccv_array_push(add_to_array_context->ids, &heap_id);
179
3.12k
  ++add_to_array_context->sequence->it;
180
3.12k
}
181
182
static void _ccv_cnnp_compiled_data_init(ccv_cnnp_compiled_data_t* const compiled_data, const int output_size, ccv_array_t* const gradient_checkpoints)
183
2.29k
{
184
2.29k
  compiled_data->f = compiled_data->fits + output_size;
185
2.29k
  compiled_data->xpu_alloc.mp_hdr = -1;
186
2.29k
  compiled_data->xpu_alloc.freed = kh_init(dy_str);
187
2.29k
  compiled_data->xpu_alloc.allocd = kh_init(dy_alloc);
188
2.29k
  compiled_data->gradient_checkpoints = gradient_checkpoints;
189
2.29k
}
190
191
static void _ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t loss)
192
2.29k
{
193
2.29k
  assert(model->graph);
194
2.29k
  model->inputs = ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * input_size);
195
2.29k
  int i;
196
4.65k
  for (i = 0; i < input_size; 
i++2.35k
)
197
2.35k
    model->inputs[i] = ccv_nnc_tensor_symbol_new(model->graph, inputs[i], 0);
198
2.29k
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
199
2.29k
  ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0);
200
2.29k
  ccv_array_t* const parameter_trainables = ccv_array_new(sizeof(int), 0, 0);
201
2.29k
  ccv_cnnp_model_sequence_t model_sequence = {
202
2.29k
    .bank = kh_init(ccv_cnnp_model_name_bank)
203
2.29k
  };
204
2.29k
  ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = {
205
2.29k
    .add_parameter_indices = 1,
206
2.29k
    .prefix = 't',
207
2.29k
    .sequence = &model_sequence,
208
2.29k
    .symbols = parameters,
209
2.29k
    .ids = parameter_ids,
210
2.29k
    .trainables = parameter_trainables,
211
2.29k
  };
212
2.29k
  ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
213
2.29k
  ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0);
214
2.29k
  ccv_cnnp_model_add_to_array_context_t add_to_output_context = {
215
2.29k
    .add_parameter_indices = 0,
216
2.29k
    .prefix = 'r',
217
2.29k
    .sequence = &model_sequence,
218
2.29k
    .symbols = internals,
219
2.29k
    .ids = internal_ids,
220
2.29k
    .trainables = 0,
221
2.29k
  };
222
2.29k
  ccv_cnnp_model_build_data_t build_data = {
223
2.29k
    .is_trainable = model->is_trainable >= 0 ? 
model->is_trainable2.28k
:
14
,
224
2.29k
    .model_sequence = &model_sequence,
225
2.29k
    .add_to_array = ccv_cnnp_model_add_to_array,
226
2.29k
    .parameters = parameters,
227
2.29k
    .context = {
228
2.29k
      .add_to_parameter = &add_to_parameter_context,
229
2.29k
      .add_to_output = &add_to_output_context,
230
2.29k
    },
231
2.29k
    .gradient_checkpoints = 0,
232
2.29k
  };
233
2.29k
  model->data = &build_data;
234
2.29k
  ccv_cnnp_model_build(model, model->graph, model->inputs, input_size, 0, 0);
235
4.59k
  for (i = 0; i < model->output_size; 
i++2.30k
)
236
2.30k
  {
237
2.30k
    const ccv_nnc_tensor_symbol_t output = model->outputs[i];
238
2.30k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, output);
239
2.30k
    if (alias_to.d == CCV_NNC_NO_TENSOR_SYMBOL)
240
1.30k
      continue;
241
    // If output is an alias, insert data transform regardless for result correctness (we cannot bind an alias). You can check ccv_nnc_tensor_bind_symbol method
242
    // to see that we can correctly bind a tensor which from it, has aliases, but we cannot bind an alias tensor correctly (this is expected, sort of, to be
243
    // honest, because we cannot handle cases of alias is part of the original tensor but bind differently).
244
1.00k
    const ccv_nnc_tensor_param_t output_params = ccv_nnc_tensor_symbol_params(model->graph, output);
245
1.00k
    model->outputs[i] = ccv_nnc_tensor_symbol_new(model->graph, output_params, 0);
246
1.00k
    ccv_nnc_graph_exec_symbol_t make_contiguous = ccv_nnc_graph_exec_symbol_new(model->graph, CMD_FORMAT_TRANSFORM_FORWARD(), &output, 1, model->outputs + i, 1, "contiguous");
247
1.00k
    ccv_nnc_graph_exec_symbol_set_flags(model->graph, make_contiguous, CCV_NNC_GRAPH_EXEC_DISABLE_OPT);
248
1.00k
  }
249
2.29k
  model->data = 0;
250
2.29k
  kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank);
251
2.29k
  if (model_sequence.sequences)
252
2.27k
    ccv_array_free(model_sequence.sequences);
253
  // Check if there are parameters that are not trainables. If there are, we will allocate uint64 bitmap to record that.
254
2.29k
  int not_trainables = 0;
255
  // Assert no parameter is alias.
256
5.24k
  for (i = 0; i < parameters->rnum; 
i++2.94k
)
257
2.94k
  {
258
2.94k
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(parameters, i);
259
2.94k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(parameter.graph, parameter);
260
2.94k
    assert(alias_to.graph == 0); // Cannot find the one alias to.
261
2.94k
    if (*(int*)ccv_array_get(parameter_trainables, i) == 0)
262
11
      not_trainables = 1;
263
2.94k
  }
264
2.29k
  assert(parameters->rnum == parameter_trainables->rnum);
265
2.29k
  uint64_t* parameter_flags = 0;
266
2.29k
  if (not_trainables)
267
8
  {
268
8
    parameter_flags = (uint64_t*)cccalloc(((parameters->rnum + 63) >> 6), sizeof(uint64_t));
269
39
    for (i = 0; i < parameter_trainables->rnum; 
i++31
)
270
31
      if (*(int*)ccv_array_get(parameter_trainables, i))
271
20
        parameter_flags[i >> 6] |= ((uint64_t)1 << (i & 63));
272
8
  }
273
2.29k
  ccv_array_free(parameter_trainables);
274
  // Assert no internal is alias.
275
2.45k
  for (i = 0; i < internals->rnum; 
i++161
)
276
161
  {
277
161
    const ccv_nnc_tensor_symbol_t internal = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(internals, i);
278
161
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(internal.graph, internal);
279
161
    assert(alias_to.graph == 0); // Cannot find the one alias to.
280
161
  }
281
2.29k
  const int output_size = model->output_size;
282
2.29k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
283
2.29k
  const int parameters_rnum = parameters->rnum;
284
2.29k
  if (input_size > 0)
285
2.29k
  {
286
2.29k
    ccv_array_resize(parameters, parameters_rnum + input_size);
287
2.29k
    memcpy(ccv_array_get(parameters, parameters_rnum), model->inputs, input_size * sizeof(ccv_nnc_tensor_symbol_t));
288
2.29k
  }
289
2.29k
  ccv_nnc_symbolic_graph_simplify(model->graph,
290
2.29k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION,
291
2.29k
      CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT,
292
2.29k
      CCV_NNC_SIMPLIFY_OPS_FUSION,
293
2.29k
      CCV_NNC_SIMPLIFY_GRAPH_PRUNING),
294
2.29k
    ccv_array_get(parameters, 0), parameters_rnum + input_size,
295
2.29k
    model->outputs, output_size,
296
2.29k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
297
2.29k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
298
  // Size it down.
299
2.29k
  parameters->rnum = parameters_rnum;
300
2.29k
  ccv_cnnp_compiled_data_t* compiled_data = model->compiled_data = cccalloc(1, sizeof(ccv_cnnp_compiled_data_t) + sizeof(ccv_nnc_tensor_symbol_t) * (output_size * 2 - 1));
301
2.29k
  _ccv_cnnp_compiled_data_init(compiled_data, output_size, build_data.gradient_checkpoints);
302
2.29k
  const int evaluate_to_size = compiled_data->evaluate.to_size = ccv_nnc_symbolic_graph_destination_size(model->graph);
303
2.29k
  assert(evaluate_to_size > 0);
304
2.29k
  compiled_data->evaluate.tos = ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
305
2.29k
  memcpy(compiled_data->evaluate.tos, ccv_nnc_symbolic_graph_destinations(model->graph), sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
306
2.29k
  compiled_data->loss = loss;
307
2.29k
  if (loss.cmd == CCV_NNC_NOOP)
308
2.28k
  {
309
    // If no loss function provided, there is no fits.
310
4.57k
    for (i = 0; i < output_size; 
i++2.29k
)
311
2.29k
    {
312
2.29k
      compiled_data->fits[i] = NO_TENSOR_SYMBOL;
313
2.29k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, model->outputs[i]);
314
2.29k
      if (alias_to.d < 0)
315
2.29k
        compiled_data->f[i] = model->outputs[i];
316
0
      else { // We cannot differentiate against an alias, therefore, we have to verify this output is full, and we can diff against the original.
317
0
        int ofs[CCV_NNC_MAX_DIM_ALLOC];
318
0
        int inc[CCV_NNC_MAX_DIM_ALLOC];
319
0
        ccv_nnc_tensor_symbol_alias_params(model->graph, model->outputs[i], ofs, inc);
320
0
        int j;
321
0
        for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
322
0
          { assert(ofs[j] == 0); } // There is no ofs.
323
0
        compiled_data->f[i] = alias_to; // Unfortunately, I cannot assert the size yet.
324
0
      }
325
2.29k
    }
326
2.28k
  } else {
327
20
    for (i = 0; i < output_size; 
i++10
)
328
10
    {
329
10
      const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(model->graph, model->outputs[i]);
330
10
      const ccv_nnc_tensor_symbol_t fit = compiled_data->fits[i] = ccv_nnc_tensor_symbol_new(model->graph, info, 0);
331
10
      compiled_data->f[i] = ccv_nnc_tensor_symbol_new(model->graph, ccv_nnc_tensor_auto, 0);
332
10
      ccv_nnc_graph_exec_symbol_new(model->graph, loss, TENSOR_SYMBOL_LIST(model->outputs[i], fit), TENSOR_SYMBOL_LIST(compiled_data->f[i]), 0);
333
10
    }
334
10
  }
335
2.29k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
336
2.29k
  ccv_nnc_symbolic_graph_simplify(model->graph,
337
2.29k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_OPS_FUSION), // Only do Ops fusion, in this way, we can fuse the loss function.
338
2.29k
    0, 0, // No need to provide binds at this point.
339
2.29k
    compiled_data->f, model->output_size,
340
2.29k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
341
2.29k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
342
  // If inputs are from GPU, stream type is GPU.
343
2.29k
  compiled_data->parameters = parameters;
344
2.29k
  compiled_data->parameter_flags = parameter_flags;
345
2.29k
  compiled_data->internals = internals;
346
2.29k
  compiled_data->ids.parameters = parameter_ids;
347
2.29k
  compiled_data->ids.internals = internal_ids;
348
2.29k
  ccv_cnnp_model_gradient_checkpoints_cleanup_after_build(compiled_data, model->graph);
349
2.29k
}
350
351
static void _ccv_cnnp_graph_push_graph_exec_symbol(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
352
8.82k
{
353
8.82k
  ccv_array_t* const stack = (ccv_array_t*)context;
354
8.82k
  ccv_array_push(stack, &symbol.d);
355
8.82k
}
356
357
static void _ccv_nnc_tensor_symbol_reinit(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
358
38.5k
{
359
38.5k
  const ccv_nnc_tensor_symbol_t src_symbol = {
360
38.5k
    .d = src_index,
361
38.5k
    .graph = src_graph
362
38.5k
  };
363
38.5k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
364
38.5k
    .d = dest_index,
365
38.5k
    .graph = dest_graph
366
38.5k
  };
367
38.5k
  const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
368
38.5k
  ccv_nnc_tensor_symbol_set(dest_graph, dest_symbol, params);
369
38.5k
  int ofs[CCV_NNC_MAX_DIM_ALLOC];
370
38.5k
  int inc[CCV_NNC_MAX_DIM_ALLOC];
371
38.5k
  if (0 == ccv_nnc_tensor_symbol_alias_params(src_graph, src_symbol, ofs, inc))
372
2.00k
    ccv_nnc_tensor_symbol_alias_set(dest_graph, dest_symbol, ofs, inc);
373
38.5k
}
374
375
static int _ccv_nnc_tensor_symbol_check_dim(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
376
2.41k
{
377
2.41k
  const ccv_nnc_tensor_symbol_t src_symbol = {
378
2.41k
    .d = src_index,
379
2.41k
    .graph = src_graph
380
2.41k
  };
381
2.41k
  const ccv_nnc_tensor_param_t src_params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
382
2.41k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
383
2.41k
    .d = dest_index,
384
2.41k
    .graph = dest_graph
385
2.41k
  };
386
2.41k
  const ccv_nnc_tensor_param_t dest_params = ccv_nnc_tensor_symbol_params(dest_graph, dest_symbol);
387
2.41k
  return memcmp(src_params.dim, dest_params.dim, sizeof(src_params.dim)) == 0;
388
2.41k
}
389
390
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size);
391
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data);
392
393
typedef struct {
394
  int parallel_count;
395
  ccv_nnc_symbolic_graph_t* graph;
396
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
397
} ccv_nnc_graph_exec_update_t;
398
399
static void _ccv_cnnp_cmd_update_for_execs(void* const context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint)
400
58
{
401
58
  ccv_nnc_graph_exec_update_t* const graph_exec_update = (ccv_nnc_graph_exec_update_t*)context;
402
58
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = graph_exec_update->graph_exec_arena;
403
58
  ccv_nnc_graph_exec_t graph_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, symbol);
404
58
  ccv_nnc_graph_exec_set(graph_exec.graph, graph_exec, cmd);
405
58
  ccv_nnc_graph_exec_set_hint(graph_exec.graph, graph_exec, hint);
406
58
  const ccv_nnc_symbolic_graph_t* const graph = graph_exec_update->graph;
407
58
  const int parallel_count = graph_exec_update->parallel_count;
408
58
  int i;
409
178
  for (i = 1; i < parallel_count; 
i++120
)
410
120
  {
411
120
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, ccv_nnc_graph_exec_symbol_copy(graph, symbol, i));
412
120
    if (!CCV_NO_GRAPH_EXEC(copy))
413
120
    {
414
120
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
415
120
      ccv_nnc_graph_exec_set_hint(copy.graph, copy, hint);
416
120
    }
417
120
  }
418
58
}
419
420
void ccv_cnnp_model_absorb(ccv_cnnp_model_t* const model, ccv_cnnp_model_t* const init, const ccv_nnc_tensor_param_t* const inputs, const int input_size)
421
2.20k
{
422
2.20k
  assert(model->graph);
423
2.20k
  assert(model->compiled_data);
424
2.20k
  assert(!init->graph);
425
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
426
2.20k
  init->graph = ccv_nnc_symbolic_graph_new();
427
2.20k
  ccv_array_t* const stack = ccv_array_new(sizeof(int), 0, 0);
428
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, _ccv_cnnp_graph_push_graph_exec_symbol, stack, 0);
429
2.20k
  _ccv_cnnp_model_compile(init, inputs, input_size, compiled_data->loss);
430
2.20k
  init->parallel_count = model->parallel_count;
431
2.20k
  init->memory_compression = model->memory_compression;
432
2.20k
  init->memory_reduction = model->memory_reduction;
433
2.20k
  init->gradient_checkpointing = model->gradient_checkpointing;
434
2.20k
  init->compiled_data->stream_type = model->compiled_data->stream_type;
435
2.20k
  init->compiled_data->minimize.minimizer = model->compiled_data->minimize.minimizer;
436
2.20k
  init->compiled_data->minimize.max_saved_aux_size = model->compiled_data->minimize.max_saved_aux_size;
437
2.20k
  if (model->compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
438
2.20k
    _ccv_cnnp_model_gradient_init(init, model->compiled_data->gradient_mode, model->compiled_data->disable_outgrad, 0, 0);
439
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, 0, 0, 0);
440
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(init->graph, TRAVERSE_FULL);
441
2.20k
  int i, j;
442
  // Verify parameters, internals and saved_aux in both graph has the same dimensionality.
443
4.61k
  for (i = 0; i < compiled_data->parameters->rnum; 
i++2.41k
)
444
2.41k
  {
445
2.41k
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
446
2.41k
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
447
2.41k
  }
448
2.20k
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
449
0
  {
450
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
451
0
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
452
0
  }
453
  // Update inputs.
454
2.20k
  assert(model->input_size == init->input_size);
455
4.40k
  
for (i = 0; 2.20k
i < model->input_size;
i++2.20k
)
456
2.20k
    if (model->inputs[i].d >= 0)
457
2.20k
    {
458
2.20k
      assert(init->inputs[i].d >= 0);
459
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->inputs[i].d, model->inputs[i].d);
460
2.20k
    }
461
  // Update outputs.
462
2.20k
  assert(model->output_size == init->output_size);
463
4.40k
  
for (i = 0; 2.20k
i < model->output_size;
i++2.20k
)
464
2.20k
  {
465
2.20k
    if (model->outputs[i].d >= 0)
466
2.20k
    {
467
2.20k
      assert(init->outputs[i].d >= 0);
468
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->outputs[i].d, model->outputs[i].d);
469
2.20k
    }
470
2.20k
    if (model->outputs[i].d != model->compiled_data->f[i].d)
471
0
    {
472
0
      assert(init->outputs[i].d != init->compiled_data->f[i].d);
473
0
      if (model->compiled_data->f[i].d >= 0)
474
0
      {
475
0
        assert(init->compiled_data->f[i].d >= 0);
476
0
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->compiled_data->f[i].d, model->compiled_data->f[i].d);
477
0
      }
478
0
    }
479
2.20k
  }
480
  // Go through the graph to set tensor on matching symbols
481
11.0k
  
for (i = 0; 2.20k
i < stack->rnum;
i++8.82k
)
482
8.82k
  {
483
8.82k
    const int d = *(int*)ccv_array_get(stack, i);
484
    // If exceed range, skip.
485
8.82k
    if (d >= ccv_nnc_graph_exec_symbol_count(init->graph) ||
486
8.82k
      d >= ccv_nnc_graph_exec_symbol_count(model->graph))
487
0
      continue;
488
8.82k
    const ccv_nnc_graph_exec_symbol_t src_symbol = {
489
8.82k
      .d = d,
490
8.82k
      .graph = init->graph
491
8.82k
    };
492
8.82k
    const ccv_nnc_graph_exec_symbol_t dest_symbol = {
493
8.82k
      .d = d,
494
8.82k
      .graph = model->graph
495
8.82k
    };
496
8.82k
    const ccv_nnc_cmd_t src_cmd = ccv_nnc_graph_exec_symbol_cmd(init->graph, src_symbol);
497
8.82k
    const ccv_nnc_cmd_t dest_cmd = ccv_nnc_graph_exec_symbol_cmd(model->graph, dest_symbol);
498
    // If the name doesn't match, skip.
499
8.82k
    if (dest_cmd.cmd != src_cmd.cmd && 
src_cmd.cmd != CCV_NNC_NOOP0
)
500
0
      continue;
501
    // Now get all the inputs and outputs, if matches, set them.
502
8.82k
    const int* src_inputs;
503
8.82k
    int src_input_size;
504
8.82k
    const int* src_outputs;
505
8.82k
    int src_output_size;
506
8.82k
    ccv_nnc_graph_exec_symbol_io(init->graph, src_symbol, &src_inputs, &src_input_size, &src_outputs, &src_output_size);
507
8.82k
    const int* dest_inputs;
508
8.82k
    int dest_input_size;
509
8.82k
    const int* dest_outputs;
510
8.82k
    int dest_output_size;
511
8.82k
    ccv_nnc_graph_exec_symbol_io(model->graph, dest_symbol, &dest_inputs, &dest_input_size, &dest_outputs, &dest_output_size);
512
    // We may have unmatched input / output size because this is the minimizer and it has
513
    // different saved_aux (for example, when we shrunk with CMD_NOOP).
514
8.82k
    if (src_input_size != dest_input_size)
515
0
      continue;
516
8.82k
    if (src_output_size != dest_output_size)
517
0
      continue;
518
8.82k
    ccv_nnc_graph_exec_symbol_set(model->graph, dest_symbol, src_cmd);
519
    // There may be mismatches of the source tensor symbols and destination tensor symbols. The reason is because
520
    // we may later passed-in the minimizer, therefore, we may allocate tensors for minimizer later in the original
521
    // graph whereas in the newly created graph, it is streamlined (the minimizer exists from the beginning). That
522
    // will make the order of tensor symbols creation different, therefore, exact which tensor is which wrong as
523
    // well. However, set a new minimizer won't change the exec symbol ordering, because we never create new exec
524
    // symbols after gradient init step. Changing a new minimizer just updated that exec symbols setting, it is not
525
    // a new exec symbol.
526
33.7k
    for (j = 0; j < src_input_size; 
j++24.8k
)
527
24.8k
      if (src_inputs[j] >= 0)
528
20.4k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_inputs[j], dest_inputs[j]);
529
22.4k
    for (j = 0; j < src_output_size; 
j++13.6k
)
530
13.6k
      if (src_outputs[j] >= 0)
531
13.6k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_outputs[j], dest_outputs[j]);
532
8.82k
  }
533
2.20k
  ccv_array_free(stack);
534
  // After this, we get all tensors in the model graph resolved through tensor_auto.
535
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(model->graph, TRAVERSE_FULL);
536
  // Verify symbols we get matches.
537
2.20k
  const int parameter_size = compiled_data->parameters->rnum;
538
4.61k
  for (i = 0; i < parameter_size; 
i++2.41k
)
539
2.41k
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->parameters, i))->d); }
540
2.20k
  const int internal_size = compiled_data->internals->rnum;
541
2.20k
  for (i = 0; i < internal_size; 
i++0
)
542
0
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->internals, i))->d); }
543
  // Go through compiled data.
544
2.20k
  if (compiled_data->tensor_arena)
545
2.20k
  {
546
2.20k
    const int flag = ccv_nnc_tensor_arena_reinit(compiled_data->tensor_arena, model->graph);
547
2.20k
    if (flag == 0 && compiled_data->graph_exec_arena)
548
2.20k
    {
549
2.20k
      ccv_nnc_graph_exec_reinit(compiled_data->graph_exec_arena, compiled_data->graph, model->graph);
550
      // Since we will reinit, if we previously set is_test, we need to set it again.
551
2.20k
      if (compiled_data->is_test)
552
1
      {
553
1
        const int parallel_count = ccv_max(model->parallel_count, 1);
554
1
        ccv_nnc_graph_exec_update_t update = {
555
1
          .parallel_count = parallel_count,
556
1
          .graph = model->graph,
557
1
          .graph_exec_arena = compiled_data->graph_exec_arena,
558
1
        };
559
1
        ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
560
1
      }
561
2.20k
    } else
562
      // Free-up tensor arena & graph exec arena.
563
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
564
2.20k
  }
565
  // There are other compiled graphs, for accum and apply gradients.
566
  // However, the main conclusion is, these absorb operations shouldn't impact parameters.
567
  // Thus, it won't impact the shape of gradients (only outgrad). Since for outgrad, we
568
  // don't allocate ourselves, it is not a concern. For normal gradients, the shape cannot
569
  // be changed otherwise parameters' shape will be meaningless. The same goes to internals.
570
  // That is why we don't update these compiled graphs at all this point.
571
  // Free the model, we've already "absorbed" it.
572
2.20k
  ccv_cnnp_model_free(init);
573
2.20k
}
574
575
void ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t minimizer, const ccv_nnc_cmd_t loss)
576
2.28k
{
577
2.28k
  assert(input_size == model->input_size || model->input_size == 0);
578
2.28k
  if (model->input_size == 0)
579
6
    model->input_size = input_size;
580
2.28k
  if (!model->graph) // The graph is not compiled yet.
581
87
  {
582
87
    model->graph = ccv_nnc_symbolic_graph_new();
583
87
    _ccv_cnnp_model_compile(model, inputs, input_size, loss);
584
87
    assert(model->compiled_data);
585
87
    int i, flag = 0;
586
217
    for (i = 0; !flag && 
i < input_size197
;
i++130
)
587
130
      flag = (CCV_TENSOR_GET_MEMORY(inputs[i].type) == CCV_TENSOR_GPU_MEMORY);
588
    // If inputs are from GPU, stream type is GPU.
589
87
    model->compiled_data->stream_type = flag ? 
CCV_STREAM_CONTEXT_GPU20
:
CCV_STREAM_CONTEXT_CPU67
;
590
87
    model->compiled_data->minimize.minimizer = minimizer;
591
87
    model->compiled_data->minimize.max_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
592
2.20k
  } else {
593
    // Now, finally fill in this part. If the graph is already compiled, we make a copy of the model.
594
    // And then absorb the "new model" to the old one.
595
2.20k
    ccv_cnnp_model_t* const init = ccv_cnnp_model_copy(model, model->is_trainable);
596
2.20k
    ccv_cnnp_model_absorb(model, init, inputs, input_size);
597
    // Reset minimizer.
598
2.20k
    ccv_cnnp_model_set_minimizer(model, minimizer, 1, 0, 0);
599
2.20k
  }
600
2.28k
}
601
602
ccv_cnnp_model_t* ccv_cnnp_model_copy(const ccv_cnnp_model_t* const model, const int is_trainable)
603
2.20k
{
604
2.20k
  ccv_cnnp_model_t* const new_model = _ccv_cnnp_model_copy(model, 0);
605
2.20k
  new_model->is_trainable = is_trainable;
606
2.20k
  return new_model;
607
2.20k
}
608
609
void ccv_cnnp_model_tensor_auto(ccv_cnnp_model_t* const model, ccv_nnc_tensor_param_t* const outputs, const int output_size)
610
4.44k
{
611
4.44k
  assert(model->graph);
612
4.44k
  assert(output_size == model->output_size);
613
4.44k
  ccv_nnc_symbolic_graph_t* const graph = model->graph;
614
4.44k
  ccv_nnc_symbolic_graph_tensor_auto(graph, TRAVERSE_FULL);
615
4.44k
  int i;
616
8.89k
  for (i = 0; i < output_size; 
i++4.44k
)
617
4.44k
  {
618
4.44k
    assert(model->outputs[i].d != CCV_NNC_NO_TENSOR_SYMBOL);
619
4.44k
    outputs[i] = ccv_nnc_tensor_symbol_params(graph, model->outputs[i]);
620
4.44k
  }
621
4.44k
}
622
623
void ccv_cnnp_model_set_workspace_size(ccv_cnnp_model_t* const model, size_t workspace_size)
624
3
{
625
3
  if (workspace_size == model->workspace_size)
626
0
    return;
627
3
  model->workspace_size = workspace_size;
628
3
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
629
3
  if (compiled_data && compiled_data->graph)
630
0
    ccv_nnc_graph_autotune(compiled_data->graph, workspace_size, 0, TRAVERSE_FULL);
631
3
}
632
633
size_t ccv_cnnp_model_workspace_size(ccv_cnnp_model_t* const model)
634
0
{
635
0
  return model->workspace_size;
636
0
}
637
638
void ccv_cnnp_model_set_data_parallel(ccv_cnnp_model_t* const model, const int parallel)
639
15
{
640
15
  if (parallel == 0)
641
0
    model->parallel_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
642
15
  else
643
15
    model->parallel_count = parallel;
644
15
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
645
15
  if (compiled_data)
646
11
    { assert(!compiled_data->graph); }
647
15
}
648
649
void ccv_cnnp_model_set_max_concurrency(ccv_cnnp_model_t* const model, const int max_stream_count)
650
0
{
651
0
  model->max_stream_count = max_stream_count;
652
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
653
0
  if (compiled_data)
654
0
    { assert(!compiled_data->graph); }
655
0
}
656
657
void ccv_cnnp_model_set_memory_compression(ccv_cnnp_model_t* const model, const int memory_compression)
658
0
{
659
0
  model->memory_compression = memory_compression;
660
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
661
0
  if (compiled_data)
662
0
    { assert(!compiled_data->graph); }
663
0
}
664
665
void ccv_cnnp_model_set_memory_reduction(ccv_cnnp_model_t* const model, const int memory_reduction)
666
0
{
667
0
  model->memory_reduction = memory_reduction;
668
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
669
0
  if (compiled_data)
670
0
    { assert(!compiled_data->graph); }
671
0
}
672
673
void ccv_cnnp_model_set_gradient_checkpointing(ccv_cnnp_model_t* const model, const int gradient_checkpointing)
674
2
{
675
2
  model->gradient_checkpointing = gradient_checkpointing;
676
2
}
677
678
int ccv_cnnp_model_gradient_checkpointing(ccv_cnnp_model_t* const model)
679
0
{
680
0
  return model->gradient_checkpointing;
681
0
}
682
683
typedef struct {
684
  int parallel_count;
685
  ccv_nnc_symbolic_graph_t* graph;
686
  ccv_cnnp_compiled_data_t* compiled_data;
687
  ccv_nnc_tensor_arena_t* tensor_arena;
688
} ccv_nnc_tensor_init_states_t;
689
690
static int _ccv_cnnp_any_to_init(const ccv_cnnp_compiled_data_t* const compiled_data)
691
89
{
692
89
  int i;
693
89
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
694
165
  for (i = 0; i < compiled_data->parameters->rnum; 
i++76
)
695
112
  {
696
112
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
697
112
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
698
36
      return 1;
699
112
  }
700
53
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
701
5
  {
702
5
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
703
5
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
704
5
      return 1;
705
5
  }
706
48
  return 0;
707
53
}
708
709
static void _ccv_cnnp_init_states_for_tensors(void* const context, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const input, const ccv_nnc_tensor_symbol_t output_symbol)
710
329
{
711
329
  ccv_nnc_tensor_init_states_t* const tensor_init_states = (ccv_nnc_tensor_init_states_t*)context;
712
329
  ccv_nnc_tensor_arena_t* const tensor_arena = tensor_init_states->tensor_arena;
713
329
  ccv_nnc_tensor_t* const output_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, output_symbol);
714
329
  if (!output_tensor)
715
0
    return;
716
329
  const int d = output_symbol.d;
717
329
  assert(d < tensor_init_states->compiled_data->tensors_init.size);
718
329
  uint32_t* const init_v = CCV_NNC_INIT_V(tensor_init_states->compiled_data->tensors_init.v);
719
329
  if (init_v[d >> 5] & (1u << (d & 0x1f)))
720
29
    return;
721
300
  init_v[d >> 5] |= (1u << (d & 0x1f));
722
300
  ccv_nnc_cmd_exec(cmd, hint, flags, &input, input ? 
112
:
0288
, &output_tensor, 1, 0);
723
300
  const ccv_nnc_symbolic_graph_t* const graph = tensor_init_states->graph;
724
300
  const int parallel_count = tensor_init_states->parallel_count;
725
300
  int i;
726
780
  for (i = 1; i < parallel_count; 
i++480
)
727
480
  {
728
480
    ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_copy(graph, output_symbol, i));
729
480
    if (copy)
730
480
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &output_tensor, 1, &copy, 1, 0);
731
480
  }
732
300
}
733
734
// This method can only handle cases we added new tensors and exec, never delete. This invariant is true because
735
// we setup everything (including calling simplify method) in ccv_cnnp_model_compile method, before this rewind setup.
736
static void _ccv_cnnp_model_rewind_graph(ccv_cnnp_model_t* const model)
737
2
{
738
2
  assert(model->graph);
739
2
  assert(model->compiled_data);
740
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
741
2
  assert(compiled_data->rewindables);
742
2
  int i;
743
51
  for (i = 0; i < compiled_data->rewindables->rnum; 
i++49
)
744
49
  {
745
49
    const ccv_cnnp_rewind_symbol_t* const rewind_symbol = (ccv_cnnp_rewind_symbol_t*)ccv_array_get(compiled_data->rewindables, i);
746
49
    if (rewind_symbol->type == CCV_CNNP_REWIND_GRAPH_EXEC)
747
16
      ccv_nnc_graph_exec_symbol_free(model->graph, rewind_symbol->graph_exec);
748
33
    else if (rewind_symbol->type == CCV_CNNP_REWIND_TENSOR)
749
33
      ccv_nnc_tensor_symbol_free(model->graph, rewind_symbol->tensor);
750
49
  }
751
2
  ccv_array_clear(compiled_data->rewindables);
752
2
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
753
2
}
754
755
static void _ccv_cnnp_model_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name)
756
6.09k
{
757
6.09k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
758
6.09k
    .type = CCV_CNNP_REWIND_TENSOR,
759
6.09k
    .tensor = symbol
760
6.09k
  };
761
6.09k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
762
6.09k
  ccv_array_push(rewind_symbols, &rewind_symbol);
763
6.09k
}
764
765
static void _ccv_cnnp_model_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC], const int inc[CCV_NNC_MAX_DIM_ALLOC], const ccv_nnc_tensor_param_t info, const char* const name)
766
475
{
767
475
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
768
475
    .type = CCV_CNNP_REWIND_TENSOR,
769
475
    .tensor = symbol
770
475
  };
771
475
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
772
475
  ccv_array_push(rewind_symbols, &rewind_symbol);
773
475
}
774
775
static void _ccv_cnnp_model_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
776
2.32k
{
777
2.32k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
778
2.32k
    .type = CCV_CNNP_REWIND_GRAPH_EXEC,
779
2.32k
    .graph_exec = symbol
780
2.32k
  };
781
2.32k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
782
2.32k
  ccv_array_push(rewind_symbols, &rewind_symbol);
783
2.32k
}
784
785
static void _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd, ccv_nnc_symbolic_graph_t* const symbolic_graph)
786
35.0k
{
787
35.0k
  ccv_nnc_graph_exec_t const update_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, exec_symbol);
788
35.0k
  if (!CCV_NO_GRAPH_EXEC(update_exec))
789
19.9k
    ccv_nnc_graph_exec_set(update_exec.graph, update_exec, cmd);
790
35.0k
  int i;
791
49.9k
  for (i = 1; i < parallel_count; 
i++14.8k
)
792
14.8k
  {
793
14.8k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
794
14.8k
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, copy_symbol);
795
14.8k
    if (!CCV_NO_GRAPH_EXEC(copy))
796
14.6k
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
797
14.8k
  }
798
35.0k
}
799
800
static void _ccv_cnnp_model_graph_exec_symbol_set(ccv_nnc_symbolic_graph_t* const symbolic_graph, ccv_cnnp_compiled_data_t* const compiled_data, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd)
801
20.0k
{
802
20.0k
  assert(compiled_data);
803
20.0k
  assert(symbolic_graph);
804
20.0k
  ccv_nnc_graph_exec_symbol_set(symbolic_graph, exec_symbol, cmd);
805
20.0k
  int i;
806
35.0k
  for (i = 1; i < parallel_count; 
i++14.9k
)
807
14.9k
  {
808
14.9k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
809
14.9k
    if (copy_symbol.graph)
810
14.8k
      ccv_nnc_graph_exec_symbol_set(symbolic_graph, copy_symbol, cmd);
811
14.9k
  }
812
20.0k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = compiled_data->graph_exec_arena;
813
20.0k
  if (graph_exec_arena)
814
20.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
815
  // Skip backward graph exec arena because it is for a specific accum symbolic graph, not the main graph (model->graph)
816
20.0k
  ccv_nnc_graph_exec_arena_t* const gradient_graph_exec_arena = compiled_data->apply_gradients.graph_exec_arena;
817
20.0k
  if (gradient_graph_exec_arena)
818
15.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(gradient_graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
819
20.0k
}
820
821
static int _ccv_cnnp_set_minimizer_for_parameter(ccv_nnc_symbolic_graph_t* const graph, ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_graph_exec_symbol_t* const update_nodes, ccv_nnc_tensor_symbol_t* const updated_parameters, ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parallel_count, const ccv_nnc_cmd_t minimizer, const int saved_aux_size, const int max_saved_aux_size, const int parameter_indice)
822
20.0k
{
823
20.0k
  int this_parameter_flag = 0;
824
20.0k
  if (update_nodes[parameter_indice].d == CCV_NNC_NO_TENSOR_SYMBOL)
825
0
    return this_parameter_flag;
826
20.0k
  const ccv_nnc_cmd_t old_minimizer = ccv_nnc_graph_exec_symbol_cmd(graph, update_nodes[parameter_indice]);
827
20.0k
  int j, k;
828
  // For no-op, we can preserve previous saved_aux_size.
829
20.0k
  if (old_minimizer.cmd != minimizer.cmd && 
minimizer.cmd != CCV_NNC_NOOP71
)
830
67
  {
831
    // If the old minimizer is a noop, then the old_saved_aux_size should be whatever its previous
832
    // saved_aux_size is, otherwise we will reinit the saved_aux repeatedly if you switch between
833
    // noop and a minimizer. We don't want that because we do that in high-level frameworks to
834
    // make sure some model parameters don't update if we don't want them to.
835
67
    int old_saved_aux_size;
836
67
    if (old_minimizer.cmd == CCV_NNC_NOOP)
837
67
    {
838
67
      int input_size;
839
67
      ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], 0, &input_size, 0, 0);
840
67
      if (input_size < 2) // This is not legit.
841
0
        old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
842
67
      else // See ccv_nnc_minimizer_saved_aux_size, the saved_aux is inputs excluding gradients and parameters.
843
67
        old_saved_aux_size = input_size - 2;
844
67
    } else
845
0
      old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
846
67
    if (old_saved_aux_size != saved_aux_size)
847
65
    {
848
65
      this_parameter_flag = 1;
849
65
      if (saved_aux_size > old_saved_aux_size)
850
65
      {
851
        // Allocate new tensor symbols.
852
65
        const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(graph, updated_parameters[parameter_indice]);
853
189
        for (j = old_saved_aux_size; j < saved_aux_size; 
j++124
)
854
124
        {
855
124
          saved_aux[parameter_indice * max_saved_aux_size + j].source = ccv_nnc_tensor_symbol_new(graph, info, 0);
856
124
          saved_aux[parameter_indice * max_saved_aux_size + j].destination = ccv_nnc_tensor_symbol_new(graph, info, 0);
857
124
          const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
858
460
          for (k = 1; k < parallel_count; 
k++336
)
859
336
          {
860
336
            ccv_nnc_tensor_param_t dev_info = info;
861
336
            if (k != device_id)
862
336
              CCV_TENSOR_SET_DEVICE_ID(dev_info.type, k);
863
0
            else
864
0
              CCV_TENSOR_SET_DEVICE_ID(dev_info.type, 0);
865
336
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
866
336
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
867
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, src_copy);
868
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, dest_copy);
869
336
          }
870
124
        }
871
65
      } else {
872
0
        for (j = saved_aux_size; j < old_saved_aux_size; j++)
873
0
        {
874
0
          for (k = 1; k < parallel_count; k++)
875
0
          {
876
0
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
877
0
            if (src_copy.d >= 0)
878
0
            {
879
0
              ccv_nnc_tensor_symbol_free(graph, src_copy);
880
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, NO_TENSOR_SYMBOL);
881
0
            }
882
0
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
883
0
            if (dest_copy.d >= 0)
884
0
            {
885
0
              ccv_nnc_tensor_symbol_free(graph, dest_copy);
886
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, NO_TENSOR_SYMBOL);
887
0
            }
888
0
          }
889
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source);
890
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination);
891
0
          saved_aux[parameter_indice * max_saved_aux_size + j].source = saved_aux[parameter_indice * max_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
892
0
        }
893
0
      }
894
65
    }
895
67
  }
896
20.0k
  _ccv_cnnp_model_graph_exec_symbol_set(graph, compiled_data, parallel_count, update_nodes[parameter_indice], minimizer);
897
20.0k
  if (this_parameter_flag)
898
65
  {
899
65
    ccv_nnc_tensor_symbol_t update_inputs[saved_aux_size + 2];
900
65
    ccv_nnc_tensor_symbol_t update_outputs[saved_aux_size + 1];
901
65
    const int* inputs = 0;
902
65
    int input_size = 0;
903
65
    ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], &inputs, &input_size, 0, 0);
904
65
    assert(input_size >= 1);
905
65
    update_inputs[0].d = inputs[0];
906
65
    update_inputs[0].graph = graph;
907
65
    update_inputs[1].d = inputs[1];
908
65
    update_inputs[1].graph = graph;
909
65
    update_outputs[0] = updated_parameters[parameter_indice];
910
189
    for (j = 0; j < saved_aux_size; 
j++124
)
911
124
    {
912
124
      update_inputs[j + 2] = saved_aux[parameter_indice * max_saved_aux_size + j].source;
913
124
      update_outputs[j + 1] = saved_aux[parameter_indice * max_saved_aux_size + j].destination;
914
124
    }
915
65
    ccv_nnc_graph_exec_symbol_set_io(graph, update_nodes[parameter_indice], update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
916
233
    for (k = 1; k < parallel_count; 
k++168
)
917
168
    {
918
168
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(graph, update_nodes[parameter_indice], k);
919
168
      assert(copy.d >= 0);
920
168
      ccv_nnc_graph_exec_symbol_io(graph, copy, &inputs, &input_size, 0, 0);
921
168
      assert(input_size >= 1);
922
168
      update_inputs[0].d = inputs[0];
923
168
      update_inputs[0].graph = graph;
924
168
      update_inputs[1].d = inputs[1];
925
168
      update_inputs[1].graph = graph;
926
168
      update_outputs[0] = ccv_nnc_tensor_symbol_copy(graph, updated_parameters[parameter_indice], k);
927
504
      for (j = 0; j < saved_aux_size; 
j++336
)
928
336
      {
929
336
        update_inputs[j + 2] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
930
336
        update_outputs[j + 1] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
931
336
      }
932
168
      ccv_nnc_graph_exec_symbol_set_io(graph, copy, update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
933
168
    }
934
65
  }
935
20.0k
  return this_parameter_flag;
936
20.0k
}
937
938
typedef struct {
939
  int parameter_size;
940
  ccv_nnc_cmd_t minimizer;
941
  ccv_cnnp_model_io_t parameters[1];
942
} ccv_cnnp_set_minimizer_for_parameter_t;
943
944
static int _ccv_cnnp_apply_parameters_with_minimizer(ccv_cnnp_model_t* const model)
945
296
{
946
296
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
947
296
  assert(compiled_data);
948
296
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
949
  // We update all parameters, at this point, we have one minimizer.
950
296
  const int parameter_size = compiled_data->parameters->rnum;
951
296
  ccv_nnc_graph_exec_symbol_t* const update_nodes = compiled_data->update_nodes;
952
296
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
953
296
  assert(symbolic_graph);
954
296
  const int parallel_count = ccv_max(model->parallel_count, 1);
955
296
  ccv_array_t* const parameters = compiled_data->minimize.parameters;
956
296
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
957
296
  int i, j, flag = 0;
958
301
  for (i = 0; i < parameters->rnum; 
i++5
)
959
5
  {
960
5
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = *(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(parameters, i);
961
10
    for (j = 0; j < set_minimizer_for_parameter->parameter_size; 
j++5
)
962
5
    {
963
5
      const int param_sel = set_minimizer_for_parameter->parameters[j]->param_sel > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_sel - 13
:
set_minimizer_for_parameter->parameters[j]->param_sel2
;
964
5
      assert(set_minimizer_for_parameter->parameters[j]->param_sel != 0);
965
5
      const int old_rnum = parameter_indices->rnum;
966
5
      ccv_cnnp_model_add_to_parameter_indices(set_minimizer_for_parameter->parameters[j]->model, param_sel, parameter_indices);
967
5
      const int param_ref = set_minimizer_for_parameter->parameters[j]->param_ref > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_ref - 10
: set_minimizer_for_parameter->parameters[j]->param_ref;
968
5
      assert(set_minimizer_for_parameter->parameters[j]->param_ref != 0);
969
5
      if (param_ref >= 0)
970
0
      {
971
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
972
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
973
0
        parameter_indices->rnum = old_rnum + 1;
974
0
      }
975
5
    }
976
5
    const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(set_minimizer_for_parameter->minimizer);
977
    // We may have duplicated indices, but that is OK, we will set it twice.
978
58
    for (j = 0; j < parameter_indices->rnum; 
j++53
)
979
53
    {
980
53
      const int d = *(int*)ccv_array_get(parameter_indices, j);
981
53
      assert(d <= parameter_size);
982
53
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, set_minimizer_for_parameter->minimizer, saved_aux_size, max_saved_aux_size, d))
983
0
        flag = 1;
984
53
    }
985
5
    ccv_array_clear(parameter_indices);
986
5
  }
987
296
  ccv_array_free(parameter_indices);
988
296
  return flag;
989
296
}
990
991
static void _ccv_cnnp_scatter_saved_aux(ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parameter_size, const int old_saved_aux_size, const int new_saved_aux_size)
992
2.24k
{
993
2.24k
  if (new_saved_aux_size == old_saved_aux_size)
994
2.24k
    return;
995
2.24k
  assert
(new_saved_aux_size > old_saved_aux_size)7
;
996
7
  int i, j;
997
72
  for (i = parameter_size - 1; i >= 0; 
i--65
)
998
65
  {
999
189
    for (j = new_saved_aux_size - 1; j >= old_saved_aux_size; 
j--124
)
1000
124
      saved_aux[i * new_saved_aux_size + j].source = saved_aux[i * new_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
1001
65
    for (j = old_saved_aux_size - 1; j >= 0; 
j--0
)
1002
0
      saved_aux[i * new_saved_aux_size + j] = saved_aux[i * old_saved_aux_size + j];
1003
65
  }
1004
7
}
1005
1006
static void _ccv_cnnp_model_set_rewindables(ccv_cnnp_model_t* const model)
1007
41
{
1008
41
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1009
41
  assert(compiled_data);
1010
41
  if (!compiled_data->rewindables)
1011
41
    compiled_data->rewindables = ccv_array_new(sizeof(ccv_cnnp_rewind_symbol_t), 0, 0);
1012
41
  ccv_nnc_tensor_symbol_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_new_hook, compiled_data->rewindables, 0);
1013
41
  ccv_nnc_tensor_symbol_alias_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_alias_new_hook, compiled_data->rewindables, 0);
1014
41
  ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_graph_exec_symbol_new_hook, compiled_data->rewindables, 0);
1015
41
}
1016
1017
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size)
1018
2.24k
{
1019
2.24k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1020
2.24k
  assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
1021
2.24k
  assert(gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
1022
2.24k
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1023
2.24k
  assert(evaluate_to_size > 0);
1024
2.24k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1025
2.24k
  compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1026
2.24k
  compiled_data->evaluate.to_ops = (ccv_nnc_graph_exec_t*)(compiled_data->evaluate.tos + evaluate_to_size * parallel_count);
1027
2.24k
  int i, j;
1028
2.24k
  const int output_size = model->output_size;
1029
2.24k
  assert(!fits || fit_size == output_size * parallel_count);
1030
2.24k
  if (fits)
1031
12
    
for (i = 0; 6
i < output_size;
i++6
)
1032
6
      ccv_nnc_tensor_symbol_set(model->graph, compiled_data->fits[i], fits[i]->info);
1033
2.24k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
1034
2.24k
  const int parameter_size = compiled_data->parameters->rnum;
1035
2.24k
  compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * max_saved_aux_size * parameter_size);
1036
2.24k
  compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
1037
2.24k
  compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
1038
2.24k
  int parameter_size_maybe_more = parameter_size;
1039
2.24k
  compiled_data->disable_outgrad = disable_outgrad;
1040
2.24k
  int outgrad_size;
1041
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
1042
9
    outgrad_size = 0;
1043
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
1044
2.22k
    outgrad_size = model->input_size;
1045
3
  else {
1046
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
1047
3
    outgrad_size = 0;
1048
10
    for (i = 0; i < model->input_size; 
i++7
)
1049
7
      if (!(disable_outgrad & ((uint64_t)1 << i)))
1050
3
        ++outgrad_size;
1051
3
  }
1052
2.24k
  compiled_data->outgrad_size = outgrad_size;
1053
2.24k
  parameter_size_maybe_more += outgrad_size;
1054
2.24k
  compiled_data->gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size_maybe_more + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size_maybe_more * parallel_count);
1055
2.24k
  compiled_data->outgrads = parameter_size_maybe_more > parameter_size ? 
compiled_data->gradients + parameter_size2.23k
:
09
;
1056
2.24k
  compiled_data->backward.tos = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->gradients + parameter_size_maybe_more);
1057
2.24k
  compiled_data->backward.to_size = parameter_size_maybe_more;
1058
2.24k
  ccv_nnc_tensor_symbol_t* parameters = (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0);
1059
2.24k
  if (compiled_data->parameter_flags)
1060
4
  {
1061
4
    parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size);
1062
25
    for (i = 0; i < parameter_size; 
i++21
)
1063
21
      if (compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))
1064
14
        parameters[i] = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1065
7
      else
1066
7
        parameters[i] = NO_TENSOR_SYMBOL;
1067
4
  }
1068
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
1069
9
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1070
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
1071
2.22k
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, model->inputs, model->input_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1072
3
  else { // Compute minimize with gradients including selected inputs.
1073
3
    assert(model->input_size > 0);
1074
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
1075
3
    assert(outgrad_size > 0);
1076
3
    ccv_nnc_tensor_symbol_t outgrads[outgrad_size];
1077
3
    j = 0;
1078
10
    for (i = 0; i < model->input_size; 
i++7
)
1079
7
      if (!(disable_outgrad & ((uint64_t)1 << i)))
1080
3
        outgrads[j++] = model->inputs[i];
1081
3
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, outgrads, outgrad_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1082
3
  }
1083
2.24k
  if (compiled_data->parameter_flags)
1084
4
    ccfree(parameters);
1085
2.24k
  _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer), compiled_data->minimize.max_saved_aux_size);
1086
2.24k
  if (compiled_data->minimize.parameters)
1087
5
    _ccv_cnnp_apply_parameters_with_minimizer(model);
1088
  // Go through gradient checkpoints to generate tensor inputs for backward pass just before executing the backward pass.
1089
2.24k
  ccv_cnnp_model_apply_gradient_checkpoints(compiled_data, model->graph);
1090
4.48k
  for (i = 0; i < output_size; 
i++2.24k
)
1091
2.24k
  {
1092
2.24k
    const ccv_nnc_tensor_symbol_t df = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
1093
    // Init this to 1 so we can backprop.
1094
2.24k
    ccv_nnc_tensor_symbol_set_flags(model->graph, df, CCV_NNC_TENSOR_SYMBOL_INIT_ONES);
1095
2.24k
  }
1096
2.24k
  compiled_data->backward.to_size = 0;
1097
7.14k
  for (i = 0; i < parameter_size_maybe_more; 
i++4.90k
)
1098
4.90k
    if (compiled_data->gradients[i].d != CCV_NNC_NO_TENSOR_SYMBOL)
1099
4.90k
      compiled_data->backward.tos[compiled_data->backward.to_size++] = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->gradients[i]);
1100
2.24k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS);
1101
2.24k
  ccv_nnc_symbolic_graph_set_destinations(model->graph, compiled_data->update_nodes, parameter_size);
1102
4.48k
  for (i = 0; i < parameter_size_maybe_more - parameter_size; 
i++2.24k
)
1103
2.24k
  {
1104
2.24k
    if (compiled_data->outgrads[i].d < 0) // When we go through input, we might find zero-length inputs, and for these, we cannot have any outgrads.
1105
0
      continue;
1106
2.24k
    const ccv_nnc_graph_exec_symbol_t outgrad = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->outgrads[i]);
1107
2.24k
    const int* tos;
1108
2.24k
    int to_size;
1109
2.24k
    ccv_nnc_graph_exec_symbol_to(model->graph, outgrad, &tos, &to_size);
1110
2.24k
    if (to_size == 0) // If this is the end (no minimizers afterwards). We need to attach this as a destination. Otherwise this is covered in update_nodes.
1111
9
    {
1112
9
      const ccv_nnc_graph_exec_symbol_t* destinations = ccv_nnc_symbolic_graph_destinations(model->graph);
1113
9
      const int destination_count = ccv_nnc_symbolic_graph_destination_size(model->graph);
1114
9
      int flag = 0;
1115
9
      const int outgrad_destination_start = ccv_max(0, destination_count - i);
1116
11
      for (j = i - 1; !flag && 
j >= 09
;
j--2
)
1117
2
        if (j + outgrad_destination_start < destination_count)
1118
2
          flag = (destinations[j + outgrad_destination_start].d == outgrad.d);
1119
9
      if (!flag) // Only if we cannot find it, we add it.
1120
7
        ccv_nnc_symbolic_graph_add_destination(model->graph, outgrad);
1121
9
    }
1122
2.24k
  }
1123
2.24k
  if (parallel_count > 1)
1124
8
  {
1125
8
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1126
8
      0, 0,
1127
8
      compiled_data->gradients, parameter_size /* No need to deal with outgrads, we don't allreduce outgrads */,
1128
8
      compiled_data->gradients /* We only care about gradients before allreduce, thus, update our current pointers */,
1129
8
      0, 0, 0,
1130
8
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1131
8
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1132
8
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1133
16
    for (i = 0; i < evaluate_to_size; 
i++8
)
1134
32
      
for (j = 1; 8
j < parallel_count;
j++24
)
1135
24
      {
1136
24
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1137
24
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1138
24
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1139
24
      }
1140
8
    const int backward_to_size = compiled_data->backward.to_size;
1141
146
    for (i = 0; i < backward_to_size; 
i++138
)
1142
552
      
for (j = 1; 138
j < parallel_count;
j++414
)
1143
414
      {
1144
414
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->backward.tos[i], j);
1145
414
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1146
414
          compiled_data->backward.tos[compiled_data->backward.to_size++] = copy;
1147
414
      }
1148
8
  }
1149
  // Only use memory compression if we are in gradient parameter mode.
1150
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS2.23k
)
1151
2.24k
  {
1152
2.24k
    if (model->memory_compression)
1153
0
      ccv_nnc_symbolic_graph_memory_compression(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1154
2.24k
    if (model->memory_reduction)
1155
0
      ccv_nnc_symbolic_graph_memory_reduction(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1156
2.24k
  }
1157
2.24k
  compiled_data->backward.to_size = _ccv_nnc_array_dedup_graph_exec_symbols(compiled_data->backward.tos, compiled_data->backward.to_size);
1158
2.24k
  compiled_data->gradient_mode = gradient_mode;
1159
2.24k
}
1160
1161
void ccv_cnnp_model_tensors_init_0(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1162
84
{
1163
84
  assert(!compiled_data->tensors.parameters);
1164
84
  const int parameter_size = compiled_data->parameters->rnum;
1165
84
  const int parallel_count = ccv_max(model->parallel_count, 1);
1166
84
  const int internal_size = compiled_data->internals->rnum;
1167
84
  compiled_data->tensors_init.size = ccv_nnc_tensor_symbol_count(model->graph);
1168
84
  compiled_data->tensors_init.v = cccalloc(((compiled_data->tensors_init.size + 31) >> 5), sizeof(uint32_t));
1169
84
  compiled_data->tensors.parameters = (ccv_nnc_tensor_t**)cccalloc((parameter_size + internal_size) * parallel_count, sizeof(ccv_nnc_tensor_t*));
1170
84
  compiled_data->tensors.internals = compiled_data->tensors.parameters + parameter_size * parallel_count;
1171
84
}
1172
1173
int ccv_cnnp_model_tensors_any_to_alloc(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1174
3
{
1175
3
  int i, j;
1176
3
  const int parameter_size = compiled_data->parameters->rnum;
1177
3
  const int parallel_count = ccv_max(model->parallel_count, 1);
1178
3
  const int internal_size = compiled_data->internals->rnum;
1179
19
  for (i = 0; i < parameter_size; 
i++16
)
1180
16
  {
1181
    // parameters has to be allocated all together.
1182
16
    if (compiled_data->tensors.parameters[i])
1183
16
    {
1184
16
      for (j = 1; j < parallel_count; 
j++0
)
1185
0
        { assert(compiled_data->tensors.parameters[i + j * parameter_size]); }
1186
16
      continue;
1187
16
    }
1188
0
    return 1;
1189
16
  }
1190
3
  for (i = 0; i < internal_size; 
i++0
)
1191
0
  {
1192
0
    if (!compiled_data->tensors.internals[i])
1193
0
      return 1;
1194
0
    for (j = 1; j < parallel_count; j++)
1195
0
      if (!compiled_data->tensors.internals[i + j * internal_size])
1196
0
        return 1;
1197
0
  }
1198
3
  return 0;
1199
3
}
1200
1201
void ccv_cnnp_model_tensors_init_1(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1202
82
{
1203
82
  int i, j;
1204
82
  const int parameter_size = compiled_data->parameters->rnum;
1205
82
  const int parallel_count = ccv_max(model->parallel_count, 1);
1206
82
  const int internal_size = compiled_data->internals->rnum;
1207
364
  for (i = 0; i < parameter_size; 
i++282
)
1208
282
  {
1209
    // parameters has to be allocated all together.
1210
282
    if (compiled_data->tensors.parameters[i])
1211
0
    {
1212
0
      for (j = 1; j < parallel_count; j++)
1213
0
        { assert(compiled_data->tensors.parameters[i + j * parameter_size]); }
1214
0
      continue;
1215
0
    }
1216
282
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1217
282
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1218
282
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1219
101
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1220
282
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1221
282
    compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0);
1222
684
    for (j = 1; j < parallel_count; 
j++402
)
1223
402
    {
1224
402
      if (j != device_id)
1225
402
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1226
0
      else
1227
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1228
402
      compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1229
402
    }
1230
282
  }
1231
82
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1232
141
  for (i = 0; i < internal_size; 
i++59
)
1233
59
  {
1234
59
    const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i);
1235
59
    const int d = retained.d;
1236
59
    if (init_v[d >> 5] & (1u << (d & 0x1f)))
1237
0
      continue;
1238
59
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(retained.graph, retained);
1239
59
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1240
7
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1241
59
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1242
59
    if (!compiled_data->tensors.internals[i])
1243
59
      compiled_data->tensors.internals[i] = ccv_nnc_tensor_new(0, info, 0);
1244
155
    for (j = 1; j < parallel_count; 
j++96
)
1245
96
    {
1246
96
      if (j != device_id)
1247
96
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1248
0
      else
1249
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1250
96
      if (!compiled_data->tensors.internals[i + j * internal_size])
1251
96
        compiled_data->tensors.internals[i + j * internal_size] = ccv_nnc_tensor_new(0, info, 0);
1252
96
    }
1253
59
  }
1254
82
  compiled_data->tensors_init.v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); // Remove 1 if any.
1255
82
}
1256
1257
static void _ccv_cnnp_model_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1258
82
{
1259
82
  ccv_cnnp_model_tensors_init_0(model, compiled_data);
1260
82
  ccv_cnnp_model_tensors_init_1(model, compiled_data);
1261
82
}
1262
1263
static void _ccv_cnnp_model_copy_tensors(const uint32_t* const tensors_init, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1264
6
{
1265
6
  assert(parallel_count > 0);
1266
6
  int i, j;
1267
12
  for (i = 0; i < tensor_size; 
i++6
)
1268
6
  {
1269
6
    if (!tensors[i])
1270
0
      continue;
1271
6
    const int d = tensor_symbols[i].d;
1272
6
    if (!(tensors_init[d >> 5] & (1u << (d & 0x1f))))
1273
0
      continue;
1274
24
    
for (j = 1; 6
j < parallel_count;
j++18
)
1275
18
      if (tensors[i + j * tensor_size])
1276
18
      {
1277
18
        ccv_nnc_tensor_t* const input = CCV_NNC_TENSOR(tensors[i]);
1278
18
        ccv_nnc_tensor_t* const output = CCV_NNC_TENSOR(tensors[i + j * tensor_size]);
1279
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &input, 1, &output, 1, 0);
1280
18
      }
1281
6
  }
1282
6
}
1283
1284
static void _ccv_cnnp_model_remove_nocopies(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t** const tensors, const int tensor_size, const int parallel_count)
1285
89
{
1286
89
  assert(parallel_count > 0);
1287
89
  int i, j;
1288
148
  for (i = 0; i < tensor_size; 
i++59
)
1289
59
  {
1290
59
    const ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1291
155
    for (j = 1; j < parallel_count; 
j++96
)
1292
96
    {
1293
96
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1294
96
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1295
96
      if (copy_tensor && copy.d == CCV_NNC_NO_TENSOR_SYMBOL)
1296
0
      { // We shouldn't allocate this, free it up.
1297
0
        ccv_nnc_tensor_free(tensors[i + j * tensor_size]);
1298
0
        tensors[i + j * tensor_size] = 0;
1299
0
      }
1300
96
    }
1301
59
  }
1302
89
}
1303
1304
static void _ccv_cnnp_model_bind_tensors(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count, ccv_array_t* const tensor_binds)
1305
479
{
1306
479
  assert(parallel_count > 0);
1307
479
  int i, j;
1308
1.81k
  for (i = 0; i < tensor_size; 
i++1.33k
)
1309
1.33k
  {
1310
1.33k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1311
1.33k
    if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL)
1312
7
      continue;
1313
1.32k
    if (graph)
1314
1.32k
    {
1315
1.32k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1316
1.32k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1317
0
        tensor_symbol = alias_to;
1318
1.32k
    }
1319
1.32k
    ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(tensors[i]);
1320
1.32k
    if (tensor && tensor_symbol.d != CCV_NNC_NO_TENSOR_SYMBOL)
1321
1.32k
    {
1322
1.32k
      const ccv_nnc_tensor_bind_t retained_bind = {
1323
1.32k
        .symbol = tensor_symbol,
1324
1.32k
        .tensor = tensor
1325
1.32k
      };
1326
1.32k
      ccv_array_push(tensor_binds, &retained_bind);
1327
1.32k
    }
1328
2.87k
    for (j = 1; j < parallel_count; 
j++1.54k
)
1329
1.54k
    {
1330
1.54k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1331
1.54k
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1332
1.54k
      if (copy_tensor && copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1333
1.54k
      {
1334
1.54k
        const ccv_nnc_tensor_bind_t bind = {
1335
1.54k
          .symbol = copy,
1336
1.54k
          .tensor = tensors[i + j * tensor_size]
1337
1.54k
        };
1338
1.54k
        ccv_array_push(tensor_binds, &bind);
1339
1.54k
      }
1340
1.54k
    }
1341
1.32k
  }
1342
479
}
1343
1344
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data)
1345
2.38k
{
1346
2.38k
  if (compiled_data->graph)
1347
89
    ccv_nnc_graph_free(compiled_data->graph);
1348
2.38k
  compiled_data->graph = 0;
1349
2.38k
  compiled_data->is_test = 0;
1350
2.38k
  if (compiled_data->tensor_arena)
1351
89
    ccv_nnc_tensor_arena_free(compiled_data->tensor_arena);
1352
2.38k
  compiled_data->tensor_arena = 0;
1353
2.38k
  if (compiled_data->graph_exec_arena)
1354
89
    ccv_nnc_graph_exec_arena_free(compiled_data->graph_exec_arena);
1355
2.38k
  compiled_data->graph_exec_arena = 0;
1356
2.38k
  if (compiled_data->backward.from_ops)
1357
29
    ccfree(compiled_data->backward.from_ops);
1358
2.38k
  compiled_data->backward.from_ops = 0;
1359
2.38k
  if (compiled_data->evaluate.schedule)
1360
34
    ccv_nnc_graph_static_schedule_free(compiled_data->evaluate.schedule);
1361
2.38k
  compiled_data->evaluate.schedule = 0;
1362
2.38k
  if (compiled_data->backward.schedule)
1363
25
    ccv_nnc_graph_static_schedule_free(compiled_data->backward.schedule);
1364
2.38k
  compiled_data->backward.schedule = 0;
1365
2.38k
}
1366
1367
static void _ccv_cnnp_compiled_data_gradient_free(ccv_cnnp_compiled_data_t* const compiled_data)
1368
2.29k
{
1369
2.29k
  if (compiled_data->gradients)
1370
2.24k
    ccfree(compiled_data->gradients);
1371
2.29k
  compiled_data->gradients = 0;
1372
2.29k
  if (compiled_data->updated_parameters)
1373
2.24k
    ccfree(compiled_data->updated_parameters);
1374
2.29k
  compiled_data->updated_parameters = 0;
1375
2.29k
  compiled_data->update_nodes = 0;
1376
2.29k
  compiled_data->saved_aux = 0;
1377
2.29k
}
1378
1379
static void _ccv_cnnp_compiled_data_backward_free(ccv_cnnp_compiled_data_t* const compiled_data)
1380
2.33k
{
1381
2.33k
  if (compiled_data->backward.gradients)
1382
5
    ccfree(compiled_data->backward.gradients);
1383
2.33k
  compiled_data->backward.gradients = 0;
1384
2.33k
  if (compiled_data->backward.accum)
1385
5
    ccv_nnc_graph_free(compiled_data->backward.accum);
1386
2.33k
  compiled_data->backward.accum = 0;
1387
2.33k
  if (compiled_data->backward.tensor_arena)
1388
5
    ccv_nnc_tensor_arena_free(compiled_data->backward.tensor_arena);
1389
2.33k
  compiled_data->backward.tensor_arena = 0;
1390
2.33k
  if (compiled_data->backward.graph_exec_arena)
1391
5
    ccv_nnc_graph_exec_arena_free(compiled_data->backward.graph_exec_arena);
1392
2.33k
  compiled_data->backward.graph_exec_arena = 0;
1393
2.33k
}
1394
1395
static void _ccv_cnnp_compiled_data_apply_gradients_free(ccv_cnnp_compiled_data_t* const compiled_data)
1396
2.30k
{
1397
2.30k
  if (compiled_data->apply_gradients.graph)
1398
21
    ccv_nnc_graph_free(compiled_data->apply_gradients.graph);
1399
2.30k
  compiled_data->apply_gradients.graph = 0;
1400
2.30k
  if (compiled_data->apply_gradients.tensor_arena)
1401
21
    ccv_nnc_tensor_arena_free(compiled_data->apply_gradients.tensor_arena);
1402
2.30k
  compiled_data->apply_gradients.tensor_arena = 0;
1403
2.30k
  if (compiled_data->apply_gradients.graph_exec_arena)
1404
21
    ccv_nnc_graph_exec_arena_free(compiled_data->apply_gradients.graph_exec_arena);
1405
2.30k
  compiled_data->apply_gradients.graph_exec_arena = 0;
1406
2.30k
}
1407
1408
// Compile the graph to run ccv_cnnp_model_fit
1409
static void _ccv_cnnp_model_fit_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1410
8
{
1411
8
  int i, j;
1412
8
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1413
8
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE);
1414
8
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_FIT_MODE;
1415
8
  const int parallel_count = ccv_max(model->parallel_count, 1);
1416
8
  assert(output_size == model->output_size * parallel_count);
1417
8
  assert(!fits || output_size == fit_size);
1418
8
  assert(output_size > 0);
1419
8
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1420
8
  {
1421
8
    _ccv_cnnp_model_set_rewindables(model);
1422
8
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1423
8
  } else 
if (0
compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES0
) {
1424
0
    _ccv_cnnp_model_rewind_graph(model);
1425
0
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1426
0
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1427
0
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1428
0
  }
1429
8
  const int tensors_init = !!compiled_data->tensors_init.v;
1430
8
  if (!tensors_init)
1431
4
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1432
4
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1433
  // Check if it is not fully allocated, if it is not, init_1.
1434
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1435
8
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1436
8
  assert((input_size % parallel_count) == 0);
1437
8
  assert((output_size % parallel_count) == 0);
1438
8
  assert((fit_size % parallel_count) == 0);
1439
8
  const int input_size_per_p = input_size / parallel_count;
1440
8
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1441
8
  const int output_size_per_p = output_size / parallel_count;
1442
8
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1443
8
  const int fit_size_per_p = fit_size / parallel_count;
1444
8
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count, tensor_binds);
1445
8
  const int parameter_size = compiled_data->parameters->rnum;
1446
8
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1447
8
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1448
8
  const int internal_size = compiled_data->internals->rnum;
1449
8
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1450
8
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1451
8
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1452
8
  ccv_array_free(tensor_binds);
1453
8
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1454
8
  if (tensors_init && 
parallel_count > 14
)
1455
0
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1456
  // If tensor is not init'ed, we need to init states first.
1457
8
  if (_ccv_cnnp_any_to_init(compiled_data))
1458
7
  {
1459
7
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1460
7
      .parallel_count = parallel_count,
1461
7
      .graph = model->graph,
1462
7
      .compiled_data = compiled_data,
1463
7
      .tensor_arena = compiled_data->tensor_arena
1464
7
    };
1465
7
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1466
7
  }
1467
8
  compiled_data->is_test = 0;
1468
8
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer);
1469
  // No need to set because it is default to training mode.
1470
  // ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1471
105
  for (i = 0; i < saved_aux_size * parameter_size; 
i++97
)
1472
97
  {
1473
97
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
1474
5
      continue;
1475
92
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, compiled_data->saved_aux[i].source);
1476
92
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
1477
296
    for (j = 1; j < parallel_count; 
j++204
)
1478
204
    {
1479
204
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
1480
204
      if (copy)
1481
204
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
1482
204
    }
1483
92
  }
1484
8
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1485
8
  compiled_data->evaluate.to_op_size = 0;
1486
22
  for (i = 0; i < evaluate_to_size; 
i++14
)
1487
14
  {
1488
14
    ccv_nnc_graph_exec_t const to = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1489
14
    if (to.graph)
1490
14
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to;
1491
14
  }
1492
8
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1493
8
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1494
8
}
1495
1496
ccv_nnc_stream_context_t* ccv_cnnp_model_default_stream(const ccv_cnnp_model_t* const model)
1497
0
{
1498
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1499
0
  if (!compiled_data || !compiled_data->graph)
1500
0
    return 0;
1501
0
  return ccv_nnc_graph_default_stream(compiled_data->graph);
1502
0
}
1503
1504
uint64_t ccv_cnnp_model_memory_size(const ccv_cnnp_model_t* const model)
1505
0
{
1506
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1507
0
  if (!compiled_data || !compiled_data->tensor_arena)
1508
0
    return 0;
1509
0
  return ccv_nnc_tensor_arena_size(compiled_data->tensor_arena);
1510
0
}
1511
1512
static void _ccv_cnnp_bind_tensors_to_arena(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1513
38.8k
{
1514
38.8k
  int i, j;
1515
114k
  for (i = 0; i < tensor_size; 
i++75.5k
)
1516
75.5k
  {
1517
75.5k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1518
75.5k
    if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL)
1519
0
      continue;
1520
75.5k
    if (graph)
1521
72.5k
    {
1522
72.5k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1523
72.5k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1524
0
        tensor_symbol = alias_to;
1525
72.5k
    }
1526
75.5k
    ccv_nnc_tensor_bind_symbol(tensor_arena, tensor_symbol, tensors[i]);
1527
77.3k
    for (j = 1; j < parallel_count; 
j++1.77k
)
1528
1.77k
    {
1529
1.77k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1530
1.77k
      if (copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1531
1.77k
        ccv_nnc_tensor_bind_symbol(tensor_arena, copy, tensors[i + tensor_size * j]);
1532
1.77k
    }
1533
75.5k
  }
1534
38.8k
}
1535
1536
void ccv_cnnp_model_fit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1537
2.54k
{
1538
2.54k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1539
2.54k
  assert(compiled_data);
1540
2.54k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1541
2.54k
  assert(output_size == model->output_size * parallel_count);
1542
2.54k
  assert(input_size == model->input_size * parallel_count);
1543
2.54k
  assert(!fits || fit_size == output_size);
1544
2.54k
  assert(model->graph);
1545
2.54k
  if (!compiled_data->graph || 
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE2.53k
)
1546
8
  {
1547
8
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1548
8
    _ccv_cnnp_compiled_data_backward_free(compiled_data);
1549
8
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
1550
    // Compile the symbolic graph down only when needed.
1551
8
    _ccv_cnnp_model_fit_jit(model, inputs, input_size, fits, fit_size, outputs, output_size);
1552
2.53k
  } else {
1553
2.53k
    assert((input_size % parallel_count) == 0);
1554
2.53k
    assert((output_size % parallel_count) == 0);
1555
2.53k
    assert((fit_size % parallel_count) == 0);
1556
2.53k
    const int input_size_per_p = input_size / parallel_count;
1557
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1558
2.53k
    const int output_size_per_p = output_size / parallel_count;
1559
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1560
2.53k
    const int fit_size_per_p = fit_size / parallel_count;
1561
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count);
1562
2.53k
  }
1563
2.54k
  if (compiled_data->is_test)
1564
0
  {
1565
0
    compiled_data->is_test = 0;
1566
0
    ccv_nnc_graph_exec_update_t update = {
1567
0
      .parallel_count = parallel_count,
1568
0
      .graph = model->graph,
1569
0
      .graph_exec_arena = compiled_data->graph_exec_arena,
1570
0
    };
1571
0
    ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1572
0
  }
1573
2.54k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1574
2.54k
}
1575
1576
// Compile the graph to run ccv_cnnp_model_evaluate with require_grad = false (MULTISTAGE_MODE_NO_GRAD).
1577
static void _ccv_cnnp_model_multistage_no_grad_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1578
52
{
1579
52
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1580
52
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD;
1581
52
  const int parallel_count = ccv_max(model->parallel_count, 1);
1582
52
  assert(output_size == model->output_size * parallel_count);
1583
52
  assert(output_size > 0);
1584
  // If the gradient is not initialized, continue to setup parallel process. We don't init gradient here, but rather,
1585
  // we setup proper rewindables so the graph can be rewinded to previous state before we run data parallel.
1586
52
  if (parallel_count > 1 && 
compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE6
)
1587
6
  {
1588
6
    const int evaluate_to_size = compiled_data->evaluate.to_size;
1589
6
    compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1590
6
    _ccv_cnnp_model_set_rewindables(model);
1591
6
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1592
6
      0, 0,
1593
6
      0, 0, 0,
1594
6
      0, 0, 0,
1595
6
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1596
6
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1597
6
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1598
6
    int i, j;
1599
12
    for (i = 0; i < evaluate_to_size; 
i++6
)
1600
24
      
for (j = 1; 6
j < parallel_count;
j++18
)
1601
18
      {
1602
18
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1603
18
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1604
18
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1605
18
      }
1606
6
  }
1607
52
  const int tensors_init = !!compiled_data->tensors_init.v;
1608
52
  if (!tensors_init)
1609
31
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1610
21
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1611
  // Check if it is not fully allocated, if it is not, init_1.
1612
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1613
52
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1614
52
  assert((input_size % parallel_count) == 0);
1615
52
  assert((output_size % parallel_count) == 0);
1616
52
  const int input_size_per_p = input_size / parallel_count;
1617
52
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1618
52
  const int output_size_per_p = output_size / parallel_count;
1619
52
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1620
52
  const int parameter_size = compiled_data->parameters->rnum;
1621
52
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1622
52
  const int internal_size = compiled_data->internals->rnum;
1623
52
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1624
52
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1625
  // If we generated gradient for the graph, only compile part of the graph because the rest is irrelevant for evaluation.
1626
52
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1627
52
  ccv_array_free(tensor_binds);
1628
52
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1629
  // If tensor is not init'ed, we need to init states first.
1630
52
  if (tensors_init && 
parallel_count > 121
)
1631
6
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1632
52
  if (_ccv_cnnp_any_to_init(compiled_data))
1633
16
  {
1634
16
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1635
16
      .parallel_count = parallel_count,
1636
16
      .graph = model->graph,
1637
16
      .compiled_data = compiled_data,
1638
16
      .tensor_arena = compiled_data->tensor_arena
1639
16
    };
1640
16
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1641
16
  }
1642
52
  compiled_data->is_test = 1;
1643
52
  ccv_nnc_graph_exec_update_t update = {
1644
52
    .parallel_count = parallel_count,
1645
52
    .graph = model->graph,
1646
52
    .graph_exec_arena = compiled_data->graph_exec_arena,
1647
52
  };
1648
52
  ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
1649
52
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1650
52
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1651
52
}
1652
1653
static void _ccv_cnnp_model_gradient_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1654
28
{
1655
28
  assert(!compiled_data->tensors.gradients);
1656
28
  const int parameter_size = compiled_data->parameters->rnum;
1657
28
  const int parallel_count = ccv_max(model->parallel_count, 1);
1658
28
  compiled_data->tensors.gradients = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * parameter_size * 2 * parallel_count);
1659
28
  compiled_data->tensors.accum_gradients = compiled_data->tensors.gradients + parameter_size * parallel_count;
1660
28
  int i, j;
1661
175
  for (i = 0; i < parameter_size; 
i++147
)
1662
147
  {
1663
147
    if (compiled_data->parameter_flags && 
!(compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))6
)
1664
2
    {
1665
2
      compiled_data->tensors.gradients[i] = 0;
1666
2
      compiled_data->tensors.accum_gradients[i] = 0;
1667
2
      for (j = 1; j < parallel_count; 
j++0
)
1668
0
      {
1669
0
        compiled_data->tensors.gradients[i + j * parameter_size] = 0;
1670
0
        compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1671
0
      }
1672
2
      continue;
1673
2
    }
1674
145
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1675
145
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1676
145
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1677
38
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1678
145
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1679
145
    compiled_data->tensors.gradients[i] = ccv_nnc_tensor_new(0, info, 0);
1680
145
    compiled_data->tensors.accum_gradients[i] = 0; // delay the accumulated gradient allocation until when we need it.
1681
325
    for (j = 1; j < parallel_count; 
j++180
)
1682
180
    {
1683
180
      if (j != device_id)
1684
180
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1685
0
      else
1686
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1687
180
      compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1688
180
      compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1689
180
    }
1690
145
  }
1691
28
}
1692
1693
static int _ccv_cnnp_is_disable_outgrad_all(const uint64_t disable_outgrad, const int input_size)
1694
7.99k
{
1695
7.99k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL)
1696
15
    return 1;
1697
7.97k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE)
1698
7.97k
    return 0;
1699
7
  int i;
1700
7
  for (i = 0; i < input_size; 
i++0
)
1701
7
    if (!(disable_outgrad & ((uint64_t)1 << i)))
1702
7
      return 0;
1703
0
  return 1;
1704
7
}
1705
1706
// Compile the graph to run ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1707
// Particularly, this method compiles the evaluation and backprop graph (the main graph).
1708
static void _ccv_cnnp_model_multistage_jit_0(ccv_cnnp_model_t* const model, const uint64_t disable_outgrad, const int is_test, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1709
29
{
1710
29
  int i, j;
1711
29
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1712
29
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES1
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS28
;
1713
29
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE || compiled_data->gradient_mode != target_gradient_mode);
1714
29
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE;
1715
29
  const int parallel_count = ccv_max(model->parallel_count, 1);
1716
29
  assert(output_size == model->output_size * parallel_count);
1717
29
  assert(output_size > 0);
1718
  // There shouldn't be a loss function if we evaluate with multistage jit.
1719
29
  assert(compiled_data->loss.cmd == CCV_NNC_NOOP);
1720
29
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1721
27
  {
1722
27
    _ccv_cnnp_model_set_rewindables(model);
1723
27
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1724
27
  } else 
if (2
compiled_data->gradient_mode != target_gradient_mode2
) {
1725
2
    _ccv_cnnp_model_rewind_graph(model);
1726
2
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1727
2
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1728
2
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1729
2
  }
1730
29
  const int tensors_init = !!compiled_data->tensors_init.v;
1731
29
  if (!tensors_init)
1732
21
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1733
8
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1734
  // Check if it is not fully allocated, if it is not, init_1.
1735
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1736
29
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1737
29
  assert((input_size % parallel_count) == 0);
1738
29
  assert((output_size % parallel_count) == 0);
1739
29
  const int input_size_per_p = input_size / parallel_count;
1740
29
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1741
29
  const int output_size_per_p = output_size / parallel_count;
1742
29
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1743
29
  const int parameter_size = compiled_data->parameters->rnum;
1744
29
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1745
29
  const int internal_size = compiled_data->internals->rnum;
1746
29
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1747
29
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1748
29
  if (!compiled_data->tensors.gradients)
1749
28
    _ccv_cnnp_model_gradient_tensors_init(model, compiled_data);
1750
29
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
1751
29
  if (compiled_data->backward.to_size > 0)
1752
29
    ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->backward.tos, compiled_data->backward.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1753
0
  else
1754
0
    ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1755
29
  ccv_array_free(tensor_binds);
1756
29
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1757
29
  if (tensors_init && 
parallel_count > 18
)
1758
0
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1759
  // If tensor is not init'ed, we need to init states first.
1760
29
  if (_ccv_cnnp_any_to_init(compiled_data))
1761
18
  {
1762
18
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1763
18
      .parallel_count = parallel_count,
1764
18
      .graph = model->graph,
1765
18
      .compiled_data = compiled_data,
1766
18
      .tensor_arena = compiled_data->tensor_arena
1767
18
    };
1768
18
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1769
18
  }
1770
29
  compiled_data->is_test = is_test;
1771
29
  ccv_nnc_graph_exec_update_t update = {
1772
29
    .parallel_count = parallel_count,
1773
29
    .graph = model->graph,
1774
29
    .graph_exec_arena = compiled_data->graph_exec_arena,
1775
29
  };
1776
29
  ccv_cnnp_model_set_is_test(model, is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1777
29
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1778
29
  compiled_data->evaluate.to_op_size = 0;
1779
29
  ccv_array_t* const backward_from = ccv_array_new(sizeof(int), 0, 0);
1780
76
  for (i = 0; i < evaluate_to_size; 
i++47
)
1781
47
  {
1782
47
    ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1783
47
    if (to_op.graph)
1784
47
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to_op;
1785
47
    const int* tos;
1786
47
    int to_size;
1787
47
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->evaluate.tos[i], &tos, &to_size);
1788
94
    for (j = 0; j < to_size; 
j++47
)
1789
47
    {
1790
47
      ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
1791
47
        .d = tos[j],
1792
47
        .graph = model->graph
1793
47
      });
1794
47
      if (to_op.graph)
1795
47
        ccv_array_add_unique_int(backward_from, to_op.d);
1796
47
    }
1797
47
  }
1798
29
  assert(backward_from->rnum > 0);
1799
29
  compiled_data->backward.from_op_size = backward_from->rnum;
1800
29
  compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1801
76
  for (i = 0; i < backward_from->rnum; 
i++47
)
1802
47
    compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1803
47
      .d = *(int*)ccv_array_get(backward_from, i),
1804
47
      .graph = compiled_data->graph,
1805
47
    };
1806
  // If there are any set node (to set some tensors to 0) inserted through backward pass, these won't be executed if we just do sources -> evaluate.to_ops, backward.from_ops -> destinations. We need this logic to find out these nodes and explicitly adding them to backward.from_ops.
1807
29
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(compiled_data->graph->exec_info, 0);
1808
29
  const int exec_info_size = compiled_data->graph->exec_info->rnum;
1809
29
  uint32_t* const visited = cccalloc((exec_info_size + 31) >> 5, sizeof(uint32_t));
1810
29
  const ccv_nnc_graph_exec_t* const sources = (ccv_nnc_graph_exec_t*)ccv_array_get(compiled_data->graph->sources, 0);
1811
29
  const int source_size = compiled_data->graph->sources->rnum;
1812
58
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new29
(compiled_data->graph, exec_info, exec_info_size, sources, source_size, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size, 0);
1813
600
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1814
600
    visited[(idx >> 5)] |= (1u << (idx & 31));
1815
600
  } ccv_nnc_graph_visit_endfor
1816
58
  ccv_nnc_graph_visit_free(visit);
1817
58
  const ccv_nnc_graph_exec_t* const destinations = (ccv_nnc_graph_exec_t*)
ccv_array_get29
(compiled_data->graph->destinations, 0);
1818
58
  const int destination_size = compiled_data->graph->destinations->rnum;
1819
58
  visit = 
ccv_nnc_graph_visit_new29
(compiled_data->graph, exec_info, exec_info_size, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, destinations, destination_size, 0);
1820
654
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1821
654
    visited[(idx >> 5)] |= (1u << (idx & 31));
1822
654
  } ccv_nnc_graph_visit_endfor
1823
58
  ccv_nnc_graph_visit_free(visit);
1824
58
  visit = 
ccv_nnc_graph_visit_new29
(compiled_data->graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0);
1825
  // Find any missing nodes to be added as source. Right now, these are only set nodes.
1826
1.30k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1827
1.30k
    if (!(visited[(idx >> 5)] & (1u << (idx & 31))))
1828
47
    {
1829
47
      assert(exec_info[idx].cmd.cmd == CCV_NNC_SET_FORWARD);
1830
47
      if (exec_info[idx].cmd.info.blas.a[0] == 0) // Special-casing for empty out the tensor set function, not for the set grad to 1 one.
1831
0
        ccv_array_add_unique_int(backward_from, idx);
1832
47
    }
1833
1.30k
  } ccv_nnc_graph_visit_endfor
1834
29
  ccv_nnc_graph_visit_free(visit);
1835
29
  ccfree(visited);
1836
29
  if (backward_from->rnum != compiled_data->backward.from_op_size) // If it doesn't match, need to redo this.
1837
0
  {
1838
0
    compiled_data->backward.from_op_size = backward_from->rnum;
1839
0
    compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccrealloc(compiled_data->backward.from_ops, sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1840
0
    for (i = 0; i < backward_from->rnum; i++)
1841
0
      compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1842
0
        .d = *(int*)ccv_array_get(backward_from, i),
1843
0
        .graph = compiled_data->graph,
1844
0
      };
1845
0
  }
1846
29
  ccv_array_free(backward_from);
1847
29
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1848
29
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1849
29
}
1850
1851
void ccv_cnnp_model_dry_run(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1852
7.96k
{
1853
7.96k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1854
7.96k
  assert(compiled_data);
1855
7.96k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1856
7.96k
  assert(output_size == model->output_size * parallel_count);
1857
7.96k
  assert(input_size == model->input_size * parallel_count);
1858
7.96k
  assert(model->graph);
1859
7.96k
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(params.disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES14
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS7.95k
;
1860
7.96k
  const int mode_mismatch = (params.requires_grad && 
(7.82k
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE7.82k
||
compiled_data->gradient_mode != target_gradient_mode7.79k
||
compiled_data->disable_outgrad != params.disable_outgrad7.79k
));
1861
7.96k
  if (!compiled_data->graph || 
mode_mismatch7.88k
)
1862
81
  {
1863
81
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1864
81
    if (mode_mismatch) // If mode mismatch, we need to redo the backward as well (no need to redo apply_gradients, it doesn't require target_gradient_mode or disable_outgrad.
1865
29
      _ccv_cnnp_compiled_data_backward_free(compiled_data);
1866
81
    if (params.requires_grad)
1867
29
      _ccv_cnnp_model_multistage_jit_0(model, params.disable_outgrad, params.is_test, inputs, input_size, outputs, output_size);
1868
52
    else
1869
52
      _ccv_cnnp_model_multistage_no_grad_jit(model, inputs, input_size, outputs, output_size);
1870
7.88k
  } else {
1871
7.88k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->tensor_arena);
1872
7.88k
    assert((input_size % parallel_count) == 0);
1873
7.88k
    const int input_size_per_p = input_size / parallel_count;
1874
7.88k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1875
7.88k
    assert((output_size % parallel_count) == 0);
1876
7.88k
    const int output_size_per_p = output_size / parallel_count;
1877
7.88k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1878
7.88k
  }
1879
7.96k
  if (compiled_data->is_test != params.is_test)
1880
57
  {
1881
57
    compiled_data->is_test = params.is_test;
1882
57
    ccv_nnc_graph_exec_update_t update = {
1883
57
      .parallel_count = parallel_count,
1884
57
      .graph = model->graph,
1885
57
      .graph_exec_arena = compiled_data->graph_exec_arena,
1886
57
    };
1887
57
    ccv_cnnp_model_set_is_test(model, params.is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1888
57
  }
1889
7.96k
}
1890
1891
void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1892
7.96k
{
1893
7.96k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1894
7.96k
  assert(compiled_data);
1895
7.96k
  ccv_cnnp_model_dry_run(model, params, inputs, input_size, outputs, output_size);
1896
7.96k
  if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD)
1897
65
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1898
7.90k
  else {
1899
7.90k
    if (!compiled_data->evaluate.schedule)
1900
34
      compiled_data->evaluate.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, 0, 0, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size);
1901
7.90k
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->evaluate.schedule, tensor_tape, stream_context);
1902
7.90k
  }
1903
7.96k
}
1904
1905
// Compile the graph to run ccv_cnnp_model_backward after ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1906
// Particularly, this method compiles the accumulator graph.
1907
static void _ccv_cnnp_model_multistage_jit_1(ccv_cnnp_model_t* const model)
1908
5
{
1909
5
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1910
5
  assert(compiled_data);
1911
5
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1912
5
  ccv_nnc_symbolic_graph_t* accum = ccv_nnc_symbolic_graph_new();
1913
5
  const int parallel_count = ccv_max(model->parallel_count, 1);
1914
5
  const int parameter_size = compiled_data->parameters->rnum;
1915
5
  int i, j;
1916
5
  compiled_data->backward.gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size * parallel_count * 3);
1917
5
  compiled_data->backward.accum_gradients = compiled_data->backward.gradients + parameter_size * parallel_count;
1918
5
  compiled_data->backward.updated_accum_gradients = compiled_data->backward.accum_gradients + parameter_size * parallel_count;
1919
20
  for (i = 0; i < parameter_size; 
i++15
)
1920
30
    
for (j = 0; 15
j < parallel_count;
j++15
)
1921
15
      if (compiled_data->tensors.gradients[i + j * parameter_size])
1922
15
      {
1923
15
        const ccv_nnc_tensor_param_t info = compiled_data->tensors.gradients[i + j * parameter_size]->info;
1924
        // Now, the old gradient is the accumulated gradient, getting new gradient tensor setup so we can collect them.
1925
15
        compiled_data->tensors.accum_gradients[i + j * parameter_size] = compiled_data->tensors.gradients[i + j * parameter_size];
1926
15
        compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1927
15
        ccv_nnc_tensor_symbol_t inputs[2];
1928
15
        inputs[0] = compiled_data->backward.accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1929
15
        inputs[1] = compiled_data->backward.gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1930
15
        ccv_nnc_tensor_symbol_t output = compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1931
15
        ccv_nnc_graph_exec_symbol_new(accum, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0);
1932
15
      } else {
1933
0
        compiled_data->backward.accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1934
0
        compiled_data->backward.gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1935
0
        compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1936
0
      }
1937
5
  ccv_nnc_graph_exec_symbol_autogen(accum, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1938
5
  if (ccv_nnc_symbolic_graph_source_size(accum) == 0)
1939
0
  {
1940
0
    ccv_nnc_symbolic_graph_free(accum);
1941
    // Create empty graph.
1942
0
    compiled_data->backward.accum = ccv_nnc_graph_new();
1943
0
    ccv_nnc_graph_topsort(compiled_data->backward.accum, 0, 0);
1944
0
    return;
1945
0
  }
1946
5
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1947
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1948
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1, tensor_binds);
1949
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1950
5
  ccv_nnc_symbolic_graph_compile(accum, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(accum), SYMBOLIC_GRAPH_DESTINATIONS(accum), &compiled_data->backward.accum, &compiled_data->backward.tensor_arena, &compiled_data->backward.graph_exec_arena);
1951
5
  ccv_nnc_symbolic_graph_free(accum);
1952
5
  ccv_array_free(tensor_binds);
1953
5
  ccv_nnc_graph_set_default_static_schedule(compiled_data->backward.accum, compiled_data->stream_type, model->max_stream_count);
1954
5
}
1955
1956
void ccv_cnnp_model_backward(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const ingrads, const int ingrad_size, ccv_nnc_tensor_t* const* const outgrads, const int outgrad_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1957
7.88k
{
1958
7.88k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1959
7.88k
  assert(compiled_data);
1960
7.88k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1961
7.88k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1962
7.88k
  assert(ingrad_size == 0 || ingrad_size == model->output_size * parallel_count);
1963
7.88k
  if (outgrad_size > 0)
1964
2.51k
    { assert(outgrad_size == compiled_data->outgrad_size * parallel_count); }
1965
7.88k
  assert(model->graph);
1966
7.88k
  assert(compiled_data->graph);
1967
7.88k
  const int parameter_size = compiled_data->parameters->rnum;
1968
  // If we need to accumulate the gradients now, do jit on accumulator.
1969
7.88k
  if (compiled_data->backward.count > 0)
1970
1.71k
  {
1971
1.71k
    if (!compiled_data->backward.accum)
1972
5
      _ccv_cnnp_model_multistage_jit_1(model);
1973
1.71k
    else if (compiled_data->backward.count == 1) {
1974
      //  On this round, we need to switch accumulated gradients with gradients (so we can do accumulation properly).
1975
496
      int i;
1976
1.48k
      for (i = 0; i < parameter_size * parallel_count; 
i++986
)
1977
986
      {
1978
986
        ccv_nnc_tensor_t* tensor;
1979
986
        CCV_SWAP(compiled_data->tensors.accum_gradients[i], compiled_data->tensors.gradients[i], tensor);
1980
986
      }
1981
496
      if (compiled_data->backward.tensor_arena)
1982
496
      {
1983
496
        ccv_nnc_tensor_arena_clear_bindings(compiled_data->backward.tensor_arena);
1984
        // Do rebind in case we messed up the binding (we switch accum_gradients and gradients).
1985
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1);
1986
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
1987
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
1988
496
      }
1989
496
    }
1990
1.71k
  }
1991
7.88k
  const int ingrad_size_per_p = model->output_size;
1992
7.88k
  const int outgrad_size_per_p = compiled_data->outgrad_size;
1993
7.88k
  int i, j;
1994
15.7k
  for (i = 0; i < ingrad_size_per_p; 
i++7.88k
)
1995
7.88k
  {
1996
7.88k
    const ccv_nnc_tensor_symbol_t ingrad = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
1997
7.88k
    if (!ingrad_size || 
!ingrads3.79k
||
ingrads[i] == 03.79k
)
1998
4.19k
    {
1999
      // Set it to 1 if it is not specified.
2000
4.19k
      ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ingrad);
2001
4.19k
      if (ingrad_tensor)
2002
4.19k
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
2003
4.31k
      for (j = 1; j < parallel_count; 
j++120
)
2004
120
      {
2005
120
        ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j));
2006
120
        if (ingrad_tensor)
2007
120
          ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
2008
120
      }
2009
4.19k
    } else {
2010
      // Make sure the length matches, in case it is an alias.
2011
3.69k
      assert(ccv_nnc_tensor_count(ingrads[i]->info) == ccv_nnc_tensor_count(ccv_nnc_tensor_symbol_params(model->graph, ingrad)));
2012
3.69k
      ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ingrad, ingrads[i]);
2013
3.69k
      for (j = 1; j < parallel_count; 
j++6
)
2014
6
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j), ingrads[i + ingrad_size_per_p * j]);
2015
3.69k
    }
2016
7.88k
  }
2017
7.88k
  if (outgrad_size > 0)
2018
2.51k
  {
2019
2.51k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS && "shouldn't pass disable_outgrad to ccv_cnnp_model_evaluate before if you plan to compute outgrad");
2020
5.14k
    
for (i = 0; 2.51k
i < outgrad_size_per_p;
i++2.62k
)
2021
2.62k
      if (outgrads[i])
2022
2.43k
      {
2023
2.43k
        const ccv_nnc_tensor_symbol_t outgrad = compiled_data->outgrads[i];
2024
2.43k
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, outgrad, outgrads[i]);
2025
2.43k
        for (j = 1; j < parallel_count; 
j++6
)
2026
6
          ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, outgrad, j), outgrads[i + outgrad_size_per_p * j]);
2027
2.43k
      }
2028
5.37k
  } else {
2029
5.37k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES ||
2030
5.37k
      compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS);
2031
5.37k
  }
2032
  // We need to rebind here because in ccv_cnnp_evaluate, we clear bindings, that will reset all bindings for the gradients.
2033
  // For parameters and internals these are fine because when we clear bindings, it restores to original bindings, which are these
2034
  // parameters and internals. The same cannot be said for gradients due to the accum_gradients switching.
2035
7.88k
  _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
2036
7.88k
  if (!compiled_data->backward.schedule)
2037
25
    compiled_data->backward.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, 0, 0);
2038
  // Run the backward pass.
2039
7.88k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->backward.schedule, tensor_tape, stream_context);
2040
  // If we need to run accumulation round, do that now.
2041
7.88k
  if (compiled_data->backward.count > 0)
2042
1.71k
    ccv_nnc_graph_run_with_schedule(compiled_data->backward.accum, 0, 0, 0, stream_context);
2043
  // Update the count, this determines whether we need to accumulate or not.
2044
7.88k
  ++compiled_data->backward.count;
2045
7.88k
}
2046
2047
// Compile the graph to run ccv_cnnp_model_apply_gradients after ccv_cnnp_model_backward (MULTISTAGE_MODE).
2048
// Particularly, this method compiles the parameter update graph.
2049
static void _ccv_cnnp_model_multistage_jit_2(ccv_cnnp_model_t* const model)
2050
21
{
2051
21
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2052
21
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
2053
21
  const int parallel_count = ccv_max(model->parallel_count, 1);
2054
21
  const int parameter_size = compiled_data->parameters->rnum;
2055
21
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
2056
21
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
2057
21
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
2058
  // Bind accumulated gradients.
2059
21
  if (compiled_data->backward.count > 1)
2060
4
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count, tensor_binds);
2061
17
  else
2062
17
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
2063
21
  ccv_array_t* const apply_gradients_from = ccv_array_new(sizeof(int), 0, 0);
2064
21
  int i, j;
2065
247
  for (i = 0; i < compiled_data->backward.to_size; 
i++226
)
2066
226
  {
2067
226
    const int* tos;
2068
226
    int to_size;
2069
226
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->backward.tos[i], &tos, &to_size);
2070
726
    for (j = 0; j < to_size; 
j++500
)
2071
500
    {
2072
      // Check if this is already show up in the backward graph, if that is the case, it won't be in the apply
2073
      // gradients graph.
2074
500
      const ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
2075
500
        .d = tos[j],
2076
500
        .graph = model->graph,
2077
500
      });
2078
500
      if (!exec.graph)
2079
313
        ccv_array_add_unique_int(apply_gradients_from, tos[j]);
2080
500
    }
2081
226
  }
2082
21
  const int from_size = apply_gradients_from->rnum;
2083
21
  if (from_size == 0)
2084
0
  {
2085
0
    ccv_array_free(apply_gradients_from);
2086
0
    ccv_array_free(tensor_binds);
2087
0
    return;
2088
0
  }
2089
21
  ccv_nnc_graph_exec_symbol_t* const froms = (ccv_nnc_graph_exec_symbol_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * from_size);
2090
154
  for (i = 0; i < from_size; 
i++133
)
2091
133
    froms[i] = (ccv_nnc_graph_exec_symbol_t){
2092
133
      .d = *(int*)ccv_array_get(apply_gradients_from, i),
2093
133
      .graph = model->graph
2094
133
    };
2095
21
  ccv_array_free(apply_gradients_from);
2096
  // It can only ends with updates on the parameters.
2097
21
  ccv_array_t* const tos = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), parameter_size * parallel_count, 0);
2098
154
  for (i = 0;  i < parameter_size; 
i++133
)
2099
133
  {
2100
133
    if (compiled_data->update_nodes[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
2101
0
      continue;
2102
133
    ccv_array_push(tos, &compiled_data->update_nodes[i]);
2103
313
    for (j = 1; j < parallel_count; 
j++180
)
2104
180
    {
2105
180
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->update_nodes[i], j);
2106
180
      ccv_array_push(tos, &copy);
2107
180
    }
2108
133
  }
2109
21
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, froms, from_size, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(tos, 0), tos->rnum, &compiled_data->apply_gradients.graph, &compiled_data->apply_gradients.tensor_arena, &compiled_data->apply_gradients.graph_exec_arena);
2110
21
  ccv_array_free(tos);
2111
21
  ccv_array_free(tensor_binds);
2112
21
  ccfree(froms);
2113
21
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2114
213
  for (i = 0; i < max_saved_aux_size * parameter_size; 
i++192
)
2115
192
  {
2116
    // Skip on no tensor.
2117
192
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
2118
0
      continue;
2119
192
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, compiled_data->saved_aux[i].source);
2120
192
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
2121
540
    for (j = 1; j < parallel_count; 
j++348
)
2122
348
    {
2123
348
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
2124
348
      if (copy)
2125
348
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
2126
348
    }
2127
192
  }
2128
21
  ccv_nnc_graph_set_default_static_schedule(compiled_data->apply_gradients.graph, compiled_data->stream_type, model->max_stream_count);
2129
21
}
2130
2131
void ccv_cnnp_model_apply_gradients(ccv_cnnp_model_t* const model, ccv_nnc_stream_context_t* const stream_context)
2132
7.81k
{
2133
7.81k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2134
7.81k
  assert(compiled_data);
2135
7.81k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
2136
7.81k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2137
7.81k
  assert(model->graph);
2138
7.81k
  assert(compiled_data->graph);
2139
  // Skip if there is no backward pass.
2140
7.81k
  if (compiled_data->backward.count <= 0)
2141
1.65k
    return;
2142
  // Skip if there is no parameters.
2143
6.16k
  if (compiled_data->parameters->rnum == 0)
2144
3
  {
2145
3
    compiled_data->backward.count = 0;
2146
3
    return;
2147
3
  }
2148
6.16k
  if (!compiled_data->apply_gradients.graph)
2149
21
    _ccv_cnnp_model_multistage_jit_2(model);
2150
6.14k
  else {
2151
6.14k
    const int parameter_size = compiled_data->parameters->rnum;
2152
6.14k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->apply_gradients.tensor_arena);
2153
    // Change to bind accum_gradients if we do gradient accumulation (run backward more than once).
2154
6.14k
    if (compiled_data->backward.count > 1)
2155
497
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count);
2156
5.64k
    else
2157
5.64k
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
2158
6.14k
  }
2159
6.16k
  if (compiled_data->apply_gradients.graph)
2160
6.16k
    ccv_nnc_graph_run_with_schedule(compiled_data->apply_gradients.graph, 0, 0, 0, stream_context);
2161
  // Reset backward count to 0.
2162
6.16k
  compiled_data->backward.count = 0;
2163
6.16k
}
2164
2165
void ccv_cnnp_model_set_parameter(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, const ccv_nnc_tensor_t* const tensor)
2166
32
{
2167
32
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2168
32
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 18
:
parameter->param_sel24
;
2169
32
  assert(parameter->param_sel != 0);
2170
32
  const int tensors_init = !!compiled_data->tensors_init.v;
2171
32
  if (!tensors_init)
2172
17
    _ccv_cnnp_model_tensors_init(model, compiled_data);
2173
15
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
2174
  // Check if it is not fully allocated, if it is not, init_1.
2175
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
2176
32
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2177
32
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2178
32
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 131
:
parameter->param_ref1
;
2179
32
  if (param_ref < 0)
2180
1
    { assert(parameter_indices->rnum == 1); }
2181
31
  else
2182
31
    { assert(param_ref < parameter_indices->rnum); }
2183
32
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2184
32
  ccv_array_free(parameter_indices);
2185
32
  const int parameter_size = compiled_data->parameters->rnum;
2186
32
  assert(d >= 0);
2187
32
  assert(d < parameter_size);
2188
32
  const int parallel_count = ccv_max(model->parallel_count, 1);
2189
32
  ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2190
32
  assert(dest);
2191
32
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)tensor), TENSOR_LIST(dest), 0);
2192
32
  int i;
2193
32
  for (i = 1; i < parallel_count; 
i++0
)
2194
0
  {
2195
0
    ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d + i * parameter_size]);
2196
0
    if (copy_tensor)
2197
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
2198
0
  }
2199
  // Mark this symbol as init'ed.
2200
32
  const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, d))->d;
2201
32
  uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2202
32
  init_v[s >> 5] |= (1u << (s & 0x1f));
2203
32
}
2204
2205
void ccv_cnnp_model_parameter_copy(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, ccv_nnc_tensor_t* const tensor)
2206
6
{
2207
6
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2208
6
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 13
:
parameter->param_sel3
;
2209
6
  assert(parameter->param_sel != 0);
2210
6
  assert(compiled_data->tensors.parameters);
2211
6
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2212
6
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2213
6
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 13
:
parameter->param_ref3
;
2214
6
  if (param_ref < 0)
2215
3
    { assert(parameter_indices->rnum == 1); }
2216
3
  else
2217
3
    { assert(param_ref < parameter_indices->rnum); }
2218
6
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2219
6
  ccv_array_free(parameter_indices);
2220
6
  const int parameter_size = compiled_data->parameters->rnum;
2221
6
  assert(d >= 0);
2222
6
  assert(d < parameter_size);
2223
  // We don't need to consider parallel_count, every parameter on each device is identical.
2224
6
  ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2225
6
  assert(src);
2226
6
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(tensor), 0);
2227
6
}
2228
2229
ccv_nnc_tensor_param_t ccv_cnnp_model_parameter_tensor_params(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter)
2230
1
{
2231
1
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2232
1
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 10
: parameter->param_sel;
2233
1
  assert(parameter->param_sel != 0);
2234
1
  assert(compiled_data->tensors.parameters);
2235
1
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2236
1
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2237
1
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 10
: parameter->param_ref;
2238
1
  if (param_ref < 0)
2239
1
    { assert(parameter_indices->rnum == 1); }
2240
0
  else
2241
0
    { assert(param_ref < parameter_indices->rnum); }
2242
1
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2243
1
  ccv_array_free(parameter_indices);
2244
1
  const int parameter_size = compiled_data->parameters->rnum;
2245
1
  assert(d >= 0);
2246
1
  assert(d < parameter_size);
2247
  // We don't need to consider parallel_count, every parameter on each device is identical.
2248
1
  ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2249
1
  assert(tensor);
2250
1
  return tensor->info;
2251
1
}
2252
2253
const char* ccv_cnnp_model_parameter_name(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter)
2254
2
{
2255
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2256
2
  const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 1 : 
parameter->param_sel0
;
2257
2
  assert(parameter->param_sel != 0);
2258
2
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2259
2
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2260
2
  const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 1 : 
parameter->param_ref0
;
2261
2
  if (param_ref < 0)
2262
0
    { assert(parameter_indices->rnum == 1); }
2263
2
  else
2264
2
    { assert(param_ref < parameter_indices->rnum); }
2265
2
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2266
2
  ccv_array_free(parameter_indices);
2267
2
  const int parameter_size = compiled_data->parameters->rnum;
2268
2
  assert(d >= 0);
2269
2
  assert(d < parameter_size);
2270
2
  return *(char**)ccv_array_get(compiled_data->ids.parameters, d);
2271
2
}
2272
2273
int ccv_cnnp_model_parameter_count(ccv_cnnp_model_t* const model)
2274
0
{
2275
0
  assert(model->compiled_data);
2276
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2277
0
  return compiled_data->parameters->rnum;
2278
0
}
2279
2280
ccv_cnnp_model_io_t ccv_cnnp_model_parameter_first(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f first, void* const context)
2281
0
{
2282
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2283
0
  assert(compiled_data);
2284
0
  const int parameter_size = compiled_data->parameters->rnum;
2285
0
  int i;
2286
0
  for (i = 0; i < parameter_size; i++)
2287
0
  {
2288
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2289
0
    if (first(model, name, context))
2290
0
      return ccv_cnnp_model_parameters(model, -1, i);
2291
0
  }
2292
0
  return 0;
2293
0
}
2294
2295
ccv_array_t* ccv_cnnp_model_parameters_filter(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f filter, void* const context)
2296
0
{
2297
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2298
0
  assert(compiled_data);
2299
0
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 0, 0);
2300
0
  const int parameter_size = compiled_data->parameters->rnum;
2301
0
  int i;
2302
0
  for (i = 0; i < parameter_size; i++)
2303
0
  {
2304
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2305
0
    if (filter(model, name, context))
2306
0
    {
2307
0
      ccv_cnnp_model_io_t parameter = ccv_cnnp_model_parameters(model, -1, i);
2308
0
      ccv_array_push(parameters, &parameter);
2309
0
    }
2310
0
  }
2311
0
  return parameters;
2312
2313
0
}
2314
2315
CCV_WARN_UNUSED(ccv_cnnp_model_io_t) ccv_cnnp_model_parameter_first_uninit(ccv_cnnp_model_t* const model)
2316
0
{
2317
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2318
0
  assert(compiled_data);
2319
0
  const int tensors_init = !!compiled_data->tensors_init.v;
2320
0
  if (!tensors_init) // If nothing initialized, we return parameter 0.
2321
0
    return ccv_cnnp_model_parameters(model, -1, 0);
2322
0
  const int parameter_size = compiled_data->parameters->rnum;
2323
0
  int i;
2324
0
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2325
0
  for (i = 0; i < parameter_size; i++)
2326
0
  {
2327
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
2328
0
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
2329
0
      return ccv_cnnp_model_parameters(model, -1, i);
2330
0
  }
2331
0
  return 0;
2332
0
}
2333
2334
static ccv_array_t* _ccv_cnnp_model_parameter_indices(const ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, int* const param_ref)
2335
48
{
2336
48
  const int to_param_sel = parameters->param_sel > 0 ? 
parameters->param_sel - 10
: parameters->param_sel;
2337
48
  assert(parameters->param_sel != 0);
2338
48
  ccv_array_t* const to_parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2339
48
  ccv_cnnp_model_add_to_parameter_indices(parameters->model, to_param_sel, to_parameter_indices);
2340
48
  *param_ref = parameters->param_ref > 0 ? 
parameters->param_ref - 10
: parameters->param_ref;
2341
48
  return to_parameter_indices;
2342
48
}
2343
2344
static void _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_array_t** const parameter_indices, int* const param_ref, ccv_array_t** const from_parameter_indices, int* const from_param_ref, const int only_init_0)
2345
14
{
2346
  // If the model is not compiled yet. Compile them now.
2347
14
  if (!model->graph)
2348
3
  {
2349
3
    model->graph = ccv_nnc_symbolic_graph_new();
2350
3
    assert(from_model->compiled_data);
2351
3
    const int input_size = from_model->input_size;
2352
3
    ccv_nnc_tensor_param_t input_params[input_size];
2353
3
    int i;
2354
9
    for (i = 0; i < input_size; 
i++6
)
2355
6
      input_params[i] = ccv_nnc_tensor_symbol_params(from_model->graph, from_model->inputs[i]);
2356
3
    _ccv_cnnp_model_compile(model, input_params, input_size, from_model->compiled_data->loss);
2357
3
    model->parallel_count = from_model->parallel_count;
2358
3
    model->memory_compression = from_model->memory_compression;
2359
3
    model->memory_reduction = from_model->memory_reduction;
2360
3
    model->gradient_checkpointing = from_model->gradient_checkpointing;
2361
3
    model->compiled_data->stream_type = from_model->compiled_data->stream_type;
2362
3
    model->compiled_data->minimize.minimizer = from_model->compiled_data->minimize.minimizer;
2363
3
    model->compiled_data->minimize.max_saved_aux_size = from_model->compiled_data->minimize.max_saved_aux_size;
2364
3
  }
2365
14
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2366
14
  assert(to_compiled_data);
2367
14
  const int to_tensors_init = !!to_compiled_data->tensors_init.v;
2368
14
  if (!to_tensors_init)
2369
10
  {
2370
10
    if (only_init_0)
2371
1
      ccv_cnnp_model_tensors_init_0(model, to_compiled_data);
2372
9
    else
2373
9
      _ccv_cnnp_model_tensors_init(model, to_compiled_data);
2374
10
  } else 
if (4
!only_init_04
&&
(uintptr_t)to_compiled_data->tensors_init.v & (uintptr_t)13
)
2375
    // Check if it is not fully allocated, if it is not, init_1.
2376
0
      ccv_cnnp_model_tensors_init_1(model, to_compiled_data);
2377
14
  assert(to_compiled_data->tensors.parameters);
2378
14
  *parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, param_ref);
2379
14
  *from_parameter_indices = _ccv_cnnp_model_parameter_indices(from_model, from_parameters, from_param_ref);
2380
14
  if (*from_param_ref < 0 && *param_ref >= 0)
2381
0
    { assert((*from_parameter_indices)->rnum == 1); }
2382
14
  else if (*from_param_ref >= 0)
2383
0
    { assert(*from_param_ref < (*from_parameter_indices)->rnum); }
2384
14
  if (*param_ref < 0 && *from_param_ref >= 0)
2385
0
    { assert((*parameter_indices)->rnum == 1); }
2386
14
  else if (*param_ref >= 0)
2387
0
    { assert(*param_ref < (*parameter_indices)->rnum); }
2388
14
}
2389
2390
void ccv_cnnp_model_set_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2391
9
{
2392
9
  ccv_array_t* to_parameter_indices;
2393
9
  int to_param_ref;
2394
9
  ccv_array_t* from_parameter_indices;
2395
9
  int from_param_ref;
2396
9
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0);
2397
  // Should be exactly the same tensor.
2398
9
  if (to_param_ref < 0 && from_param_ref < 0)
2399
9
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2400
  // To models.
2401
9
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2402
9
  assert(to_compiled_data);
2403
  // From models.
2404
9
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2405
9
  const int parallel_count = ccv_max(model->parallel_count, 1);
2406
9
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2407
9
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2408
9
  int i, j;
2409
9
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2410
9
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2411
18
  for (i = 0; i < rnum; 
i++9
)
2412
9
  {
2413
9
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2414
9
    assert(src_d >= 0);
2415
9
    assert(src_d < from_compiled_data->parameters->rnum);
2416
9
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2417
    // If the original is not init'ed. We cannot copy from.
2418
9
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2419
0
      continue;
2420
9
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2421
9
    assert(dest_d >= 0);
2422
9
    assert(dest_d < to_compiled_data->parameters->rnum);
2423
9
    ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]);
2424
9
    assert(src);
2425
9
    ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2426
9
    assert(dest);
2427
9
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(dest), 0);
2428
27
    for (j = 1; j < parallel_count; 
j++18
)
2429
18
    {
2430
18
      ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2431
18
      if (copy_tensor)
2432
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
2433
18
    }
2434
    // Mark this symbol as init'ed.
2435
9
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2436
9
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2437
9
  }
2438
9
  ccv_array_free(to_parameter_indices);
2439
9
  ccv_array_free(from_parameter_indices);
2440
9
}
2441
2442
KHASH_MAP_INIT_STR(ccv_cnnp_parameter_id, int)
2443
2444
void ccv_cnnp_model_share_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_cnnp_model_parameters_renamer_f renamer, void* const context)
2445
2
{
2446
2
  ccv_array_t* to_parameter_indices;
2447
2
  int to_param_ref;
2448
2
  ccv_array_t* from_parameter_indices;
2449
2
  int from_param_ref;
2450
2
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 1);
2451
  // Should be exactly the same tensor.
2452
2
  if (renamer == 0 && 
to_param_ref < 01
&&
from_param_ref < 01
)
2453
1
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2454
  // To models.
2455
2
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2456
2
  assert(to_compiled_data);
2457
  // From models.
2458
2
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2459
2
  const int parallel_count = ccv_max(model->parallel_count, 1);
2460
2
  assert(parallel_count == ccv_max(from_model->parallel_count, 1)); // Should have the same parallel count can share parameters.
2461
2
  const int from_parameter_size = from_compiled_data->parameters->rnum;
2462
2
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2463
2
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2464
2
  int i, j;
2465
2
  khash_t(ccv_cnnp_parameter_id)* id_map = 0;
2466
2
  char* updated_name = 0;
2467
2
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2468
2
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2469
8
  for (i = 0; i < rnum; 
i++6
)
2470
6
  {
2471
6
    int src_d = (from_param_ref >= 0 ? 
from_param_ref0
: i) < from_parameter_indices->rnum ?
*(int*)4
ccv_array_get4
(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i) :
from_parameter_size2
;
2472
    // Need to figure out how to use the renamer here.
2473
6
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2474
6
    assert(dest_d >= 0);
2475
6
    assert(dest_d < to_parameter_size);
2476
6
    if (renamer)
2477
3
    {
2478
3
      const char* const src_name = (src_d < from_parameter_size && 
src_d >= 01
) ?
*(char**)1
ccv_array_get1
(from_compiled_data->ids.parameters, src_d) :
02
;
2479
3
      const char* const dest_name = *(char**)ccv_array_get(to_compiled_data->ids.parameters, dest_d);
2480
3
      if (!updated_name)
2481
1
        updated_name = (char*)ccmalloc(1024);
2482
3
      const size_t src_name_len = src_name == 0 ? 
02
:
ccv_min1
(strnlen(src_name, 1023), 1023);
2483
3
      if (src_name_len > 0)
2484
1
        memcpy(updated_name, src_name, src_name_len);
2485
3
      updated_name[src_name_len] = 0;
2486
3
      if (renamer(context, dest_name, updated_name, 1024) != 0)
2487
0
        continue; // Skip this.
2488
3
      if (src_name != 0 && 
memcmp(updated_name, src_name, src_name_len) == 01
&&
strnlen(updated_name, 1023) == src_name_len0
)
2489
0
      {
2490
        // Nothing changed.
2491
3
      } else {
2492
3
        if (!id_map)
2493
1
        {
2494
1
          id_map = kh_init(ccv_cnnp_parameter_id);
2495
2
          for (j = 0; j < from_parameter_size; 
j++1
)
2496
1
          {
2497
1
            int ret;
2498
1
            const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, *(char**)ccv_array_get(from_compiled_data->ids.parameters, j), &ret);
2499
1
            assert(ret != 0);
2500
1
            kh_val(id_map, k) = j;
2501
1
          }
2502
1
        }
2503
3
        const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, updated_name);
2504
3
        if (k == kh_end(id_map)) // Cannot find the name, skip.
2505
2
          continue;
2506
1
        src_d = kh_val(id_map, k);
2507
1
        assert(src_d >= 0);
2508
1
        assert(src_d < from_parameter_size);
2509
1
      }
2510
3
    }
2511
6
    assert
(src_d >= 0)4
;
2512
4
    assert(src_d < from_parameter_size);
2513
4
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2514
    // If the original is not init'ed. We cannot share from.
2515
4
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2516
0
      continue;
2517
8
    
for (j = 0; 4
j < parallel_count;
j++4
)
2518
4
    {
2519
4
      ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * from_parameter_size]);
2520
4
      assert(src);
2521
4
      ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
2522
4
      if (dest && 
!((uintptr_t)dest & (uintptr_t)1)1
)
2523
1
        ccv_nnc_tensor_free(dest);
2524
4
      to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size] = (ccv_nnc_tensor_t*)((uintptr_t)src | (uintptr_t)1);
2525
4
    }
2526
    // Mark this symbol as init'ed.
2527
4
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2528
4
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2529
4
  }
2530
2
  ccv_array_free(to_parameter_indices);
2531
2
  ccv_array_free(from_parameter_indices);
2532
2
  if (id_map)
2533
1
    kh_destroy(ccv_cnnp_parameter_id, id_map);
2534
2
  if (updated_name)
2535
1
    ccfree(updated_name);
2536
  // Mark it as incomplete so we will call init_1.
2537
2
  if (ccv_cnnp_model_tensors_any_to_alloc(model, to_compiled_data))
2538
0
    to_compiled_data->tensors_init.v = (uint32_t*)((uintptr_t)to_compiled_data->tensors_init.v | (uintptr_t)1);
2539
2
  else // Remove the flag.
2540
2
    to_compiled_data->tensors_init.v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2541
2
}
2542
2543
ccv_nnc_stream_context_t* ccv_cnnp_compiled_data_get_stream(ccv_cnnp_compiled_data_t* const compiled_data, const int type)
2544
24
{
2545
24
  if (!compiled_data->stream_map)
2546
4
    compiled_data->stream_map = kh_init(stream_map);
2547
24
  int ret = 0;
2548
24
  khiter_t k = kh_put(stream_map, compiled_data->stream_map, type, &ret);
2549
24
  assert(ret >= 0);
2550
24
  ccv_nnc_stream_context_t* stream = kh_val(compiled_data->stream_map, k);
2551
  // If ret == 0, the key already exist, we can return directly, otherwise, create and return.
2552
24
  if (ret != 0)
2553
16
  {
2554
16
    stream = ccv_nnc_stream_context_new(type);
2555
16
    kh_val(compiled_data->stream_map, k) = stream;
2556
16
  }
2557
24
  return stream;
2558
24
}
2559
2560
void ccv_cnnp_model_parameters_zip_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2561
3
{
2562
3
  ccv_array_t* to_parameter_indices;
2563
3
  int to_param_ref;
2564
3
  ccv_array_t* from_parameter_indices;
2565
3
  int from_param_ref;
2566
3
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0);
2567
  // Should be exactly the same tensor.
2568
3
  if (to_param_ref < 0 && from_param_ref < 0)
2569
3
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2570
  // To models.
2571
3
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2572
3
  assert(to_compiled_data);
2573
  // From models.
2574
3
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2575
3
  const int parallel_count = ccv_max(model->parallel_count, 1);
2576
3
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2577
3
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2578
3
  assert(aux_in_size >= 0);
2579
3
  assert(aux_out_size >= 0);
2580
3
  int i, j;
2581
3
  ccv_nnc_tensor_t* inputs[aux_in_size + 2];
2582
3
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2583
3
  for (i = 0; i < aux_in_size; 
i++0
)
2584
0
    inputs[i + 2] = aux_ins[i];
2585
3
  for (i = 0; i < aux_out_size; 
i++0
)
2586
0
    outputs[i + 1] = aux_outs[i];
2587
3
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2588
3
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2589
6
  for (i = 0; i < rnum; 
i++3
)
2590
3
  {
2591
3
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2592
3
    assert(src_d >= 0);
2593
3
    assert(src_d < from_compiled_data->parameters->rnum);
2594
3
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2595
    // If the original is not init'ed. We cannot copy from.
2596
3
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2597
0
      continue;
2598
3
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2599
3
    assert(dest_d >= 0);
2600
3
    assert(dest_d < to_compiled_data->parameters->rnum);
2601
3
    if (parallel_count > 1)
2602
2
    {
2603
2
      ccv_nnc_stream_context_t* streams[parallel_count];
2604
2
      ccv_nnc_stream_signal_t* signal;
2605
2
      if (stream_context)
2606
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2607
10
      for (j = 0; j < parallel_count; 
j++8
)
2608
8
      {
2609
8
        ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * to_parameter_size]);
2610
8
        ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2611
8
        if (!dest || !src)
2612
0
        {
2613
0
          streams[j] = 0;
2614
0
          continue;
2615
0
        }
2616
        // At the moment, can only handle them on the same device.
2617
8
        assert(CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GET_MEMORY(dest->info.type));
2618
8
        assert(CCV_TENSOR_GET_DEVICE_ID(src->info.type) == CCV_TENSOR_GET_DEVICE_ID(dest->info.type));
2619
8
        const int stream_type = CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2620
8
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(src->info.type);
2621
8
        int type = stream_type;
2622
8
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2623
8
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2624
        // Wait signal to finish.
2625
8
        if (stream_context)
2626
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2627
8
        inputs[0] = outputs[0] = dest;
2628
8
        inputs[1] = src;
2629
8
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_0);
2630
8
        if (stream_context)
2631
4
        {
2632
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2633
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2634
4
        }
2635
8
        streams[j] = stream_0;
2636
8
      }
2637
      // If this should be blocking, blocking it.
2638
2
      if (!stream_context)
2639
5
        
for (j = 0; 1
j < parallel_count;
j++4
)
2640
4
          if (streams[j])
2641
4
            ccv_nnc_stream_context_wait(streams[j]);
2642
2
    } else {
2643
1
      ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]);
2644
1
      assert(src);
2645
1
      ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2646
1
      assert(dest);
2647
1
      inputs[0] = outputs[0] = dest;
2648
1
      inputs[1] = src;
2649
1
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_context);
2650
1
    }
2651
    // Mark this symbol as init'ed.
2652
3
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2653
3
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2654
3
  }
2655
3
  ccv_array_free(to_parameter_indices);
2656
3
  ccv_array_free(from_parameter_indices);
2657
3
}
2658
2659
void ccv_cnnp_model_parameters_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2660
14
{
2661
14
  int to_param_ref;
2662
14
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2663
  // To models.
2664
14
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2665
14
  assert(to_compiled_data);
2666
  // Tensor has to be inited already.
2667
14
  assert(!!to_compiled_data->tensors_init.v);
2668
14
  assert(to_compiled_data->tensors.parameters);
2669
  // From models.
2670
14
  const int parallel_count = ccv_max(model->parallel_count, 1);
2671
14
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2672
14
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2673
14
  assert(aux_in_size >= 0);
2674
14
  assert(aux_out_size >= 0);
2675
14
  int i, j;
2676
14
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
2677
14
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2678
14
  for (i = 0; i < aux_in_size; 
i++0
)
2679
0
    inputs[i + 1] = aux_ins[i];
2680
14
  for (i = 0; i < aux_out_size; 
i++0
)
2681
0
    outputs[i + 1] = aux_outs[i];
2682
28
  for (i = 0; i < rnum; 
i++14
)
2683
14
  {
2684
14
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2685
14
    assert(dest_d >= 0);
2686
14
    assert(dest_d < to_compiled_data->parameters->rnum);
2687
14
    if (parallel_count > 1)
2688
4
    {
2689
4
      ccv_nnc_stream_context_t* streams[parallel_count];
2690
4
      ccv_nnc_stream_signal_t* signal;
2691
4
      if (stream_context)
2692
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2693
20
      for (j = 0; j < parallel_count; 
j++16
)
2694
16
      {
2695
16
        ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2696
16
        if (!dest)
2697
0
        {
2698
0
          streams[j] = 0;
2699
0
          continue;
2700
0
        }
2701
16
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2702
16
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
2703
16
        int type = stream_type;
2704
16
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2705
16
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2706
        // Wait signal to finish.
2707
16
        if (stream_context)
2708
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2709
16
        inputs[0] = outputs[0] = dest;
2710
16
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
2711
16
        if (stream_context)
2712
4
        {
2713
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2714
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2715
4
        }
2716
16
        streams[j] = stream_0;
2717
16
      }
2718
      // If this should be blocking, blocking it.
2719
4
      if (!stream_context)
2720
15
        
for (j = 0; 3
j < parallel_count;
j++12
)
2721
12
          if (streams[j])
2722
12
            ccv_nnc_stream_context_wait(streams[j]);
2723
10
    } else {
2724
10
      ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2725
10
      assert(dest);
2726
10
      inputs[0] = outputs[0] = dest;
2727
10
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
2728
10
    }
2729
    // No need to mark this symbol as init'ed, it is already.
2730
14
  }
2731
14
  ccv_array_free(to_parameter_indices);
2732
14
}
2733
2734
void ccv_cnnp_model_parameter_gradients_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2735
6
{
2736
6
  int to_param_ref;
2737
6
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2738
  // To models.
2739
6
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2740
6
  assert(to_compiled_data);
2741
  // Tensor has to be inited already.
2742
6
  assert(!!to_compiled_data->tensors_init.v);
2743
6
  ccv_nnc_tensor_t** tensor_gradients;
2744
6
  if (to_compiled_data->backward.count > 1)
2745
3
    tensor_gradients = to_compiled_data->tensors.accum_gradients;
2746
3
  else
2747
3
    tensor_gradients = to_compiled_data->tensors.gradients;
2748
6
  assert(tensor_gradients);
2749
  // From models.
2750
6
  const int parallel_count = ccv_max(model->parallel_count, 1);
2751
6
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2752
6
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2753
6
  assert(aux_in_size >= 0);
2754
6
  assert(aux_out_size >= 0);
2755
6
  int i, j;
2756
6
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
2757
6
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2758
10
  for (i = 0; i < aux_in_size; 
i++4
)
2759
4
    inputs[i + 1] = aux_ins[i];
2760
14
  for (i = 0; i < aux_out_size; 
i++8
)
2761
8
    outputs[i + 1] = aux_outs[i];
2762
12
  for (i = 0; i < rnum; 
i++6
)
2763
6
  {
2764
6
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2765
6
    assert(dest_d >= 0);
2766
6
    assert(dest_d < to_compiled_data->parameters->rnum);
2767
6
    if (parallel_count > 1)
2768
0
    {
2769
0
      ccv_nnc_stream_context_t* streams[parallel_count];
2770
0
      ccv_nnc_stream_signal_t* signal;
2771
0
      if (stream_context)
2772
0
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2773
0
      for (j = 0; j < parallel_count; j++)
2774
0
      {
2775
0
        ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d + j * to_parameter_size];
2776
0
        if (!dest)
2777
0
        {
2778
0
          streams[j] = 0;
2779
0
          continue;
2780
0
        }
2781
0
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU;
2782
0
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
2783
0
        int type = stream_type;
2784
0
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2785
0
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2786
        // Wait signal to finish.
2787
0
        if (stream_context)
2788
0
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2789
0
        inputs[0] = outputs[0] = dest;
2790
0
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
2791
0
        if (stream_context)
2792
0
        {
2793
0
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2794
0
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2795
0
        }
2796
0
        streams[j] = stream_0;
2797
0
      }
2798
      // If this should be blocking, blocking it.
2799
0
      if (!stream_context)
2800
0
        for (j = 0; j < parallel_count; j++)
2801
0
          if (streams[j])
2802
0
            ccv_nnc_stream_context_wait(streams[j]);
2803
6
    } else {
2804
6
      ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d];
2805
6
      if (!dest)
2806
0
        continue;
2807
6
      assert(dest);
2808
6
      inputs[0] = outputs[0] = dest;
2809
6
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
2810
6
    }
2811
    // No need to mark this symbol as init'ed, it is already.
2812
6
  }
2813
6
  ccv_array_free(to_parameter_indices);
2814
6
}
2815
2816
ccv_nnc_cmd_t ccv_cnnp_model_minimizer(ccv_cnnp_model_t* const model)
2817
2.20k
{
2818
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2819
2.20k
  assert(compiled_data);
2820
2.20k
  return compiled_data->minimize.minimizer;
2821
2.20k
}
2822
2823
void ccv_cnnp_model_set_minimizer(ccv_cnnp_model_t* const model, const ccv_nnc_cmd_t minimizer, const int reset, const ccv_cnnp_model_io_t* const set_parameters, const int set_parameter_size)
2824
4.36k
{
2825
4.36k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2826
4.36k
  assert(compiled_data);
2827
4.36k
  const int parameter_size = compiled_data->parameters->rnum;
2828
4.36k
  if (parameter_size == 0)
2829
3
    return;
2830
4.35k
  if (reset)
2831
2.49k
    { assert(set_parameters == 0 && set_parameter_size == 0); }
2832
4.35k
  const int old_max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2833
4.35k
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
2834
4.35k
  if (saved_aux_size > compiled_data->minimize.max_saved_aux_size)
2835
7
    compiled_data->minimize.max_saved_aux_size = saved_aux_size;
2836
4.35k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2837
  // We update all parameters, at this point, we have one minimizer.
2838
4.35k
  if (set_parameters == 0 || 
set_parameter_size == 0301
)
2839
4.05k
    compiled_data->minimize.minimizer = minimizer;
2840
4.35k
  int i;
2841
4.35k
  if (set_parameters && 
set_parameter_size301
)
2842
301
  {
2843
    // I need to save what's the minimizer along with this.
2844
301
    if (!compiled_data->minimize.parameters)
2845
5
      compiled_data->minimize.parameters = ccv_array_new(sizeof(ccv_cnnp_set_minimizer_for_parameter_t*), 1, 0);
2846
301
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = ccmalloc(sizeof(ccv_cnnp_set_minimizer_for_parameter_t) + (set_parameter_size - 1) * sizeof(ccv_cnnp_model_io_t));
2847
301
    set_minimizer_for_parameter->minimizer = minimizer;
2848
301
    set_minimizer_for_parameter->parameter_size = set_parameter_size;
2849
301
    memcpy(set_minimizer_for_parameter->parameters, set_parameters, sizeof(ccv_cnnp_model_io_t) * set_parameter_size);
2850
301
    ccv_array_push(compiled_data->minimize.parameters, &set_minimizer_for_parameter);
2851
301
  }
2852
  // If reset is true, clear the parameters array.
2853
4.35k
  if (reset && 
compiled_data->minimize.parameters2.49k
)
2854
291
  {
2855
582
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++291
)
2856
291
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
2857
291
    ccv_array_clear(compiled_data->minimize.parameters);
2858
291
  }
2859
4.35k
  if (!compiled_data->update_nodes)
2860
9
    return;
2861
4.34k
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
2862
4.34k
  assert(symbolic_graph);
2863
4.34k
  if (saved_aux_size > old_max_saved_aux_size)
2864
7
  {
2865
7
    assert(compiled_data->updated_parameters);
2866
    // Reallocate first, move them around later.
2867
7
    compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccrealloc(compiled_data->updated_parameters, sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * saved_aux_size * parameter_size);
2868
7
    compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
2869
7
    compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
2870
    // We need to do this from back to front because saved_aux_size > old_saved_aux_size, it could overlap.
2871
7
    _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, old_max_saved_aux_size, saved_aux_size);
2872
7
  }
2873
4.34k
  int flag = 0;
2874
4.34k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2875
4.34k
  if (set_parameters && 
set_parameter_size296
)
2876
296
  {
2877
296
    ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2878
592
    for (i = 0; i < set_parameter_size; 
i++296
)
2879
296
    {
2880
296
      const int param_sel = set_parameters[i]->param_sel > 0 ? 
set_parameters[i]->param_sel - 1291
:
set_parameters[i]->param_sel5
;
2881
296
      assert(set_parameters[i]->param_sel != 0);
2882
296
      const int old_rnum = parameter_indices->rnum;
2883
296
      ccv_cnnp_model_add_to_parameter_indices(set_parameters[i]->model, param_sel, parameter_indices);
2884
296
      const int param_ref = set_parameters[i]->param_ref > 0 ? 
set_parameters[i]->param_ref - 10
: set_parameters[i]->param_ref;
2885
296
      assert(set_parameters[i]->param_ref != 0);
2886
296
      if (param_ref >= 0)
2887
0
      {
2888
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
2889
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
2890
0
        parameter_indices->rnum = old_rnum + 1;
2891
0
      }
2892
296
    }
2893
    // We may have duplicated indices, but that is OK, we will set it twice.
2894
5.24k
    
for (i = 0; 296
i < parameter_indices->rnum;
i++4.95k
)
2895
4.95k
    {
2896
4.95k
      const int d = *(int*)ccv_array_get(parameter_indices, i);
2897
4.95k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, d))
2898
0
        flag = 1;
2899
4.95k
    }
2900
296
    ccv_array_free(parameter_indices);
2901
4.05k
  } else {
2902
19.1k
    for (i = 0; i < parameter_size; 
i++15.0k
)
2903
15.0k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, i))
2904
65
        flag = 1;
2905
4.05k
    if (compiled_data->minimize.parameters)
2906
291
      if (_ccv_cnnp_apply_parameters_with_minimizer(model))
2907
0
        flag = 1;
2908
4.05k
  }
2909
4.34k
  if (flag)
2910
7
  {
2911
    // If saved_aux_size doesn't match, we need to remove / add new saved_aux to the graph. But first, free up apply gradients graph.
2912
7
    if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_FIT_MODE)
2913
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
2914
7
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
2915
7
  }
2916
4.34k
}
2917
2918
void ccv_cnnp_model_set_compile_params(ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_compile_param_t compile_params)
2919
0
{
2920
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2921
0
  assert(compiled_data);
2922
0
  compiled_data->compile_params = compile_params;
2923
0
}
2924
2925
void ccv_cnnp_model_dot(const ccv_cnnp_model_t* const model, const int flags, FILE** const outs, const int out_size)
2926
45
{
2927
45
  if (model->graph && 
out_size > 044
)
2928
44
    ccv_nnc_symbolic_graph_dot(model->graph, flags, outs[0]);
2929
45
  if (model->compiled_data && 
model->compiled_data->graph44
&&
out_size > 116
)
2930
0
    ccv_nnc_graph_dot(model->compiled_data->graph, flags, outs[1]);
2931
45
  if (model->compiled_data && 
model->compiled_data->backward.accum44
&&
out_size > 20
)
2932
0
    ccv_nnc_graph_dot(model->compiled_data->backward.accum, flags, outs[2]);
2933
45
  if (model->compiled_data && 
model->compiled_data->apply_gradients.graph44
&&
out_size > 33
)
2934
0
    ccv_nnc_graph_dot(model->compiled_data->apply_gradients.graph, flags, outs[3]);
2935
45
}
2936
2937
void ccv_cnnp_model_format(const ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_format_f format_fn, void* const context)
2938
0
{
2939
0
  if (model->graph)
2940
0
    ccv_nnc_symbolic_graph_format(model->graph, 0, 0, 0, 0, format_fn, context);
2941
0
}
2942
2943
static void _ccv_cnnp_compiled_data_free(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
2944
2.29k
{
2945
2.29k
  int i;
2946
2.29k
  const int parameter_size = compiled_data->parameters->rnum;
2947
2.29k
  ccv_array_free(compiled_data->parameters);
2948
2.29k
  if (compiled_data->parameter_flags)
2949
8
    ccfree(compiled_data->parameter_flags);
2950
2.29k
  const int internal_size = compiled_data->internals->rnum;
2951
2.29k
  ccv_array_free(compiled_data->internals);
2952
2.29k
  assert(compiled_data->ids.parameters->rnum == parameter_size);
2953
2.29k
  assert(compiled_data->ids.internals->rnum == internal_size);
2954
5.24k
  
for (i = 0; 2.29k
i < parameter_size;
i++2.94k
)
2955
2.94k
    ccfree(*(char**)ccv_array_get(compiled_data->ids.parameters, i));
2956
2.29k
  ccv_array_free(compiled_data->ids.parameters);
2957
2.45k
  for (i = 0; i < internal_size; 
i++161
)
2958
161
    ccfree(*(char**)ccv_array_get(compiled_data->ids.internals, i));
2959
2.29k
  ccv_array_free(compiled_data->ids.internals);
2960
2.29k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2961
2.29k
  if (compiled_data->tensors.parameters)
2962
84
  {
2963
781
    for (i = 0; i < parameter_size * parallel_count; 
i++697
)
2964
      // If it is not marked as not belonging, we can free it.
2965
697
      if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1))
2966
693
        if (compiled_data->tensors.parameters[i])
2967
693
          ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]);
2968
239
    for (i = 0; i < internal_size * parallel_count; 
i++155
)
2969
155
      if (compiled_data->tensors.internals[i])
2970
155
        ccv_nnc_tensor_free(compiled_data->tensors.internals[i]);
2971
84
    ccfree(compiled_data->tensors.parameters);
2972
84
  }
2973
2.29k
  if (compiled_data->tensors.gradients)
2974
28
  {
2975
355
    for (i = 0; i < parameter_size * parallel_count; 
i++327
)
2976
327
    {
2977
327
      if (compiled_data->tensors.gradients[i])
2978
325
        ccv_nnc_tensor_free(compiled_data->tensors.gradients[i]);
2979
327
      if (compiled_data->tensors.accum_gradients[i])
2980
15
        ccv_nnc_tensor_free(compiled_data->tensors.accum_gradients[i]);
2981
327
    }
2982
28
    ccfree(compiled_data->tensors.gradients);
2983
28
  }
2984
2.29k
  if (compiled_data->minimize.parameters)
2985
5
  {
2986
15
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++10
)
2987
10
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
2988
5
    ccv_array_free(compiled_data->minimize.parameters);
2989
5
  }
2990
2.29k
  if (compiled_data->rewindables)
2991
41
    ccv_array_free(compiled_data->rewindables);
2992
2.29k
  if (compiled_data->tensors_init.v)
2993
84
    ccfree(CCV_NNC_INIT_V(compiled_data->tensors_init.v));
2994
2.29k
  if (compiled_data->evaluate.tos)
2995
2.29k
    ccfree(compiled_data->evaluate.tos);
2996
2.29k
  compiled_data->evaluate.tos = 0;
2997
2.29k
  if (compiled_data->stream_map)
2998
4
  {
2999
4
    khiter_t k;
3000
36
    for (k = 
kh_begin4
(compiled_data->stream_map); k != kh_end(compiled_data->stream_map);
++k32
)
3001
32
    {
3002
32
      if (!kh_exist(compiled_data->stream_map, k))
3003
16
        continue;
3004
16
      ccv_nnc_stream_context_t* const stream = kh_val(compiled_data->stream_map, k);
3005
16
      ccv_nnc_stream_context_free(stream);
3006
16
    }
3007
4
    kh_destroy(stream_map, compiled_data->stream_map);
3008
4
  }
3009
2.29k
  _ccv_cnnp_compiled_data_graph_free(compiled_data);
3010
2.29k
  _ccv_cnnp_compiled_data_gradient_free(compiled_data);
3011
2.29k
  _ccv_cnnp_compiled_data_backward_free(compiled_data);
3012
2.29k
  _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
3013
2.29k
  if (compiled_data->gradient_checkpoints)
3014
2
  {
3015
4
    for (i = 0; i < compiled_data->gradient_checkpoints->rnum; 
i++2
)
3016
2
    {
3017
2
      ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(compiled_data->gradient_checkpoints, i);
3018
2
      assert(checkpoint->inputs);
3019
2
      ccfree(checkpoint->inputs);
3020
2
      ccv_array_free(checkpoint->tensor_symbols);
3021
2
    }
3022
2
    ccv_array_free(compiled_data->gradient_checkpoints);
3023
2
  }
3024
2.29k
  ccv_nnc_xpu_alloc_destroy(&compiled_data->xpu_alloc);
3025
2.29k
  ccfree(compiled_data);
3026
2.29k
}
3027
3028
void ccv_cnnp_model_free(ccv_cnnp_model_t* const model)
3029
5.40k
{
3030
5.40k
  if (model->isa->deinit)
3031
1.37k
    model->isa->deinit(model);
3032
5.40k
  if (model->io)
3033
771
  {
3034
771
    int i;
3035
1.90k
    for (i = 0; i < model->io->rnum; 
i++1.13k
)
3036
1.13k
    {
3037
1.13k
      ccv_cnnp_model_io_t model_io = *(ccv_cnnp_model_io_t*)ccv_array_get(model->io, i);
3038
1.13k
      if (model_io->outgoings)
3039
634
        ccv_array_free(model_io->outgoings);
3040
1.13k
      if (model_io->incomings)
3041
579
        ccv_array_free(model_io->incomings);
3042
1.13k
      if (model_io->dependencies)
3043
2
        ccv_array_free(model_io->dependencies);
3044
1.13k
      ccfree(model_io);
3045
1.13k
    }
3046
771
    ccv_array_free(model->io);
3047
771
  }
3048
5.40k
  if (model->parameter_indices)
3049
2.52k
    ccv_array_free(model->parameter_indices);
3050
5.40k
  if (model->inputs)
3051
2.29k
    ccfree(model->inputs);
3052
5.40k
  if (model->graph)
3053
2.29k
    ccv_nnc_symbolic_graph_free(model->graph);
3054
5.40k
  if (model->compiled_data)
3055
2.29k
    _ccv_cnnp_compiled_data_free(model, model->compiled_data);
3056
5.40k
  if (model->name)
3057
198
    ccfree(model->name);
3058
5.40k
  ccfree(model);
3059
5.40k
}
3060
3061
void ccv_cnnp_model_cancel(ccv_cnnp_model_t* const model)
3062
0
{
3063
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3064
0
  if (!compiled_data)
3065
0
    return;
3066
0
  if (compiled_data->graph)
3067
0
    ccv_nnc_graph_cancel(compiled_data->graph);
3068
0
  if (compiled_data->apply_gradients.graph)
3069
0
    ccv_nnc_graph_cancel(compiled_data->apply_gradients.graph);
3070
0
}