Coverage Report

Created: 2025-05-05 12:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_cnnp_model.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_cnnp_model.h"
6
#include "_ccv_nnc_graph.h"
7
8
// MARK - Level-5 API
9
10
ccv_cnnp_model_io_t ccv_cnnp_model_apply(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t* const inputs, const int input_size)
11
545
{
12
545
  if (!model->io)
13
536
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
14
545
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s) + sizeof(ccv_nnc_tensor_symbol_t) * model->output_size);
15
545
  model_io->param_ref = 0;
16
545
  model_io->param_sel = 0;
17
545
  model_io->visit = 0;
18
545
  model_io->model = model;
19
545
  model_io->dependencies = 0;
20
545
  model_io->dependents = 0;
21
545
  model_io->outgoings = 0;
22
545
  model_io->outputs = (ccv_nnc_tensor_symbol_t*)(model_io + 1);
23
545
  ccv_array_push(model->io, &model_io);
24
545
  if (input_size > 0)
25
542
  {
26
542
    model_io->incomings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), input_size, 0);
27
542
    ccv_array_resize(model_io->incomings, input_size);
28
542
    int i;
29
542
    memcpy(ccv_array_get(model_io->incomings, 0), inputs, sizeof(ccv_cnnp_model_io_t) * input_size);
30
1.22k
    for (i = 0; i < input_size; 
i++680
)
31
680
    {
32
680
      if (!inputs[i]->outgoings)
33
592
        inputs[i]->outgoings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
34
680
      ccv_array_push(inputs[i]->outgoings, &model_io);
35
680
    }
36
542
  } else {
37
3
    model_io->incomings = 0;
38
3
  }
39
545
  return model_io;
40
545
}
41
42
void ccv_cnnp_model_add_dependencies(ccv_cnnp_model_io_t model_io, const ccv_cnnp_model_io_t* const dependencies, const int dependency_size)
43
2
{
44
2
  assert(dependency_size > 0);
45
2
  if (!model_io->dependencies)
46
2
    model_io->dependencies = ccv_array_new(sizeof(ccv_cnnp_model_io_t), dependency_size, 0);
47
2
  int i, j;
48
5
  for (i = 0; i < dependency_size; 
i++3
)
49
3
  {
50
3
    int flag = 0;
51
    // Check if it is already exist or not.
52
4
    for (j = 0; !flag && j < model_io->dependencies->rnum; 
j++1
)
53
1
      if (*(ccv_cnnp_model_io_t*)ccv_array_get(model_io->dependencies, j) == dependencies[i])
54
0
        flag = 1;
55
3
    if (flag)
56
0
      continue;
57
3
    ccv_array_push(model_io->dependencies, dependencies + i);
58
3
    ++dependencies[i]->dependents;
59
3
  }
60
2
}
61
62
int ccv_cnnp_model_output_size(const ccv_cnnp_model_t* const model)
63
0
{
64
0
  return model->output_size;
65
0
}
66
67
int ccv_cnnp_model_is_trainable(const ccv_cnnp_model_t* const model)
68
16
{
69
  // If the model is compiled, it is default to 1 unless it is not.
70
16
  if (model->compiled_data)
71
4
    return model->is_trainable >= 0 ? model->is_trainable : 
10
;
72
12
  return model->is_trainable;
73
16
}
74
75
ccv_cnnp_model_io_t ccv_cnnp_model_parameters(ccv_cnnp_model_t* const model, const int selector, const int index)
76
392
{
77
392
  if (!model->io)
78
37
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
79
392
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s));
80
392
  model_io->param_ref = index >= 0 ? 
index + 140
:
ALL_PARAMETERS352
;
81
392
  model_io->param_sel = selector >= 0 ? 
selector + 1308
:
ALL_PARAMETERS84
;
82
392
  model_io->visit = 0;
83
392
  model_io->model = model;
84
392
  model_io->outputs = 0;
85
392
  model_io->dependencies = 0;
86
392
  model_io->dependents = 0;
87
392
  model_io->incomings = 0;
88
392
  model_io->outgoings = 0;
89
392
  ccv_array_push(model->io, &model_io);
90
392
  return model_io;
91
392
}
92
93
void ccv_cnnp_model_notify_hook(ccv_cnnp_model_t* const model, ccv_cnnp_model_notify_f func, void* const context)
94
3
{
95
3
  model->notify_hook.func = func;
96
3
  model->notify_hook.context = context;
97
3
}
98
99
void ccv_cnnp_model_notify(const ccv_cnnp_model_t* const model, const int tag, void* const payload)
100
14
{
101
14
  if (model->notify_hook.func)
102
3
    model->notify_hook.func(model, tag, payload, model->notify_hook.context);
103
14
  if (model->isa->notify)
104
1
    model->isa->notify(model, tag, payload);
105
14
}
106
107
static int _ccv_nnc_array_dedup_graph_exec_symbols(ccv_nnc_graph_exec_symbol_t* const graph_exec_symbols, int graph_exec_symbol_size)
108
2.24k
{
109
2.24k
  int i, j;
110
4.84k
  for (i = 0; i < graph_exec_symbol_size; 
i++2.60k
)
111
2.60k
  {
112
2.60k
    ccv_nnc_graph_exec_symbol_t* const graph_exec_symbol = graph_exec_symbols + i;
113
    // Check whether this tensor symbol has any duplicate.
114
23.2k
    for (j = i + 1; j < graph_exec_symbol_size;)
115
20.6k
    {
116
20.6k
      ccv_nnc_graph_exec_symbol_t* const other_symbol = graph_exec_symbols + j;
117
      // If there is a same tensor symbol, remove it.
118
20.6k
      if (other_symbol->d == graph_exec_symbol->d && 
other_symbol->graph == graph_exec_symbol->graph2.70k
)
119
2.70k
      {
120
2.70k
        if (j + 1 < graph_exec_symbol_size)
121
436
          *other_symbol = graph_exec_symbols[graph_exec_symbol_size - 1];
122
2.70k
        --graph_exec_symbol_size;
123
2.70k
        continue;
124
2.70k
      }
125
17.9k
      ++j;
126
17.9k
    }
127
2.60k
  }
128
2.24k
  return graph_exec_symbol_size;
129
2.24k
}
130
131
void ccv_cnnp_model_add_to_array(void* const context, const ccv_nnc_tensor_symbol_t symbol, const int is_trainable)
132
3.14k
{
133
3.14k
  ccv_cnnp_model_add_to_array_context_t* const add_to_array_context = (ccv_cnnp_model_add_to_array_context_t*)context;
134
3.14k
  ccv_cnnp_model_t* const model = add_to_array_context->sequence->model;
135
3.14k
  int i;
136
3.14k
  if (add_to_array_context->add_parameter_indices && 
!model->parameter_indices2.96k
)
137
2.52k
    model->parameter_indices = ccv_array_new(sizeof(int), 0, 0);
138
37.1k
  for (i = 0; i < add_to_array_context->symbols->rnum; 
i++33.9k
)
139
33.9k
  {
140
33.9k
    const ccv_nnc_tensor_symbol_t other_symbol = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(add_to_array_context->symbols, i);
141
33.9k
    if (other_symbol.d == symbol.d && 
other_symbol.graph == symbol.graph24
)
142
24
    {
143
      // Only add to parameter_indices if it is trainable.
144
24
      if (add_to_array_context->add_parameter_indices)
145
15
        ccv_array_add_unique_int(model->parameter_indices, i);
146
      // Found it, return, don't add it.
147
24
      return;
148
24
    }
149
33.9k
  }
150
  // Only add to parameter_indices if it is trainable.
151
3.12k
  if (add_to_array_context->add_parameter_indices)
152
2.95k
    ccv_array_push(model->parameter_indices, &add_to_array_context->symbols->rnum);
153
  // This is a new one, no need to add_unique_int, it is unique.
154
3.12k
  ccv_array_push(add_to_array_context->symbols, &symbol);
155
3.12k
  if (add_to_array_context->trainables)
156
2.96k
    ccv_array_push(add_to_array_context->trainables, &is_trainable);
157
3.12k
  char id[2048];
158
3.12k
  id[0] = add_to_array_context->prefix;
159
3.12k
  id[1] = '-';
160
3.12k
  int total_len = 2;
161
6.47k
  for (i = 0; i < add_to_array_context->sequence->sequences->rnum; 
i++3.35k
)
162
3.35k
  {
163
3.35k
    const ccv_cnnp_model_name_t* const name = (ccv_cnnp_model_name_t*)ccv_array_get(add_to_array_context->sequence->sequences, i);
164
3.35k
    int len;
165
3.35k
    if (name->name && 
name->name[0] != '\0'348
)
166
348
      len = snprintf(id + total_len, 2048 - total_len, "%s-%d-", name->name, name->sequence);
167
3.00k
    else
168
3.00k
      len = snprintf(id + total_len, 2048 - total_len, "%d-", name->sequence);
169
3.35k
    total_len += len;
170
3.35k
    if (total_len >= 2047)
171
0
      break;
172
3.35k
  }
173
3.12k
  if (total_len < 2047)
174
3.12k
    total_len += snprintf(id + total_len, 2048 - total_len, "%d", add_to_array_context->sequence->it);
175
3.12k
  assert(total_len < 2048);
176
3.12k
  char *heap_id = (char*)ccmalloc(total_len + 1);
177
3.12k
  memcpy(heap_id, id, total_len + 1);
178
3.12k
  ccv_array_push(add_to_array_context->ids, &heap_id);
179
3.12k
  ++add_to_array_context->sequence->it;
180
3.12k
}
181
182
static void _ccv_cnnp_compiled_data_init(ccv_cnnp_compiled_data_t* const compiled_data, const int output_size, ccv_array_t* const gradient_checkpoints)
183
2.29k
{
184
2.29k
  compiled_data->f = compiled_data->fits + output_size;
185
2.29k
  compiled_data->xpu_alloc.mp_hdr = -1;
186
2.29k
  compiled_data->xpu_alloc.freed = kh_init(dy_str);
187
2.29k
  compiled_data->xpu_alloc.allocd = kh_init(dy_alloc);
188
2.29k
  compiled_data->gradient_checkpoints = gradient_checkpoints;
189
2.29k
}
190
191
typedef struct {
192
  void* old_graph_exec_symbol_new_hook_context;
193
  ccv_nnc_graph_exec_symbol_new_hook_f old_graph_exec_symbol_new_hook;
194
  ccv_nnc_symbolic_graph_t* graph;
195
  ccv_cnnp_model_build_data_t* build_data;
196
} ccv_cnnp_model_set_exec_flags_context_t;
197
198
static void _ccv_cnnp_model_set_exec_flags(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
199
2.90k
{
200
2.90k
  ccv_cnnp_model_set_exec_flags_context_t* flags_context = (ccv_cnnp_model_set_exec_flags_context_t*)context;
201
2.90k
  if (flags_context->build_data->exec_flags)
202
0
    ccv_nnc_graph_exec_symbol_set_flags(flags_context->graph, symbol, flags_context->build_data->exec_flags);
203
2.90k
  if (flags_context->old_graph_exec_symbol_new_hook)
204
2.20k
    flags_context->old_graph_exec_symbol_new_hook(flags_context->old_graph_exec_symbol_new_hook_context, symbol, cmd, inputs, input_size, outputs, output_size, name);
205
2.90k
}
206
207
static void _ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t loss)
208
2.29k
{
209
2.29k
  assert(model->graph);
210
2.29k
  model->inputs = ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * input_size);
211
2.29k
  int i;
212
4.66k
  for (i = 0; i < input_size; 
i++2.36k
)
213
2.36k
    model->inputs[i] = ccv_nnc_tensor_symbol_new(model->graph, inputs[i], 0);
214
2.29k
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
215
2.29k
  ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0);
216
2.29k
  ccv_array_t* const parameter_trainables = ccv_array_new(sizeof(int), 0, 0);
217
2.29k
  ccv_cnnp_model_sequence_t model_sequence = {
218
2.29k
    .bank = kh_init(ccv_cnnp_model_name_bank)
219
2.29k
  };
220
2.29k
  ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = {
221
2.29k
    .add_parameter_indices = 1,
222
2.29k
    .prefix = 't',
223
2.29k
    .sequence = &model_sequence,
224
2.29k
    .symbols = parameters,
225
2.29k
    .ids = parameter_ids,
226
2.29k
    .trainables = parameter_trainables,
227
2.29k
  };
228
2.29k
  ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
229
2.29k
  ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0);
230
2.29k
  ccv_cnnp_model_add_to_array_context_t add_to_output_context = {
231
2.29k
    .add_parameter_indices = 0,
232
2.29k
    .prefix = 'r',
233
2.29k
    .sequence = &model_sequence,
234
2.29k
    .symbols = internals,
235
2.29k
    .ids = internal_ids,
236
2.29k
    .trainables = 0,
237
2.29k
  };
238
2.29k
  ccv_cnnp_model_build_data_t build_data = {
239
2.29k
    .exec_flags = 0,
240
2.29k
    .is_trainable = model->is_trainable >= 0 ? 
model->is_trainable2.29k
:
14
,
241
2.29k
    .model_sequence = &model_sequence,
242
2.29k
    .add_to_array = ccv_cnnp_model_add_to_array,
243
2.29k
    .parameters = parameters,
244
2.29k
    .context = {
245
2.29k
      .add_to_parameter = &add_to_parameter_context,
246
2.29k
      .add_to_output = &add_to_output_context,
247
2.29k
    },
248
2.29k
    .gradient_checkpoints = 0,
249
2.29k
  };
250
2.29k
  model->data = &build_data;
251
2.29k
  ccv_cnnp_model_set_exec_flags_context_t flags_context = {
252
2.29k
    .graph = model->graph,
253
2.29k
    .build_data = &build_data,
254
2.29k
    .old_graph_exec_symbol_new_hook = 0,
255
2.29k
    .old_graph_exec_symbol_new_hook_context = 0
256
2.29k
  };
257
2.29k
  flags_context.old_graph_exec_symbol_new_hook_context = ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_set_exec_flags, &flags_context, &flags_context.old_graph_exec_symbol_new_hook);
258
2.29k
  ccv_cnnp_model_build(model, model->graph, model->inputs, input_size, 0, 0);
259
  // Reset back to previous hook.
260
2.29k
  ccv_nnc_graph_exec_symbol_new_hook(model->graph, flags_context.old_graph_exec_symbol_new_hook, flags_context.old_graph_exec_symbol_new_hook_context, 0);
261
4.61k
  for (i = 0; i < model->output_size; 
i++2.31k
)
262
2.31k
  {
263
2.31k
    const ccv_nnc_tensor_symbol_t output = model->outputs[i];
264
2.31k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, output);
265
2.31k
    if (alias_to.d == CCV_NNC_NO_TENSOR_SYMBOL)
266
1.30k
      continue;
267
    // If output is an alias, insert data transform regardless for result correctness (we cannot bind an alias). You can check ccv_nnc_tensor_bind_symbol method
268
    // to see that we can correctly bind a tensor which from it, has aliases, but we cannot bind an alias tensor correctly (this is expected, sort of, to be
269
    // honest, because we cannot handle cases of alias is part of the original tensor but bind differently).
270
1.00k
    const ccv_nnc_tensor_param_t output_params = ccv_nnc_tensor_symbol_params(model->graph, output);
271
1.00k
    model->outputs[i] = ccv_nnc_tensor_symbol_new(model->graph, output_params, 0);
272
1.00k
    ccv_nnc_graph_exec_symbol_t make_contiguous = ccv_nnc_graph_exec_symbol_new(model->graph, CMD_FORMAT_TRANSFORM_FORWARD(), &output, 1, model->outputs + i, 1, "contiguous");
273
1.00k
    ccv_nnc_graph_exec_symbol_set_flags(model->graph, make_contiguous, CCV_NNC_GRAPH_EXEC_DISABLE_OPT);
274
1.00k
  }
275
2.29k
  model->data = 0;
276
2.29k
  kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank);
277
2.29k
  if (model_sequence.sequences)
278
2.28k
    ccv_array_free(model_sequence.sequences);
279
  // Check if there are parameters that are not trainables. If there are, we will allocate uint64 bitmap to record that.
280
2.29k
  int not_trainables = 0;
281
  // Assert no parameter is alias.
282
5.25k
  for (i = 0; i < parameters->rnum; 
i++2.95k
)
283
2.95k
  {
284
2.95k
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(parameters, i);
285
2.95k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(parameter.graph, parameter);
286
2.95k
    assert(alias_to.graph == 0); // Cannot find the one alias to.
287
2.95k
    if (*(int*)ccv_array_get(parameter_trainables, i) == 0)
288
14
      not_trainables = 1;
289
2.95k
  }
290
2.29k
  assert(parameters->rnum == parameter_trainables->rnum);
291
2.29k
  uint64_t* parameter_flags = 0;
292
2.29k
  if (not_trainables)
293
10
  {
294
10
    parameter_flags = (uint64_t*)cccalloc(((parameters->rnum + 63) >> 6), sizeof(uint64_t));
295
44
    for (i = 0; i < parameter_trainables->rnum; 
i++34
)
296
34
      if (*(int*)ccv_array_get(parameter_trainables, i))
297
20
        parameter_flags[i >> 6] |= ((uint64_t)1 << (i & 63));
298
10
  }
299
2.29k
  ccv_array_free(parameter_trainables);
300
  // Assert no internal is alias.
301
2.46k
  for (i = 0; i < internals->rnum; 
i++161
)
302
161
  {
303
161
    const ccv_nnc_tensor_symbol_t internal = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(internals, i);
304
161
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(internal.graph, internal);
305
161
    assert(alias_to.graph == 0); // Cannot find the one alias to.
306
161
  }
307
2.29k
  const int output_size = model->output_size;
308
2.29k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
309
2.29k
  const int parameters_rnum = parameters->rnum;
310
2.29k
  if (input_size > 0)
311
2.29k
  {
312
2.29k
    ccv_array_resize(parameters, parameters_rnum + input_size);
313
2.29k
    memcpy(ccv_array_get(parameters, parameters_rnum), model->inputs, input_size * sizeof(ccv_nnc_tensor_symbol_t));
314
2.29k
  }
315
2.29k
  ccv_nnc_symbolic_graph_simplify(model->graph,
316
2.29k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION,
317
2.29k
      CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT,
318
2.29k
      CCV_NNC_SIMPLIFY_OPS_FUSION,
319
2.29k
      CCV_NNC_SIMPLIFY_GRAPH_PRUNING),
320
2.29k
    ccv_array_get(parameters, 0), parameters_rnum + input_size,
321
2.29k
    model->outputs, output_size,
322
2.29k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
323
2.29k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
324
  // Size it down.
325
2.29k
  parameters->rnum = parameters_rnum;
326
2.29k
  ccv_cnnp_compiled_data_t* compiled_data = model->compiled_data = cccalloc(1, sizeof(ccv_cnnp_compiled_data_t) + sizeof(ccv_nnc_tensor_symbol_t) * (output_size * 2 - 1));
327
2.29k
  _ccv_cnnp_compiled_data_init(compiled_data, output_size, build_data.gradient_checkpoints);
328
2.29k
  const int evaluate_to_size = compiled_data->evaluate.to_size = ccv_nnc_symbolic_graph_destination_size(model->graph);
329
2.29k
  assert(evaluate_to_size > 0);
330
2.29k
  compiled_data->evaluate.tos = ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
331
2.29k
  memcpy(compiled_data->evaluate.tos, ccv_nnc_symbolic_graph_destinations(model->graph), sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
332
2.29k
  compiled_data->loss = loss;
333
2.29k
  if (loss.cmd == CCV_NNC_NOOP)
334
2.28k
  {
335
    // If no loss function provided, there is no fits.
336
4.59k
    for (i = 0; i < output_size; 
i++2.30k
)
337
2.30k
    {
338
2.30k
      compiled_data->fits[i] = NO_TENSOR_SYMBOL;
339
2.30k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, model->outputs[i]);
340
2.30k
      if (alias_to.d < 0)
341
2.30k
        compiled_data->f[i] = model->outputs[i];
342
0
      else { // We cannot differentiate against an alias, therefore, we have to verify this output is full, and we can diff against the original.
343
0
        int ofs[CCV_NNC_MAX_DIM_ALLOC];
344
0
        int inc[CCV_NNC_MAX_DIM_ALLOC];
345
0
        ccv_nnc_tensor_symbol_alias_params(model->graph, model->outputs[i], ofs, inc);
346
0
        int j;
347
0
        for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
348
0
          { assert(ofs[j] == 0); } // There is no ofs.
349
0
        compiled_data->f[i] = alias_to; // Unfortunately, I cannot assert the size yet.
350
0
      }
351
2.30k
    }
352
2.28k
  } else {
353
20
    for (i = 0; i < output_size; 
i++10
)
354
10
    {
355
10
      const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(model->graph, model->outputs[i]);
356
10
      const ccv_nnc_tensor_symbol_t fit = compiled_data->fits[i] = ccv_nnc_tensor_symbol_new(model->graph, info, 0);
357
10
      compiled_data->f[i] = ccv_nnc_tensor_symbol_new(model->graph, ccv_nnc_tensor_auto, 0);
358
10
      ccv_nnc_graph_exec_symbol_new(model->graph, loss, TENSOR_SYMBOL_LIST(model->outputs[i], fit), TENSOR_SYMBOL_LIST(compiled_data->f[i]), 0);
359
10
    }
360
10
  }
361
2.29k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
362
2.29k
  ccv_nnc_symbolic_graph_simplify(model->graph,
363
2.29k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_OPS_FUSION), // Only do Ops fusion, in this way, we can fuse the loss function.
364
2.29k
    0, 0, // No need to provide binds at this point.
365
2.29k
    compiled_data->f, model->output_size,
366
2.29k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
367
2.29k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
368
  // If inputs are from GPU, stream type is GPU.
369
2.29k
  compiled_data->parameters = parameters;
370
2.29k
  compiled_data->parameter_flags = parameter_flags;
371
2.29k
  compiled_data->internals = internals;
372
2.29k
  compiled_data->ids.parameters = parameter_ids;
373
2.29k
  compiled_data->ids.internals = internal_ids;
374
2.29k
  ccv_cnnp_model_gradient_checkpoints_cleanup_after_build(compiled_data, model->graph);
375
2.29k
}
376
377
static void _ccv_cnnp_graph_push_graph_exec_symbol(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
378
8.82k
{
379
8.82k
  ccv_array_t* const stack = (ccv_array_t*)context;
380
8.82k
  ccv_array_push(stack, &symbol.d);
381
8.82k
}
382
383
static void _ccv_nnc_tensor_symbol_reinit(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
384
38.5k
{
385
38.5k
  const ccv_nnc_tensor_symbol_t src_symbol = {
386
38.5k
    .d = src_index,
387
38.5k
    .graph = src_graph
388
38.5k
  };
389
38.5k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
390
38.5k
    .d = dest_index,
391
38.5k
    .graph = dest_graph
392
38.5k
  };
393
38.5k
  const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
394
38.5k
  ccv_nnc_tensor_symbol_set(dest_graph, dest_symbol, params);
395
38.5k
  int ofs[CCV_NNC_MAX_DIM_ALLOC];
396
38.5k
  int inc[CCV_NNC_MAX_DIM_ALLOC];
397
38.5k
  if (0 == ccv_nnc_tensor_symbol_alias_params(src_graph, src_symbol, ofs, inc))
398
2.00k
    ccv_nnc_tensor_symbol_alias_set(dest_graph, dest_symbol, ofs, inc);
399
38.5k
}
400
401
static int _ccv_nnc_tensor_symbol_check_dim(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
402
2.41k
{
403
2.41k
  const ccv_nnc_tensor_symbol_t src_symbol = {
404
2.41k
    .d = src_index,
405
2.41k
    .graph = src_graph
406
2.41k
  };
407
2.41k
  const ccv_nnc_tensor_param_t src_params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
408
2.41k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
409
2.41k
    .d = dest_index,
410
2.41k
    .graph = dest_graph
411
2.41k
  };
412
2.41k
  const ccv_nnc_tensor_param_t dest_params = ccv_nnc_tensor_symbol_params(dest_graph, dest_symbol);
413
2.41k
  return memcmp(src_params.dim, dest_params.dim, sizeof(src_params.dim)) == 0;
414
2.41k
}
415
416
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size);
417
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data);
418
419
typedef struct {
420
  int parallel_count;
421
  ccv_nnc_symbolic_graph_t* graph;
422
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
423
} ccv_nnc_graph_exec_update_t;
424
425
static void _ccv_cnnp_cmd_update_for_execs(void* const context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint)
426
58
{
427
58
  ccv_nnc_graph_exec_update_t* const graph_exec_update = (ccv_nnc_graph_exec_update_t*)context;
428
58
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = graph_exec_update->graph_exec_arena;
429
58
  ccv_nnc_graph_exec_t graph_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, symbol);
430
58
  ccv_nnc_graph_exec_set(graph_exec.graph, graph_exec, cmd);
431
58
  ccv_nnc_graph_exec_set_hint(graph_exec.graph, graph_exec, hint);
432
58
  const ccv_nnc_symbolic_graph_t* const graph = graph_exec_update->graph;
433
58
  const int parallel_count = graph_exec_update->parallel_count;
434
58
  int i;
435
178
  for (i = 1; i < parallel_count; 
i++120
)
436
120
  {
437
120
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, ccv_nnc_graph_exec_symbol_copy(graph, symbol, i));
438
120
    if (!CCV_NO_GRAPH_EXEC(copy))
439
120
    {
440
120
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
441
120
      ccv_nnc_graph_exec_set_hint(copy.graph, copy, hint);
442
120
    }
443
120
  }
444
58
}
445
446
void ccv_cnnp_model_absorb(ccv_cnnp_model_t* const model, ccv_cnnp_model_t* const init, const ccv_nnc_tensor_param_t* const inputs, const int input_size)
447
2.20k
{
448
2.20k
  assert(model->graph);
449
2.20k
  assert(model->compiled_data);
450
2.20k
  assert(!init->graph);
451
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
452
2.20k
  init->graph = ccv_nnc_symbolic_graph_new();
453
2.20k
  ccv_array_t* const stack = ccv_array_new(sizeof(int), 0, 0);
454
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, _ccv_cnnp_graph_push_graph_exec_symbol, stack, 0);
455
2.20k
  _ccv_cnnp_model_compile(init, inputs, input_size, compiled_data->loss);
456
2.20k
  init->parallel_count = model->parallel_count;
457
2.20k
  init->memory_compression = model->memory_compression;
458
2.20k
  init->memory_reduction = model->memory_reduction;
459
2.20k
  init->gradient_checkpointing = model->gradient_checkpointing;
460
2.20k
  init->compiled_data->stream_type = model->compiled_data->stream_type;
461
2.20k
  init->compiled_data->minimize.minimizer = model->compiled_data->minimize.minimizer;
462
2.20k
  init->compiled_data->minimize.max_saved_aux_size = model->compiled_data->minimize.max_saved_aux_size;
463
2.20k
  if (model->compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
464
2.20k
    _ccv_cnnp_model_gradient_init(init, model->compiled_data->gradient_mode, model->compiled_data->disable_outgrad, 0, 0);
465
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, 0, 0, 0);
466
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(init->graph, TRAVERSE_FULL);
467
2.20k
  int i, j;
468
  // Verify parameters, internals and saved_aux in both graph has the same dimensionality.
469
4.61k
  for (i = 0; i < compiled_data->parameters->rnum; 
i++2.41k
)
470
2.41k
  {
471
2.41k
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
472
2.41k
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
473
2.41k
  }
474
2.20k
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
475
0
  {
476
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
477
0
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
478
0
  }
479
  // Update inputs.
480
2.20k
  assert(model->input_size == init->input_size);
481
4.40k
  
for (i = 0; 2.20k
i < model->input_size;
i++2.20k
)
482
2.20k
    if (model->inputs[i].d >= 0)
483
2.20k
    {
484
2.20k
      assert(init->inputs[i].d >= 0);
485
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->inputs[i].d, model->inputs[i].d);
486
2.20k
    }
487
  // Update outputs.
488
2.20k
  assert(model->output_size == init->output_size);
489
4.40k
  
for (i = 0; 2.20k
i < model->output_size;
i++2.20k
)
490
2.20k
  {
491
2.20k
    if (model->outputs[i].d >= 0)
492
2.20k
    {
493
2.20k
      assert(init->outputs[i].d >= 0);
494
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->outputs[i].d, model->outputs[i].d);
495
2.20k
    }
496
2.20k
    if (model->outputs[i].d != model->compiled_data->f[i].d)
497
0
    {
498
0
      assert(init->outputs[i].d != init->compiled_data->f[i].d);
499
0
      if (model->compiled_data->f[i].d >= 0)
500
0
      {
501
0
        assert(init->compiled_data->f[i].d >= 0);
502
0
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->compiled_data->f[i].d, model->compiled_data->f[i].d);
503
0
      }
504
0
    }
505
2.20k
  }
506
  // Go through the graph to set tensor on matching symbols
507
11.0k
  
for (i = 0; 2.20k
i < stack->rnum;
i++8.82k
)
508
8.82k
  {
509
8.82k
    const int d = *(int*)ccv_array_get(stack, i);
510
    // If exceed range, skip.
511
8.82k
    if (d >= ccv_nnc_graph_exec_symbol_count(init->graph) ||
512
8.82k
      d >= ccv_nnc_graph_exec_symbol_count(model->graph))
513
0
      continue;
514
8.82k
    const ccv_nnc_graph_exec_symbol_t src_symbol = {
515
8.82k
      .d = d,
516
8.82k
      .graph = init->graph
517
8.82k
    };
518
8.82k
    const ccv_nnc_graph_exec_symbol_t dest_symbol = {
519
8.82k
      .d = d,
520
8.82k
      .graph = model->graph
521
8.82k
    };
522
8.82k
    const ccv_nnc_cmd_t src_cmd = ccv_nnc_graph_exec_symbol_cmd(init->graph, src_symbol);
523
8.82k
    const ccv_nnc_cmd_t dest_cmd = ccv_nnc_graph_exec_symbol_cmd(model->graph, dest_symbol);
524
    // If the name doesn't match, skip.
525
8.82k
    if (dest_cmd.cmd != src_cmd.cmd && 
src_cmd.cmd != CCV_NNC_NOOP0
)
526
0
      continue;
527
    // Now get all the inputs and outputs, if matches, set them.
528
8.82k
    const int* src_inputs;
529
8.82k
    int src_input_size;
530
8.82k
    const int* src_outputs;
531
8.82k
    int src_output_size;
532
8.82k
    ccv_nnc_graph_exec_symbol_io(init->graph, src_symbol, &src_inputs, &src_input_size, &src_outputs, &src_output_size);
533
8.82k
    const int* dest_inputs;
534
8.82k
    int dest_input_size;
535
8.82k
    const int* dest_outputs;
536
8.82k
    int dest_output_size;
537
8.82k
    ccv_nnc_graph_exec_symbol_io(model->graph, dest_symbol, &dest_inputs, &dest_input_size, &dest_outputs, &dest_output_size);
538
    // We may have unmatched input / output size because this is the minimizer and it has
539
    // different saved_aux (for example, when we shrunk with CMD_NOOP).
540
8.82k
    if (src_input_size != dest_input_size)
541
0
      continue;
542
8.82k
    if (src_output_size != dest_output_size)
543
0
      continue;
544
8.82k
    ccv_nnc_graph_exec_symbol_set(model->graph, dest_symbol, src_cmd);
545
    // There may be mismatches of the source tensor symbols and destination tensor symbols. The reason is because
546
    // we may later passed-in the minimizer, therefore, we may allocate tensors for minimizer later in the original
547
    // graph whereas in the newly created graph, it is streamlined (the minimizer exists from the beginning). That
548
    // will make the order of tensor symbols creation different, therefore, exact which tensor is which wrong as
549
    // well. However, set a new minimizer won't change the exec symbol ordering, because we never create new exec
550
    // symbols after gradient init step. Changing a new minimizer just updated that exec symbols setting, it is not
551
    // a new exec symbol.
552
33.7k
    for (j = 0; j < src_input_size; 
j++24.8k
)
553
24.8k
      if (src_inputs[j] >= 0)
554
20.4k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_inputs[j], dest_inputs[j]);
555
22.4k
    for (j = 0; j < src_output_size; 
j++13.6k
)
556
13.6k
      if (src_outputs[j] >= 0)
557
13.6k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_outputs[j], dest_outputs[j]);
558
8.82k
  }
559
2.20k
  ccv_array_free(stack);
560
  // After this, we get all tensors in the model graph resolved through tensor_auto.
561
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(model->graph, TRAVERSE_FULL);
562
  // Verify symbols we get matches.
563
2.20k
  const int parameter_size = compiled_data->parameters->rnum;
564
4.61k
  for (i = 0; i < parameter_size; 
i++2.41k
)
565
2.41k
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->parameters, i))->d); }
566
2.20k
  const int internal_size = compiled_data->internals->rnum;
567
2.20k
  for (i = 0; i < internal_size; 
i++0
)
568
0
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->internals, i))->d); }
569
  // Go through compiled data.
570
2.20k
  if (compiled_data->tensor_arena)
571
2.20k
  {
572
2.20k
    const int flag = ccv_nnc_tensor_arena_reinit(compiled_data->tensor_arena, model->graph);
573
2.20k
    if (flag == 0 && compiled_data->graph_exec_arena)
574
2.20k
    {
575
2.20k
      ccv_nnc_graph_exec_reinit(compiled_data->graph_exec_arena, compiled_data->graph, model->graph);
576
      // Since we will reinit, if we previously set is_test, we need to set it again.
577
2.20k
      if (compiled_data->is_test)
578
1
      {
579
1
        const int parallel_count = ccv_max(model->parallel_count, 1);
580
1
        ccv_nnc_graph_exec_update_t update = {
581
1
          .parallel_count = parallel_count,
582
1
          .graph = model->graph,
583
1
          .graph_exec_arena = compiled_data->graph_exec_arena,
584
1
        };
585
1
        ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
586
1
      }
587
2.20k
    } else
588
      // Free-up tensor arena & graph exec arena.
589
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
590
2.20k
  }
591
  // There are other compiled graphs, for accum and apply gradients.
592
  // However, the main conclusion is, these absorb operations shouldn't impact parameters.
593
  // Thus, it won't impact the shape of gradients (only outgrad). Since for outgrad, we
594
  // don't allocate ourselves, it is not a concern. For normal gradients, the shape cannot
595
  // be changed otherwise parameters' shape will be meaningless. The same goes to internals.
596
  // That is why we don't update these compiled graphs at all this point.
597
  // Free the model, we've already "absorbed" it.
598
2.20k
  ccv_cnnp_model_free(init);
599
2.20k
}
600
601
void ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t minimizer, const ccv_nnc_cmd_t loss)
602
2.29k
{
603
2.29k
  assert(input_size == model->input_size || model->input_size == 0);
604
2.29k
  if (model->input_size == 0)
605
10
    model->input_size = input_size;
606
2.29k
  if (!model->graph) // The graph is not compiled yet.
607
93
  {
608
93
    model->graph = ccv_nnc_symbolic_graph_new();
609
93
    _ccv_cnnp_model_compile(model, inputs, input_size, loss);
610
93
    assert(model->compiled_data);
611
93
    int i, flag = 0;
612
234
    for (i = 0; !flag && 
i < input_size214
;
i++141
)
613
141
      flag = (CCV_TENSOR_GET_MEMORY(inputs[i].type) == CCV_TENSOR_GPU_MEMORY);
614
    // If inputs are from GPU, stream type is GPU.
615
93
    model->compiled_data->stream_type = flag ? 
CCV_STREAM_CONTEXT_GPU20
:
CCV_STREAM_CONTEXT_CPU73
;
616
93
    model->compiled_data->minimize.minimizer = minimizer;
617
93
    model->compiled_data->minimize.max_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
618
2.20k
  } else {
619
    // Now, finally fill in this part. If the graph is already compiled, we make a copy of the model.
620
    // And then absorb the "new model" to the old one.
621
2.20k
    ccv_cnnp_model_t* const init = ccv_cnnp_model_copy(model, model->is_trainable);
622
2.20k
    ccv_cnnp_model_absorb(model, init, inputs, input_size);
623
    // Reset minimizer.
624
2.20k
    ccv_cnnp_model_set_minimizer(model, minimizer, 1, 0, 0);
625
2.20k
  }
626
2.29k
}
627
628
ccv_cnnp_model_t* ccv_cnnp_model_copy(const ccv_cnnp_model_t* const model, const int is_trainable)
629
2.20k
{
630
2.20k
  ccv_cnnp_model_t* const new_model = _ccv_cnnp_model_copy(model, 0);
631
2.20k
  new_model->is_trainable = is_trainable;
632
2.20k
  return new_model;
633
2.20k
}
634
635
void ccv_cnnp_model_tensor_auto(ccv_cnnp_model_t* const model, ccv_nnc_tensor_param_t* const outputs, const int output_size)
636
4.44k
{
637
4.44k
  assert(model->graph);
638
4.44k
  assert(output_size == model->output_size);
639
4.44k
  ccv_nnc_symbolic_graph_t* const graph = model->graph;
640
4.44k
  ccv_nnc_symbolic_graph_tensor_auto(graph, TRAVERSE_FULL);
641
4.44k
  int i;
642
8.89k
  for (i = 0; i < output_size; 
i++4.44k
)
643
4.44k
  {
644
4.44k
    assert(model->outputs[i].d != CCV_NNC_NO_TENSOR_SYMBOL);
645
4.44k
    outputs[i] = ccv_nnc_tensor_symbol_params(graph, model->outputs[i]);
646
4.44k
  }
647
4.44k
}
648
649
void ccv_cnnp_model_set_workspace_size(ccv_cnnp_model_t* const model, size_t workspace_size)
650
3
{
651
3
  if (workspace_size == model->workspace_size)
652
0
    return;
653
3
  model->workspace_size = workspace_size;
654
3
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
655
3
  if (compiled_data && compiled_data->graph)
656
0
    ccv_nnc_graph_autotune(compiled_data->graph, workspace_size, 0, TRAVERSE_FULL);
657
3
}
658
659
size_t ccv_cnnp_model_workspace_size(ccv_cnnp_model_t* const model)
660
0
{
661
0
  return model->workspace_size;
662
0
}
663
664
void ccv_cnnp_model_set_data_parallel(ccv_cnnp_model_t* const model, const int parallel)
665
15
{
666
15
  if (parallel == 0)
667
0
    model->parallel_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
668
15
  else
669
15
    model->parallel_count = parallel;
670
15
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
671
15
  if (compiled_data)
672
11
    { assert(!compiled_data->graph); }
673
15
}
674
675
void ccv_cnnp_model_set_max_concurrency(ccv_cnnp_model_t* const model, const int max_stream_count)
676
0
{
677
0
  model->max_stream_count = max_stream_count;
678
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
679
0
  if (compiled_data)
680
0
    { assert(!compiled_data->graph); }
681
0
}
682
683
void ccv_cnnp_model_set_memory_compression(ccv_cnnp_model_t* const model, const int memory_compression)
684
0
{
685
0
  model->memory_compression = memory_compression;
686
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
687
0
  if (compiled_data)
688
0
    { assert(!compiled_data->graph); }
689
0
}
690
691
void ccv_cnnp_model_set_memory_reduction(ccv_cnnp_model_t* const model, const int memory_reduction)
692
0
{
693
0
  model->memory_reduction = memory_reduction;
694
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
695
0
  if (compiled_data)
696
0
    { assert(!compiled_data->graph); }
697
0
}
698
699
void ccv_cnnp_model_set_gradient_checkpointing(ccv_cnnp_model_t* const model, const int gradient_checkpointing)
700
2
{
701
2
  model->gradient_checkpointing = gradient_checkpointing;
702
2
}
703
704
int ccv_cnnp_model_gradient_checkpointing(ccv_cnnp_model_t* const model)
705
0
{
706
0
  return model->gradient_checkpointing;
707
0
}
708
709
typedef struct {
710
  int parallel_count;
711
  ccv_nnc_symbolic_graph_t* graph;
712
  ccv_cnnp_compiled_data_t* compiled_data;
713
  ccv_nnc_tensor_arena_t* tensor_arena;
714
} ccv_nnc_tensor_init_states_t;
715
716
static int _ccv_cnnp_any_to_init(const ccv_cnnp_compiled_data_t* const compiled_data)
717
95
{
718
95
  int i;
719
95
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
720
174
  for (i = 0; i < compiled_data->parameters->rnum; 
i++79
)
721
115
  {
722
115
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
723
115
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
724
36
      return 1;
725
115
  }
726
59
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
727
5
  {
728
5
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
729
5
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
730
5
      return 1;
731
5
  }
732
54
  return 0;
733
59
}
734
735
static void _ccv_cnnp_init_states_for_tensors(void* const context, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const input, const ccv_nnc_tensor_symbol_t output_symbol)
736
329
{
737
329
  ccv_nnc_tensor_init_states_t* const tensor_init_states = (ccv_nnc_tensor_init_states_t*)context;
738
329
  ccv_nnc_tensor_arena_t* const tensor_arena = tensor_init_states->tensor_arena;
739
329
  ccv_nnc_tensor_t* const output_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, output_symbol);
740
329
  if (!output_tensor)
741
0
    return;
742
329
  const int d = output_symbol.d;
743
329
  assert(d < tensor_init_states->compiled_data->tensors_init.size);
744
329
  uint32_t* const init_v = CCV_NNC_INIT_V(tensor_init_states->compiled_data->tensors_init.v);
745
329
  if (init_v[d >> 5] & (1u << (d & 0x1f)))
746
29
    return;
747
300
  init_v[d >> 5] |= (1u << (d & 0x1f));
748
300
  ccv_nnc_cmd_exec(cmd, hint, flags, &input, input ? 
112
:
0288
, &output_tensor, 1, 0);
749
300
  const ccv_nnc_symbolic_graph_t* const graph = tensor_init_states->graph;
750
300
  const int parallel_count = tensor_init_states->parallel_count;
751
300
  int i;
752
780
  for (i = 1; i < parallel_count; 
i++480
)
753
480
  {
754
480
    ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_copy(graph, output_symbol, i));
755
480
    if (copy)
756
480
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &output_tensor, 1, &copy, 1, 0);
757
480
  }
758
300
}
759
760
// This method can only handle cases we added new tensors and exec, never delete. This invariant is true because
761
// we setup everything (including calling simplify method) in ccv_cnnp_model_compile method, before this rewind setup.
762
static void _ccv_cnnp_model_rewind_graph(ccv_cnnp_model_t* const model)
763
2
{
764
2
  assert(model->graph);
765
2
  assert(model->compiled_data);
766
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
767
2
  assert(compiled_data->rewindables);
768
2
  int i;
769
51
  for (i = 0; i < compiled_data->rewindables->rnum; 
i++49
)
770
49
  {
771
49
    const ccv_cnnp_rewind_symbol_t* const rewind_symbol = (ccv_cnnp_rewind_symbol_t*)ccv_array_get(compiled_data->rewindables, i);
772
49
    if (rewind_symbol->type == CCV_CNNP_REWIND_GRAPH_EXEC)
773
16
      ccv_nnc_graph_exec_symbol_free(model->graph, rewind_symbol->graph_exec);
774
33
    else if (rewind_symbol->type == CCV_CNNP_REWIND_TENSOR)
775
33
      ccv_nnc_tensor_symbol_free(model->graph, rewind_symbol->tensor);
776
49
  }
777
2
  ccv_array_clear(compiled_data->rewindables);
778
2
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
779
2
}
780
781
static void _ccv_cnnp_model_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name)
782
6.09k
{
783
6.09k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
784
6.09k
    .type = CCV_CNNP_REWIND_TENSOR,
785
6.09k
    .tensor = symbol
786
6.09k
  };
787
6.09k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
788
6.09k
  ccv_array_push(rewind_symbols, &rewind_symbol);
789
6.09k
}
790
791
static void _ccv_cnnp_model_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC], const int inc[CCV_NNC_MAX_DIM_ALLOC], const ccv_nnc_tensor_param_t info, const char* const name)
792
475
{
793
475
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
794
475
    .type = CCV_CNNP_REWIND_TENSOR,
795
475
    .tensor = symbol
796
475
  };
797
475
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
798
475
  ccv_array_push(rewind_symbols, &rewind_symbol);
799
475
}
800
801
static void _ccv_cnnp_model_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
802
2.32k
{
803
2.32k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
804
2.32k
    .type = CCV_CNNP_REWIND_GRAPH_EXEC,
805
2.32k
    .graph_exec = symbol
806
2.32k
  };
807
2.32k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
808
2.32k
  ccv_array_push(rewind_symbols, &rewind_symbol);
809
2.32k
}
810
811
static void _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd, ccv_nnc_symbolic_graph_t* const symbolic_graph)
812
35.0k
{
813
35.0k
  ccv_nnc_graph_exec_t const update_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, exec_symbol);
814
35.0k
  if (!CCV_NO_GRAPH_EXEC(update_exec))
815
19.9k
    ccv_nnc_graph_exec_set(update_exec.graph, update_exec, cmd);
816
35.0k
  int i;
817
49.9k
  for (i = 1; i < parallel_count; 
i++14.8k
)
818
14.8k
  {
819
14.8k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
820
14.8k
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, copy_symbol);
821
14.8k
    if (!CCV_NO_GRAPH_EXEC(copy))
822
14.6k
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
823
14.8k
  }
824
35.0k
}
825
826
static void _ccv_cnnp_model_graph_exec_symbol_set(ccv_nnc_symbolic_graph_t* const symbolic_graph, ccv_cnnp_compiled_data_t* const compiled_data, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd)
827
20.0k
{
828
20.0k
  assert(compiled_data);
829
20.0k
  assert(symbolic_graph);
830
20.0k
  ccv_nnc_graph_exec_symbol_set(symbolic_graph, exec_symbol, cmd);
831
20.0k
  int i;
832
35.0k
  for (i = 1; i < parallel_count; 
i++14.9k
)
833
14.9k
  {
834
14.9k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
835
14.9k
    if (copy_symbol.graph)
836
14.8k
      ccv_nnc_graph_exec_symbol_set(symbolic_graph, copy_symbol, cmd);
837
14.9k
  }
838
20.0k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = compiled_data->graph_exec_arena;
839
20.0k
  if (graph_exec_arena)
840
20.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
841
  // Skip backward graph exec arena because it is for a specific accum symbolic graph, not the main graph (model->graph)
842
20.0k
  ccv_nnc_graph_exec_arena_t* const gradient_graph_exec_arena = compiled_data->apply_gradients.graph_exec_arena;
843
20.0k
  if (gradient_graph_exec_arena)
844
15.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(gradient_graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
845
20.0k
}
846
847
static int _ccv_cnnp_set_minimizer_for_parameter(ccv_nnc_symbolic_graph_t* const graph, ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_graph_exec_symbol_t* const update_nodes, ccv_nnc_tensor_symbol_t* const updated_parameters, ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parallel_count, const ccv_nnc_cmd_t minimizer, const int saved_aux_size, const int max_saved_aux_size, const int parameter_indice)
848
20.0k
{
849
20.0k
  int this_parameter_flag = 0;
850
20.0k
  if (update_nodes[parameter_indice].d == CCV_NNC_NO_TENSOR_SYMBOL)
851
0
    return this_parameter_flag;
852
20.0k
  const ccv_nnc_cmd_t old_minimizer = ccv_nnc_graph_exec_symbol_cmd(graph, update_nodes[parameter_indice]);
853
20.0k
  int j, k;
854
  // For no-op, we can preserve previous saved_aux_size.
855
20.0k
  if (old_minimizer.cmd != minimizer.cmd && 
minimizer.cmd != CCV_NNC_NOOP71
)
856
67
  {
857
    // If the old minimizer is a noop, then the old_saved_aux_size should be whatever its previous
858
    // saved_aux_size is, otherwise we will reinit the saved_aux repeatedly if you switch between
859
    // noop and a minimizer. We don't want that because we do that in high-level frameworks to
860
    // make sure some model parameters don't update if we don't want them to.
861
67
    int old_saved_aux_size;
862
67
    if (old_minimizer.cmd == CCV_NNC_NOOP)
863
67
    {
864
67
      int input_size;
865
67
      ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], 0, &input_size, 0, 0);
866
67
      if (input_size < 2) // This is not legit.
867
0
        old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
868
67
      else // See ccv_nnc_minimizer_saved_aux_size, the saved_aux is inputs excluding gradients and parameters.
869
67
        old_saved_aux_size = input_size - 2;
870
67
    } else
871
0
      old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
872
67
    if (old_saved_aux_size != saved_aux_size)
873
65
    {
874
65
      this_parameter_flag = 1;
875
65
      if (saved_aux_size > old_saved_aux_size)
876
65
      {
877
        // Allocate new tensor symbols.
878
65
        const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(graph, updated_parameters[parameter_indice]);
879
189
        for (j = old_saved_aux_size; j < saved_aux_size; 
j++124
)
880
124
        {
881
124
          saved_aux[parameter_indice * max_saved_aux_size + j].source = ccv_nnc_tensor_symbol_new(graph, info, 0);
882
124
          saved_aux[parameter_indice * max_saved_aux_size + j].destination = ccv_nnc_tensor_symbol_new(graph, info, 0);
883
124
          const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
884
460
          for (k = 1; k < parallel_count; 
k++336
)
885
336
          {
886
336
            ccv_nnc_tensor_param_t dev_info = info;
887
336
            if (k != device_id)
888
336
              CCV_TENSOR_SET_DEVICE_ID(dev_info.type, k);
889
0
            else
890
0
              CCV_TENSOR_SET_DEVICE_ID(dev_info.type, 0);
891
336
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
892
336
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
893
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, src_copy);
894
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, dest_copy);
895
336
          }
896
124
        }
897
65
      } else {
898
0
        for (j = saved_aux_size; j < old_saved_aux_size; j++)
899
0
        {
900
0
          for (k = 1; k < parallel_count; k++)
901
0
          {
902
0
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
903
0
            if (src_copy.d >= 0)
904
0
            {
905
0
              ccv_nnc_tensor_symbol_free(graph, src_copy);
906
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, NO_TENSOR_SYMBOL);
907
0
            }
908
0
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
909
0
            if (dest_copy.d >= 0)
910
0
            {
911
0
              ccv_nnc_tensor_symbol_free(graph, dest_copy);
912
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, NO_TENSOR_SYMBOL);
913
0
            }
914
0
          }
915
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source);
916
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination);
917
0
          saved_aux[parameter_indice * max_saved_aux_size + j].source = saved_aux[parameter_indice * max_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
918
0
        }
919
0
      }
920
65
    }
921
67
  }
922
20.0k
  _ccv_cnnp_model_graph_exec_symbol_set(graph, compiled_data, parallel_count, update_nodes[parameter_indice], minimizer);
923
20.0k
  if (this_parameter_flag)
924
65
  {
925
65
    ccv_nnc_tensor_symbol_t update_inputs[saved_aux_size + 2];
926
65
    ccv_nnc_tensor_symbol_t update_outputs[saved_aux_size + 1];
927
65
    const int* inputs = 0;
928
65
    int input_size = 0;
929
65
    ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], &inputs, &input_size, 0, 0);
930
65
    assert(input_size >= 1);
931
65
    update_inputs[0].d = inputs[0];
932
65
    update_inputs[0].graph = graph;
933
65
    update_inputs[1].d = inputs[1];
934
65
    update_inputs[1].graph = graph;
935
65
    update_outputs[0] = updated_parameters[parameter_indice];
936
189
    for (j = 0; j < saved_aux_size; 
j++124
)
937
124
    {
938
124
      update_inputs[j + 2] = saved_aux[parameter_indice * max_saved_aux_size + j].source;
939
124
      update_outputs[j + 1] = saved_aux[parameter_indice * max_saved_aux_size + j].destination;
940
124
    }
941
65
    ccv_nnc_graph_exec_symbol_set_io(graph, update_nodes[parameter_indice], update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
942
233
    for (k = 1; k < parallel_count; 
k++168
)
943
168
    {
944
168
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(graph, update_nodes[parameter_indice], k);
945
168
      assert(copy.d >= 0);
946
168
      ccv_nnc_graph_exec_symbol_io(graph, copy, &inputs, &input_size, 0, 0);
947
168
      assert(input_size >= 1);
948
168
      update_inputs[0].d = inputs[0];
949
168
      update_inputs[0].graph = graph;
950
168
      update_inputs[1].d = inputs[1];
951
168
      update_inputs[1].graph = graph;
952
168
      update_outputs[0] = ccv_nnc_tensor_symbol_copy(graph, updated_parameters[parameter_indice], k);
953
504
      for (j = 0; j < saved_aux_size; 
j++336
)
954
336
      {
955
336
        update_inputs[j + 2] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
956
336
        update_outputs[j + 1] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
957
336
      }
958
168
      ccv_nnc_graph_exec_symbol_set_io(graph, copy, update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
959
168
    }
960
65
  }
961
20.0k
  return this_parameter_flag;
962
20.0k
}
963
964
typedef struct {
965
  int parameter_size;
966
  ccv_nnc_cmd_t minimizer;
967
  ccv_cnnp_model_io_t parameters[1];
968
} ccv_cnnp_set_minimizer_for_parameter_t;
969
970
static int _ccv_cnnp_apply_parameters_with_minimizer(ccv_cnnp_model_t* const model)
971
296
{
972
296
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
973
296
  assert(compiled_data);
974
296
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
975
  // We update all parameters, at this point, we have one minimizer.
976
296
  const int parameter_size = compiled_data->parameters->rnum;
977
296
  ccv_nnc_graph_exec_symbol_t* const update_nodes = compiled_data->update_nodes;
978
296
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
979
296
  assert(symbolic_graph);
980
296
  const int parallel_count = ccv_max(model->parallel_count, 1);
981
296
  ccv_array_t* const parameters = compiled_data->minimize.parameters;
982
296
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
983
296
  int i, j, flag = 0;
984
301
  for (i = 0; i < parameters->rnum; 
i++5
)
985
5
  {
986
5
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = *(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(parameters, i);
987
10
    for (j = 0; j < set_minimizer_for_parameter->parameter_size; 
j++5
)
988
5
    {
989
5
      const int param_sel = set_minimizer_for_parameter->parameters[j]->param_sel > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_sel - 13
:
set_minimizer_for_parameter->parameters[j]->param_sel2
;
990
5
      assert(set_minimizer_for_parameter->parameters[j]->param_sel != 0);
991
5
      const int old_rnum = parameter_indices->rnum;
992
5
      ccv_cnnp_model_add_to_parameter_indices(set_minimizer_for_parameter->parameters[j]->model, param_sel, parameter_indices);
993
5
      const int param_ref = set_minimizer_for_parameter->parameters[j]->param_ref > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_ref - 10
: set_minimizer_for_parameter->parameters[j]->param_ref;
994
5
      assert(set_minimizer_for_parameter->parameters[j]->param_ref != 0);
995
5
      if (param_ref >= 0)
996
0
      {
997
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
998
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
999
0
        parameter_indices->rnum = old_rnum + 1;
1000
0
      }
1001
5
    }
1002
5
    const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(set_minimizer_for_parameter->minimizer);
1003
    // We may have duplicated indices, but that is OK, we will set it twice.
1004
58
    for (j = 0; j < parameter_indices->rnum; 
j++53
)
1005
53
    {
1006
53
      const int d = *(int*)ccv_array_get(parameter_indices, j);
1007
53
      assert(d <= parameter_size);
1008
53
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, set_minimizer_for_parameter->minimizer, saved_aux_size, max_saved_aux_size, d))
1009
0
        flag = 1;
1010
53
    }
1011
5
    ccv_array_clear(parameter_indices);
1012
5
  }
1013
296
  ccv_array_free(parameter_indices);
1014
296
  return flag;
1015
296
}
1016
1017
static void _ccv_cnnp_scatter_saved_aux(ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parameter_size, const int old_saved_aux_size, const int new_saved_aux_size)
1018
2.24k
{
1019
2.24k
  if (new_saved_aux_size == old_saved_aux_size)
1020
2.24k
    return;
1021
2.24k
  assert
(new_saved_aux_size > old_saved_aux_size)7
;
1022
7
  int i, j;
1023
72
  for (i = parameter_size - 1; i >= 0; 
i--65
)
1024
65
  {
1025
189
    for (j = new_saved_aux_size - 1; j >= old_saved_aux_size; 
j--124
)
1026
124
      saved_aux[i * new_saved_aux_size + j].source = saved_aux[i * new_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
1027
65
    for (j = old_saved_aux_size - 1; j >= 0; 
j--0
)
1028
0
      saved_aux[i * new_saved_aux_size + j] = saved_aux[i * old_saved_aux_size + j];
1029
65
  }
1030
7
}
1031
1032
static void _ccv_cnnp_model_set_rewindables(ccv_cnnp_model_t* const model)
1033
41
{
1034
41
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1035
41
  assert(compiled_data);
1036
41
  if (!compiled_data->rewindables)
1037
41
    compiled_data->rewindables = ccv_array_new(sizeof(ccv_cnnp_rewind_symbol_t), 0, 0);
1038
41
  ccv_nnc_tensor_symbol_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_new_hook, compiled_data->rewindables, 0);
1039
41
  ccv_nnc_tensor_symbol_alias_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_alias_new_hook, compiled_data->rewindables, 0);
1040
41
  ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_graph_exec_symbol_new_hook, compiled_data->rewindables, 0);
1041
41
}
1042
1043
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size)
1044
2.24k
{
1045
2.24k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1046
2.24k
  assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
1047
2.24k
  assert(gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
1048
2.24k
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1049
2.24k
  assert(evaluate_to_size > 0);
1050
2.24k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1051
2.24k
  compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1052
2.24k
  compiled_data->evaluate.to_ops = (ccv_nnc_graph_exec_t*)(compiled_data->evaluate.tos + evaluate_to_size * parallel_count);
1053
2.24k
  int i, j;
1054
2.24k
  const int output_size = model->output_size;
1055
2.24k
  assert(!fits || fit_size == output_size * parallel_count);
1056
2.24k
  if (fits)
1057
12
    
for (i = 0; 6
i < output_size;
i++6
)
1058
6
      ccv_nnc_tensor_symbol_set(model->graph, compiled_data->fits[i], fits[i]->info);
1059
2.24k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
1060
2.24k
  const int parameter_size = compiled_data->parameters->rnum;
1061
2.24k
  compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * max_saved_aux_size * parameter_size);
1062
2.24k
  compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
1063
2.24k
  compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
1064
2.24k
  int parameter_size_maybe_more = parameter_size;
1065
2.24k
  compiled_data->disable_outgrad = disable_outgrad;
1066
2.24k
  int outgrad_size;
1067
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
1068
9
    outgrad_size = 0;
1069
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
1070
2.22k
    outgrad_size = model->input_size;
1071
3
  else {
1072
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
1073
3
    outgrad_size = 0;
1074
10
    for (i = 0; i < model->input_size; 
i++7
)
1075
7
      if (!(disable_outgrad & ((uint64_t)1 << i)))
1076
3
        ++outgrad_size;
1077
3
  }
1078
2.24k
  compiled_data->outgrad_size = outgrad_size;
1079
2.24k
  parameter_size_maybe_more += outgrad_size;
1080
2.24k
  compiled_data->gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size_maybe_more + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size_maybe_more * parallel_count);
1081
2.24k
  compiled_data->outgrads = parameter_size_maybe_more > parameter_size ? 
compiled_data->gradients + parameter_size2.23k
:
09
;
1082
2.24k
  compiled_data->backward.tos = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->gradients + parameter_size_maybe_more);
1083
2.24k
  compiled_data->backward.to_size = parameter_size_maybe_more;
1084
2.24k
  ccv_nnc_tensor_symbol_t* parameters = (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0);
1085
2.24k
  if (compiled_data->parameter_flags)
1086
4
  {
1087
4
    parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size);
1088
25
    for (i = 0; i < parameter_size; 
i++21
)
1089
21
      if (compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))
1090
14
        parameters[i] = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1091
7
      else
1092
7
        parameters[i] = NO_TENSOR_SYMBOL;
1093
4
  }
1094
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
1095
9
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1096
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
1097
2.22k
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, model->inputs, model->input_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1098
3
  else { // Compute minimize with gradients including selected inputs.
1099
3
    assert(model->input_size > 0);
1100
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
1101
3
    assert(outgrad_size > 0);
1102
3
    ccv_nnc_tensor_symbol_t outgrads[outgrad_size];
1103
3
    j = 0;
1104
10
    for (i = 0; i < model->input_size; 
i++7
)
1105
7
      if (!(disable_outgrad & ((uint64_t)1 << i)))
1106
3
        outgrads[j++] = model->inputs[i];
1107
3
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, outgrads, outgrad_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1108
3
  }
1109
2.24k
  if (compiled_data->parameter_flags)
1110
4
    ccfree(parameters);
1111
2.24k
  _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer), compiled_data->minimize.max_saved_aux_size);
1112
2.24k
  if (compiled_data->minimize.parameters)
1113
5
    _ccv_cnnp_apply_parameters_with_minimizer(model);
1114
  // Go through gradient checkpoints to generate tensor inputs for backward pass just before executing the backward pass.
1115
2.24k
  ccv_cnnp_model_apply_gradient_checkpoints(compiled_data, model->graph);
1116
4.48k
  for (i = 0; i < output_size; 
i++2.24k
)
1117
2.24k
  {
1118
2.24k
    const ccv_nnc_tensor_symbol_t df = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
1119
    // Init this to 1 so we can backprop.
1120
2.24k
    ccv_nnc_tensor_symbol_set_flags(model->graph, df, CCV_NNC_TENSOR_SYMBOL_INIT_ONES);
1121
2.24k
  }
1122
2.24k
  compiled_data->backward.to_size = 0;
1123
7.14k
  for (i = 0; i < parameter_size_maybe_more; 
i++4.90k
)
1124
4.90k
    if (compiled_data->gradients[i].d != CCV_NNC_NO_TENSOR_SYMBOL)
1125
4.90k
      compiled_data->backward.tos[compiled_data->backward.to_size++] = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->gradients[i]);
1126
2.24k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS);
1127
2.24k
  ccv_nnc_symbolic_graph_set_destinations(model->graph, compiled_data->update_nodes, parameter_size);
1128
4.48k
  for (i = 0; i < parameter_size_maybe_more - parameter_size; 
i++2.24k
)
1129
2.24k
  {
1130
2.24k
    if (compiled_data->outgrads[i].d < 0) // When we go through input, we might find zero-length inputs, and for these, we cannot have any outgrads.
1131
0
      continue;
1132
2.24k
    const ccv_nnc_graph_exec_symbol_t outgrad = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->outgrads[i]);
1133
2.24k
    const int* tos;
1134
2.24k
    int to_size;
1135
2.24k
    ccv_nnc_graph_exec_symbol_to(model->graph, outgrad, &tos, &to_size);
1136
2.24k
    if (to_size == 0) // If this is the end (no minimizers afterwards). We need to attach this as a destination. Otherwise this is covered in update_nodes.
1137
9
    {
1138
9
      const ccv_nnc_graph_exec_symbol_t* destinations = ccv_nnc_symbolic_graph_destinations(model->graph);
1139
9
      const int destination_count = ccv_nnc_symbolic_graph_destination_size(model->graph);
1140
9
      int flag = 0;
1141
9
      const int outgrad_destination_start = ccv_max(0, destination_count - i);
1142
11
      for (j = i - 1; !flag && 
j >= 09
;
j--2
)
1143
2
        if (j + outgrad_destination_start < destination_count)
1144
2
          flag = (destinations[j + outgrad_destination_start].d == outgrad.d);
1145
9
      if (!flag) // Only if we cannot find it, we add it.
1146
7
        ccv_nnc_symbolic_graph_add_destination(model->graph, outgrad);
1147
9
    }
1148
2.24k
  }
1149
2.24k
  if (parallel_count > 1)
1150
8
  {
1151
8
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1152
8
      0, 0,
1153
8
      compiled_data->gradients, parameter_size /* No need to deal with outgrads, we don't allreduce outgrads */,
1154
8
      compiled_data->gradients /* We only care about gradients before allreduce, thus, update our current pointers */,
1155
8
      0, 0, 0,
1156
8
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1157
8
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1158
8
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1159
16
    for (i = 0; i < evaluate_to_size; 
i++8
)
1160
32
      
for (j = 1; 8
j < parallel_count;
j++24
)
1161
24
      {
1162
24
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1163
24
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1164
24
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1165
24
      }
1166
8
    const int backward_to_size = compiled_data->backward.to_size;
1167
146
    for (i = 0; i < backward_to_size; 
i++138
)
1168
552
      
for (j = 1; 138
j < parallel_count;
j++414
)
1169
414
      {
1170
414
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->backward.tos[i], j);
1171
414
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1172
414
          compiled_data->backward.tos[compiled_data->backward.to_size++] = copy;
1173
414
      }
1174
8
  }
1175
  // Only use memory compression if we are in gradient parameter mode.
1176
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS2.23k
)
1177
2.24k
  {
1178
2.24k
    if (model->memory_compression)
1179
0
      ccv_nnc_symbolic_graph_memory_compression(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1180
2.24k
    if (model->memory_reduction)
1181
0
      ccv_nnc_symbolic_graph_memory_reduction(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1182
2.24k
  }
1183
2.24k
  compiled_data->backward.to_size = _ccv_nnc_array_dedup_graph_exec_symbols(compiled_data->backward.tos, compiled_data->backward.to_size);
1184
2.24k
  compiled_data->gradient_mode = gradient_mode;
1185
2.24k
}
1186
1187
void ccv_cnnp_model_tensors_init_0(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1188
90
{
1189
90
  assert(!compiled_data->tensors.parameters);
1190
90
  const int parameter_size = compiled_data->parameters->rnum;
1191
90
  const int parallel_count = ccv_max(model->parallel_count, 1);
1192
90
  const int internal_size = compiled_data->internals->rnum;
1193
90
  compiled_data->tensors_init.size = ccv_nnc_tensor_symbol_count(model->graph);
1194
90
  compiled_data->tensors_init.v = cccalloc(((compiled_data->tensors_init.size + 31) >> 5), sizeof(uint32_t));
1195
90
  compiled_data->tensors.parameters = (ccv_nnc_tensor_t**)cccalloc((parameter_size + internal_size) * parallel_count, sizeof(ccv_nnc_tensor_t*));
1196
90
  compiled_data->tensors.internals = compiled_data->tensors.parameters + parameter_size * parallel_count;
1197
90
}
1198
1199
int ccv_cnnp_model_tensors_any_to_alloc(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1200
3
{
1201
3
  int i, j;
1202
3
  const int parameter_size = compiled_data->parameters->rnum;
1203
3
  const int parallel_count = ccv_max(model->parallel_count, 1);
1204
3
  const int internal_size = compiled_data->internals->rnum;
1205
19
  for (i = 0; i < parameter_size; 
i++16
)
1206
16
  {
1207
    // parameters has to be allocated all together.
1208
16
    if (compiled_data->tensors.parameters[i])
1209
16
    {
1210
16
      for (j = 1; j < parallel_count; 
j++0
)
1211
0
        { assert(compiled_data->tensors.parameters[i + j * parameter_size]); }
1212
16
      continue;
1213
16
    }
1214
0
    return 1;
1215
16
  }
1216
3
  for (i = 0; i < internal_size; 
i++0
)
1217
0
  {
1218
0
    if (!compiled_data->tensors.internals[i])
1219
0
      return 1;
1220
0
    for (j = 1; j < parallel_count; j++)
1221
0
      if (!compiled_data->tensors.internals[i + j * internal_size])
1222
0
        return 1;
1223
0
  }
1224
3
  return 0;
1225
3
}
1226
1227
void ccv_cnnp_model_tensors_init_1(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1228
88
{
1229
88
  int i, j;
1230
88
  const int parameter_size = compiled_data->parameters->rnum;
1231
88
  const int parallel_count = ccv_max(model->parallel_count, 1);
1232
88
  const int internal_size = compiled_data->internals->rnum;
1233
373
  for (i = 0; i < parameter_size; 
i++285
)
1234
285
  {
1235
    // parameters has to be allocated all together.
1236
285
    if (compiled_data->tensors.parameters[i])
1237
0
    {
1238
0
      for (j = 1; j < parallel_count; j++)
1239
0
        { assert(compiled_data->tensors.parameters[i + j * parameter_size]); }
1240
0
      continue;
1241
0
    }
1242
285
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1243
285
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1244
285
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1245
104
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1246
285
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1247
285
    compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0);
1248
687
    for (j = 1; j < parallel_count; 
j++402
)
1249
402
    {
1250
402
      if (j != device_id)
1251
402
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1252
0
      else
1253
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1254
402
      compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1255
402
    }
1256
285
  }
1257
88
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1258
147
  for (i = 0; i < internal_size; 
i++59
)
1259
59
  {
1260
59
    const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i);
1261
59
    const int d = retained.d;
1262
59
    if (init_v[d >> 5] & (1u << (d & 0x1f)))
1263
0
      continue;
1264
59
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(retained.graph, retained);
1265
59
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1266
7
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1267
59
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1268
59
    if (!compiled_data->tensors.internals[i])
1269
59
      compiled_data->tensors.internals[i] = ccv_nnc_tensor_new(0, info, 0);
1270
155
    for (j = 1; j < parallel_count; 
j++96
)
1271
96
    {
1272
96
      if (j != device_id)
1273
96
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1274
0
      else
1275
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1276
96
      if (!compiled_data->tensors.internals[i + j * internal_size])
1277
96
        compiled_data->tensors.internals[i + j * internal_size] = ccv_nnc_tensor_new(0, info, 0);
1278
96
    }
1279
59
  }
1280
88
  compiled_data->tensors_init.v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); // Remove 1 if any.
1281
88
}
1282
1283
static void _ccv_cnnp_model_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1284
88
{
1285
88
  ccv_cnnp_model_tensors_init_0(model, compiled_data);
1286
88
  ccv_cnnp_model_tensors_init_1(model, compiled_data);
1287
88
}
1288
1289
static void _ccv_cnnp_model_copy_tensors(const uint32_t* const tensors_init, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1290
6
{
1291
6
  assert(parallel_count > 0);
1292
6
  int i, j;
1293
12
  for (i = 0; i < tensor_size; 
i++6
)
1294
6
  {
1295
6
    if (!tensors[i])
1296
0
      continue;
1297
6
    const int d = tensor_symbols[i].d;
1298
6
    if (!(tensors_init[d >> 5] & (1u << (d & 0x1f))))
1299
0
      continue;
1300
24
    
for (j = 1; 6
j < parallel_count;
j++18
)
1301
18
      if (tensors[i + j * tensor_size])
1302
18
      {
1303
18
        ccv_nnc_tensor_t* const input = CCV_NNC_TENSOR(tensors[i]);
1304
18
        ccv_nnc_tensor_t* const output = CCV_NNC_TENSOR(tensors[i + j * tensor_size]);
1305
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &input, 1, &output, 1, 0);
1306
18
      }
1307
6
  }
1308
6
}
1309
1310
static void _ccv_cnnp_model_remove_nocopies(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t** const tensors, const int tensor_size, const int parallel_count)
1311
95
{
1312
95
  assert(parallel_count > 0);
1313
95
  int i, j;
1314
154
  for (i = 0; i < tensor_size; 
i++59
)
1315
59
  {
1316
59
    const ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1317
155
    for (j = 1; j < parallel_count; 
j++96
)
1318
96
    {
1319
96
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1320
96
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1321
96
      if (copy_tensor && copy.d == CCV_NNC_NO_TENSOR_SYMBOL)
1322
0
      { // We shouldn't allocate this, free it up.
1323
0
        ccv_nnc_tensor_free(tensors[i + j * tensor_size]);
1324
0
        tensors[i + j * tensor_size] = 0;
1325
0
      }
1326
96
    }
1327
59
  }
1328
95
}
1329
1330
static void _ccv_cnnp_model_bind_tensors(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count, ccv_array_t* const tensor_binds)
1331
503
{
1332
503
  assert(parallel_count > 0);
1333
503
  int i, j;
1334
1.85k
  for (i = 0; i < tensor_size; 
i++1.35k
)
1335
1.35k
  {
1336
1.35k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1337
1.35k
    if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL)
1338
7
      continue;
1339
1.34k
    if (graph)
1340
1.34k
    {
1341
1.34k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1342
1.34k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1343
0
        tensor_symbol = alias_to;
1344
1.34k
    }
1345
1.34k
    ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(tensors[i]);
1346
1.34k
    if (tensor && tensor_symbol.d != CCV_NNC_NO_TENSOR_SYMBOL)
1347
1.34k
    {
1348
1.34k
      const ccv_nnc_tensor_bind_t retained_bind = {
1349
1.34k
        .symbol = tensor_symbol,
1350
1.34k
        .tensor = tensor
1351
1.34k
      };
1352
1.34k
      ccv_array_push(tensor_binds, &retained_bind);
1353
1.34k
    }
1354
2.89k
    for (j = 1; j < parallel_count; 
j++1.54k
)
1355
1.54k
    {
1356
1.54k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1357
1.54k
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1358
1.54k
      if (copy_tensor && copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1359
1.54k
      {
1360
1.54k
        const ccv_nnc_tensor_bind_t bind = {
1361
1.54k
          .symbol = copy,
1362
1.54k
          .tensor = tensors[i + j * tensor_size]
1363
1.54k
        };
1364
1.54k
        ccv_array_push(tensor_binds, &bind);
1365
1.54k
      }
1366
1.54k
    }
1367
1.34k
  }
1368
503
}
1369
1370
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data)
1371
2.39k
{
1372
2.39k
  if (compiled_data->graph)
1373
95
    ccv_nnc_graph_free(compiled_data->graph);
1374
2.39k
  compiled_data->graph = 0;
1375
2.39k
  compiled_data->is_test = 0;
1376
2.39k
  if (compiled_data->tensor_arena)
1377
95
    ccv_nnc_tensor_arena_free(compiled_data->tensor_arena);
1378
2.39k
  compiled_data->tensor_arena = 0;
1379
2.39k
  if (compiled_data->graph_exec_arena)
1380
95
    ccv_nnc_graph_exec_arena_free(compiled_data->graph_exec_arena);
1381
2.39k
  compiled_data->graph_exec_arena = 0;
1382
2.39k
  if (compiled_data->backward.from_ops)
1383
29
    ccfree(compiled_data->backward.from_ops);
1384
2.39k
  compiled_data->backward.from_ops = 0;
1385
2.39k
  if (compiled_data->evaluate.schedule)
1386
34
    ccv_nnc_graph_static_schedule_free(compiled_data->evaluate.schedule);
1387
2.39k
  compiled_data->evaluate.schedule = 0;
1388
2.39k
  if (compiled_data->backward.schedule)
1389
25
    ccv_nnc_graph_static_schedule_free(compiled_data->backward.schedule);
1390
2.39k
  compiled_data->backward.schedule = 0;
1391
2.39k
}
1392
1393
static void _ccv_cnnp_compiled_data_gradient_free(ccv_cnnp_compiled_data_t* const compiled_data)
1394
2.30k
{
1395
2.30k
  if (compiled_data->gradients)
1396
2.24k
    ccfree(compiled_data->gradients);
1397
2.30k
  compiled_data->gradients = 0;
1398
2.30k
  if (compiled_data->updated_parameters)
1399
2.24k
    ccfree(compiled_data->updated_parameters);
1400
2.30k
  compiled_data->updated_parameters = 0;
1401
2.30k
  compiled_data->update_nodes = 0;
1402
2.30k
  compiled_data->saved_aux = 0;
1403
2.30k
}
1404
1405
static void _ccv_cnnp_compiled_data_backward_free(ccv_cnnp_compiled_data_t* const compiled_data)
1406
2.33k
{
1407
2.33k
  if (compiled_data->backward.gradients)
1408
5
    ccfree(compiled_data->backward.gradients);
1409
2.33k
  compiled_data->backward.gradients = 0;
1410
2.33k
  if (compiled_data->backward.accum)
1411
5
    ccv_nnc_graph_free(compiled_data->backward.accum);
1412
2.33k
  compiled_data->backward.accum = 0;
1413
2.33k
  if (compiled_data->backward.tensor_arena)
1414
5
    ccv_nnc_tensor_arena_free(compiled_data->backward.tensor_arena);
1415
2.33k
  compiled_data->backward.tensor_arena = 0;
1416
2.33k
  if (compiled_data->backward.graph_exec_arena)
1417
5
    ccv_nnc_graph_exec_arena_free(compiled_data->backward.graph_exec_arena);
1418
2.33k
  compiled_data->backward.graph_exec_arena = 0;
1419
2.33k
}
1420
1421
static void _ccv_cnnp_compiled_data_apply_gradients_free(ccv_cnnp_compiled_data_t* const compiled_data)
1422
2.31k
{
1423
2.31k
  if (compiled_data->apply_gradients.graph)
1424
21
    ccv_nnc_graph_free(compiled_data->apply_gradients.graph);
1425
2.31k
  compiled_data->apply_gradients.graph = 0;
1426
2.31k
  if (compiled_data->apply_gradients.tensor_arena)
1427
21
    ccv_nnc_tensor_arena_free(compiled_data->apply_gradients.tensor_arena);
1428
2.31k
  compiled_data->apply_gradients.tensor_arena = 0;
1429
2.31k
  if (compiled_data->apply_gradients.graph_exec_arena)
1430
21
    ccv_nnc_graph_exec_arena_free(compiled_data->apply_gradients.graph_exec_arena);
1431
2.31k
  compiled_data->apply_gradients.graph_exec_arena = 0;
1432
2.31k
}
1433
1434
// Compile the graph to run ccv_cnnp_model_fit
1435
static void _ccv_cnnp_model_fit_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1436
8
{
1437
8
  int i, j;
1438
8
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1439
8
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE);
1440
8
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_FIT_MODE;
1441
8
  const int parallel_count = ccv_max(model->parallel_count, 1);
1442
8
  assert(output_size == model->output_size * parallel_count);
1443
8
  assert(!fits || output_size == fit_size);
1444
8
  assert(output_size > 0);
1445
8
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1446
8
  {
1447
8
    _ccv_cnnp_model_set_rewindables(model);
1448
8
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1449
8
  } else 
if (0
compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES0
) {
1450
0
    _ccv_cnnp_model_rewind_graph(model);
1451
0
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1452
0
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1453
0
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1454
0
  }
1455
8
  const int tensors_init = !!compiled_data->tensors_init.v;
1456
8
  if (!tensors_init)
1457
4
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1458
4
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1459
  // Check if it is not fully allocated, if it is not, init_1.
1460
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1461
8
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1462
8
  assert((input_size % parallel_count) == 0);
1463
8
  assert((output_size % parallel_count) == 0);
1464
8
  assert((fit_size % parallel_count) == 0);
1465
8
  const int input_size_per_p = input_size / parallel_count;
1466
8
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1467
8
  const int output_size_per_p = output_size / parallel_count;
1468
8
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1469
8
  const int fit_size_per_p = fit_size / parallel_count;
1470
8
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count, tensor_binds);
1471
8
  const int parameter_size = compiled_data->parameters->rnum;
1472
8
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1473
8
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1474
8
  const int internal_size = compiled_data->internals->rnum;
1475
8
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1476
8
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1477
8
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1478
8
  ccv_array_free(tensor_binds);
1479
8
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1480
8
  if (tensors_init && 
parallel_count > 14
)
1481
0
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1482
  // If tensor is not init'ed, we need to init states first.
1483
8
  if (_ccv_cnnp_any_to_init(compiled_data))
1484
7
  {
1485
7
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1486
7
      .parallel_count = parallel_count,
1487
7
      .graph = model->graph,
1488
7
      .compiled_data = compiled_data,
1489
7
      .tensor_arena = compiled_data->tensor_arena
1490
7
    };
1491
7
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1492
7
  }
1493
8
  compiled_data->is_test = 0;
1494
8
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer);
1495
  // No need to set because it is default to training mode.
1496
  // ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1497
105
  for (i = 0; i < saved_aux_size * parameter_size; 
i++97
)
1498
97
  {
1499
97
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
1500
5
      continue;
1501
92
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, compiled_data->saved_aux[i].source);
1502
92
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
1503
296
    for (j = 1; j < parallel_count; 
j++204
)
1504
204
    {
1505
204
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
1506
204
      if (copy)
1507
204
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
1508
204
    }
1509
92
  }
1510
8
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1511
8
  compiled_data->evaluate.to_op_size = 0;
1512
22
  for (i = 0; i < evaluate_to_size; 
i++14
)
1513
14
  {
1514
14
    ccv_nnc_graph_exec_t const to = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1515
14
    if (to.graph)
1516
14
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to;
1517
14
  }
1518
8
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1519
8
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1520
8
}
1521
1522
ccv_nnc_stream_context_t* ccv_cnnp_model_default_stream(const ccv_cnnp_model_t* const model)
1523
0
{
1524
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1525
0
  if (!compiled_data || !compiled_data->graph)
1526
0
    return 0;
1527
0
  return ccv_nnc_graph_default_stream(compiled_data->graph);
1528
0
}
1529
1530
uint64_t ccv_cnnp_model_memory_size(const ccv_cnnp_model_t* const model)
1531
0
{
1532
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1533
0
  if (!compiled_data || !compiled_data->tensor_arena)
1534
0
    return 0;
1535
0
  return ccv_nnc_tensor_arena_size(compiled_data->tensor_arena);
1536
0
}
1537
1538
static void _ccv_cnnp_bind_tensors_to_arena(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1539
38.8k
{
1540
38.8k
  int i, j;
1541
114k
  for (i = 0; i < tensor_size; 
i++75.5k
)
1542
75.5k
  {
1543
75.5k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1544
75.5k
    if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL)
1545
0
      continue;
1546
75.5k
    if (graph)
1547
72.5k
    {
1548
72.5k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1549
72.5k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1550
0
        tensor_symbol = alias_to;
1551
72.5k
    }
1552
75.5k
    ccv_nnc_tensor_bind_symbol(tensor_arena, tensor_symbol, tensors[i]);
1553
77.3k
    for (j = 1; j < parallel_count; 
j++1.77k
)
1554
1.77k
    {
1555
1.77k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1556
1.77k
      if (copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1557
1.77k
        ccv_nnc_tensor_bind_symbol(tensor_arena, copy, tensors[i + tensor_size * j]);
1558
1.77k
    }
1559
75.5k
  }
1560
38.8k
}
1561
1562
void ccv_cnnp_model_fit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1563
2.54k
{
1564
2.54k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1565
2.54k
  assert(compiled_data);
1566
2.54k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1567
2.54k
  assert(output_size == model->output_size * parallel_count);
1568
2.54k
  assert(input_size == model->input_size * parallel_count);
1569
2.54k
  assert(!fits || fit_size == output_size);
1570
2.54k
  assert(model->graph);
1571
2.54k
  if (!compiled_data->graph || 
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE2.53k
)
1572
8
  {
1573
8
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1574
8
    _ccv_cnnp_compiled_data_backward_free(compiled_data);
1575
8
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
1576
    // Compile the symbolic graph down only when needed.
1577
8
    _ccv_cnnp_model_fit_jit(model, inputs, input_size, fits, fit_size, outputs, output_size);
1578
2.53k
  } else {
1579
2.53k
    assert((input_size % parallel_count) == 0);
1580
2.53k
    assert((output_size % parallel_count) == 0);
1581
2.53k
    assert((fit_size % parallel_count) == 0);
1582
2.53k
    const int input_size_per_p = input_size / parallel_count;
1583
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1584
2.53k
    const int output_size_per_p = output_size / parallel_count;
1585
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1586
2.53k
    const int fit_size_per_p = fit_size / parallel_count;
1587
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count);
1588
2.53k
  }
1589
2.54k
  if (compiled_data->is_test)
1590
0
  {
1591
0
    compiled_data->is_test = 0;
1592
0
    ccv_nnc_graph_exec_update_t update = {
1593
0
      .parallel_count = parallel_count,
1594
0
      .graph = model->graph,
1595
0
      .graph_exec_arena = compiled_data->graph_exec_arena,
1596
0
    };
1597
0
    ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1598
0
  }
1599
2.54k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1600
2.54k
}
1601
1602
// Compile the graph to run ccv_cnnp_model_evaluate with require_grad = false (MULTISTAGE_MODE_NO_GRAD).
1603
static void _ccv_cnnp_model_multistage_no_grad_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1604
58
{
1605
58
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1606
58
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD;
1607
58
  const int parallel_count = ccv_max(model->parallel_count, 1);
1608
58
  assert(output_size == model->output_size * parallel_count);
1609
58
  assert(output_size > 0);
1610
  // If the gradient is not initialized, continue to setup parallel process. We don't init gradient here, but rather,
1611
  // we setup proper rewindables so the graph can be rewinded to previous state before we run data parallel.
1612
58
  if (parallel_count > 1 && 
compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE6
)
1613
6
  {
1614
6
    const int evaluate_to_size = compiled_data->evaluate.to_size;
1615
6
    compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1616
6
    _ccv_cnnp_model_set_rewindables(model);
1617
6
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1618
6
      0, 0,
1619
6
      0, 0, 0,
1620
6
      0, 0, 0,
1621
6
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1622
6
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1623
6
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1624
6
    int i, j;
1625
12
    for (i = 0; i < evaluate_to_size; 
i++6
)
1626
24
      
for (j = 1; 6
j < parallel_count;
j++18
)
1627
18
      {
1628
18
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1629
18
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1630
18
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1631
18
      }
1632
6
  }
1633
58
  const int tensors_init = !!compiled_data->tensors_init.v;
1634
58
  if (!tensors_init)
1635
35
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1636
23
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1637
  // Check if it is not fully allocated, if it is not, init_1.
1638
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1639
58
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1640
58
  assert((input_size % parallel_count) == 0);
1641
58
  assert((output_size % parallel_count) == 0);
1642
58
  const int input_size_per_p = input_size / parallel_count;
1643
58
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1644
58
  const int output_size_per_p = output_size / parallel_count;
1645
58
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1646
58
  const int parameter_size = compiled_data->parameters->rnum;
1647
58
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1648
58
  const int internal_size = compiled_data->internals->rnum;
1649
58
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1650
58
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1651
  // If we generated gradient for the graph, only compile part of the graph because the rest is irrelevant for evaluation.
1652
58
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1653
58
  ccv_array_free(tensor_binds);
1654
58
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1655
  // If tensor is not init'ed, we need to init states first.
1656
58
  if (tensors_init && 
parallel_count > 123
)
1657
6
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1658
58
  if (_ccv_cnnp_any_to_init(compiled_data))
1659
16
  {
1660
16
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1661
16
      .parallel_count = parallel_count,
1662
16
      .graph = model->graph,
1663
16
      .compiled_data = compiled_data,
1664
16
      .tensor_arena = compiled_data->tensor_arena
1665
16
    };
1666
16
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1667
16
  }
1668
58
  compiled_data->is_test = 1;
1669
58
  ccv_nnc_graph_exec_update_t update = {
1670
58
    .parallel_count = parallel_count,
1671
58
    .graph = model->graph,
1672
58
    .graph_exec_arena = compiled_data->graph_exec_arena,
1673
58
  };
1674
58
  ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
1675
58
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1676
58
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1677
58
}
1678
1679
static void _ccv_cnnp_model_gradient_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1680
28
{
1681
28
  assert(!compiled_data->tensors.gradients);
1682
28
  const int parameter_size = compiled_data->parameters->rnum;
1683
28
  const int parallel_count = ccv_max(model->parallel_count, 1);
1684
28
  compiled_data->tensors.gradients = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * parameter_size * 2 * parallel_count);
1685
28
  compiled_data->tensors.accum_gradients = compiled_data->tensors.gradients + parameter_size * parallel_count;
1686
28
  int i, j;
1687
175
  for (i = 0; i < parameter_size; 
i++147
)
1688
147
  {
1689
147
    if (compiled_data->parameter_flags && 
!(compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))6
)
1690
2
    {
1691
2
      compiled_data->tensors.gradients[i] = 0;
1692
2
      compiled_data->tensors.accum_gradients[i] = 0;
1693
2
      for (j = 1; j < parallel_count; 
j++0
)
1694
0
      {
1695
0
        compiled_data->tensors.gradients[i + j * parameter_size] = 0;
1696
0
        compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1697
0
      }
1698
2
      continue;
1699
2
    }
1700
145
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1701
145
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1702
145
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1703
38
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1704
145
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1705
145
    compiled_data->tensors.gradients[i] = ccv_nnc_tensor_new(0, info, 0);
1706
145
    compiled_data->tensors.accum_gradients[i] = 0; // delay the accumulated gradient allocation until when we need it.
1707
325
    for (j = 1; j < parallel_count; 
j++180
)
1708
180
    {
1709
180
      if (j != device_id)
1710
180
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1711
0
      else
1712
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1713
180
      compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1714
180
      compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1715
180
    }
1716
145
  }
1717
28
}
1718
1719
static int _ccv_cnnp_is_disable_outgrad_all(const uint64_t disable_outgrad, const int input_size)
1720
8.00k
{
1721
8.00k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL)
1722
15
    return 1;
1723
7.98k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE)
1724
7.97k
    return 0;
1725
7
  int i;
1726
7
  for (i = 0; i < input_size; 
i++0
)
1727
7
    if (!(disable_outgrad & ((uint64_t)1 << i)))
1728
7
      return 0;
1729
0
  return 1;
1730
7
}
1731
1732
// Compile the graph to run ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1733
// Particularly, this method compiles the evaluation and backprop graph (the main graph).
1734
static void _ccv_cnnp_model_multistage_jit_0(ccv_cnnp_model_t* const model, const uint64_t disable_outgrad, const int is_test, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1735
29
{
1736
29
  int i, j;
1737
29
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1738
29
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES1
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS28
;
1739
29
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE || compiled_data->gradient_mode != target_gradient_mode);
1740
29
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE;
1741
29
  const int parallel_count = ccv_max(model->parallel_count, 1);
1742
29
  assert(output_size == model->output_size * parallel_count);
1743
29
  assert(output_size > 0);
1744
  // There shouldn't be a loss function if we evaluate with multistage jit.
1745
29
  assert(compiled_data->loss.cmd == CCV_NNC_NOOP);
1746
29
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1747
27
  {
1748
27
    _ccv_cnnp_model_set_rewindables(model);
1749
27
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1750
27
  } else 
if (2
compiled_data->gradient_mode != target_gradient_mode2
) {
1751
2
    _ccv_cnnp_model_rewind_graph(model);
1752
2
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1753
2
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1754
2
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1755
2
  }
1756
29
  const int tensors_init = !!compiled_data->tensors_init.v;
1757
29
  if (!tensors_init)
1758
21
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1759
8
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1760
  // Check if it is not fully allocated, if it is not, init_1.
1761
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1762
29
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1763
29
  assert((input_size % parallel_count) == 0);
1764
29
  assert((output_size % parallel_count) == 0);
1765
29
  const int input_size_per_p = input_size / parallel_count;
1766
29
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1767
29
  const int output_size_per_p = output_size / parallel_count;
1768
29
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1769
29
  const int parameter_size = compiled_data->parameters->rnum;
1770
29
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1771
29
  const int internal_size = compiled_data->internals->rnum;
1772
29
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1773
29
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1774
29
  if (!compiled_data->tensors.gradients)
1775
28
    _ccv_cnnp_model_gradient_tensors_init(model, compiled_data);
1776
29
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
1777
29
  if (compiled_data->backward.to_size > 0)
1778
29
    ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->backward.tos, compiled_data->backward.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1779
0
  else
1780
0
    ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1781
29
  ccv_array_free(tensor_binds);
1782
29
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1783
29
  if (tensors_init && 
parallel_count > 18
)
1784
0
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1785
  // If tensor is not init'ed, we need to init states first.
1786
29
  if (_ccv_cnnp_any_to_init(compiled_data))
1787
18
  {
1788
18
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1789
18
      .parallel_count = parallel_count,
1790
18
      .graph = model->graph,
1791
18
      .compiled_data = compiled_data,
1792
18
      .tensor_arena = compiled_data->tensor_arena
1793
18
    };
1794
18
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1795
18
  }
1796
29
  compiled_data->is_test = is_test;
1797
29
  ccv_nnc_graph_exec_update_t update = {
1798
29
    .parallel_count = parallel_count,
1799
29
    .graph = model->graph,
1800
29
    .graph_exec_arena = compiled_data->graph_exec_arena,
1801
29
  };
1802
29
  ccv_cnnp_model_set_is_test(model, is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1803
29
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1804
29
  compiled_data->evaluate.to_op_size = 0;
1805
29
  ccv_array_t* const backward_from = ccv_array_new(sizeof(int), 0, 0);
1806
76
  for (i = 0; i < evaluate_to_size; 
i++47
)
1807
47
  {
1808
47
    ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1809
47
    if (to_op.graph)
1810
47
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to_op;
1811
47
    const int* tos;
1812
47
    int to_size;
1813
47
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->evaluate.tos[i], &tos, &to_size);
1814
94
    for (j = 0; j < to_size; 
j++47
)
1815
47
    {
1816
47
      ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
1817
47
        .d = tos[j],
1818
47
        .graph = model->graph
1819
47
      });
1820
47
      if (to_op.graph)
1821
47
        ccv_array_add_unique_int(backward_from, to_op.d);
1822
47
    }
1823
47
  }
1824
29
  assert(backward_from->rnum > 0);
1825
29
  compiled_data->backward.from_op_size = backward_from->rnum;
1826
29
  compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1827
76
  for (i = 0; i < backward_from->rnum; 
i++47
)
1828
47
    compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1829
47
      .d = *(int*)ccv_array_get(backward_from, i),
1830
47
      .graph = compiled_data->graph,
1831
47
    };
1832
  // If there are any set node (to set some tensors to 0) inserted through backward pass, these won't be executed if we just do sources -> evaluate.to_ops, backward.from_ops -> destinations. We need this logic to find out these nodes and explicitly adding them to backward.from_ops.
1833
29
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(compiled_data->graph->exec_info, 0);
1834
29
  const int exec_info_size = compiled_data->graph->exec_info->rnum;
1835
29
  uint32_t* const visited = cccalloc((exec_info_size + 31) >> 5, sizeof(uint32_t));
1836
29
  const ccv_nnc_graph_exec_t* const sources = (ccv_nnc_graph_exec_t*)ccv_array_get(compiled_data->graph->sources, 0);
1837
29
  const int source_size = compiled_data->graph->sources->rnum;
1838
58
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new29
(compiled_data->graph, exec_info, exec_info_size, sources, source_size, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size, 0);
1839
600
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1840
600
    visited[(idx >> 5)] |= (1u << (idx & 31));
1841
600
  } ccv_nnc_graph_visit_endfor
1842
58
  ccv_nnc_graph_visit_free(visit);
1843
58
  const ccv_nnc_graph_exec_t* const destinations = (ccv_nnc_graph_exec_t*)
ccv_array_get29
(compiled_data->graph->destinations, 0);
1844
58
  const int destination_size = compiled_data->graph->destinations->rnum;
1845
58
  visit = 
ccv_nnc_graph_visit_new29
(compiled_data->graph, exec_info, exec_info_size, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, destinations, destination_size, 0);
1846
654
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1847
654
    visited[(idx >> 5)] |= (1u << (idx & 31));
1848
654
  } ccv_nnc_graph_visit_endfor
1849
58
  ccv_nnc_graph_visit_free(visit);
1850
58
  visit = 
ccv_nnc_graph_visit_new29
(compiled_data->graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0);
1851
  // Find any missing nodes to be added as source. Right now, these are only set nodes.
1852
1.30k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1853
1.30k
    if (!(visited[(idx >> 5)] & (1u << (idx & 31))))
1854
47
    {
1855
47
      assert(exec_info[idx].cmd.cmd == CCV_NNC_SET_FORWARD);
1856
47
      if (exec_info[idx].cmd.info.blas.a[0] == 0) // Special-casing for empty out the tensor set function, not for the set grad to 1 one.
1857
0
        ccv_array_add_unique_int(backward_from, idx);
1858
47
    }
1859
1.30k
  } ccv_nnc_graph_visit_endfor
1860
29
  ccv_nnc_graph_visit_free(visit);
1861
29
  ccfree(visited);
1862
29
  if (backward_from->rnum != compiled_data->backward.from_op_size) // If it doesn't match, need to redo this.
1863
0
  {
1864
0
    compiled_data->backward.from_op_size = backward_from->rnum;
1865
0
    compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccrealloc(compiled_data->backward.from_ops, sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1866
0
    for (i = 0; i < backward_from->rnum; i++)
1867
0
      compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1868
0
        .d = *(int*)ccv_array_get(backward_from, i),
1869
0
        .graph = compiled_data->graph,
1870
0
      };
1871
0
  }
1872
29
  ccv_array_free(backward_from);
1873
29
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1874
29
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1875
29
}
1876
1877
void ccv_cnnp_model_dry_run(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1878
7.97k
{
1879
7.97k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1880
7.97k
  assert(compiled_data);
1881
7.97k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1882
7.97k
  assert(output_size == model->output_size * parallel_count);
1883
7.97k
  assert(input_size == model->input_size * parallel_count);
1884
7.97k
  assert(model->graph);
1885
7.97k
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(params.disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES14
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS7.95k
;
1886
7.97k
  const int mode_mismatch = (params.requires_grad && 
(7.82k
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE7.82k
||
compiled_data->gradient_mode != target_gradient_mode7.79k
||
compiled_data->disable_outgrad != params.disable_outgrad7.79k
));
1887
7.97k
  if (!compiled_data->graph || 
mode_mismatch7.88k
)
1888
87
  {
1889
87
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1890
87
    if (mode_mismatch) // If mode mismatch, we need to redo the backward as well (no need to redo apply_gradients, it doesn't require target_gradient_mode or disable_outgrad.
1891
29
      _ccv_cnnp_compiled_data_backward_free(compiled_data);
1892
87
    if (params.requires_grad)
1893
29
      _ccv_cnnp_model_multistage_jit_0(model, params.disable_outgrad, params.is_test, inputs, input_size, outputs, output_size);
1894
58
    else
1895
58
      _ccv_cnnp_model_multistage_no_grad_jit(model, inputs, input_size, outputs, output_size);
1896
7.88k
  } else {
1897
7.88k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->tensor_arena);
1898
7.88k
    assert((input_size % parallel_count) == 0);
1899
7.88k
    const int input_size_per_p = input_size / parallel_count;
1900
7.88k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1901
7.88k
    assert((output_size % parallel_count) == 0);
1902
7.88k
    const int output_size_per_p = output_size / parallel_count;
1903
7.88k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1904
7.88k
  }
1905
7.97k
  if (compiled_data->is_test != params.is_test)
1906
63
  {
1907
63
    compiled_data->is_test = params.is_test;
1908
63
    ccv_nnc_graph_exec_update_t update = {
1909
63
      .parallel_count = parallel_count,
1910
63
      .graph = model->graph,
1911
63
      .graph_exec_arena = compiled_data->graph_exec_arena,
1912
63
    };
1913
63
    ccv_cnnp_model_set_is_test(model, params.is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1914
63
  }
1915
7.97k
}
1916
1917
void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1918
7.97k
{
1919
7.97k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1920
7.97k
  assert(compiled_data);
1921
7.97k
  ccv_cnnp_model_dry_run(model, params, inputs, input_size, outputs, output_size);
1922
7.97k
  if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD)
1923
71
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1924
7.90k
  else {
1925
7.90k
    if (!compiled_data->evaluate.schedule)
1926
34
      compiled_data->evaluate.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, 0, 0, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size);
1927
7.90k
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->evaluate.schedule, tensor_tape, stream_context);
1928
7.90k
  }
1929
7.97k
}
1930
1931
// Compile the graph to run ccv_cnnp_model_backward after ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1932
// Particularly, this method compiles the accumulator graph.
1933
static void _ccv_cnnp_model_multistage_jit_1(ccv_cnnp_model_t* const model)
1934
5
{
1935
5
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1936
5
  assert(compiled_data);
1937
5
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1938
5
  ccv_nnc_symbolic_graph_t* accum = ccv_nnc_symbolic_graph_new();
1939
5
  const int parallel_count = ccv_max(model->parallel_count, 1);
1940
5
  const int parameter_size = compiled_data->parameters->rnum;
1941
5
  int i, j;
1942
5
  compiled_data->backward.gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size * parallel_count * 3);
1943
5
  compiled_data->backward.accum_gradients = compiled_data->backward.gradients + parameter_size * parallel_count;
1944
5
  compiled_data->backward.updated_accum_gradients = compiled_data->backward.accum_gradients + parameter_size * parallel_count;
1945
20
  for (i = 0; i < parameter_size; 
i++15
)
1946
30
    
for (j = 0; 15
j < parallel_count;
j++15
)
1947
15
      if (compiled_data->tensors.gradients[i + j * parameter_size])
1948
15
      {
1949
15
        const ccv_nnc_tensor_param_t info = compiled_data->tensors.gradients[i + j * parameter_size]->info;
1950
        // Now, the old gradient is the accumulated gradient, getting new gradient tensor setup so we can collect them.
1951
15
        compiled_data->tensors.accum_gradients[i + j * parameter_size] = compiled_data->tensors.gradients[i + j * parameter_size];
1952
15
        compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1953
15
        ccv_nnc_tensor_symbol_t inputs[2];
1954
15
        inputs[0] = compiled_data->backward.accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1955
15
        inputs[1] = compiled_data->backward.gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1956
15
        ccv_nnc_tensor_symbol_t output = compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1957
15
        ccv_nnc_graph_exec_symbol_new(accum, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0);
1958
15
      } else {
1959
0
        compiled_data->backward.accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1960
0
        compiled_data->backward.gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1961
0
        compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1962
0
      }
1963
5
  ccv_nnc_graph_exec_symbol_autogen(accum, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1964
5
  if (ccv_nnc_symbolic_graph_source_size(accum) == 0)
1965
0
  {
1966
0
    ccv_nnc_symbolic_graph_free(accum);
1967
    // Create empty graph.
1968
0
    compiled_data->backward.accum = ccv_nnc_graph_new();
1969
0
    ccv_nnc_graph_topsort(compiled_data->backward.accum, 0, 0);
1970
0
    return;
1971
0
  }
1972
5
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1973
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1974
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1, tensor_binds);
1975
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1976
5
  ccv_nnc_symbolic_graph_compile(accum, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(accum), SYMBOLIC_GRAPH_DESTINATIONS(accum), &compiled_data->backward.accum, &compiled_data->backward.tensor_arena, &compiled_data->backward.graph_exec_arena);
1977
5
  ccv_nnc_symbolic_graph_free(accum);
1978
5
  ccv_array_free(tensor_binds);
1979
5
  ccv_nnc_graph_set_default_static_schedule(compiled_data->backward.accum, compiled_data->stream_type, model->max_stream_count);
1980
5
}
1981
1982
void ccv_cnnp_model_backward(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const ingrads, const int ingrad_size, ccv_nnc_tensor_t* const* const outgrads, const int outgrad_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1983
7.88k
{
1984
7.88k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1985
7.88k
  assert(compiled_data);
1986
7.88k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1987
7.88k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1988
7.88k
  assert(ingrad_size == 0 || ingrad_size == model->output_size * parallel_count);
1989
7.88k
  if (outgrad_size > 0)
1990
2.51k
    { assert(outgrad_size == compiled_data->outgrad_size * parallel_count); }
1991
7.88k
  assert(model->graph);
1992
7.88k
  assert(compiled_data->graph);
1993
7.88k
  const int parameter_size = compiled_data->parameters->rnum;
1994
  // If we need to accumulate the gradients now, do jit on accumulator.
1995
7.88k
  if (compiled_data->backward.count > 0)
1996
1.71k
  {
1997
1.71k
    if (!compiled_data->backward.accum)
1998
5
      _ccv_cnnp_model_multistage_jit_1(model);
1999
1.71k
    else if (compiled_data->backward.count == 1) {
2000
      //  On this round, we need to switch accumulated gradients with gradients (so we can do accumulation properly).
2001
496
      int i;
2002
1.48k
      for (i = 0; i < parameter_size * parallel_count; 
i++986
)
2003
986
      {
2004
986
        ccv_nnc_tensor_t* tensor;
2005
986
        CCV_SWAP(compiled_data->tensors.accum_gradients[i], compiled_data->tensors.gradients[i], tensor);
2006
986
      }
2007
496
      if (compiled_data->backward.tensor_arena)
2008
496
      {
2009
496
        ccv_nnc_tensor_arena_clear_bindings(compiled_data->backward.tensor_arena);
2010
        // Do rebind in case we messed up the binding (we switch accum_gradients and gradients).
2011
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1);
2012
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
2013
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
2014
496
      }
2015
496
    }
2016
1.71k
  }
2017
7.88k
  const int ingrad_size_per_p = model->output_size;
2018
7.88k
  const int outgrad_size_per_p = compiled_data->outgrad_size;
2019
7.88k
  int i, j;
2020
15.7k
  for (i = 0; i < ingrad_size_per_p; 
i++7.88k
)
2021
7.88k
  {
2022
7.88k
    const ccv_nnc_tensor_symbol_t ingrad = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
2023
7.88k
    if (!ingrad_size || 
!ingrads3.79k
||
ingrads[i] == 03.79k
)
2024
4.19k
    {
2025
      // Set it to 1 if it is not specified.
2026
4.19k
      ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ingrad);
2027
4.19k
      if (ingrad_tensor)
2028
4.19k
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
2029
4.31k
      for (j = 1; j < parallel_count; 
j++120
)
2030
120
      {
2031
120
        ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j));
2032
120
        if (ingrad_tensor)
2033
120
          ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
2034
120
      }
2035
4.19k
    } else {
2036
      // Make sure the length matches, in case it is an alias.
2037
3.69k
      assert(ccv_nnc_tensor_count(ingrads[i]->info) == ccv_nnc_tensor_count(ccv_nnc_tensor_symbol_params(model->graph, ingrad)));
2038
3.69k
      ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ingrad, ingrads[i]);
2039
3.69k
      for (j = 1; j < parallel_count; 
j++6
)
2040
6
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j), ingrads[i + ingrad_size_per_p * j]);
2041
3.69k
    }
2042
7.88k
  }
2043
7.88k
  if (outgrad_size > 0)
2044
2.51k
  {
2045
2.51k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS && "shouldn't pass disable_outgrad to ccv_cnnp_model_evaluate before if you plan to compute outgrad");
2046
5.14k
    
for (i = 0; 2.51k
i < outgrad_size_per_p;
i++2.62k
)
2047
2.62k
      if (outgrads[i])
2048
2.43k
      {
2049
2.43k
        const ccv_nnc_tensor_symbol_t outgrad = compiled_data->outgrads[i];
2050
2.43k
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, outgrad, outgrads[i]);
2051
2.43k
        for (j = 1; j < parallel_count; 
j++6
)
2052
6
          ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, outgrad, j), outgrads[i + outgrad_size_per_p * j]);
2053
2.43k
      }
2054
5.37k
  } else {
2055
5.37k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES ||
2056
5.37k
      compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS);
2057
5.37k
  }
2058
  // We need to rebind here because in ccv_cnnp_evaluate, we clear bindings, that will reset all bindings for the gradients.
2059
  // For parameters and internals these are fine because when we clear bindings, it restores to original bindings, which are these
2060
  // parameters and internals. The same cannot be said for gradients due to the accum_gradients switching.
2061
7.88k
  _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
2062
7.88k
  if (!compiled_data->backward.schedule)
2063
25
    compiled_data->backward.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, 0, 0);
2064
  // Run the backward pass.
2065
7.88k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->backward.schedule, tensor_tape, stream_context);
2066
  // If we need to run accumulation round, do that now.
2067
7.88k
  if (compiled_data->backward.count > 0)
2068
1.71k
    ccv_nnc_graph_run_with_schedule(compiled_data->backward.accum, 0, 0, 0, stream_context);
2069
  // Update the count, this determines whether we need to accumulate or not.
2070
7.88k
  ++compiled_data->backward.count;
2071
7.88k
}
2072
2073
// Compile the graph to run ccv_cnnp_model_apply_gradients after ccv_cnnp_model_backward (MULTISTAGE_MODE).
2074
// Particularly, this method compiles the parameter update graph.
2075
static void _ccv_cnnp_model_multistage_jit_2(ccv_cnnp_model_t* const model)
2076
21
{
2077
21
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2078
21
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
2079
21
  const int parallel_count = ccv_max(model->parallel_count, 1);
2080
21
  const int parameter_size = compiled_data->parameters->rnum;
2081
21
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
2082
21
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
2083
21
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
2084
  // Bind accumulated gradients.
2085
21
  if (compiled_data->backward.count > 1)
2086
4
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count, tensor_binds);
2087
17
  else
2088
17
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
2089
21
  ccv_array_t* const apply_gradients_from = ccv_array_new(sizeof(int), 0, 0);
2090
21
  int i, j;
2091
247
  for (i = 0; i < compiled_data->backward.to_size; 
i++226
)
2092
226
  {
2093
226
    const int* tos;
2094
226
    int to_size;
2095
226
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->backward.tos[i], &tos, &to_size);
2096
726
    for (j = 0; j < to_size; 
j++500
)
2097
500
    {
2098
      // Check if this is already show up in the backward graph, if that is the case, it won't be in the apply
2099
      // gradients graph.
2100
500
      const ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
2101
500
        .d = tos[j],
2102
500
        .graph = model->graph,
2103
500
      });
2104
500
      if (!exec.graph)
2105
313
        ccv_array_add_unique_int(apply_gradients_from, tos[j]);
2106
500
    }
2107
226
  }
2108
21
  const int from_size = apply_gradients_from->rnum;
2109
21
  if (from_size == 0)
2110
0
  {
2111
0
    ccv_array_free(apply_gradients_from);
2112
0
    ccv_array_free(tensor_binds);
2113
0
    return;
2114
0
  }
2115
21
  ccv_nnc_graph_exec_symbol_t* const froms = (ccv_nnc_graph_exec_symbol_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * from_size);
2116
154
  for (i = 0; i < from_size; 
i++133
)
2117
133
    froms[i] = (ccv_nnc_graph_exec_symbol_t){
2118
133
      .d = *(int*)ccv_array_get(apply_gradients_from, i),
2119
133
      .graph = model->graph
2120
133
    };
2121
21
  ccv_array_free(apply_gradients_from);
2122
  // It can only ends with updates on the parameters.
2123
21
  ccv_array_t* const tos = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), parameter_size * parallel_count, 0);
2124
154
  for (i = 0;  i < parameter_size; 
i++133
)
2125
133
  {
2126
133
    if (compiled_data->update_nodes[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
2127
0
      continue;
2128
133
    ccv_array_push(tos, &compiled_data->update_nodes[i]);
2129
313
    for (j = 1; j < parallel_count; 
j++180
)
2130
180
    {
2131
180
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->update_nodes[i], j);
2132
180
      ccv_array_push(tos, &copy);
2133
180
    }
2134
133
  }
2135
21
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, froms, from_size, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(tos, 0), tos->rnum, &compiled_data->apply_gradients.graph, &compiled_data->apply_gradients.tensor_arena, &compiled_data->apply_gradients.graph_exec_arena);
2136
21
  ccv_array_free(tos);
2137
21
  ccv_array_free(tensor_binds);
2138
21
  ccfree(froms);
2139
21
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2140
213
  for (i = 0; i < max_saved_aux_size * parameter_size; 
i++192
)
2141
192
  {
2142
    // Skip on no tensor.
2143
192
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
2144
0
      continue;
2145
192
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, compiled_data->saved_aux[i].source);
2146
192
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
2147
540
    for (j = 1; j < parallel_count; 
j++348
)
2148
348
    {
2149
348
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
2150
348
      if (copy)
2151
348
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
2152
348
    }
2153
192
  }
2154
21
  ccv_nnc_graph_set_default_static_schedule(compiled_data->apply_gradients.graph, compiled_data->stream_type, model->max_stream_count);
2155
21
}
2156
2157
void ccv_cnnp_model_apply_gradients(ccv_cnnp_model_t* const model, ccv_nnc_stream_context_t* const stream_context)
2158
7.81k
{
2159
7.81k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2160
7.81k
  assert(compiled_data);
2161
7.81k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
2162
7.81k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2163
7.81k
  assert(model->graph);
2164
7.81k
  assert(compiled_data->graph);
2165
  // Skip if there is no backward pass.
2166
7.81k
  if (compiled_data->backward.count <= 0)
2167
1.65k
    return;
2168
  // Skip if there is no parameters.
2169
6.16k
  if (compiled_data->parameters->rnum == 0)
2170
3
  {
2171
3
    compiled_data->backward.count = 0;
2172
3
    return;
2173
3
  }
2174
6.16k
  if (!compiled_data->apply_gradients.graph)
2175
21
    _ccv_cnnp_model_multistage_jit_2(model);
2176
6.14k
  else {
2177
6.14k
    const int parameter_size = compiled_data->parameters->rnum;
2178
6.14k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->apply_gradients.tensor_arena);
2179
    // Change to bind accum_gradients if we do gradient accumulation (run backward more than once).
2180
6.14k
    if (compiled_data->backward.count > 1)
2181
497
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count);
2182
5.64k
    else
2183
5.64k
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
2184
6.14k
  }
2185
6.16k
  if (compiled_data->apply_gradients.graph)
2186
6.16k
    ccv_nnc_graph_run_with_schedule(compiled_data->apply_gradients.graph, 0, 0, 0, stream_context);
2187
  // Reset backward count to 0.
2188
6.16k
  compiled_data->backward.count = 0;
2189
6.16k
}
2190
2191
void ccv_cnnp_model_set_parameter(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, const ccv_nnc_tensor_t* const tensor)
2192
35
{
2193
35
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2194
35
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 18
:
parameter->param_sel27
;
2195
35
  assert(parameter->param_sel != 0);
2196
35
  const int tensors_init = !!compiled_data->tensors_init.v;
2197
35
  if (!tensors_init)
2198
19
    _ccv_cnnp_model_tensors_init(model, compiled_data);
2199
16
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
2200
  // Check if it is not fully allocated, if it is not, init_1.
2201
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
2202
35
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2203
35
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2204
35
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 134
:
parameter->param_ref1
;
2205
35
  if (param_ref < 0)
2206
1
    { assert(parameter_indices->rnum == 1); }
2207
34
  else
2208
34
    { assert(param_ref < parameter_indices->rnum); }
2209
35
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2210
35
  ccv_array_free(parameter_indices);
2211
35
  const int parameter_size = compiled_data->parameters->rnum;
2212
35
  assert(d >= 0);
2213
35
  assert(d < parameter_size);
2214
35
  const int parallel_count = ccv_max(model->parallel_count, 1);
2215
35
  ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2216
35
  assert(dest);
2217
35
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)tensor), TENSOR_LIST(dest), 0);
2218
35
  int i;
2219
35
  for (i = 1; i < parallel_count; 
i++0
)
2220
0
  {
2221
0
    ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d + i * parameter_size]);
2222
0
    if (copy_tensor)
2223
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
2224
0
  }
2225
  // Mark this symbol as init'ed.
2226
35
  const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, d))->d;
2227
35
  uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2228
35
  init_v[s >> 5] |= (1u << (s & 0x1f));
2229
35
}
2230
2231
void ccv_cnnp_model_parameter_copy(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, ccv_nnc_tensor_t* const tensor)
2232
6
{
2233
6
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2234
6
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 13
:
parameter->param_sel3
;
2235
6
  assert(parameter->param_sel != 0);
2236
6
  assert(compiled_data->tensors.parameters);
2237
6
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2238
6
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2239
6
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 13
:
parameter->param_ref3
;
2240
6
  if (param_ref < 0)
2241
3
    { assert(parameter_indices->rnum == 1); }
2242
3
  else
2243
3
    { assert(param_ref < parameter_indices->rnum); }
2244
6
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2245
6
  ccv_array_free(parameter_indices);
2246
6
  const int parameter_size = compiled_data->parameters->rnum;
2247
6
  assert(d >= 0);
2248
6
  assert(d < parameter_size);
2249
  // We don't need to consider parallel_count, every parameter on each device is identical.
2250
6
  ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2251
6
  assert(src);
2252
6
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(tensor), 0);
2253
6
}
2254
2255
ccv_nnc_tensor_param_t ccv_cnnp_model_parameter_tensor_params(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter)
2256
1
{
2257
1
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2258
1
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 10
: parameter->param_sel;
2259
1
  assert(parameter->param_sel != 0);
2260
1
  assert(compiled_data->tensors.parameters);
2261
1
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2262
1
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2263
1
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 10
: parameter->param_ref;
2264
1
  if (param_ref < 0)
2265
1
    { assert(parameter_indices->rnum == 1); }
2266
0
  else
2267
0
    { assert(param_ref < parameter_indices->rnum); }
2268
1
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2269
1
  ccv_array_free(parameter_indices);
2270
1
  const int parameter_size = compiled_data->parameters->rnum;
2271
1
  assert(d >= 0);
2272
1
  assert(d < parameter_size);
2273
  // We don't need to consider parallel_count, every parameter on each device is identical.
2274
1
  ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2275
1
  assert(tensor);
2276
1
  return tensor->info;
2277
1
}
2278
2279
const char* ccv_cnnp_model_parameter_name(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter)
2280
2
{
2281
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2282
2
  const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 1 : 
parameter->param_sel0
;
2283
2
  assert(parameter->param_sel != 0);
2284
2
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2285
2
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2286
2
  const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 1 : 
parameter->param_ref0
;
2287
2
  if (param_ref < 0)
2288
0
    { assert(parameter_indices->rnum == 1); }
2289
2
  else
2290
2
    { assert(param_ref < parameter_indices->rnum); }
2291
2
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2292
2
  ccv_array_free(parameter_indices);
2293
2
  const int parameter_size = compiled_data->parameters->rnum;
2294
2
  assert(d >= 0);
2295
2
  assert(d < parameter_size);
2296
2
  return *(char**)ccv_array_get(compiled_data->ids.parameters, d);
2297
2
}
2298
2299
int ccv_cnnp_model_parameter_count(ccv_cnnp_model_t* const model)
2300
0
{
2301
0
  assert(model->compiled_data);
2302
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2303
0
  return compiled_data->parameters->rnum;
2304
0
}
2305
2306
uint64_t ccv_cnnp_model_parameters_size(ccv_cnnp_model_t* const model)
2307
0
{
2308
0
  assert(model->compiled_data);
2309
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2310
0
  const int parameter_size = compiled_data->parameters->rnum;
2311
0
  int i;
2312
0
  const ccv_nnc_symbolic_graph_t* const graph = model->graph;
2313
0
  uint64_t size = 0;
2314
0
  for (i = 0; i < parameter_size; i++)
2315
0
  {
2316
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
2317
0
    ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(graph, (ccv_nnc_tensor_symbol_t){
2318
0
      .graph = graph,
2319
0
      .d = d
2320
0
    });
2321
0
    size += ccv_nnc_tensor_data_size(params);
2322
0
  }
2323
0
  return size;
2324
0
}
2325
2326
int ccv_cnnp_model_parameters_move(ccv_cnnp_model_t* const model, char** const names, ccv_nnc_tensor_t** const tensors, const int count, int type)
2327
0
{
2328
0
  assert(model->compiled_data);
2329
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2330
0
  if (count != compiled_data->parameters->rnum)
2331
0
    return 0;
2332
0
  if (CCV_TENSOR_GET_DEVICE(type) == CCV_COMPUTE_DEVICE_ANY)
2333
0
    CCV_TENSOR_SET_DEVICE_ID(type, 0);
2334
0
  int i;
2335
  // We don't need to consider parallel_count, every parameter on each device is identical.
2336
0
  for (i = 0; i < count; i++)
2337
0
  {
2338
0
    ccv_nnc_tensor_t* tensor = compiled_data->tensors.parameters[i];
2339
0
    if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything.
2340
0
    {
2341
0
      tensors[i] = 0;
2342
0
      continue;
2343
0
    }
2344
0
    tensor = CCV_NNC_TENSOR(tensor);
2345
0
    if (tensor->info.type == type)
2346
0
      tensors[i] = tensor;
2347
0
    else {
2348
0
      ccv_nnc_tensor_param_t info = tensor->info;
2349
0
      info.type = type;
2350
0
      tensors[i] = ccv_nnc_tensor_new(0, info, 0); // Create this tensor, don't initiate copy yet.
2351
0
    }
2352
0
  }
2353
0
  for (i = 0; i < count; i++)
2354
0
  {
2355
0
    ccv_nnc_tensor_t* tensor = compiled_data->tensors.parameters[i];
2356
0
    if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything.
2357
0
      continue;
2358
0
    tensor = CCV_NNC_TENSOR(tensor);
2359
    // Now initiate transfer. We should do this one on a stream.
2360
0
    if (tensor->info.type != type)
2361
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(tensors[i]), 0);
2362
0
  }
2363
  // Copy names and remove parameters.
2364
0
  for (i = 0; i < count; i++)
2365
0
  {
2366
0
    ccv_nnc_tensor_t* const tensor = compiled_data->tensors.parameters[i];
2367
0
    if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything.
2368
0
    {
2369
0
      names[i] = 0;
2370
0
      continue;
2371
0
    }
2372
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2373
0
    const size_t name_len = ccv_min(strnlen(name, 1023), 1023);
2374
0
    names[i] = ccmalloc(name_len + 1);
2375
0
    names[i][name_len] = 0;
2376
0
    memcpy(names[i], name, name_len);
2377
0
    compiled_data->tensors.parameters[i] = 0;
2378
0
  }
2379
0
  return 1;
2380
0
}
2381
2382
KHASH_MAP_INIT_STR(ccv_cnnp_parameter_id, int)
2383
2384
void ccv_cnnp_model_set_parameters_from_key_values(ccv_cnnp_model_t* const model, const char* const* const names, ccv_nnc_tensor_t** const tensors, const int count, const int invalidates)
2385
0
{
2386
0
  assert(model->compiled_data);
2387
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2388
0
  int i;
2389
0
  khash_t(ccv_cnnp_parameter_id)* id_map = 0;
2390
0
  if (count != compiled_data->parameters->rnum)
2391
0
  {
2392
0
    id_map = kh_init(ccv_cnnp_parameter_id);
2393
    // Build the map between name and the index.
2394
0
    for (i = 0; i < count; i++)
2395
0
    {
2396
0
      int ret;
2397
0
      const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, names[i], &ret);
2398
0
      assert(ret != 0);
2399
0
      kh_val(id_map, k) = i;
2400
0
    }
2401
0
  }
2402
0
  const int parameter_size = compiled_data->parameters->rnum;
2403
0
  int* copy_back = 0;
2404
0
  for (i = 0; i < parameter_size; i++)
2405
0
  {
2406
0
    int j = i;
2407
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, ccv_min(count - 1, i));
2408
0
    if (strncmp(name, names[i], 1023) != 0)
2409
0
    {
2410
      // Build the map.
2411
0
      if (id_map == 0)
2412
0
      {
2413
0
        id_map = kh_init(ccv_cnnp_parameter_id);
2414
0
        for (j = 0; j < count; j++)
2415
0
        {
2416
0
          int ret;
2417
0
          const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, names[j], &ret);
2418
0
          assert(ret != 0);
2419
0
          kh_val(id_map, k) = j;
2420
0
        }
2421
0
      }
2422
0
      const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, name);
2423
0
      if (k == kh_end(id_map)) // Cannot find the name, skip.
2424
0
        continue;
2425
0
      j = kh_val(id_map, k);
2426
0
    }
2427
0
    if (compiled_data->tensors.parameters[i]) // Cannot be a shared parameter to read.
2428
0
      { assert(!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1)); }
2429
0
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
2430
0
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
2431
0
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
2432
0
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
2433
0
    if (info.type == tensors[j]->info.type) // Can move.
2434
0
    {
2435
      // Deallocate it if needed.
2436
0
      if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1))
2437
0
        if (compiled_data->tensors.parameters[i])
2438
0
          ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]);
2439
0
      compiled_data->tensors.parameters[i] = tensors[j];
2440
0
      tensors[j] = 0;
2441
0
    } else if (!compiled_data->tensors.parameters[i]) { // Not allocated, to allocate first.
2442
      // Create new one, make sure we create this by having the right parameters.
2443
0
      const int type = info.type;
2444
0
      info = tensors[j]->info;
2445
0
      info.type = type; // Revert back the type.
2446
0
      compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0);
2447
0
      if (!copy_back)
2448
0
        copy_back = (int*)cccalloc(parameter_size, sizeof(int));
2449
0
      copy_back[i] = j + 1;
2450
0
    }
2451
0
  }
2452
0
  if (id_map)
2453
0
    kh_destroy(ccv_cnnp_parameter_id, id_map);
2454
  // Now do the transfer.
2455
0
  if (copy_back)
2456
0
  {
2457
0
    for (i = 0; i < parameter_size; i++)
2458
0
    {
2459
0
      ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[i]);
2460
0
      if (copy_back[i] == 0)
2461
0
        continue;
2462
0
      const int j = copy_back[i] - 1;
2463
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensors[j]), TENSOR_LIST(tensor), 0);
2464
0
    }
2465
0
    ccfree(copy_back);
2466
0
  }
2467
0
}
2468
2469
ccv_cnnp_model_io_t ccv_cnnp_model_parameter_first(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f first, void* const context)
2470
0
{
2471
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2472
0
  assert(compiled_data);
2473
0
  const int parameter_size = compiled_data->parameters->rnum;
2474
0
  int i;
2475
0
  for (i = 0; i < parameter_size; i++)
2476
0
  {
2477
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2478
0
    if (first(model, name, context))
2479
0
      return ccv_cnnp_model_parameters(model, -1, i);
2480
0
  }
2481
0
  return 0;
2482
0
}
2483
2484
ccv_array_t* ccv_cnnp_model_parameters_filter(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f filter, void* const context)
2485
0
{
2486
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2487
0
  assert(compiled_data);
2488
0
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 0, 0);
2489
0
  const int parameter_size = compiled_data->parameters->rnum;
2490
0
  int i;
2491
0
  for (i = 0; i < parameter_size; i++)
2492
0
  {
2493
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2494
0
    if (filter(model, name, context))
2495
0
    {
2496
0
      ccv_cnnp_model_io_t parameter = ccv_cnnp_model_parameters(model, -1, i);
2497
0
      ccv_array_push(parameters, &parameter);
2498
0
    }
2499
0
  }
2500
0
  return parameters;
2501
2502
0
}
2503
2504
CCV_WARN_UNUSED(ccv_cnnp_model_io_t) ccv_cnnp_model_parameter_first_uninit(ccv_cnnp_model_t* const model)
2505
0
{
2506
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2507
0
  assert(compiled_data);
2508
0
  const int tensors_init = !!compiled_data->tensors_init.v;
2509
0
  if (!tensors_init) // If nothing initialized, we return parameter 0.
2510
0
    return ccv_cnnp_model_parameters(model, -1, 0);
2511
0
  const int parameter_size = compiled_data->parameters->rnum;
2512
0
  int i;
2513
0
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2514
0
  for (i = 0; i < parameter_size; i++)
2515
0
  {
2516
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
2517
0
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
2518
0
      return ccv_cnnp_model_parameters(model, -1, i);
2519
0
  }
2520
0
  return 0;
2521
0
}
2522
2523
static ccv_array_t* _ccv_cnnp_model_parameter_indices(const ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, int* const param_ref)
2524
48
{
2525
48
  const int to_param_sel = parameters->param_sel > 0 ? 
parameters->param_sel - 10
: parameters->param_sel;
2526
48
  assert(parameters->param_sel != 0);
2527
48
  ccv_array_t* const to_parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2528
48
  ccv_cnnp_model_add_to_parameter_indices(parameters->model, to_param_sel, to_parameter_indices);
2529
48
  *param_ref = parameters->param_ref > 0 ? 
parameters->param_ref - 10
: parameters->param_ref;
2530
48
  return to_parameter_indices;
2531
48
}
2532
2533
static void _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_array_t** const parameter_indices, int* const param_ref, ccv_array_t** const from_parameter_indices, int* const from_param_ref, const int only_init_0)
2534
14
{
2535
  // If the model is not compiled yet. Compile them now.
2536
14
  if (!model->graph)
2537
3
  {
2538
3
    model->graph = ccv_nnc_symbolic_graph_new();
2539
3
    assert(from_model->compiled_data);
2540
3
    const int input_size = from_model->input_size;
2541
3
    ccv_nnc_tensor_param_t input_params[input_size];
2542
3
    int i;
2543
9
    for (i = 0; i < input_size; 
i++6
)
2544
6
      input_params[i] = ccv_nnc_tensor_symbol_params(from_model->graph, from_model->inputs[i]);
2545
3
    _ccv_cnnp_model_compile(model, input_params, input_size, from_model->compiled_data->loss);
2546
3
    model->parallel_count = from_model->parallel_count;
2547
3
    model->memory_compression = from_model->memory_compression;
2548
3
    model->memory_reduction = from_model->memory_reduction;
2549
3
    model->gradient_checkpointing = from_model->gradient_checkpointing;
2550
3
    model->compiled_data->stream_type = from_model->compiled_data->stream_type;
2551
3
    model->compiled_data->minimize.minimizer = from_model->compiled_data->minimize.minimizer;
2552
3
    model->compiled_data->minimize.max_saved_aux_size = from_model->compiled_data->minimize.max_saved_aux_size;
2553
3
  }
2554
14
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2555
14
  assert(to_compiled_data);
2556
14
  const int to_tensors_init = !!to_compiled_data->tensors_init.v;
2557
14
  if (!to_tensors_init)
2558
10
  {
2559
10
    if (only_init_0)
2560
1
      ccv_cnnp_model_tensors_init_0(model, to_compiled_data);
2561
9
    else
2562
9
      _ccv_cnnp_model_tensors_init(model, to_compiled_data);
2563
10
  } else 
if (4
!only_init_04
&&
(uintptr_t)to_compiled_data->tensors_init.v & (uintptr_t)13
)
2564
    // Check if it is not fully allocated, if it is not, init_1.
2565
0
      ccv_cnnp_model_tensors_init_1(model, to_compiled_data);
2566
14
  assert(to_compiled_data->tensors.parameters);
2567
14
  *parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, param_ref);
2568
14
  *from_parameter_indices = _ccv_cnnp_model_parameter_indices(from_model, from_parameters, from_param_ref);
2569
14
  if (*from_param_ref < 0 && *param_ref >= 0)
2570
0
    { assert((*from_parameter_indices)->rnum == 1); }
2571
14
  else if (*from_param_ref >= 0)
2572
0
    { assert(*from_param_ref < (*from_parameter_indices)->rnum); }
2573
14
  if (*param_ref < 0 && *from_param_ref >= 0)
2574
0
    { assert((*parameter_indices)->rnum == 1); }
2575
14
  else if (*param_ref >= 0)
2576
0
    { assert(*param_ref < (*parameter_indices)->rnum); }
2577
14
}
2578
2579
void ccv_cnnp_model_set_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2580
9
{
2581
9
  ccv_array_t* to_parameter_indices;
2582
9
  int to_param_ref;
2583
9
  ccv_array_t* from_parameter_indices;
2584
9
  int from_param_ref;
2585
9
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0);
2586
  // Should be exactly the same tensor.
2587
9
  if (to_param_ref < 0 && from_param_ref < 0)
2588
9
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2589
  // To models.
2590
9
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2591
9
  assert(to_compiled_data);
2592
  // From models.
2593
9
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2594
9
  const int parallel_count = ccv_max(model->parallel_count, 1);
2595
9
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2596
9
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2597
9
  int i, j;
2598
9
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2599
9
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2600
18
  for (i = 0; i < rnum; 
i++9
)
2601
9
  {
2602
9
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2603
9
    assert(src_d >= 0);
2604
9
    assert(src_d < from_compiled_data->parameters->rnum);
2605
9
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2606
    // If the original is not init'ed. We cannot copy from.
2607
9
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2608
0
      continue;
2609
9
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2610
9
    assert(dest_d >= 0);
2611
9
    assert(dest_d < to_compiled_data->parameters->rnum);
2612
9
    ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]);
2613
9
    assert(src);
2614
9
    ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2615
9
    assert(dest);
2616
9
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(dest), 0);
2617
27
    for (j = 1; j < parallel_count; 
j++18
)
2618
18
    {
2619
18
      ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2620
18
      if (copy_tensor)
2621
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
2622
18
    }
2623
    // Mark this symbol as init'ed.
2624
9
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2625
9
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2626
9
  }
2627
9
  ccv_array_free(to_parameter_indices);
2628
9
  ccv_array_free(from_parameter_indices);
2629
9
}
2630
2631
void ccv_cnnp_model_share_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_cnnp_model_parameters_renamer_f renamer, void* const context)
2632
2
{
2633
2
  ccv_array_t* to_parameter_indices;
2634
2
  int to_param_ref;
2635
2
  ccv_array_t* from_parameter_indices;
2636
2
  int from_param_ref;
2637
2
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 1);
2638
  // Should be exactly the same tensor.
2639
2
  if (renamer == 0 && 
to_param_ref < 01
&&
from_param_ref < 01
)
2640
1
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2641
  // To models.
2642
2
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2643
2
  assert(to_compiled_data);
2644
  // From models.
2645
2
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2646
2
  const int parallel_count = ccv_max(model->parallel_count, 1);
2647
2
  assert(parallel_count == ccv_max(from_model->parallel_count, 1)); // Should have the same parallel count can share parameters.
2648
2
  const int from_parameter_size = from_compiled_data->parameters->rnum;
2649
2
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2650
2
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2651
2
  int i, j;
2652
2
  khash_t(ccv_cnnp_parameter_id)* id_map = 0;
2653
2
  char* updated_name = 0;
2654
2
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2655
2
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2656
8
  for (i = 0; i < rnum; 
i++6
)
2657
6
  {
2658
6
    int src_d = (from_param_ref >= 0 ? 
from_param_ref0
: i) < from_parameter_indices->rnum ?
*(int*)4
ccv_array_get4
(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i) :
from_parameter_size2
;
2659
    // Need to figure out how to use the renamer here.
2660
6
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2661
6
    assert(dest_d >= 0);
2662
6
    assert(dest_d < to_parameter_size);
2663
6
    if (renamer)
2664
3
    {
2665
3
      const char* const src_name = (src_d < from_parameter_size && 
src_d >= 01
) ?
*(char**)1
ccv_array_get1
(from_compiled_data->ids.parameters, src_d) :
02
;
2666
3
      const char* const dest_name = *(char**)ccv_array_get(to_compiled_data->ids.parameters, dest_d);
2667
3
      if (!updated_name)
2668
1
        updated_name = (char*)ccmalloc(1024);
2669
3
      const size_t src_name_len = src_name == 0 ? 
02
:
ccv_min1
(strnlen(src_name, 1023), 1023);
2670
3
      if (src_name_len > 0)
2671
1
        memcpy(updated_name, src_name, src_name_len);
2672
3
      updated_name[src_name_len] = 0;
2673
3
      if (renamer(context, dest_name, updated_name, 1024) != 0)
2674
0
        continue; // Skip this.
2675
3
      if (src_name != 0 && 
memcmp(updated_name, src_name, src_name_len) == 01
&&
strnlen(updated_name, 1023) == src_name_len0
)
2676
0
      {
2677
        // Nothing changed.
2678
3
      } else {
2679
3
        if (!id_map)
2680
1
        {
2681
1
          id_map = kh_init(ccv_cnnp_parameter_id);
2682
2
          for (j = 0; j < from_parameter_size; 
j++1
)
2683
1
          {
2684
1
            int ret;
2685
1
            const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, *(char**)ccv_array_get(from_compiled_data->ids.parameters, j), &ret);
2686
1
            assert(ret != 0);
2687
1
            kh_val(id_map, k) = j;
2688
1
          }
2689
1
        }
2690
3
        const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, updated_name);
2691
3
        if (k == kh_end(id_map)) // Cannot find the name, skip.
2692
2
          continue;
2693
1
        src_d = kh_val(id_map, k);
2694
1
        assert(src_d >= 0);
2695
1
        assert(src_d < from_parameter_size);
2696
1
      }
2697
3
    }
2698
6
    assert
(src_d >= 0)4
;
2699
4
    assert(src_d < from_parameter_size);
2700
4
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2701
    // If the original is not init'ed. We cannot share from.
2702
4
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2703
0
      continue;
2704
8
    
for (j = 0; 4
j < parallel_count;
j++4
)
2705
4
    {
2706
4
      ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * from_parameter_size]);
2707
4
      assert(src);
2708
4
      ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
2709
4
      if (dest && 
!((uintptr_t)dest & (uintptr_t)1)1
)
2710
1
        ccv_nnc_tensor_free(dest);
2711
4
      to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size] = (ccv_nnc_tensor_t*)((uintptr_t)src | (uintptr_t)1);
2712
4
    }
2713
    // Mark this symbol as init'ed.
2714
4
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2715
4
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2716
4
  }
2717
2
  ccv_array_free(to_parameter_indices);
2718
2
  ccv_array_free(from_parameter_indices);
2719
2
  if (id_map)
2720
1
    kh_destroy(ccv_cnnp_parameter_id, id_map);
2721
2
  if (updated_name)
2722
1
    ccfree(updated_name);
2723
  // Mark it as incomplete so we will call init_1.
2724
2
  if (ccv_cnnp_model_tensors_any_to_alloc(model, to_compiled_data))
2725
0
    to_compiled_data->tensors_init.v = (uint32_t*)((uintptr_t)to_compiled_data->tensors_init.v | (uintptr_t)1);
2726
2
  else // Remove the flag.
2727
2
    to_compiled_data->tensors_init.v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2728
2
}
2729
2730
ccv_nnc_stream_context_t* ccv_cnnp_compiled_data_get_stream(ccv_cnnp_compiled_data_t* const compiled_data, const int type)
2731
24
{
2732
24
  if (!compiled_data->stream_map)
2733
4
    compiled_data->stream_map = kh_init(stream_map);
2734
24
  int ret = 0;
2735
24
  khiter_t k = kh_put(stream_map, compiled_data->stream_map, type, &ret);
2736
24
  assert(ret >= 0);
2737
24
  ccv_nnc_stream_context_t* stream = kh_val(compiled_data->stream_map, k);
2738
  // If ret == 0, the key already exist, we can return directly, otherwise, create and return.
2739
24
  if (ret != 0)
2740
16
  {
2741
16
    stream = ccv_nnc_stream_context_new(type);
2742
16
    kh_val(compiled_data->stream_map, k) = stream;
2743
16
  }
2744
24
  return stream;
2745
24
}
2746
2747
void ccv_cnnp_model_parameters_zip_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2748
3
{
2749
3
  ccv_array_t* to_parameter_indices;
2750
3
  int to_param_ref;
2751
3
  ccv_array_t* from_parameter_indices;
2752
3
  int from_param_ref;
2753
3
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0);
2754
  // Should be exactly the same tensor.
2755
3
  if (to_param_ref < 0 && from_param_ref < 0)
2756
3
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2757
  // To models.
2758
3
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2759
3
  assert(to_compiled_data);
2760
  // From models.
2761
3
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2762
3
  const int parallel_count = ccv_max(model->parallel_count, 1);
2763
3
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2764
3
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2765
3
  assert(aux_in_size >= 0);
2766
3
  assert(aux_out_size >= 0);
2767
3
  int i, j;
2768
3
  ccv_nnc_tensor_t* inputs[aux_in_size + 2];
2769
3
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2770
3
  for (i = 0; i < aux_in_size; 
i++0
)
2771
0
    inputs[i + 2] = aux_ins[i];
2772
3
  for (i = 0; i < aux_out_size; 
i++0
)
2773
0
    outputs[i + 1] = aux_outs[i];
2774
3
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2775
3
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2776
6
  for (i = 0; i < rnum; 
i++3
)
2777
3
  {
2778
3
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2779
3
    assert(src_d >= 0);
2780
3
    assert(src_d < from_compiled_data->parameters->rnum);
2781
3
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2782
    // If the original is not init'ed. We cannot copy from.
2783
3
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2784
0
      continue;
2785
3
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2786
3
    assert(dest_d >= 0);
2787
3
    assert(dest_d < to_compiled_data->parameters->rnum);
2788
3
    if (parallel_count > 1)
2789
2
    {
2790
2
      ccv_nnc_stream_context_t* streams[parallel_count];
2791
2
      ccv_nnc_stream_signal_t* signal;
2792
2
      if (stream_context)
2793
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2794
10
      for (j = 0; j < parallel_count; 
j++8
)
2795
8
      {
2796
8
        ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * to_parameter_size]);
2797
8
        ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2798
8
        if (!dest || !src)
2799
0
        {
2800
0
          streams[j] = 0;
2801
0
          continue;
2802
0
        }
2803
        // At the moment, can only handle them on the same device.
2804
8
        assert(CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GET_MEMORY(dest->info.type));
2805
8
        assert(CCV_TENSOR_GET_DEVICE_ID(src->info.type) == CCV_TENSOR_GET_DEVICE_ID(dest->info.type));
2806
8
        const int stream_type = CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2807
8
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(src->info.type);
2808
8
        int type = stream_type;
2809
8
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2810
8
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2811
        // Wait signal to finish.
2812
8
        if (stream_context)
2813
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2814
8
        inputs[0] = outputs[0] = dest;
2815
8
        inputs[1] = src;
2816
8
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_0);
2817
8
        if (stream_context)
2818
4
        {
2819
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2820
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2821
4
        }
2822
8
        streams[j] = stream_0;
2823
8
      }
2824
      // If this should be blocking, blocking it.
2825
2
      if (!stream_context)
2826
5
        
for (j = 0; 1
j < parallel_count;
j++4
)
2827
4
          if (streams[j])
2828
4
            ccv_nnc_stream_context_wait(streams[j]);
2829
2
    } else {
2830
1
      ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]);
2831
1
      assert(src);
2832
1
      ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2833
1
      assert(dest);
2834
1
      inputs[0] = outputs[0] = dest;
2835
1
      inputs[1] = src;
2836
1
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_context);
2837
1
    }
2838
    // Mark this symbol as init'ed.
2839
3
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2840
3
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2841
3
  }
2842
3
  ccv_array_free(to_parameter_indices);
2843
3
  ccv_array_free(from_parameter_indices);
2844
3
}
2845
2846
void ccv_cnnp_model_parameters_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2847
14
{
2848
14
  int to_param_ref;
2849
14
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2850
  // To models.
2851
14
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2852
14
  assert(to_compiled_data);
2853
  // Tensor has to be inited already.
2854
14
  assert(!!to_compiled_data->tensors_init.v);
2855
14
  assert(to_compiled_data->tensors.parameters);
2856
  // From models.
2857
14
  const int parallel_count = ccv_max(model->parallel_count, 1);
2858
14
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2859
14
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2860
14
  assert(aux_in_size >= 0);
2861
14
  assert(aux_out_size >= 0);
2862
14
  int i, j;
2863
14
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
2864
14
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2865
14
  for (i = 0; i < aux_in_size; 
i++0
)
2866
0
    inputs[i + 1] = aux_ins[i];
2867
14
  for (i = 0; i < aux_out_size; 
i++0
)
2868
0
    outputs[i + 1] = aux_outs[i];
2869
28
  for (i = 0; i < rnum; 
i++14
)
2870
14
  {
2871
14
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2872
14
    assert(dest_d >= 0);
2873
14
    assert(dest_d < to_compiled_data->parameters->rnum);
2874
14
    if (parallel_count > 1)
2875
4
    {
2876
4
      ccv_nnc_stream_context_t* streams[parallel_count];
2877
4
      ccv_nnc_stream_signal_t* signal;
2878
4
      if (stream_context)
2879
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2880
20
      for (j = 0; j < parallel_count; 
j++16
)
2881
16
      {
2882
16
        ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2883
16
        if (!dest)
2884
0
        {
2885
0
          streams[j] = 0;
2886
0
          continue;
2887
0
        }
2888
16
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2889
16
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
2890
16
        int type = stream_type;
2891
16
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2892
16
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2893
        // Wait signal to finish.
2894
16
        if (stream_context)
2895
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2896
16
        inputs[0] = outputs[0] = dest;
2897
16
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
2898
16
        if (stream_context)
2899
4
        {
2900
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2901
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2902
4
        }
2903
16
        streams[j] = stream_0;
2904
16
      }
2905
      // If this should be blocking, blocking it.
2906
4
      if (!stream_context)
2907
15
        
for (j = 0; 3
j < parallel_count;
j++12
)
2908
12
          if (streams[j])
2909
12
            ccv_nnc_stream_context_wait(streams[j]);
2910
10
    } else {
2911
10
      ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2912
10
      assert(dest);
2913
10
      inputs[0] = outputs[0] = dest;
2914
10
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
2915
10
    }
2916
    // No need to mark this symbol as init'ed, it is already.
2917
14
  }
2918
14
  ccv_array_free(to_parameter_indices);
2919
14
}
2920
2921
void ccv_cnnp_model_parameter_gradients_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2922
6
{
2923
6
  int to_param_ref;
2924
6
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2925
  // To models.
2926
6
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2927
6
  assert(to_compiled_data);
2928
  // Tensor has to be inited already.
2929
6
  assert(!!to_compiled_data->tensors_init.v);
2930
6
  ccv_nnc_tensor_t** tensor_gradients;
2931
6
  if (to_compiled_data->backward.count > 1)
2932
3
    tensor_gradients = to_compiled_data->tensors.accum_gradients;
2933
3
  else
2934
3
    tensor_gradients = to_compiled_data->tensors.gradients;
2935
6
  assert(tensor_gradients);
2936
  // From models.
2937
6
  const int parallel_count = ccv_max(model->parallel_count, 1);
2938
6
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2939
6
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2940
6
  assert(aux_in_size >= 0);
2941
6
  assert(aux_out_size >= 0);
2942
6
  int i, j;
2943
6
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
2944
6
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2945
10
  for (i = 0; i < aux_in_size; 
i++4
)
2946
4
    inputs[i + 1] = aux_ins[i];
2947
14
  for (i = 0; i < aux_out_size; 
i++8
)
2948
8
    outputs[i + 1] = aux_outs[i];
2949
12
  for (i = 0; i < rnum; 
i++6
)
2950
6
  {
2951
6
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2952
6
    assert(dest_d >= 0);
2953
6
    assert(dest_d < to_compiled_data->parameters->rnum);
2954
6
    if (parallel_count > 1)
2955
0
    {
2956
0
      ccv_nnc_stream_context_t* streams[parallel_count];
2957
0
      ccv_nnc_stream_signal_t* signal;
2958
0
      if (stream_context)
2959
0
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2960
0
      for (j = 0; j < parallel_count; j++)
2961
0
      {
2962
0
        ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d + j * to_parameter_size];
2963
0
        if (!dest)
2964
0
        {
2965
0
          streams[j] = 0;
2966
0
          continue;
2967
0
        }
2968
0
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU;
2969
0
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
2970
0
        int type = stream_type;
2971
0
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2972
0
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2973
        // Wait signal to finish.
2974
0
        if (stream_context)
2975
0
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2976
0
        inputs[0] = outputs[0] = dest;
2977
0
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
2978
0
        if (stream_context)
2979
0
        {
2980
0
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2981
0
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2982
0
        }
2983
0
        streams[j] = stream_0;
2984
0
      }
2985
      // If this should be blocking, blocking it.
2986
0
      if (!stream_context)
2987
0
        for (j = 0; j < parallel_count; j++)
2988
0
          if (streams[j])
2989
0
            ccv_nnc_stream_context_wait(streams[j]);
2990
6
    } else {
2991
6
      ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d];
2992
6
      if (!dest)
2993
0
        continue;
2994
6
      assert(dest);
2995
6
      inputs[0] = outputs[0] = dest;
2996
6
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
2997
6
    }
2998
    // No need to mark this symbol as init'ed, it is already.
2999
6
  }
3000
6
  ccv_array_free(to_parameter_indices);
3001
6
}
3002
3003
ccv_nnc_cmd_t ccv_cnnp_model_minimizer(ccv_cnnp_model_t* const model)
3004
2.20k
{
3005
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3006
2.20k
  assert(compiled_data);
3007
2.20k
  return compiled_data->minimize.minimizer;
3008
2.20k
}
3009
3010
void ccv_cnnp_model_set_minimizer(ccv_cnnp_model_t* const model, const ccv_nnc_cmd_t minimizer, const int reset, const ccv_cnnp_model_io_t* const set_parameters, const int set_parameter_size)
3011
4.36k
{
3012
4.36k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3013
4.36k
  assert(compiled_data);
3014
4.36k
  const int parameter_size = compiled_data->parameters->rnum;
3015
4.36k
  if (parameter_size == 0)
3016
3
    return;
3017
4.35k
  if (reset)
3018
2.49k
    { assert(set_parameters == 0 && set_parameter_size == 0); }
3019
4.35k
  const int old_max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
3020
4.35k
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
3021
4.35k
  if (saved_aux_size > compiled_data->minimize.max_saved_aux_size)
3022
7
    compiled_data->minimize.max_saved_aux_size = saved_aux_size;
3023
4.35k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
3024
  // We update all parameters, at this point, we have one minimizer.
3025
4.35k
  if (set_parameters == 0 || 
set_parameter_size == 0301
)
3026
4.05k
    compiled_data->minimize.minimizer = minimizer;
3027
4.35k
  int i;
3028
4.35k
  if (set_parameters && 
set_parameter_size301
)
3029
301
  {
3030
    // I need to save what's the minimizer along with this.
3031
301
    if (!compiled_data->minimize.parameters)
3032
5
      compiled_data->minimize.parameters = ccv_array_new(sizeof(ccv_cnnp_set_minimizer_for_parameter_t*), 1, 0);
3033
301
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = ccmalloc(sizeof(ccv_cnnp_set_minimizer_for_parameter_t) + (set_parameter_size - 1) * sizeof(ccv_cnnp_model_io_t));
3034
301
    set_minimizer_for_parameter->minimizer = minimizer;
3035
301
    set_minimizer_for_parameter->parameter_size = set_parameter_size;
3036
301
    memcpy(set_minimizer_for_parameter->parameters, set_parameters, sizeof(ccv_cnnp_model_io_t) * set_parameter_size);
3037
301
    ccv_array_push(compiled_data->minimize.parameters, &set_minimizer_for_parameter);
3038
301
  }
3039
  // If reset is true, clear the parameters array.
3040
4.35k
  if (reset && 
compiled_data->minimize.parameters2.49k
)
3041
291
  {
3042
582
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++291
)
3043
291
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
3044
291
    ccv_array_clear(compiled_data->minimize.parameters);
3045
291
  }
3046
4.35k
  if (!compiled_data->update_nodes)
3047
9
    return;
3048
4.34k
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
3049
4.34k
  assert(symbolic_graph);
3050
4.34k
  if (saved_aux_size > old_max_saved_aux_size)
3051
7
  {
3052
7
    assert(compiled_data->updated_parameters);
3053
    // Reallocate first, move them around later.
3054
7
    compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccrealloc(compiled_data->updated_parameters, sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * saved_aux_size * parameter_size);
3055
7
    compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
3056
7
    compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
3057
    // We need to do this from back to front because saved_aux_size > old_saved_aux_size, it could overlap.
3058
7
    _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, old_max_saved_aux_size, saved_aux_size);
3059
7
  }
3060
4.34k
  int flag = 0;
3061
4.34k
  const int parallel_count = ccv_max(model->parallel_count, 1);
3062
4.34k
  if (set_parameters && 
set_parameter_size296
)
3063
296
  {
3064
296
    ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
3065
592
    for (i = 0; i < set_parameter_size; 
i++296
)
3066
296
    {
3067
296
      const int param_sel = set_parameters[i]->param_sel > 0 ? 
set_parameters[i]->param_sel - 1291
:
set_parameters[i]->param_sel5
;
3068
296
      assert(set_parameters[i]->param_sel != 0);
3069
296
      const int old_rnum = parameter_indices->rnum;
3070
296
      ccv_cnnp_model_add_to_parameter_indices(set_parameters[i]->model, param_sel, parameter_indices);
3071
296
      const int param_ref = set_parameters[i]->param_ref > 0 ? 
set_parameters[i]->param_ref - 10
: set_parameters[i]->param_ref;
3072
296
      assert(set_parameters[i]->param_ref != 0);
3073
296
      if (param_ref >= 0)
3074
0
      {
3075
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
3076
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
3077
0
        parameter_indices->rnum = old_rnum + 1;
3078
0
      }
3079
296
    }
3080
    // We may have duplicated indices, but that is OK, we will set it twice.
3081
5.24k
    
for (i = 0; 296
i < parameter_indices->rnum;
i++4.95k
)
3082
4.95k
    {
3083
4.95k
      const int d = *(int*)ccv_array_get(parameter_indices, i);
3084
4.95k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, d))
3085
0
        flag = 1;
3086
4.95k
    }
3087
296
    ccv_array_free(parameter_indices);
3088
4.05k
  } else {
3089
19.1k
    for (i = 0; i < parameter_size; 
i++15.0k
)
3090
15.0k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, i))
3091
65
        flag = 1;
3092
4.05k
    if (compiled_data->minimize.parameters)
3093
291
      if (_ccv_cnnp_apply_parameters_with_minimizer(model))
3094
0
        flag = 1;
3095
4.05k
  }
3096
4.34k
  if (flag)
3097
7
  {
3098
    // If saved_aux_size doesn't match, we need to remove / add new saved_aux to the graph. But first, free up apply gradients graph.
3099
7
    if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_FIT_MODE)
3100
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
3101
7
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
3102
7
  }
3103
4.34k
}
3104
3105
void ccv_cnnp_model_set_compile_params(ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_compile_param_t compile_params)
3106
0
{
3107
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3108
0
  assert(compiled_data);
3109
0
  compiled_data->compile_params = compile_params;
3110
0
}
3111
3112
void ccv_cnnp_model_dot(const ccv_cnnp_model_t* const model, const int flags, FILE** const outs, const int out_size)
3113
45
{
3114
45
  if (model->graph && 
out_size > 044
)
3115
44
    ccv_nnc_symbolic_graph_dot(model->graph, flags, outs[0]);
3116
45
  if (model->compiled_data && 
model->compiled_data->graph44
&&
out_size > 116
)
3117
0
    ccv_nnc_graph_dot(model->compiled_data->graph, flags, outs[1]);
3118
45
  if (model->compiled_data && 
model->compiled_data->backward.accum44
&&
out_size > 20
)
3119
0
    ccv_nnc_graph_dot(model->compiled_data->backward.accum, flags, outs[2]);
3120
45
  if (model->compiled_data && 
model->compiled_data->apply_gradients.graph44
&&
out_size > 33
)
3121
0
    ccv_nnc_graph_dot(model->compiled_data->apply_gradients.graph, flags, outs[3]);
3122
45
}
3123
3124
void ccv_cnnp_model_format(const ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_format_f format_fn, void* const context)
3125
0
{
3126
0
  if (model->graph)
3127
0
    ccv_nnc_symbolic_graph_format(model->graph, 0, 0, 0, 0, format_fn, context);
3128
0
}
3129
3130
static void _ccv_cnnp_compiled_data_free(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
3131
2.29k
{
3132
2.29k
  int i;
3133
2.29k
  const int parameter_size = compiled_data->parameters->rnum;
3134
2.29k
  ccv_array_free(compiled_data->parameters);
3135
2.29k
  if (compiled_data->parameter_flags)
3136
10
    ccfree(compiled_data->parameter_flags);
3137
2.29k
  const int internal_size = compiled_data->internals->rnum;
3138
2.29k
  ccv_array_free(compiled_data->internals);
3139
2.29k
  assert(compiled_data->ids.parameters->rnum == parameter_size);
3140
2.29k
  assert(compiled_data->ids.internals->rnum == internal_size);
3141
5.25k
  
for (i = 0; 2.29k
i < parameter_size;
i++2.95k
)
3142
2.95k
    ccfree(*(char**)ccv_array_get(compiled_data->ids.parameters, i));
3143
2.29k
  ccv_array_free(compiled_data->ids.parameters);
3144
2.46k
  for (i = 0; i < internal_size; 
i++161
)
3145
161
    ccfree(*(char**)ccv_array_get(compiled_data->ids.internals, i));
3146
2.29k
  ccv_array_free(compiled_data->ids.internals);
3147
2.29k
  const int parallel_count = ccv_max(model->parallel_count, 1);
3148
2.29k
  if (compiled_data->tensors.parameters)
3149
90
  {
3150
790
    for (i = 0; i < parameter_size * parallel_count; 
i++700
)
3151
      // If it is not marked as not belonging, we can free it.
3152
700
      if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1))
3153
696
        if (compiled_data->tensors.parameters[i])
3154
696
          ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]);
3155
245
    for (i = 0; i < internal_size * parallel_count; 
i++155
)
3156
155
      if (compiled_data->tensors.internals[i])
3157
155
        ccv_nnc_tensor_free(compiled_data->tensors.internals[i]);
3158
90
    ccfree(compiled_data->tensors.parameters);
3159
90
  }
3160
2.29k
  if (compiled_data->tensors.gradients)
3161
28
  {
3162
355
    for (i = 0; i < parameter_size * parallel_count; 
i++327
)
3163
327
    {
3164
327
      if (compiled_data->tensors.gradients[i])
3165
325
        ccv_nnc_tensor_free(compiled_data->tensors.gradients[i]);
3166
327
      if (compiled_data->tensors.accum_gradients[i])
3167
15
        ccv_nnc_tensor_free(compiled_data->tensors.accum_gradients[i]);
3168
327
    }
3169
28
    ccfree(compiled_data->tensors.gradients);
3170
28
  }
3171
2.29k
  if (compiled_data->minimize.parameters)
3172
5
  {
3173
15
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++10
)
3174
10
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
3175
5
    ccv_array_free(compiled_data->minimize.parameters);
3176
5
  }
3177
2.29k
  if (compiled_data->rewindables)
3178
41
    ccv_array_free(compiled_data->rewindables);
3179
2.29k
  if (compiled_data->tensors_init.v)
3180
90
    ccfree(CCV_NNC_INIT_V(compiled_data->tensors_init.v));
3181
2.29k
  if (compiled_data->evaluate.tos)
3182
2.29k
    ccfree(compiled_data->evaluate.tos);
3183
2.29k
  compiled_data->evaluate.tos = 0;
3184
2.29k
  if (compiled_data->stream_map)
3185
4
  {
3186
4
    khiter_t k;
3187
36
    for (k = 
kh_begin4
(compiled_data->stream_map); k != kh_end(compiled_data->stream_map);
++k32
)
3188
32
    {
3189
32
      if (!kh_exist(compiled_data->stream_map, k))
3190
16
        continue;
3191
16
      ccv_nnc_stream_context_t* const stream = kh_val(compiled_data->stream_map, k);
3192
16
      ccv_nnc_stream_context_free(stream);
3193
16
    }
3194
4
    kh_destroy(stream_map, compiled_data->stream_map);
3195
4
  }
3196
2.29k
  _ccv_cnnp_compiled_data_graph_free(compiled_data);
3197
2.29k
  _ccv_cnnp_compiled_data_gradient_free(compiled_data);
3198
2.29k
  _ccv_cnnp_compiled_data_backward_free(compiled_data);
3199
2.29k
  _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
3200
2.29k
  if (compiled_data->gradient_checkpoints)
3201
2
  {
3202
4
    for (i = 0; i < compiled_data->gradient_checkpoints->rnum; 
i++2
)
3203
2
    {
3204
2
      ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(compiled_data->gradient_checkpoints, i);
3205
2
      assert(checkpoint->inputs);
3206
2
      ccfree(checkpoint->inputs);
3207
2
      ccv_array_free(checkpoint->tensor_symbols);
3208
2
    }
3209
2
    ccv_array_free(compiled_data->gradient_checkpoints);
3210
2
  }
3211
2.29k
  ccv_nnc_xpu_alloc_destroy(&compiled_data->xpu_alloc);
3212
2.29k
  ccfree(compiled_data);
3213
2.29k
}
3214
3215
void ccv_cnnp_model_free(ccv_cnnp_model_t* const model)
3216
5.41k
{
3217
5.41k
  ccv_cnnp_model_deinit(model);
3218
5.41k
  if (model->isa->dealloc)
3219
1.21k
    model->isa->dealloc(model);
3220
5.41k
  if (model->io)
3221
773
  {
3222
773
    int i;
3223
1.91k
    for (i = 0; i < model->io->rnum; 
i++1.13k
)
3224
1.13k
    {
3225
1.13k
      ccv_cnnp_model_io_t model_io = *(ccv_cnnp_model_io_t*)ccv_array_get(model->io, i);
3226
1.13k
      if (model_io->outgoings)
3227
634
        ccv_array_free(model_io->outgoings);
3228
1.13k
      if (model_io->incomings)
3229
579
        ccv_array_free(model_io->incomings);
3230
1.13k
      if (model_io->dependencies)
3231
2
        ccv_array_free(model_io->dependencies);
3232
1.13k
      ccfree(model_io);
3233
1.13k
    }
3234
773
    ccv_array_free(model->io);
3235
773
  }
3236
5.41k
  if (model->parameter_indices)
3237
2.52k
    ccv_array_free(model->parameter_indices);
3238
5.41k
  if (model->inputs)
3239
2.29k
    ccfree(model->inputs);
3240
5.41k
  if (model->graph)
3241
2.29k
    ccv_nnc_symbolic_graph_free(model->graph);
3242
5.41k
  if (model->compiled_data)
3243
2.29k
    _ccv_cnnp_compiled_data_free(model, model->compiled_data);
3244
5.41k
  if (model->name)
3245
204
    ccfree(model->name);
3246
5.41k
  ccfree(model);
3247
5.41k
}
3248
3249
void ccv_cnnp_model_cancel(ccv_cnnp_model_t* const model)
3250
0
{
3251
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3252
0
  if (!compiled_data)
3253
0
    return;
3254
0
  if (compiled_data->graph)
3255
0
    ccv_nnc_graph_cancel(compiled_data->graph);
3256
0
  if (compiled_data->apply_gradients.graph)
3257
0
    ccv_nnc_graph_cancel(compiled_data->apply_gradients.graph);
3258
0
}
3259
3260
void ccv_cnnp_model_set_flags(ccv_cnnp_model_t* const model, const int flags)
3261
0
{
3262
0
  model->exec_flags = flags;
3263
0
}
3264
3265
int ccv_cnnp_model_flags(ccv_cnnp_model_t* const model)
3266
0
{
3267
0
  return model->exec_flags;
3268
0
}