Coverage Report

Created: 2025-05-28 16:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_cnnp_model.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_cnnp_model.h"
6
#include "_ccv_nnc_graph.h"
7
8
// MARK - Level-5 API
9
10
ccv_cnnp_model_io_t ccv_cnnp_model_apply(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t* const inputs, const int input_size)
11
557
{
12
557
  if (!model->io)
13
548
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
14
557
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s) + sizeof(ccv_nnc_tensor_symbol_t) * model->output_size);
15
557
  model_io->param_ref = 0;
16
557
  model_io->param_sel = 0;
17
557
  model_io->visit = 0;
18
557
  model_io->model = model;
19
557
  model_io->dependencies = 0;
20
557
  model_io->dependents = 0;
21
557
  model_io->outgoings = 0;
22
557
  model_io->outputs = (ccv_nnc_tensor_symbol_t*)(model_io + 1);
23
557
  ccv_array_push(model->io, &model_io);
24
557
  if (input_size > 0)
25
554
  {
26
554
    model_io->incomings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), input_size, 0);
27
554
    ccv_array_resize(model_io->incomings, input_size);
28
554
    int i;
29
554
    memcpy(ccv_array_get(model_io->incomings, 0), inputs, sizeof(ccv_cnnp_model_io_t) * input_size);
30
1.25k
    for (i = 0; i < input_size; 
i++700
)
31
700
    {
32
700
      if (!inputs[i]->outgoings)
33
608
        inputs[i]->outgoings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
34
700
      ccv_array_push(inputs[i]->outgoings, &model_io);
35
700
    }
36
554
  } else {
37
3
    model_io->incomings = 0;
38
3
  }
39
557
  return model_io;
40
557
}
41
42
void ccv_cnnp_model_add_dependencies(ccv_cnnp_model_io_t model_io, const ccv_cnnp_model_io_t* const dependencies, const int dependency_size)
43
2
{
44
2
  assert(dependency_size > 0);
45
2
  if (!model_io->dependencies)
46
2
    model_io->dependencies = ccv_array_new(sizeof(ccv_cnnp_model_io_t), dependency_size, 0);
47
2
  int i, j;
48
5
  for (i = 0; i < dependency_size; 
i++3
)
49
3
  {
50
3
    int flag = 0;
51
    // Check if it is already exist or not.
52
4
    for (j = 0; !flag && j < model_io->dependencies->rnum; 
j++1
)
53
1
      if (*(ccv_cnnp_model_io_t*)ccv_array_get(model_io->dependencies, j) == dependencies[i])
54
0
        flag = 1;
55
3
    if (flag)
56
0
      continue;
57
3
    ccv_array_push(model_io->dependencies, dependencies + i);
58
3
    ++dependencies[i]->dependents;
59
3
  }
60
2
}
61
62
int ccv_cnnp_model_output_size(const ccv_cnnp_model_t* const model)
63
0
{
64
0
  return model->output_size;
65
0
}
66
67
int ccv_cnnp_model_is_trainable(const ccv_cnnp_model_t* const model)
68
16
{
69
  // If the model is compiled, it is default to 1 unless it is not.
70
16
  if (model->compiled_data)
71
4
    return model->is_trainable >= 0 ? model->is_trainable : 
10
;
72
12
  return model->is_trainable;
73
16
}
74
75
ccv_cnnp_model_io_t ccv_cnnp_model_parameters(ccv_cnnp_model_t* const model, const int selector, const int index)
76
393
{
77
393
  if (!model->io)
78
38
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
79
393
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s));
80
393
  model_io->param_ref = index >= 0 ? 
index + 140
:
ALL_PARAMETERS353
;
81
393
  model_io->param_sel = selector >= 0 ? 
selector + 1308
:
ALL_PARAMETERS85
;
82
393
  model_io->visit = 0;
83
393
  model_io->model = model;
84
393
  model_io->outputs = 0;
85
393
  model_io->dependencies = 0;
86
393
  model_io->dependents = 0;
87
393
  model_io->incomings = 0;
88
393
  model_io->outgoings = 0;
89
393
  ccv_array_push(model->io, &model_io);
90
393
  return model_io;
91
393
}
92
93
void ccv_cnnp_model_notify_hook(ccv_cnnp_model_t* const model, ccv_cnnp_model_notify_f func, void* const context)
94
3
{
95
3
  model->notify_hook.func = func;
96
3
  model->notify_hook.context = context;
97
3
}
98
99
void ccv_cnnp_model_notify(const ccv_cnnp_model_t* const model, const int tag, void* const payload)
100
14
{
101
14
  if (model->notify_hook.func)
102
3
    model->notify_hook.func(model, tag, payload, model->notify_hook.context);
103
14
  if (model->isa->notify)
104
1
    model->isa->notify(model, tag, payload);
105
14
}
106
107
static int _ccv_nnc_array_dedup_graph_exec_symbols(ccv_nnc_graph_exec_symbol_t* const graph_exec_symbols, int graph_exec_symbol_size)
108
2.24k
{
109
2.24k
  int i, j;
110
4.85k
  for (i = 0; i < graph_exec_symbol_size; 
i++2.61k
)
111
2.61k
  {
112
2.61k
    ccv_nnc_graph_exec_symbol_t* const graph_exec_symbol = graph_exec_symbols + i;
113
    // Check whether this tensor symbol has any duplicate.
114
23.2k
    for (j = i + 1; j < graph_exec_symbol_size;)
115
20.6k
    {
116
20.6k
      ccv_nnc_graph_exec_symbol_t* const other_symbol = graph_exec_symbols + j;
117
      // If there is a same tensor symbol, remove it.
118
20.6k
      if (other_symbol->d == graph_exec_symbol->d && 
other_symbol->graph == graph_exec_symbol->graph2.71k
)
119
2.71k
      {
120
2.71k
        if (j + 1 < graph_exec_symbol_size)
121
439
          *other_symbol = graph_exec_symbols[graph_exec_symbol_size - 1];
122
2.71k
        --graph_exec_symbol_size;
123
2.71k
        continue;
124
2.71k
      }
125
17.9k
      ++j;
126
17.9k
    }
127
2.61k
  }
128
2.24k
  return graph_exec_symbol_size;
129
2.24k
}
130
131
void ccv_cnnp_model_add_to_array(void* const context, const ccv_nnc_tensor_symbol_t symbol, const int is_trainable)
132
3.16k
{
133
3.16k
  ccv_cnnp_model_add_to_array_context_t* const add_to_array_context = (ccv_cnnp_model_add_to_array_context_t*)context;
134
3.16k
  ccv_cnnp_model_t* const model = add_to_array_context->sequence->model;
135
3.16k
  int i;
136
3.16k
  if (add_to_array_context->add_parameter_indices && 
!model->parameter_indices2.97k
)
137
2.52k
    model->parameter_indices = ccv_array_new(sizeof(int), 0, 0);
138
37.1k
  for (i = 0; i < add_to_array_context->symbols->rnum; 
i++33.9k
)
139
33.9k
  {
140
33.9k
    const ccv_nnc_tensor_symbol_t other_symbol = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(add_to_array_context->symbols, i);
141
33.9k
    if (other_symbol.d == symbol.d && 
other_symbol.graph == symbol.graph28
)
142
28
    {
143
      // Only add to parameter_indices if it is trainable.
144
28
      if (add_to_array_context->add_parameter_indices)
145
15
        ccv_array_add_unique_int(model->parameter_indices, i);
146
      // Found it, return, don't add it.
147
28
      return;
148
28
    }
149
33.9k
  }
150
  // Only add to parameter_indices if it is trainable.
151
3.13k
  if (add_to_array_context->add_parameter_indices)
152
2.95k
    ccv_array_push(model->parameter_indices, &add_to_array_context->symbols->rnum);
153
  // This is a new one, no need to add_unique_int, it is unique.
154
3.13k
  ccv_array_push(add_to_array_context->symbols, &symbol);
155
3.13k
  if (add_to_array_context->trainables)
156
2.96k
    ccv_array_push(add_to_array_context->trainables, &is_trainable);
157
3.13k
  char id[2048];
158
3.13k
  id[0] = add_to_array_context->prefix;
159
3.13k
  id[1] = '-';
160
3.13k
  int total_len = 2;
161
6.50k
  for (i = 0; i < add_to_array_context->sequence->sequences->rnum; 
i++3.36k
)
162
3.36k
  {
163
3.36k
    const ccv_cnnp_model_name_t* const name = (ccv_cnnp_model_name_t*)ccv_array_get(add_to_array_context->sequence->sequences, i);
164
3.36k
    int len;
165
3.36k
    if (name->name && 
name->name[0] != '\0'364
)
166
364
      len = snprintf(id + total_len, 2048 - total_len, "%s-%d-", name->name, name->sequence);
167
3.00k
    else
168
3.00k
      len = snprintf(id + total_len, 2048 - total_len, "%d-", name->sequence);
169
3.36k
    total_len += len;
170
3.36k
    if (total_len >= 2047)
171
0
      break;
172
3.36k
  }
173
3.13k
  if (total_len < 2047)
174
3.13k
    total_len += snprintf(id + total_len, 2048 - total_len, "%d", add_to_array_context->sequence->it);
175
3.13k
  assert(total_len < 2048);
176
3.13k
  char *heap_id = (char*)ccmalloc(total_len + 1);
177
3.13k
  memcpy(heap_id, id, total_len + 1);
178
3.13k
  ccv_array_push(add_to_array_context->ids, &heap_id);
179
3.13k
  ++add_to_array_context->sequence->it;
180
3.13k
}
181
182
static void _ccv_cnnp_compiled_data_init(ccv_cnnp_compiled_data_t* const compiled_data, const int output_size, ccv_array_t* const gradient_checkpoints)
183
2.30k
{
184
2.30k
  compiled_data->f = compiled_data->fits + output_size;
185
2.30k
  compiled_data->xpu_alloc.mp_hdr = -1;
186
2.30k
  compiled_data->xpu_alloc.freed = kh_init(dy_str);
187
2.30k
  compiled_data->xpu_alloc.allocd = kh_init(dy_alloc);
188
2.30k
  compiled_data->gradient_checkpoints = gradient_checkpoints;
189
2.30k
}
190
191
typedef struct {
192
  void* old_graph_exec_symbol_new_hook_context;
193
  ccv_nnc_graph_exec_symbol_new_hook_f old_graph_exec_symbol_new_hook;
194
  ccv_nnc_symbolic_graph_t* graph;
195
  ccv_cnnp_model_build_data_t* build_data;
196
} ccv_cnnp_model_set_exec_flags_context_t;
197
198
static void _ccv_cnnp_model_set_exec_flags(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
199
2.92k
{
200
2.92k
  ccv_cnnp_model_set_exec_flags_context_t* flags_context = (ccv_cnnp_model_set_exec_flags_context_t*)context;
201
2.92k
  if (flags_context->build_data->exec_flags)
202
0
    ccv_nnc_graph_exec_symbol_set_flags(flags_context->graph, symbol, flags_context->build_data->exec_flags);
203
2.92k
  if (flags_context->old_graph_exec_symbol_new_hook)
204
2.20k
    flags_context->old_graph_exec_symbol_new_hook(flags_context->old_graph_exec_symbol_new_hook_context, symbol, cmd, inputs, input_size, outputs, output_size, name);
205
2.92k
}
206
207
static void _ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t loss)
208
2.30k
{
209
2.30k
  assert(model->graph);
210
2.30k
  model->inputs = ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * input_size);
211
2.30k
  int i;
212
4.68k
  for (i = 0; i < input_size; 
i++2.37k
)
213
2.37k
    model->inputs[i] = ccv_nnc_tensor_symbol_new(model->graph, inputs[i], 0);
214
2.30k
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
215
2.30k
  ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0);
216
2.30k
  ccv_array_t* const parameter_trainables = ccv_array_new(sizeof(int), 0, 0);
217
2.30k
  ccv_cnnp_model_sequence_t model_sequence = {
218
2.30k
    .bank = kh_init(ccv_cnnp_model_name_bank)
219
2.30k
  };
220
2.30k
  ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = {
221
2.30k
    .add_parameter_indices = 1,
222
2.30k
    .prefix = 't',
223
2.30k
    .sequence = &model_sequence,
224
2.30k
    .symbols = parameters,
225
2.30k
    .ids = parameter_ids,
226
2.30k
    .trainables = parameter_trainables,
227
2.30k
  };
228
2.30k
  ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
229
2.30k
  ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0);
230
2.30k
  ccv_cnnp_model_add_to_array_context_t add_to_output_context = {
231
2.30k
    .add_parameter_indices = 0,
232
2.30k
    .prefix = 'r',
233
2.30k
    .sequence = &model_sequence,
234
2.30k
    .symbols = internals,
235
2.30k
    .ids = internal_ids,
236
2.30k
    .trainables = 0,
237
2.30k
  };
238
2.30k
  ccv_cnnp_model_build_data_t build_data = {
239
2.30k
    .exec_flags = 0,
240
2.30k
    .is_trainable = model->is_trainable >= 0 ? 
model->is_trainable2.30k
:
14
,
241
2.30k
    .model_sequence = &model_sequence,
242
2.30k
    .add_to_array = ccv_cnnp_model_add_to_array,
243
2.30k
    .parameters = parameters,
244
2.30k
    .context = {
245
2.30k
      .add_to_parameter = &add_to_parameter_context,
246
2.30k
      .add_to_output = &add_to_output_context,
247
2.30k
    },
248
2.30k
    .gradient_checkpoints = 0,
249
2.30k
  };
250
2.30k
  model->data = &build_data;
251
2.30k
  ccv_cnnp_model_set_exec_flags_context_t flags_context = {
252
2.30k
    .graph = model->graph,
253
2.30k
    .build_data = &build_data,
254
2.30k
    .old_graph_exec_symbol_new_hook = 0,
255
2.30k
    .old_graph_exec_symbol_new_hook_context = 0
256
2.30k
  };
257
2.30k
  flags_context.old_graph_exec_symbol_new_hook_context = ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_set_exec_flags, &flags_context, &flags_context.old_graph_exec_symbol_new_hook);
258
2.30k
  ccv_cnnp_model_build(model, model->graph, model->inputs, input_size, 0, 0);
259
  // Reset back to previous hook.
260
2.30k
  ccv_nnc_graph_exec_symbol_new_hook(model->graph, flags_context.old_graph_exec_symbol_new_hook, flags_context.old_graph_exec_symbol_new_hook_context, 0);
261
4.62k
  for (i = 0; i < model->output_size; 
i++2.31k
)
262
2.31k
  {
263
2.31k
    const ccv_nnc_tensor_symbol_t output = model->outputs[i];
264
2.31k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, output);
265
2.31k
    if (alias_to.d == CCV_NNC_NO_TENSOR_SYMBOL)
266
1.31k
      continue;
267
    // If output is an alias, insert data transform regardless for result correctness (we cannot bind an alias). You can check ccv_nnc_tensor_bind_symbol method
268
    // to see that we can correctly bind a tensor which from it, has aliases, but we cannot bind an alias tensor correctly (this is expected, sort of, to be
269
    // honest, because we cannot handle cases of alias is part of the original tensor but bind differently).
270
1.00k
    const ccv_nnc_tensor_param_t output_params = ccv_nnc_tensor_symbol_params(model->graph, output);
271
1.00k
    model->outputs[i] = ccv_nnc_tensor_symbol_new(model->graph, output_params, 0);
272
1.00k
    ccv_nnc_graph_exec_symbol_t make_contiguous = ccv_nnc_graph_exec_symbol_new(model->graph, CMD_FORMAT_TRANSFORM_FORWARD(), &output, 1, model->outputs + i, 1, "contiguous");
273
1.00k
    ccv_nnc_graph_exec_symbol_set_flags(model->graph, make_contiguous, CCV_NNC_GRAPH_EXEC_DISABLE_OPT);
274
1.00k
  }
275
2.30k
  model->data = 0;
276
2.30k
  kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank);
277
2.30k
  if (model_sequence.sequences)
278
2.28k
    ccv_array_free(model_sequence.sequences);
279
  // Check if there are parameters that are not trainables. If there are, we will allocate uint64 bitmap to record that.
280
2.30k
  int not_trainables = 0;
281
  // Assert no parameter is alias.
282
5.26k
  for (i = 0; i < parameters->rnum; 
i++2.95k
)
283
2.95k
  {
284
2.95k
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(parameters, i);
285
2.95k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(parameter.graph, parameter);
286
2.95k
    assert(alias_to.graph == 0); // Cannot find the one alias to.
287
2.95k
    if (*(int*)ccv_array_get(parameter_trainables, i) == 0)
288
14
      not_trainables = 1;
289
2.95k
  }
290
2.30k
  assert(parameters->rnum == parameter_trainables->rnum);
291
2.30k
  uint64_t* parameter_flags = 0;
292
2.30k
  if (not_trainables)
293
10
  {
294
10
    parameter_flags = (uint64_t*)cccalloc(((parameters->rnum + 63) >> 6), sizeof(uint64_t));
295
44
    for (i = 0; i < parameter_trainables->rnum; 
i++34
)
296
34
      if (*(int*)ccv_array_get(parameter_trainables, i))
297
20
        parameter_flags[i >> 6] |= ((uint64_t)1 << (i & 63));
298
10
  }
299
2.30k
  ccv_array_free(parameter_trainables);
300
  // Assert no internal is alias.
301
2.46k
  for (i = 0; i < internals->rnum; 
i++165
)
302
165
  {
303
165
    const ccv_nnc_tensor_symbol_t internal = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(internals, i);
304
165
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(internal.graph, internal);
305
165
    assert(alias_to.graph == 0); // Cannot find the one alias to.
306
165
  }
307
2.30k
  const int output_size = model->output_size;
308
2.30k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
309
2.30k
  const int parameters_rnum = parameters->rnum;
310
2.30k
  if (input_size > 0)
311
2.30k
  {
312
2.30k
    ccv_array_resize(parameters, parameters_rnum + input_size);
313
2.30k
    memcpy(ccv_array_get(parameters, parameters_rnum), model->inputs, input_size * sizeof(ccv_nnc_tensor_symbol_t));
314
2.30k
  }
315
2.30k
  ccv_nnc_symbolic_graph_simplify(model->graph,
316
2.30k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION,
317
2.30k
      CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT,
318
2.30k
      CCV_NNC_SIMPLIFY_OPS_FUSION,
319
2.30k
      CCV_NNC_SIMPLIFY_GRAPH_PRUNING),
320
2.30k
    ccv_array_get(parameters, 0), parameters_rnum + input_size,
321
2.30k
    model->outputs, output_size,
322
2.30k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
323
2.30k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
324
  // Size it down.
325
2.30k
  parameters->rnum = parameters_rnum;
326
2.30k
  ccv_cnnp_compiled_data_t* compiled_data = model->compiled_data = cccalloc(1, sizeof(ccv_cnnp_compiled_data_t) + sizeof(ccv_nnc_tensor_symbol_t) * (output_size * 2 - 1));
327
2.30k
  _ccv_cnnp_compiled_data_init(compiled_data, output_size, build_data.gradient_checkpoints);
328
2.30k
  const int evaluate_to_size = compiled_data->evaluate.to_size = ccv_nnc_symbolic_graph_destination_size(model->graph);
329
2.30k
  assert(evaluate_to_size > 0);
330
2.30k
  compiled_data->evaluate.tos = ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
331
2.30k
  memcpy(compiled_data->evaluate.tos, ccv_nnc_symbolic_graph_destinations(model->graph), sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
332
2.30k
  compiled_data->loss = loss;
333
2.30k
  if (loss.cmd == CCV_NNC_NOOP)
334
2.29k
  {
335
    // If no loss function provided, there is no fits.
336
4.60k
    for (i = 0; i < output_size; 
i++2.30k
)
337
2.30k
    {
338
2.30k
      compiled_data->fits[i] = NO_TENSOR_SYMBOL;
339
2.30k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, model->outputs[i]);
340
2.30k
      if (alias_to.d < 0)
341
2.30k
        compiled_data->f[i] = model->outputs[i];
342
0
      else { // We cannot differentiate against an alias, therefore, we have to verify this output is full, and we can diff against the original.
343
0
        int ofs[CCV_NNC_MAX_DIM_ALLOC];
344
0
        int inc[CCV_NNC_MAX_DIM_ALLOC];
345
0
        ccv_nnc_tensor_symbol_alias_params(model->graph, model->outputs[i], ofs, inc);
346
0
        int j;
347
0
        for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
348
0
          { assert(ofs[j] == 0); } // There is no ofs.
349
0
        compiled_data->f[i] = alias_to; // Unfortunately, I cannot assert the size yet.
350
0
      }
351
2.30k
    }
352
2.29k
  } else {
353
20
    for (i = 0; i < output_size; 
i++10
)
354
10
    {
355
10
      const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(model->graph, model->outputs[i]);
356
10
      const ccv_nnc_tensor_symbol_t fit = compiled_data->fits[i] = ccv_nnc_tensor_symbol_new(model->graph, info, 0);
357
10
      compiled_data->f[i] = ccv_nnc_tensor_symbol_new(model->graph, ccv_nnc_tensor_auto, 0);
358
10
      ccv_nnc_graph_exec_symbol_new(model->graph, loss, TENSOR_SYMBOL_LIST(model->outputs[i], fit), TENSOR_SYMBOL_LIST(compiled_data->f[i]), 0);
359
10
    }
360
10
  }
361
2.30k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
362
2.30k
  ccv_nnc_symbolic_graph_simplify(model->graph,
363
2.30k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_OPS_FUSION), // Only do Ops fusion, in this way, we can fuse the loss function.
364
2.30k
    0, 0, // No need to provide binds at this point.
365
2.30k
    compiled_data->f, model->output_size,
366
2.30k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
367
2.30k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
368
  // If inputs are from GPU, stream type is GPU.
369
2.30k
  compiled_data->parameters = parameters;
370
2.30k
  compiled_data->parameter_flags = parameter_flags;
371
2.30k
  compiled_data->internals = internals;
372
2.30k
  compiled_data->ids.parameters = parameter_ids;
373
2.30k
  compiled_data->ids.internals = internal_ids;
374
2.30k
  ccv_cnnp_model_gradient_checkpoints_cleanup_after_build(compiled_data, model->graph);
375
2.30k
}
376
377
static void _ccv_cnnp_graph_push_graph_exec_symbol(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
378
8.82k
{
379
8.82k
  ccv_array_t* const stack = (ccv_array_t*)context;
380
8.82k
  ccv_array_push(stack, &symbol.d);
381
8.82k
}
382
383
static void _ccv_nnc_tensor_symbol_reinit(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
384
38.5k
{
385
38.5k
  const ccv_nnc_tensor_symbol_t src_symbol = {
386
38.5k
    .d = src_index,
387
38.5k
    .graph = src_graph
388
38.5k
  };
389
38.5k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
390
38.5k
    .d = dest_index,
391
38.5k
    .graph = dest_graph
392
38.5k
  };
393
38.5k
  const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
394
38.5k
  ccv_nnc_tensor_symbol_set(dest_graph, dest_symbol, params);
395
38.5k
  int ofs[CCV_NNC_MAX_DIM_ALLOC];
396
38.5k
  int inc[CCV_NNC_MAX_DIM_ALLOC];
397
38.5k
  if (0 == ccv_nnc_tensor_symbol_alias_params(src_graph, src_symbol, ofs, inc))
398
2.00k
    ccv_nnc_tensor_symbol_alias_set(dest_graph, dest_symbol, ofs, inc);
399
38.5k
}
400
401
static int _ccv_nnc_tensor_symbol_check_dim(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
402
2.41k
{
403
2.41k
  const ccv_nnc_tensor_symbol_t src_symbol = {
404
2.41k
    .d = src_index,
405
2.41k
    .graph = src_graph
406
2.41k
  };
407
2.41k
  const ccv_nnc_tensor_param_t src_params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
408
2.41k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
409
2.41k
    .d = dest_index,
410
2.41k
    .graph = dest_graph
411
2.41k
  };
412
2.41k
  const ccv_nnc_tensor_param_t dest_params = ccv_nnc_tensor_symbol_params(dest_graph, dest_symbol);
413
2.41k
  return memcmp(src_params.dim, dest_params.dim, sizeof(src_params.dim)) == 0;
414
2.41k
}
415
416
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size);
417
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data);
418
419
typedef struct {
420
  int parallel_count;
421
  ccv_nnc_symbolic_graph_t* graph;
422
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
423
} ccv_nnc_graph_exec_update_t;
424
425
static void _ccv_cnnp_cmd_update_for_execs(void* const context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint)
426
58
{
427
58
  ccv_nnc_graph_exec_update_t* const graph_exec_update = (ccv_nnc_graph_exec_update_t*)context;
428
58
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = graph_exec_update->graph_exec_arena;
429
58
  ccv_nnc_graph_exec_t graph_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, symbol);
430
58
  ccv_nnc_graph_exec_set(graph_exec.graph, graph_exec, cmd);
431
58
  ccv_nnc_graph_exec_set_hint(graph_exec.graph, graph_exec, hint);
432
58
  const ccv_nnc_symbolic_graph_t* const graph = graph_exec_update->graph;
433
58
  const int parallel_count = graph_exec_update->parallel_count;
434
58
  int i;
435
178
  for (i = 1; i < parallel_count; 
i++120
)
436
120
  {
437
120
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, ccv_nnc_graph_exec_symbol_copy(graph, symbol, i));
438
120
    if (!CCV_NO_GRAPH_EXEC(copy))
439
120
    {
440
120
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
441
120
      ccv_nnc_graph_exec_set_hint(copy.graph, copy, hint);
442
120
    }
443
120
  }
444
58
}
445
446
void ccv_cnnp_model_absorb(ccv_cnnp_model_t* const model, ccv_cnnp_model_t* const init, const ccv_nnc_tensor_param_t* const inputs, const int input_size)
447
2.20k
{
448
2.20k
  assert(model->graph);
449
2.20k
  assert(model->compiled_data);
450
2.20k
  assert(!init->graph);
451
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
452
2.20k
  init->graph = ccv_nnc_symbolic_graph_new();
453
2.20k
  ccv_array_t* const stack = ccv_array_new(sizeof(int), 0, 0);
454
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, _ccv_cnnp_graph_push_graph_exec_symbol, stack, 0);
455
2.20k
  _ccv_cnnp_model_compile(init, inputs, input_size, compiled_data->loss);
456
2.20k
  init->parallel_count = model->parallel_count;
457
2.20k
  init->memory_compression = model->memory_compression;
458
2.20k
  init->memory_reduction = model->memory_reduction;
459
2.20k
  init->gradient_checkpointing = model->gradient_checkpointing;
460
2.20k
  init->compiled_data->stream_type = model->compiled_data->stream_type;
461
2.20k
  init->compiled_data->minimize.minimizer = model->compiled_data->minimize.minimizer;
462
2.20k
  init->compiled_data->minimize.max_saved_aux_size = model->compiled_data->minimize.max_saved_aux_size;
463
2.20k
  if (model->compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
464
2.20k
    _ccv_cnnp_model_gradient_init(init, model->compiled_data->gradient_mode, model->compiled_data->disable_outgrad, 0, 0);
465
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, 0, 0, 0);
466
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(init->graph, TRAVERSE_FULL);
467
2.20k
  int i, j;
468
  // Verify parameters, internals and saved_aux in both graph has the same dimensionality.
469
4.61k
  for (i = 0; i < compiled_data->parameters->rnum; 
i++2.41k
)
470
2.41k
  {
471
2.41k
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
472
2.41k
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
473
2.41k
  }
474
2.20k
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
475
0
  {
476
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
477
0
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
478
0
  }
479
  // Update inputs.
480
2.20k
  assert(model->input_size == init->input_size);
481
4.40k
  
for (i = 0; 2.20k
i < model->input_size;
i++2.20k
)
482
2.20k
    if (model->inputs[i].d >= 0)
483
2.20k
    {
484
2.20k
      assert(init->inputs[i].d >= 0);
485
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->inputs[i].d, model->inputs[i].d);
486
2.20k
    }
487
  // Update outputs.
488
2.20k
  assert(model->output_size == init->output_size);
489
4.40k
  
for (i = 0; 2.20k
i < model->output_size;
i++2.20k
)
490
2.20k
  {
491
2.20k
    if (model->outputs[i].d >= 0)
492
2.20k
    {
493
2.20k
      assert(init->outputs[i].d >= 0);
494
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->outputs[i].d, model->outputs[i].d);
495
2.20k
    }
496
2.20k
    if (model->outputs[i].d != model->compiled_data->f[i].d)
497
0
    {
498
0
      assert(init->outputs[i].d != init->compiled_data->f[i].d);
499
0
      if (model->compiled_data->f[i].d >= 0)
500
0
      {
501
0
        assert(init->compiled_data->f[i].d >= 0);
502
0
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->compiled_data->f[i].d, model->compiled_data->f[i].d);
503
0
      }
504
0
    }
505
2.20k
  }
506
  // Go through the graph to set tensor on matching symbols
507
11.0k
  
for (i = 0; 2.20k
i < stack->rnum;
i++8.82k
)
508
8.82k
  {
509
8.82k
    const int d = *(int*)ccv_array_get(stack, i);
510
    // If exceed range, skip.
511
8.82k
    if (d >= ccv_nnc_graph_exec_symbol_count(init->graph) ||
512
8.82k
      d >= ccv_nnc_graph_exec_symbol_count(model->graph))
513
0
      continue;
514
8.82k
    const ccv_nnc_graph_exec_symbol_t src_symbol = {
515
8.82k
      .d = d,
516
8.82k
      .graph = init->graph
517
8.82k
    };
518
8.82k
    const ccv_nnc_graph_exec_symbol_t dest_symbol = {
519
8.82k
      .d = d,
520
8.82k
      .graph = model->graph
521
8.82k
    };
522
8.82k
    const ccv_nnc_cmd_t src_cmd = ccv_nnc_graph_exec_symbol_cmd(init->graph, src_symbol);
523
8.82k
    const ccv_nnc_cmd_t dest_cmd = ccv_nnc_graph_exec_symbol_cmd(model->graph, dest_symbol);
524
    // If the name doesn't match, skip.
525
8.82k
    if (dest_cmd.cmd != src_cmd.cmd && 
src_cmd.cmd != CCV_NNC_NOOP0
)
526
0
      continue;
527
    // Now get all the inputs and outputs, if matches, set them.
528
8.82k
    const int* src_inputs;
529
8.82k
    int src_input_size;
530
8.82k
    const int* src_outputs;
531
8.82k
    int src_output_size;
532
8.82k
    ccv_nnc_graph_exec_symbol_io(init->graph, src_symbol, &src_inputs, &src_input_size, &src_outputs, &src_output_size);
533
8.82k
    const int* dest_inputs;
534
8.82k
    int dest_input_size;
535
8.82k
    const int* dest_outputs;
536
8.82k
    int dest_output_size;
537
8.82k
    ccv_nnc_graph_exec_symbol_io(model->graph, dest_symbol, &dest_inputs, &dest_input_size, &dest_outputs, &dest_output_size);
538
    // We may have unmatched input / output size because this is the minimizer and it has
539
    // different saved_aux (for example, when we shrunk with CMD_NOOP).
540
8.82k
    if (src_input_size != dest_input_size)
541
0
      continue;
542
8.82k
    if (src_output_size != dest_output_size)
543
0
      continue;
544
8.82k
    ccv_nnc_graph_exec_symbol_set(model->graph, dest_symbol, src_cmd);
545
    // There may be mismatches of the source tensor symbols and destination tensor symbols. The reason is because
546
    // we may later passed-in the minimizer, therefore, we may allocate tensors for minimizer later in the original
547
    // graph whereas in the newly created graph, it is streamlined (the minimizer exists from the beginning). That
548
    // will make the order of tensor symbols creation different, therefore, exact which tensor is which wrong as
549
    // well. However, set a new minimizer won't change the exec symbol ordering, because we never create new exec
550
    // symbols after gradient init step. Changing a new minimizer just updated that exec symbols setting, it is not
551
    // a new exec symbol.
552
33.7k
    for (j = 0; j < src_input_size; 
j++24.8k
)
553
24.8k
      if (src_inputs[j] >= 0)
554
20.4k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_inputs[j], dest_inputs[j]);
555
22.4k
    for (j = 0; j < src_output_size; 
j++13.6k
)
556
13.6k
      if (src_outputs[j] >= 0)
557
13.6k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_outputs[j], dest_outputs[j]);
558
8.82k
  }
559
2.20k
  ccv_array_free(stack);
560
  // After this, we get all tensors in the model graph resolved through tensor_auto.
561
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(model->graph, TRAVERSE_FULL);
562
  // Verify symbols we get matches.
563
2.20k
  const int parameter_size = compiled_data->parameters->rnum;
564
4.61k
  for (i = 0; i < parameter_size; 
i++2.41k
)
565
2.41k
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->parameters, i))->d); }
566
2.20k
  const int internal_size = compiled_data->internals->rnum;
567
2.20k
  for (i = 0; i < internal_size; 
i++0
)
568
0
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->internals, i))->d); }
569
  // Go through compiled data.
570
2.20k
  if (compiled_data->tensor_arena)
571
2.20k
  {
572
2.20k
    const int flag = ccv_nnc_tensor_arena_reinit(compiled_data->tensor_arena, model->graph);
573
2.20k
    if (flag == 0 && compiled_data->graph_exec_arena)
574
2.20k
    {
575
2.20k
      ccv_nnc_graph_exec_reinit(compiled_data->graph_exec_arena, compiled_data->graph, model->graph);
576
      // Since we will reinit, if we previously set is_test, we need to set it again.
577
2.20k
      if (compiled_data->is_test)
578
1
      {
579
1
        const int parallel_count = ccv_max(model->parallel_count, 1);
580
1
        ccv_nnc_graph_exec_update_t update = {
581
1
          .parallel_count = parallel_count,
582
1
          .graph = model->graph,
583
1
          .graph_exec_arena = compiled_data->graph_exec_arena,
584
1
        };
585
1
        ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
586
1
      }
587
2.20k
    } else
588
      // Free-up tensor arena & graph exec arena.
589
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
590
2.20k
  }
591
  // There are other compiled graphs, for accum and apply gradients.
592
  // However, the main conclusion is, these absorb operations shouldn't impact parameters.
593
  // Thus, it won't impact the shape of gradients (only outgrad). Since for outgrad, we
594
  // don't allocate ourselves, it is not a concern. For normal gradients, the shape cannot
595
  // be changed otherwise parameters' shape will be meaningless. The same goes to internals.
596
  // That is why we don't update these compiled graphs at all this point.
597
  // Free the model, we've already "absorbed" it.
598
2.20k
  ccv_cnnp_model_free(init);
599
2.20k
}
600
601
void ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t minimizer, const ccv_nnc_cmd_t loss)
602
2.29k
{
603
2.29k
  assert(input_size == model->input_size || model->input_size == 0);
604
2.29k
  if (model->input_size == 0)
605
10
    model->input_size = input_size;
606
2.29k
  if (!model->graph) // The graph is not compiled yet.
607
98
  {
608
98
    model->graph = ccv_nnc_symbolic_graph_new();
609
98
    _ccv_cnnp_model_compile(model, inputs, input_size, loss);
610
98
    assert(model->compiled_data);
611
98
    int i, flag = 0;
612
248
    for (i = 0; !flag && 
i < input_size228
;
i++150
)
613
150
      flag = (CCV_TENSOR_GET_MEMORY(inputs[i].type) == CCV_TENSOR_GPU_MEMORY);
614
    // If inputs are from GPU, stream type is GPU.
615
98
    model->compiled_data->stream_type = flag ? 
CCV_STREAM_CONTEXT_GPU20
:
CCV_STREAM_CONTEXT_CPU78
;
616
98
    model->compiled_data->minimize.minimizer = minimizer;
617
98
    model->compiled_data->minimize.max_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
618
2.20k
  } else {
619
    // Now, finally fill in this part. If the graph is already compiled, we make a copy of the model.
620
    // And then absorb the "new model" to the old one.
621
2.20k
    ccv_cnnp_model_t* const init = ccv_cnnp_model_copy(model, model->is_trainable);
622
2.20k
    ccv_cnnp_model_absorb(model, init, inputs, input_size);
623
    // Reset minimizer.
624
2.20k
    ccv_cnnp_model_set_minimizer(model, minimizer, 1, 0, 0);
625
2.20k
  }
626
2.29k
}
627
628
ccv_cnnp_model_t* ccv_cnnp_model_copy(const ccv_cnnp_model_t* const model, const int is_trainable)
629
2.20k
{
630
2.20k
  ccv_cnnp_model_t* const new_model = _ccv_cnnp_model_copy(model, 0);
631
2.20k
  new_model->is_trainable = is_trainable;
632
2.20k
  return new_model;
633
2.20k
}
634
635
void ccv_cnnp_model_tensor_auto(ccv_cnnp_model_t* const model, ccv_nnc_tensor_param_t* const outputs, const int output_size)
636
4.44k
{
637
4.44k
  assert(model->graph);
638
4.44k
  assert(output_size == model->output_size);
639
4.44k
  ccv_nnc_symbolic_graph_t* const graph = model->graph;
640
4.44k
  ccv_nnc_symbolic_graph_tensor_auto(graph, TRAVERSE_FULL);
641
4.44k
  int i;
642
8.89k
  for (i = 0; i < output_size; 
i++4.45k
)
643
4.45k
  {
644
4.45k
    assert(model->outputs[i].d != CCV_NNC_NO_TENSOR_SYMBOL);
645
4.45k
    outputs[i] = ccv_nnc_tensor_symbol_params(graph, model->outputs[i]);
646
4.45k
  }
647
4.44k
}
648
649
void ccv_cnnp_model_set_workspace_size(ccv_cnnp_model_t* const model, size_t workspace_size)
650
3
{
651
3
  if (workspace_size == model->workspace_size)
652
0
    return;
653
3
  model->workspace_size = workspace_size;
654
3
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
655
3
  if (compiled_data && compiled_data->graph)
656
0
    ccv_nnc_graph_autotune(compiled_data->graph, workspace_size, 0, TRAVERSE_FULL);
657
3
}
658
659
size_t ccv_cnnp_model_workspace_size(ccv_cnnp_model_t* const model)
660
0
{
661
0
  return model->workspace_size;
662
0
}
663
664
void ccv_cnnp_model_set_data_parallel(ccv_cnnp_model_t* const model, const int parallel)
665
15
{
666
15
  if (parallel == 0)
667
0
    model->parallel_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
668
15
  else
669
15
    model->parallel_count = parallel;
670
15
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
671
15
  if (compiled_data)
672
11
    { assert(!compiled_data->graph); }
673
15
}
674
675
void ccv_cnnp_model_set_max_concurrency(ccv_cnnp_model_t* const model, const int max_stream_count)
676
0
{
677
0
  model->max_stream_count = max_stream_count;
678
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
679
0
  if (compiled_data)
680
0
    { assert(!compiled_data->graph); }
681
0
}
682
683
void ccv_cnnp_model_set_memory_compression(ccv_cnnp_model_t* const model, const int memory_compression)
684
0
{
685
0
  model->memory_compression = memory_compression;
686
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
687
0
  if (compiled_data)
688
0
    { assert(!compiled_data->graph); }
689
0
}
690
691
void ccv_cnnp_model_set_memory_reduction(ccv_cnnp_model_t* const model, const int memory_reduction)
692
0
{
693
0
  model->memory_reduction = memory_reduction;
694
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
695
0
  if (compiled_data)
696
0
    { assert(!compiled_data->graph); }
697
0
}
698
699
void ccv_cnnp_model_set_gradient_checkpointing(ccv_cnnp_model_t* const model, const int gradient_checkpointing)
700
2
{
701
2
  model->gradient_checkpointing = gradient_checkpointing;
702
2
}
703
704
int ccv_cnnp_model_gradient_checkpointing(ccv_cnnp_model_t* const model)
705
0
{
706
0
  return model->gradient_checkpointing;
707
0
}
708
709
typedef struct {
710
  int parallel_count;
711
  ccv_nnc_symbolic_graph_t* graph;
712
  ccv_cnnp_compiled_data_t* compiled_data;
713
  ccv_nnc_tensor_arena_t* tensor_arena;
714
} ccv_nnc_tensor_init_states_t;
715
716
static int _ccv_cnnp_any_to_init(const ccv_cnnp_compiled_data_t* const compiled_data)
717
100
{
718
100
  int i;
719
100
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
720
180
  for (i = 0; i < compiled_data->parameters->rnum; 
i++80
)
721
119
  {
722
119
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
723
119
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
724
39
      return 1;
725
119
  }
726
61
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
727
6
  {
728
6
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
729
6
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
730
6
      return 1;
731
6
  }
732
55
  return 0;
733
61
}
734
735
static void _ccv_cnnp_init_states_for_tensors(void* const context, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const input, const ccv_nnc_tensor_symbol_t output_symbol)
736
341
{
737
341
  ccv_nnc_tensor_init_states_t* const tensor_init_states = (ccv_nnc_tensor_init_states_t*)context;
738
341
  ccv_nnc_tensor_arena_t* const tensor_arena = tensor_init_states->tensor_arena;
739
341
  ccv_nnc_tensor_t* const output_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, output_symbol);
740
341
  if (!output_tensor)
741
0
    return;
742
341
  const int d = output_symbol.d;
743
341
  assert(d < tensor_init_states->compiled_data->tensors_init.size);
744
341
  uint32_t* const init_v = CCV_NNC_INIT_V(tensor_init_states->compiled_data->tensors_init.v);
745
341
  if (init_v[d >> 5] & (1u << (d & 0x1f)))
746
34
    return;
747
307
  init_v[d >> 5] |= (1u << (d & 0x1f));
748
307
  ccv_nnc_cmd_exec(cmd, hint, flags, &input, input ? 
116
:
0291
, &output_tensor, 1, 0);
749
307
  const ccv_nnc_symbolic_graph_t* const graph = tensor_init_states->graph;
750
307
  const int parallel_count = tensor_init_states->parallel_count;
751
307
  int i;
752
787
  for (i = 1; i < parallel_count; 
i++480
)
753
480
  {
754
480
    ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_copy(graph, output_symbol, i));
755
480
    if (copy)
756
480
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &output_tensor, 1, &copy, 1, 0);
757
480
  }
758
307
}
759
760
// This method can only handle cases we added new tensors and exec, never delete. This invariant is true because
761
// we setup everything (including calling simplify method) in ccv_cnnp_model_compile method, before this rewind setup.
762
static void _ccv_cnnp_model_rewind_graph(ccv_cnnp_model_t* const model)
763
2
{
764
2
  assert(model->graph);
765
2
  assert(model->compiled_data);
766
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
767
2
  assert(compiled_data->rewindables);
768
2
  int i;
769
51
  for (i = 0; i < compiled_data->rewindables->rnum; 
i++49
)
770
49
  {
771
49
    const ccv_cnnp_rewind_symbol_t* const rewind_symbol = (ccv_cnnp_rewind_symbol_t*)ccv_array_get(compiled_data->rewindables, i);
772
49
    if (rewind_symbol->type == CCV_CNNP_REWIND_GRAPH_EXEC)
773
16
      ccv_nnc_graph_exec_symbol_free(model->graph, rewind_symbol->graph_exec);
774
33
    else if (rewind_symbol->type == CCV_CNNP_REWIND_TENSOR)
775
33
      ccv_nnc_tensor_symbol_free(model->graph, rewind_symbol->tensor);
776
49
  }
777
2
  ccv_array_clear(compiled_data->rewindables);
778
2
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
779
2
}
780
781
static void _ccv_cnnp_model_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name)
782
6.13k
{
783
6.13k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
784
6.13k
    .type = CCV_CNNP_REWIND_TENSOR,
785
6.13k
    .tensor = symbol
786
6.13k
  };
787
6.13k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
788
6.13k
  ccv_array_push(rewind_symbols, &rewind_symbol);
789
6.13k
}
790
791
static void _ccv_cnnp_model_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC], const int inc[CCV_NNC_MAX_DIM_ALLOC], const ccv_nnc_tensor_param_t info, const char* const name)
792
476
{
793
476
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
794
476
    .type = CCV_CNNP_REWIND_TENSOR,
795
476
    .tensor = symbol
796
476
  };
797
476
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
798
476
  ccv_array_push(rewind_symbols, &rewind_symbol);
799
476
}
800
801
static void _ccv_cnnp_model_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
802
2.34k
{
803
2.34k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
804
2.34k
    .type = CCV_CNNP_REWIND_GRAPH_EXEC,
805
2.34k
    .graph_exec = symbol
806
2.34k
  };
807
2.34k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
808
2.34k
  ccv_array_push(rewind_symbols, &rewind_symbol);
809
2.34k
}
810
811
static void _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd, ccv_nnc_symbolic_graph_t* const symbolic_graph)
812
35.0k
{
813
35.0k
  ccv_nnc_graph_exec_t const update_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, exec_symbol);
814
35.0k
  if (!CCV_NO_GRAPH_EXEC(update_exec))
815
19.9k
    ccv_nnc_graph_exec_set(update_exec.graph, update_exec, cmd);
816
35.0k
  int i;
817
49.9k
  for (i = 1; i < parallel_count; 
i++14.8k
)
818
14.8k
  {
819
14.8k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
820
14.8k
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, copy_symbol);
821
14.8k
    if (!CCV_NO_GRAPH_EXEC(copy))
822
14.6k
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
823
14.8k
  }
824
35.0k
}
825
826
static void _ccv_cnnp_model_graph_exec_symbol_set(ccv_nnc_symbolic_graph_t* const symbolic_graph, ccv_cnnp_compiled_data_t* const compiled_data, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd)
827
20.0k
{
828
20.0k
  assert(compiled_data);
829
20.0k
  assert(symbolic_graph);
830
20.0k
  ccv_nnc_graph_exec_symbol_set(symbolic_graph, exec_symbol, cmd);
831
20.0k
  int i;
832
35.0k
  for (i = 1; i < parallel_count; 
i++14.9k
)
833
14.9k
  {
834
14.9k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
835
14.9k
    if (copy_symbol.graph)
836
14.8k
      ccv_nnc_graph_exec_symbol_set(symbolic_graph, copy_symbol, cmd);
837
14.9k
  }
838
20.0k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = compiled_data->graph_exec_arena;
839
20.0k
  if (graph_exec_arena)
840
20.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
841
  // Skip backward graph exec arena because it is for a specific accum symbolic graph, not the main graph (model->graph)
842
20.0k
  ccv_nnc_graph_exec_arena_t* const gradient_graph_exec_arena = compiled_data->apply_gradients.graph_exec_arena;
843
20.0k
  if (gradient_graph_exec_arena)
844
15.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(gradient_graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
845
20.0k
}
846
847
static int _ccv_cnnp_set_minimizer_for_parameter(ccv_nnc_symbolic_graph_t* const graph, ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_graph_exec_symbol_t* const update_nodes, ccv_nnc_tensor_symbol_t* const updated_parameters, ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parallel_count, const ccv_nnc_cmd_t minimizer, const int saved_aux_size, const int max_saved_aux_size, const int parameter_indice)
848
20.0k
{
849
20.0k
  int this_parameter_flag = 0;
850
20.0k
  if (update_nodes[parameter_indice].d == CCV_NNC_NO_TENSOR_SYMBOL)
851
0
    return this_parameter_flag;
852
20.0k
  const ccv_nnc_cmd_t old_minimizer = ccv_nnc_graph_exec_symbol_cmd(graph, update_nodes[parameter_indice]);
853
20.0k
  int j, k;
854
  // For no-op, we can preserve previous saved_aux_size.
855
20.0k
  if (old_minimizer.cmd != minimizer.cmd && 
minimizer.cmd != CCV_NNC_NOOP71
)
856
67
  {
857
    // If the old minimizer is a noop, then the old_saved_aux_size should be whatever its previous
858
    // saved_aux_size is, otherwise we will reinit the saved_aux repeatedly if you switch between
859
    // noop and a minimizer. We don't want that because we do that in high-level frameworks to
860
    // make sure some model parameters don't update if we don't want them to.
861
67
    int old_saved_aux_size;
862
67
    if (old_minimizer.cmd == CCV_NNC_NOOP)
863
67
    {
864
67
      int input_size;
865
67
      ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], 0, &input_size, 0, 0);
866
67
      if (input_size < 2) // This is not legit.
867
0
        old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
868
67
      else // See ccv_nnc_minimizer_saved_aux_size, the saved_aux is inputs excluding gradients and parameters.
869
67
        old_saved_aux_size = input_size - 2;
870
67
    } else
871
0
      old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
872
67
    if (old_saved_aux_size != saved_aux_size)
873
65
    {
874
65
      this_parameter_flag = 1;
875
65
      if (saved_aux_size > old_saved_aux_size)
876
65
      {
877
        // Allocate new tensor symbols.
878
65
        const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(graph, updated_parameters[parameter_indice]);
879
189
        for (j = old_saved_aux_size; j < saved_aux_size; 
j++124
)
880
124
        {
881
124
          saved_aux[parameter_indice * max_saved_aux_size + j].source = ccv_nnc_tensor_symbol_new(graph, info, 0);
882
124
          saved_aux[parameter_indice * max_saved_aux_size + j].destination = ccv_nnc_tensor_symbol_new(graph, info, 0);
883
124
          const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
884
460
          for (k = 1; k < parallel_count; 
k++336
)
885
336
          {
886
336
            ccv_nnc_tensor_param_t dev_info = info;
887
336
            if (k != device_id)
888
336
              CCV_TENSOR_SET_DEVICE_ID(dev_info.type, k);
889
0
            else
890
0
              CCV_TENSOR_SET_DEVICE_ID(dev_info.type, 0);
891
336
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
892
336
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
893
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, src_copy);
894
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, dest_copy);
895
336
          }
896
124
        }
897
65
      } else {
898
0
        for (j = saved_aux_size; j < old_saved_aux_size; j++)
899
0
        {
900
0
          for (k = 1; k < parallel_count; k++)
901
0
          {
902
0
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
903
0
            if (src_copy.d >= 0)
904
0
            {
905
0
              ccv_nnc_tensor_symbol_free(graph, src_copy);
906
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, NO_TENSOR_SYMBOL);
907
0
            }
908
0
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
909
0
            if (dest_copy.d >= 0)
910
0
            {
911
0
              ccv_nnc_tensor_symbol_free(graph, dest_copy);
912
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, NO_TENSOR_SYMBOL);
913
0
            }
914
0
          }
915
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source);
916
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination);
917
0
          saved_aux[parameter_indice * max_saved_aux_size + j].source = saved_aux[parameter_indice * max_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
918
0
        }
919
0
      }
920
65
    }
921
67
  }
922
20.0k
  _ccv_cnnp_model_graph_exec_symbol_set(graph, compiled_data, parallel_count, update_nodes[parameter_indice], minimizer);
923
20.0k
  if (this_parameter_flag)
924
65
  {
925
65
    ccv_nnc_tensor_symbol_t update_inputs[saved_aux_size + 2];
926
65
    ccv_nnc_tensor_symbol_t update_outputs[saved_aux_size + 1];
927
65
    const int* inputs = 0;
928
65
    int input_size = 0;
929
65
    ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], &inputs, &input_size, 0, 0);
930
65
    assert(input_size >= 1);
931
65
    update_inputs[0].d = inputs[0];
932
65
    update_inputs[0].graph = graph;
933
65
    update_inputs[1].d = inputs[1];
934
65
    update_inputs[1].graph = graph;
935
65
    update_outputs[0] = updated_parameters[parameter_indice];
936
189
    for (j = 0; j < saved_aux_size; 
j++124
)
937
124
    {
938
124
      update_inputs[j + 2] = saved_aux[parameter_indice * max_saved_aux_size + j].source;
939
124
      update_outputs[j + 1] = saved_aux[parameter_indice * max_saved_aux_size + j].destination;
940
124
    }
941
65
    ccv_nnc_graph_exec_symbol_set_io(graph, update_nodes[parameter_indice], update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
942
233
    for (k = 1; k < parallel_count; 
k++168
)
943
168
    {
944
168
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(graph, update_nodes[parameter_indice], k);
945
168
      assert(copy.d >= 0);
946
168
      ccv_nnc_graph_exec_symbol_io(graph, copy, &inputs, &input_size, 0, 0);
947
168
      assert(input_size >= 1);
948
168
      update_inputs[0].d = inputs[0];
949
168
      update_inputs[0].graph = graph;
950
168
      update_inputs[1].d = inputs[1];
951
168
      update_inputs[1].graph = graph;
952
168
      update_outputs[0] = ccv_nnc_tensor_symbol_copy(graph, updated_parameters[parameter_indice], k);
953
504
      for (j = 0; j < saved_aux_size; 
j++336
)
954
336
      {
955
336
        update_inputs[j + 2] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
956
336
        update_outputs[j + 1] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
957
336
      }
958
168
      ccv_nnc_graph_exec_symbol_set_io(graph, copy, update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
959
168
    }
960
65
  }
961
20.0k
  return this_parameter_flag;
962
20.0k
}
963
964
typedef struct {
965
  int parameter_size;
966
  ccv_nnc_cmd_t minimizer;
967
  ccv_cnnp_model_io_t parameters[1];
968
} ccv_cnnp_set_minimizer_for_parameter_t;
969
970
static int _ccv_cnnp_apply_parameters_with_minimizer(ccv_cnnp_model_t* const model)
971
296
{
972
296
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
973
296
  assert(compiled_data);
974
296
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
975
  // We update all parameters, at this point, we have one minimizer.
976
296
  const int parameter_size = compiled_data->parameters->rnum;
977
296
  ccv_nnc_graph_exec_symbol_t* const update_nodes = compiled_data->update_nodes;
978
296
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
979
296
  assert(symbolic_graph);
980
296
  const int parallel_count = ccv_max(model->parallel_count, 1);
981
296
  ccv_array_t* const parameters = compiled_data->minimize.parameters;
982
296
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
983
296
  int i, j, flag = 0;
984
301
  for (i = 0; i < parameters->rnum; 
i++5
)
985
5
  {
986
5
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = *(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(parameters, i);
987
10
    for (j = 0; j < set_minimizer_for_parameter->parameter_size; 
j++5
)
988
5
    {
989
5
      const int param_sel = set_minimizer_for_parameter->parameters[j]->param_sel > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_sel - 13
:
set_minimizer_for_parameter->parameters[j]->param_sel2
;
990
5
      assert(set_minimizer_for_parameter->parameters[j]->param_sel != 0);
991
5
      const int old_rnum = parameter_indices->rnum;
992
5
      ccv_cnnp_model_add_to_parameter_indices(set_minimizer_for_parameter->parameters[j]->model, param_sel, parameter_indices);
993
5
      const int param_ref = set_minimizer_for_parameter->parameters[j]->param_ref > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_ref - 10
: set_minimizer_for_parameter->parameters[j]->param_ref;
994
5
      assert(set_minimizer_for_parameter->parameters[j]->param_ref != 0);
995
5
      if (param_ref >= 0)
996
0
      {
997
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
998
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
999
0
        parameter_indices->rnum = old_rnum + 1;
1000
0
      }
1001
5
    }
1002
5
    const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(set_minimizer_for_parameter->minimizer);
1003
    // We may have duplicated indices, but that is OK, we will set it twice.
1004
58
    for (j = 0; j < parameter_indices->rnum; 
j++53
)
1005
53
    {
1006
53
      const int d = *(int*)ccv_array_get(parameter_indices, j);
1007
53
      assert(d <= parameter_size);
1008
53
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, set_minimizer_for_parameter->minimizer, saved_aux_size, max_saved_aux_size, d))
1009
0
        flag = 1;
1010
53
    }
1011
5
    ccv_array_clear(parameter_indices);
1012
5
  }
1013
296
  ccv_array_free(parameter_indices);
1014
296
  return flag;
1015
296
}
1016
1017
static void _ccv_cnnp_scatter_saved_aux(ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parameter_size, const int old_saved_aux_size, const int new_saved_aux_size)
1018
2.25k
{
1019
2.25k
  if (new_saved_aux_size == old_saved_aux_size)
1020
2.24k
    return;
1021
2.25k
  assert
(new_saved_aux_size > old_saved_aux_size)7
;
1022
7
  int i, j;
1023
72
  for (i = parameter_size - 1; i >= 0; 
i--65
)
1024
65
  {
1025
189
    for (j = new_saved_aux_size - 1; j >= old_saved_aux_size; 
j--124
)
1026
124
      saved_aux[i * new_saved_aux_size + j].source = saved_aux[i * new_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
1027
65
    for (j = old_saved_aux_size - 1; j >= 0; 
j--0
)
1028
0
      saved_aux[i * new_saved_aux_size + j] = saved_aux[i * old_saved_aux_size + j];
1029
65
  }
1030
7
}
1031
1032
static void _ccv_cnnp_model_set_rewindables(ccv_cnnp_model_t* const model)
1033
45
{
1034
45
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1035
45
  assert(compiled_data);
1036
45
  if (!compiled_data->rewindables)
1037
45
    compiled_data->rewindables = ccv_array_new(sizeof(ccv_cnnp_rewind_symbol_t), 0, 0);
1038
45
  ccv_nnc_tensor_symbol_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_new_hook, compiled_data->rewindables, 0);
1039
45
  ccv_nnc_tensor_symbol_alias_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_alias_new_hook, compiled_data->rewindables, 0);
1040
45
  ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_graph_exec_symbol_new_hook, compiled_data->rewindables, 0);
1041
45
}
1042
1043
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size)
1044
2.24k
{
1045
2.24k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1046
2.24k
  assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
1047
2.24k
  assert(gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
1048
2.24k
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1049
2.24k
  assert(evaluate_to_size > 0);
1050
2.24k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1051
2.24k
  compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1052
2.24k
  compiled_data->evaluate.to_ops = (ccv_nnc_graph_exec_t*)(compiled_data->evaluate.tos + evaluate_to_size * parallel_count);
1053
2.24k
  int i, j;
1054
2.24k
  const int output_size = model->output_size;
1055
2.24k
  assert(!fits || fit_size == output_size * parallel_count);
1056
2.24k
  if (fits)
1057
12
    
for (i = 0; 6
i < output_size;
i++6
)
1058
6
      ccv_nnc_tensor_symbol_set(model->graph, compiled_data->fits[i], fits[i]->info);
1059
2.24k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
1060
2.24k
  const int parameter_size = compiled_data->parameters->rnum;
1061
2.24k
  compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * max_saved_aux_size * parameter_size);
1062
2.24k
  compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
1063
2.24k
  compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
1064
2.24k
  int parameter_size_maybe_more = parameter_size;
1065
2.24k
  compiled_data->disable_outgrad = disable_outgrad;
1066
2.24k
  int outgrad_size;
1067
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
1068
9
    outgrad_size = 0;
1069
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
1070
2.23k
    outgrad_size = model->input_size;
1071
3
  else {
1072
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
1073
3
    outgrad_size = 0;
1074
10
    for (i = 0; i < model->input_size; 
i++7
)
1075
7
      if (!(disable_outgrad & ((uint64_t)1 << i)))
1076
3
        ++outgrad_size;
1077
3
  }
1078
2.24k
  compiled_data->outgrad_size = outgrad_size;
1079
2.24k
  parameter_size_maybe_more += outgrad_size;
1080
2.24k
  compiled_data->gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size_maybe_more + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size_maybe_more * parallel_count);
1081
2.24k
  compiled_data->outgrads = parameter_size_maybe_more > parameter_size ? 
compiled_data->gradients + parameter_size2.23k
:
09
;
1082
2.24k
  compiled_data->backward.tos = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->gradients + parameter_size_maybe_more);
1083
2.24k
  compiled_data->backward.to_size = parameter_size_maybe_more;
1084
2.24k
  ccv_nnc_tensor_symbol_t* parameters = (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0);
1085
2.24k
  if (compiled_data->parameter_flags)
1086
4
  {
1087
4
    parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size);
1088
25
    for (i = 0; i < parameter_size; 
i++21
)
1089
21
      if (compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))
1090
14
        parameters[i] = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1091
7
      else
1092
7
        parameters[i] = NO_TENSOR_SYMBOL;
1093
4
  }
1094
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
1095
9
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1096
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
1097
2.23k
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, model->inputs, model->input_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1098
3
  else { // Compute minimize with gradients including selected inputs.
1099
3
    assert(model->input_size > 0);
1100
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
1101
3
    assert(outgrad_size > 0);
1102
3
    ccv_nnc_tensor_symbol_t outgrads[outgrad_size];
1103
3
    j = 0;
1104
10
    for (i = 0; i < model->input_size; 
i++7
)
1105
7
      if (!(disable_outgrad & ((uint64_t)1 << i)))
1106
3
        outgrads[j++] = model->inputs[i];
1107
3
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, outgrads, outgrad_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
1108
3
  }
1109
2.24k
  if (compiled_data->parameter_flags)
1110
4
    ccfree(parameters);
1111
2.24k
  _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer), compiled_data->minimize.max_saved_aux_size);
1112
2.24k
  if (compiled_data->minimize.parameters)
1113
5
    _ccv_cnnp_apply_parameters_with_minimizer(model);
1114
  // Go through gradient checkpoints to generate tensor inputs for backward pass just before executing the backward pass.
1115
2.24k
  ccv_cnnp_model_apply_gradient_checkpoints(compiled_data, model->graph);
1116
4.48k
  for (i = 0; i < output_size; 
i++2.24k
)
1117
2.24k
  {
1118
2.24k
    const ccv_nnc_tensor_symbol_t df = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
1119
    // Init this to 1 so we can backprop.
1120
2.24k
    ccv_nnc_tensor_symbol_set_flags(model->graph, df, CCV_NNC_TENSOR_SYMBOL_INIT_ONES);
1121
2.24k
  }
1122
2.24k
  compiled_data->backward.to_size = 0;
1123
7.16k
  for (i = 0; i < parameter_size_maybe_more; 
i++4.91k
)
1124
4.91k
    if (compiled_data->gradients[i].d != CCV_NNC_NO_TENSOR_SYMBOL)
1125
4.91k
      compiled_data->backward.tos[compiled_data->backward.to_size++] = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->gradients[i]);
1126
2.24k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS);
1127
2.24k
  ccv_nnc_symbolic_graph_set_destinations(model->graph, compiled_data->update_nodes, parameter_size);
1128
4.49k
  for (i = 0; i < parameter_size_maybe_more - parameter_size; 
i++2.25k
)
1129
2.25k
  {
1130
2.25k
    if (compiled_data->outgrads[i].d < 0) // When we go through input, we might find zero-length inputs, and for these, we cannot have any outgrads.
1131
0
      continue;
1132
2.25k
    const ccv_nnc_graph_exec_symbol_t outgrad = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->outgrads[i]);
1133
2.25k
    const int* tos;
1134
2.25k
    int to_size;
1135
2.25k
    ccv_nnc_graph_exec_symbol_to(model->graph, outgrad, &tos, &to_size);
1136
2.25k
    if (to_size == 0) // If this is the end (no minimizers afterwards). We need to attach this as a destination. Otherwise this is covered in update_nodes.
1137
10
    {
1138
10
      const ccv_nnc_graph_exec_symbol_t* destinations = ccv_nnc_symbolic_graph_destinations(model->graph);
1139
10
      const int destination_count = ccv_nnc_symbolic_graph_destination_size(model->graph);
1140
10
      int flag = 0;
1141
10
      const int outgrad_destination_start = ccv_max(0, destination_count - i);
1142
12
      for (j = i - 1; !flag && 
j >= 010
;
j--2
)
1143
2
        if (j + outgrad_destination_start < destination_count)
1144
2
          flag = (destinations[j + outgrad_destination_start].d == outgrad.d);
1145
10
      if (!flag) // Only if we cannot find it, we add it.
1146
8
        ccv_nnc_symbolic_graph_add_destination(model->graph, outgrad);
1147
10
    }
1148
2.25k
  }
1149
2.24k
  if (parallel_count > 1)
1150
8
  {
1151
8
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1152
8
      0, 0,
1153
8
      compiled_data->gradients, parameter_size /* No need to deal with outgrads, we don't allreduce outgrads */,
1154
8
      compiled_data->gradients /* We only care about gradients before allreduce, thus, update our current pointers */,
1155
8
      0, 0, 0,
1156
8
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1157
8
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1158
8
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1159
16
    for (i = 0; i < evaluate_to_size; 
i++8
)
1160
32
      
for (j = 1; 8
j < parallel_count;
j++24
)
1161
24
      {
1162
24
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1163
24
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1164
24
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1165
24
      }
1166
8
    const int backward_to_size = compiled_data->backward.to_size;
1167
146
    for (i = 0; i < backward_to_size; 
i++138
)
1168
552
      
for (j = 1; 138
j < parallel_count;
j++414
)
1169
414
      {
1170
414
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->backward.tos[i], j);
1171
414
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1172
414
          compiled_data->backward.tos[compiled_data->backward.to_size++] = copy;
1173
414
      }
1174
8
  }
1175
  // Only use memory compression if we are in gradient parameter mode.
1176
2.24k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS2.23k
)
1177
2.24k
  {
1178
2.24k
    if (model->memory_compression)
1179
0
      ccv_nnc_symbolic_graph_memory_compression(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1180
2.24k
    if (model->memory_reduction)
1181
0
      ccv_nnc_symbolic_graph_memory_reduction(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1182
2.24k
  }
1183
2.24k
  compiled_data->backward.to_size = _ccv_nnc_array_dedup_graph_exec_symbols(compiled_data->backward.tos, compiled_data->backward.to_size);
1184
2.24k
  compiled_data->gradient_mode = gradient_mode;
1185
2.24k
}
1186
1187
void ccv_cnnp_model_tensors_init_0(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1188
95
{
1189
95
  assert(!compiled_data->tensors.parameters);
1190
95
  const int parameter_size = compiled_data->parameters->rnum;
1191
95
  const int parallel_count = ccv_max(model->parallel_count, 1);
1192
95
  const int internal_size = compiled_data->internals->rnum;
1193
95
  compiled_data->tensors_init.size = ccv_nnc_tensor_symbol_count(model->graph);
1194
95
  compiled_data->tensors_init.v = cccalloc(((compiled_data->tensors_init.size + 31) >> 5), sizeof(uint32_t));
1195
95
  compiled_data->tensors.parameters = (ccv_nnc_tensor_t**)cccalloc((parameter_size + internal_size) * parallel_count, sizeof(ccv_nnc_tensor_t*));
1196
95
  compiled_data->tensors.internals = compiled_data->tensors.parameters + parameter_size * parallel_count;
1197
95
}
1198
1199
int ccv_cnnp_model_tensors_any_to_alloc(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1200
3
{
1201
3
  int i, j;
1202
3
  const int parameter_size = compiled_data->parameters->rnum;
1203
3
  const int parallel_count = ccv_max(model->parallel_count, 1);
1204
3
  const int internal_size = compiled_data->internals->rnum;
1205
19
  for (i = 0; i < parameter_size; 
i++16
)
1206
16
  {
1207
    // parameters has to be allocated all together.
1208
16
    if (compiled_data->tensors.parameters[i])
1209
16
    {
1210
16
      for (j = 1; j < parallel_count; 
j++0
)
1211
0
        { assert(compiled_data->tensors.parameters[i + j * parameter_size]); }
1212
16
      continue;
1213
16
    }
1214
0
    return 1;
1215
16
  }
1216
3
  for (i = 0; i < internal_size; 
i++0
)
1217
0
  {
1218
0
    if (!compiled_data->tensors.internals[i])
1219
0
      return 1;
1220
0
    for (j = 1; j < parallel_count; j++)
1221
0
      if (!compiled_data->tensors.internals[i + j * internal_size])
1222
0
        return 1;
1223
0
  }
1224
3
  return 0;
1225
3
}
1226
1227
void ccv_cnnp_model_tensors_init_1(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1228
92
{
1229
92
  int i, j;
1230
92
  const int parameter_size = compiled_data->parameters->rnum;
1231
92
  const int parallel_count = ccv_max(model->parallel_count, 1);
1232
92
  const int internal_size = compiled_data->internals->rnum;
1233
380
  for (i = 0; i < parameter_size; 
i++288
)
1234
288
  {
1235
    // parameters has to be allocated all together.
1236
288
    if (compiled_data->tensors.parameters[i])
1237
0
    {
1238
0
      for (j = 1; j < parallel_count; j++)
1239
0
        { assert(compiled_data->tensors.parameters[i + j * parameter_size]); }
1240
0
      continue;
1241
0
    }
1242
288
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1243
288
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1244
288
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1245
104
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1246
288
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1247
288
    compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0);
1248
690
    for (j = 1; j < parallel_count; 
j++402
)
1249
402
    {
1250
402
      if (j != device_id)
1251
402
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1252
0
      else
1253
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1254
402
      compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1255
402
    }
1256
288
  }
1257
92
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1258
154
  for (i = 0; i < internal_size; 
i++62
)
1259
62
  {
1260
62
    const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i);
1261
62
    const int d = retained.d;
1262
62
    if (init_v[d >> 5] & (1u << (d & 0x1f)))
1263
0
      continue;
1264
62
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(retained.graph, retained);
1265
62
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1266
7
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1267
62
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1268
62
    if (!compiled_data->tensors.internals[i])
1269
62
      compiled_data->tensors.internals[i] = ccv_nnc_tensor_new(0, info, 0);
1270
158
    for (j = 1; j < parallel_count; 
j++96
)
1271
96
    {
1272
96
      if (j != device_id)
1273
96
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1274
0
      else
1275
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1276
96
      if (!compiled_data->tensors.internals[i + j * internal_size])
1277
96
        compiled_data->tensors.internals[i + j * internal_size] = ccv_nnc_tensor_new(0, info, 0);
1278
96
    }
1279
62
  }
1280
92
  compiled_data->tensors_init.v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); // Remove 1 if any.
1281
92
}
1282
1283
static void _ccv_cnnp_model_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1284
92
{
1285
92
  ccv_cnnp_model_tensors_init_0(model, compiled_data);
1286
92
  ccv_cnnp_model_tensors_init_1(model, compiled_data);
1287
92
}
1288
1289
static void _ccv_cnnp_model_copy_tensors(const uint32_t* const tensors_init, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1290
6
{
1291
6
  assert(parallel_count > 0);
1292
6
  int i, j;
1293
12
  for (i = 0; i < tensor_size; 
i++6
)
1294
6
  {
1295
6
    if (!tensors[i])
1296
0
      continue;
1297
6
    const int d = tensor_symbols[i].d;
1298
6
    if (!(tensors_init[d >> 5] & (1u << (d & 0x1f))))
1299
0
      continue;
1300
24
    
for (j = 1; 6
j < parallel_count;
j++18
)
1301
18
      if (tensors[i + j * tensor_size])
1302
18
      {
1303
18
        ccv_nnc_tensor_t* const input = CCV_NNC_TENSOR(tensors[i]);
1304
18
        ccv_nnc_tensor_t* const output = CCV_NNC_TENSOR(tensors[i + j * tensor_size]);
1305
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &input, 1, &output, 1, 0);
1306
18
      }
1307
6
  }
1308
6
}
1309
1310
static void _ccv_cnnp_model_remove_nocopies(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t** const tensors, const int tensor_size, const int parallel_count)
1311
100
{
1312
100
  assert(parallel_count > 0);
1313
100
  int i, j;
1314
163
  for (i = 0; i < tensor_size; 
i++63
)
1315
63
  {
1316
63
    const ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1317
159
    for (j = 1; j < parallel_count; 
j++96
)
1318
96
    {
1319
96
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1320
96
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1321
96
      if (copy_tensor && copy.d == CCV_NNC_NO_TENSOR_SYMBOL)
1322
0
      { // We shouldn't allocate this, free it up.
1323
0
        ccv_nnc_tensor_free(tensors[i + j * tensor_size]);
1324
0
        tensors[i + j * tensor_size] = 0;
1325
0
      }
1326
96
    }
1327
63
  }
1328
100
}
1329
1330
static void _ccv_cnnp_model_bind_tensors(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count, ccv_array_t* const tensor_binds)
1331
536
{
1332
536
  assert(parallel_count > 0);
1333
536
  int i, j;
1334
1.92k
  for (i = 0; i < tensor_size; 
i++1.38k
)
1335
1.38k
  {
1336
1.38k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1337
1.38k
    if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL)
1338
7
      continue;
1339
1.38k
    if (graph)
1340
1.38k
    {
1341
1.38k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1342
1.38k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1343
0
        tensor_symbol = alias_to;
1344
1.38k
    }
1345
1.38k
    ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(tensors[i]);
1346
1.38k
    if (tensor && 
tensor_symbol.d != CCV_NNC_NO_TENSOR_SYMBOL1.38k
)
1347
1.38k
    {
1348
1.38k
      const ccv_nnc_tensor_bind_t retained_bind = {
1349
1.38k
        .symbol = tensor_symbol,
1350
1.38k
        .tensor = tensor
1351
1.38k
      };
1352
1.38k
      ccv_array_push(tensor_binds, &retained_bind);
1353
1.38k
    }
1354
2.92k
    for (j = 1; j < parallel_count; 
j++1.54k
)
1355
1.54k
    {
1356
1.54k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1357
1.54k
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1358
1.54k
      if (copy_tensor && copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1359
1.54k
      {
1360
1.54k
        const ccv_nnc_tensor_bind_t bind = {
1361
1.54k
          .symbol = copy,
1362
1.54k
          .tensor = tensors[i + j * tensor_size]
1363
1.54k
        };
1364
1.54k
        ccv_array_push(tensor_binds, &bind);
1365
1.54k
      }
1366
1.54k
    }
1367
1.38k
  }
1368
536
}
1369
1370
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data)
1371
2.40k
{
1372
2.40k
  if (compiled_data->graph)
1373
100
    ccv_nnc_graph_free(compiled_data->graph);
1374
2.40k
  compiled_data->graph = 0;
1375
2.40k
  compiled_data->is_test = 0;
1376
2.40k
  if (compiled_data->tensor_arena)
1377
100
    ccv_nnc_tensor_arena_free(compiled_data->tensor_arena);
1378
2.40k
  compiled_data->tensor_arena = 0;
1379
2.40k
  if (compiled_data->graph_exec_arena)
1380
100
    ccv_nnc_graph_exec_arena_free(compiled_data->graph_exec_arena);
1381
2.40k
  compiled_data->graph_exec_arena = 0;
1382
2.40k
  if (compiled_data->backward.from_ops)
1383
33
    ccfree(compiled_data->backward.from_ops);
1384
2.40k
  compiled_data->backward.from_ops = 0;
1385
2.40k
  if (compiled_data->evaluate.schedule)
1386
38
    ccv_nnc_graph_static_schedule_free(compiled_data->evaluate.schedule);
1387
2.40k
  compiled_data->evaluate.schedule = 0;
1388
2.40k
  if (compiled_data->backward.schedule)
1389
28
    ccv_nnc_graph_static_schedule_free(compiled_data->backward.schedule);
1390
2.40k
  compiled_data->backward.schedule = 0;
1391
2.40k
}
1392
1393
static void _ccv_cnnp_compiled_data_gradient_free(ccv_cnnp_compiled_data_t* const compiled_data)
1394
2.30k
{
1395
2.30k
  if (compiled_data->gradients)
1396
2.24k
    ccfree(compiled_data->gradients);
1397
2.30k
  compiled_data->gradients = 0;
1398
2.30k
  if (compiled_data->updated_parameters)
1399
2.24k
    ccfree(compiled_data->updated_parameters);
1400
2.30k
  compiled_data->updated_parameters = 0;
1401
2.30k
  compiled_data->update_nodes = 0;
1402
2.30k
  compiled_data->saved_aux = 0;
1403
2.30k
}
1404
1405
static void _ccv_cnnp_compiled_data_backward_free(ccv_cnnp_compiled_data_t* const compiled_data)
1406
2.34k
{
1407
2.34k
  if (compiled_data->backward.gradients)
1408
5
    ccfree(compiled_data->backward.gradients);
1409
2.34k
  compiled_data->backward.gradients = 0;
1410
2.34k
  if (compiled_data->backward.accum)
1411
5
    ccv_nnc_graph_free(compiled_data->backward.accum);
1412
2.34k
  compiled_data->backward.accum = 0;
1413
2.34k
  if (compiled_data->backward.tensor_arena)
1414
5
    ccv_nnc_tensor_arena_free(compiled_data->backward.tensor_arena);
1415
2.34k
  compiled_data->backward.tensor_arena = 0;
1416
2.34k
  if (compiled_data->backward.graph_exec_arena)
1417
5
    ccv_nnc_graph_exec_arena_free(compiled_data->backward.graph_exec_arena);
1418
2.34k
  compiled_data->backward.graph_exec_arena = 0;
1419
2.34k
}
1420
1421
static void _ccv_cnnp_compiled_data_apply_gradients_free(ccv_cnnp_compiled_data_t* const compiled_data)
1422
2.31k
{
1423
2.31k
  if (compiled_data->apply_gradients.graph)
1424
24
    ccv_nnc_graph_free(compiled_data->apply_gradients.graph);
1425
2.31k
  compiled_data->apply_gradients.graph = 0;
1426
2.31k
  if (compiled_data->apply_gradients.tensor_arena)
1427
24
    ccv_nnc_tensor_arena_free(compiled_data->apply_gradients.tensor_arena);
1428
2.31k
  compiled_data->apply_gradients.tensor_arena = 0;
1429
2.31k
  if (compiled_data->apply_gradients.graph_exec_arena)
1430
24
    ccv_nnc_graph_exec_arena_free(compiled_data->apply_gradients.graph_exec_arena);
1431
2.31k
  compiled_data->apply_gradients.graph_exec_arena = 0;
1432
2.31k
}
1433
1434
// Compile the graph to run ccv_cnnp_model_fit
1435
static void _ccv_cnnp_model_fit_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1436
8
{
1437
8
  int i, j;
1438
8
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1439
8
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE);
1440
8
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_FIT_MODE;
1441
8
  const int parallel_count = ccv_max(model->parallel_count, 1);
1442
8
  assert(output_size == model->output_size * parallel_count);
1443
8
  assert(!fits || output_size == fit_size);
1444
8
  assert(output_size > 0);
1445
8
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1446
8
  {
1447
8
    _ccv_cnnp_model_set_rewindables(model);
1448
8
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1449
8
  } else 
if (0
compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES0
) {
1450
0
    _ccv_cnnp_model_rewind_graph(model);
1451
0
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1452
0
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1453
0
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1454
0
  }
1455
8
  const int tensors_init = !!compiled_data->tensors_init.v;
1456
8
  if (!tensors_init)
1457
4
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1458
4
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1459
  // Check if it is not fully allocated, if it is not, init_1.
1460
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1461
8
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1462
8
  assert((input_size % parallel_count) == 0);
1463
8
  assert((output_size % parallel_count) == 0);
1464
8
  assert((fit_size % parallel_count) == 0);
1465
8
  const int input_size_per_p = input_size / parallel_count;
1466
8
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1467
8
  const int output_size_per_p = output_size / parallel_count;
1468
8
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1469
8
  const int fit_size_per_p = fit_size / parallel_count;
1470
8
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count, tensor_binds);
1471
8
  const int parameter_size = compiled_data->parameters->rnum;
1472
8
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1473
8
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1474
8
  const int internal_size = compiled_data->internals->rnum;
1475
8
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1476
8
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1477
8
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1478
8
  ccv_array_free(tensor_binds);
1479
8
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1480
8
  if (tensors_init && 
parallel_count > 14
)
1481
0
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1482
  // If tensor is not init'ed, we need to init states first.
1483
8
  if (_ccv_cnnp_any_to_init(compiled_data))
1484
7
  {
1485
7
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1486
7
      .parallel_count = parallel_count,
1487
7
      .graph = model->graph,
1488
7
      .compiled_data = compiled_data,
1489
7
      .tensor_arena = compiled_data->tensor_arena
1490
7
    };
1491
7
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1492
7
  }
1493
8
  compiled_data->is_test = 0;
1494
8
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer);
1495
  // No need to set because it is default to training mode.
1496
  // ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1497
105
  for (i = 0; i < saved_aux_size * parameter_size; 
i++97
)
1498
97
  {
1499
97
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
1500
5
      continue;
1501
92
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, compiled_data->saved_aux[i].source);
1502
92
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
1503
296
    for (j = 1; j < parallel_count; 
j++204
)
1504
204
    {
1505
204
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
1506
204
      if (copy)
1507
204
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
1508
204
    }
1509
92
  }
1510
8
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1511
8
  compiled_data->evaluate.to_op_size = 0;
1512
22
  for (i = 0; i < evaluate_to_size; 
i++14
)
1513
14
  {
1514
14
    ccv_nnc_graph_exec_t const to = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1515
14
    if (to.graph)
1516
14
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to;
1517
14
  }
1518
8
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1519
8
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1520
8
}
1521
1522
ccv_nnc_stream_context_t* ccv_cnnp_model_default_stream(const ccv_cnnp_model_t* const model)
1523
0
{
1524
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1525
0
  if (!compiled_data || !compiled_data->graph)
1526
0
    return 0;
1527
0
  return ccv_nnc_graph_default_stream(compiled_data->graph);
1528
0
}
1529
1530
uint64_t ccv_cnnp_model_memory_size(const ccv_cnnp_model_t* const model)
1531
0
{
1532
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1533
0
  if (!compiled_data || !compiled_data->tensor_arena)
1534
0
    return 0;
1535
0
  return ccv_nnc_tensor_arena_size(compiled_data->tensor_arena);
1536
0
}
1537
1538
static void _ccv_cnnp_bind_tensors_to_arena(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1539
38.9k
{
1540
38.9k
  int i, j;
1541
114k
  for (i = 0; i < tensor_size; 
i++75.6k
)
1542
75.6k
  {
1543
75.6k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1544
75.6k
    if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL)
1545
0
      continue;
1546
75.6k
    if (graph)
1547
72.7k
    {
1548
72.7k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1549
72.7k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1550
0
        tensor_symbol = alias_to;
1551
72.7k
    }
1552
75.6k
    ccv_nnc_tensor_bind_symbol(tensor_arena, tensor_symbol, tensors[i]);
1553
77.4k
    for (j = 1; j < parallel_count; 
j++1.77k
)
1554
1.77k
    {
1555
1.77k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1556
1.77k
      if (copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1557
1.77k
        ccv_nnc_tensor_bind_symbol(tensor_arena, copy, tensors[i + tensor_size * j]);
1558
1.77k
    }
1559
75.6k
  }
1560
38.9k
}
1561
1562
void ccv_cnnp_model_fit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1563
2.54k
{
1564
2.54k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1565
2.54k
  assert(compiled_data);
1566
2.54k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1567
2.54k
  assert(output_size == model->output_size * parallel_count);
1568
2.54k
  assert(input_size == model->input_size * parallel_count);
1569
2.54k
  assert(!fits || fit_size == output_size);
1570
2.54k
  assert(model->graph);
1571
2.54k
  if (!compiled_data->graph || 
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE2.53k
)
1572
8
  {
1573
8
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1574
8
    _ccv_cnnp_compiled_data_backward_free(compiled_data);
1575
8
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
1576
    // Compile the symbolic graph down only when needed.
1577
8
    _ccv_cnnp_model_fit_jit(model, inputs, input_size, fits, fit_size, outputs, output_size);
1578
2.53k
  } else {
1579
2.53k
    assert((input_size % parallel_count) == 0);
1580
2.53k
    assert((output_size % parallel_count) == 0);
1581
2.53k
    assert((fit_size % parallel_count) == 0);
1582
2.53k
    const int input_size_per_p = input_size / parallel_count;
1583
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1584
2.53k
    const int output_size_per_p = output_size / parallel_count;
1585
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1586
2.53k
    const int fit_size_per_p = fit_size / parallel_count;
1587
2.53k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count);
1588
2.53k
  }
1589
2.54k
  if (compiled_data->is_test)
1590
0
  {
1591
0
    compiled_data->is_test = 0;
1592
0
    ccv_nnc_graph_exec_update_t update = {
1593
0
      .parallel_count = parallel_count,
1594
0
      .graph = model->graph,
1595
0
      .graph_exec_arena = compiled_data->graph_exec_arena,
1596
0
    };
1597
0
    ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1598
0
  }
1599
2.54k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1600
2.54k
}
1601
1602
// Compile the graph to run ccv_cnnp_model_evaluate with require_grad = false (MULTISTAGE_MODE_NO_GRAD).
1603
static void _ccv_cnnp_model_multistage_no_grad_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1604
59
{
1605
59
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1606
59
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD;
1607
59
  const int parallel_count = ccv_max(model->parallel_count, 1);
1608
59
  assert(output_size == model->output_size * parallel_count);
1609
59
  assert(output_size > 0);
1610
  // If the gradient is not initialized, continue to setup parallel process. We don't init gradient here, but rather,
1611
  // we setup proper rewindables so the graph can be rewinded to previous state before we run data parallel.
1612
59
  if (parallel_count > 1 && 
compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE6
)
1613
6
  {
1614
6
    const int evaluate_to_size = compiled_data->evaluate.to_size;
1615
6
    compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1616
6
    _ccv_cnnp_model_set_rewindables(model);
1617
6
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1618
6
      0, 0,
1619
6
      0, 0, 0,
1620
6
      0, 0, 0,
1621
6
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1622
6
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1623
6
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1624
6
    int i, j;
1625
12
    for (i = 0; i < evaluate_to_size; 
i++6
)
1626
24
      
for (j = 1; 6
j < parallel_count;
j++18
)
1627
18
      {
1628
18
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1629
18
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1630
18
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1631
18
      }
1632
6
  }
1633
59
  const int tensors_init = !!compiled_data->tensors_init.v;
1634
59
  if (!tensors_init)
1635
35
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1636
24
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1637
  // Check if it is not fully allocated, if it is not, init_1.
1638
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1639
59
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1640
59
  assert((input_size % parallel_count) == 0);
1641
59
  assert((output_size % parallel_count) == 0);
1642
59
  const int input_size_per_p = input_size / parallel_count;
1643
59
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1644
59
  const int output_size_per_p = output_size / parallel_count;
1645
59
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1646
59
  const int parameter_size = compiled_data->parameters->rnum;
1647
59
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1648
59
  const int internal_size = compiled_data->internals->rnum;
1649
59
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1650
59
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1651
  // If we generated gradient for the graph, only compile part of the graph because the rest is irrelevant for evaluation.
1652
59
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1653
59
  ccv_array_free(tensor_binds);
1654
59
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1655
  // If tensor is not init'ed, we need to init states first.
1656
59
  if (tensors_init && 
parallel_count > 124
)
1657
6
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1658
59
  if (_ccv_cnnp_any_to_init(compiled_data))
1659
17
  {
1660
17
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1661
17
      .parallel_count = parallel_count,
1662
17
      .graph = model->graph,
1663
17
      .compiled_data = compiled_data,
1664
17
      .tensor_arena = compiled_data->tensor_arena
1665
17
    };
1666
17
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1667
17
  }
1668
59
  compiled_data->is_test = 1;
1669
59
  ccv_nnc_graph_exec_update_t update = {
1670
59
    .parallel_count = parallel_count,
1671
59
    .graph = model->graph,
1672
59
    .graph_exec_arena = compiled_data->graph_exec_arena,
1673
59
  };
1674
59
  ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
1675
59
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1676
59
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1677
59
}
1678
1679
static void _ccv_cnnp_model_gradient_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1680
32
{
1681
32
  assert(!compiled_data->tensors.gradients);
1682
32
  const int parameter_size = compiled_data->parameters->rnum;
1683
32
  const int parallel_count = ccv_max(model->parallel_count, 1);
1684
32
  compiled_data->tensors.gradients = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * parameter_size * 2 * parallel_count);
1685
32
  compiled_data->tensors.accum_gradients = compiled_data->tensors.gradients + parameter_size * parallel_count;
1686
32
  int i, j;
1687
182
  for (i = 0; i < parameter_size; 
i++150
)
1688
150
  {
1689
150
    if (compiled_data->parameter_flags && 
!(compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))6
)
1690
2
    {
1691
2
      compiled_data->tensors.gradients[i] = 0;
1692
2
      compiled_data->tensors.accum_gradients[i] = 0;
1693
2
      for (j = 1; j < parallel_count; 
j++0
)
1694
0
      {
1695
0
        compiled_data->tensors.gradients[i + j * parameter_size] = 0;
1696
0
        compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1697
0
      }
1698
2
      continue;
1699
2
    }
1700
148
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1701
148
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1702
148
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
1703
38
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1704
148
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
1705
148
    compiled_data->tensors.gradients[i] = ccv_nnc_tensor_new(0, info, 0);
1706
148
    compiled_data->tensors.accum_gradients[i] = 0; // delay the accumulated gradient allocation until when we need it.
1707
328
    for (j = 1; j < parallel_count; 
j++180
)
1708
180
    {
1709
180
      if (j != device_id)
1710
180
        CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1711
0
      else
1712
0
        CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1713
180
      compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1714
180
      compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1715
180
    }
1716
148
  }
1717
32
}
1718
1719
static int _ccv_cnnp_is_disable_outgrad_all(const uint64_t disable_outgrad, const int input_size)
1720
8.03k
{
1721
8.03k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL)
1722
15
    return 1;
1723
8.02k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE)
1724
8.01k
    return 0;
1725
7
  int i;
1726
7
  for (i = 0; i < input_size; 
i++0
)
1727
7
    if (!(disable_outgrad & ((uint64_t)1 << i)))
1728
7
      return 0;
1729
0
  return 1;
1730
7
}
1731
1732
// Compile the graph to run ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1733
// Particularly, this method compiles the evaluation and backprop graph (the main graph).
1734
static void _ccv_cnnp_model_multistage_jit_0(ccv_cnnp_model_t* const model, const uint64_t disable_outgrad, const int is_test, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1735
33
{
1736
33
  int i, j;
1737
33
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1738
33
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES1
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS32
;
1739
33
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE || compiled_data->gradient_mode != target_gradient_mode);
1740
33
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE;
1741
33
  const int parallel_count = ccv_max(model->parallel_count, 1);
1742
33
  assert(output_size == model->output_size * parallel_count);
1743
33
  assert(output_size > 0);
1744
  // There shouldn't be a loss function if we evaluate with multistage jit.
1745
33
  assert(compiled_data->loss.cmd == CCV_NNC_NOOP);
1746
33
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1747
31
  {
1748
31
    _ccv_cnnp_model_set_rewindables(model);
1749
31
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1750
31
  } else 
if (2
compiled_data->gradient_mode != target_gradient_mode2
) {
1751
2
    _ccv_cnnp_model_rewind_graph(model);
1752
2
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1753
2
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1754
2
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1755
2
  }
1756
33
  const int tensors_init = !!compiled_data->tensors_init.v;
1757
33
  if (!tensors_init)
1758
25
    _ccv_cnnp_model_tensors_init(model, compiled_data);
1759
8
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
1760
  // Check if it is not fully allocated, if it is not, init_1.
1761
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
1762
33
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1763
33
  assert((input_size % parallel_count) == 0);
1764
33
  assert((output_size % parallel_count) == 0);
1765
33
  const int input_size_per_p = input_size / parallel_count;
1766
33
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1767
33
  const int output_size_per_p = output_size / parallel_count;
1768
33
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1769
33
  const int parameter_size = compiled_data->parameters->rnum;
1770
33
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1771
33
  const int internal_size = compiled_data->internals->rnum;
1772
33
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1773
33
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1774
33
  if (!compiled_data->tensors.gradients)
1775
32
    _ccv_cnnp_model_gradient_tensors_init(model, compiled_data);
1776
33
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
1777
33
  if (compiled_data->backward.to_size > 0)
1778
33
    ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->backward.tos, compiled_data->backward.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1779
0
  else
1780
0
    ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1781
33
  ccv_array_free(tensor_binds);
1782
33
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
1783
33
  if (tensors_init && 
parallel_count > 18
)
1784
0
    _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1785
  // If tensor is not init'ed, we need to init states first.
1786
33
  if (_ccv_cnnp_any_to_init(compiled_data))
1787
21
  {
1788
21
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1789
21
      .parallel_count = parallel_count,
1790
21
      .graph = model->graph,
1791
21
      .compiled_data = compiled_data,
1792
21
      .tensor_arena = compiled_data->tensor_arena
1793
21
    };
1794
21
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1795
21
  }
1796
33
  compiled_data->is_test = is_test;
1797
33
  ccv_nnc_graph_exec_update_t update = {
1798
33
    .parallel_count = parallel_count,
1799
33
    .graph = model->graph,
1800
33
    .graph_exec_arena = compiled_data->graph_exec_arena,
1801
33
  };
1802
33
  ccv_cnnp_model_set_is_test(model, is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1803
33
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1804
33
  compiled_data->evaluate.to_op_size = 0;
1805
33
  ccv_array_t* const backward_from = ccv_array_new(sizeof(int), 0, 0);
1806
84
  for (i = 0; i < evaluate_to_size; 
i++51
)
1807
51
  {
1808
51
    ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1809
51
    if (to_op.graph)
1810
51
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to_op;
1811
51
    const int* tos;
1812
51
    int to_size;
1813
51
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->evaluate.tos[i], &tos, &to_size);
1814
102
    for (j = 0; j < to_size; 
j++51
)
1815
51
    {
1816
51
      ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
1817
51
        .d = tos[j],
1818
51
        .graph = model->graph
1819
51
      });
1820
51
      if (to_op.graph)
1821
51
        ccv_array_add_unique_int(backward_from, to_op.d);
1822
51
    }
1823
51
  }
1824
33
  assert(backward_from->rnum > 0);
1825
33
  compiled_data->backward.from_op_size = backward_from->rnum;
1826
33
  compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1827
84
  for (i = 0; i < backward_from->rnum; 
i++51
)
1828
51
    compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1829
51
      .d = *(int*)ccv_array_get(backward_from, i),
1830
51
      .graph = compiled_data->graph,
1831
51
    };
1832
  // If there are any set node (to set some tensors to 0) inserted through backward pass, these won't be executed if we just do sources -> evaluate.to_ops, backward.from_ops -> destinations. We need this logic to find out these nodes and explicitly adding them to backward.from_ops.
1833
33
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(compiled_data->graph->exec_info, 0);
1834
33
  const int exec_info_size = compiled_data->graph->exec_info->rnum;
1835
33
  uint32_t* const visited = cccalloc((exec_info_size + 31) >> 5, sizeof(uint32_t));
1836
33
  const ccv_nnc_graph_exec_t* const sources = (ccv_nnc_graph_exec_t*)ccv_array_get(compiled_data->graph->sources, 0);
1837
33
  const int source_size = compiled_data->graph->sources->rnum;
1838
66
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new33
(compiled_data->graph, exec_info, exec_info_size, sources, source_size, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size, 0);
1839
620
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1840
620
    visited[(idx >> 5)] |= (1u << (idx & 31));
1841
620
  } ccv_nnc_graph_visit_endfor
1842
66
  ccv_nnc_graph_visit_free(visit);
1843
66
  const ccv_nnc_graph_exec_t* const destinations = (ccv_nnc_graph_exec_t*)
ccv_array_get33
(compiled_data->graph->destinations, 0);
1844
66
  const int destination_size = compiled_data->graph->destinations->rnum;
1845
66
  visit = 
ccv_nnc_graph_visit_new33
(compiled_data->graph, exec_info, exec_info_size, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, destinations, destination_size, 0);
1846
676
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1847
676
    visited[(idx >> 5)] |= (1u << (idx & 31));
1848
676
  } ccv_nnc_graph_visit_endfor
1849
66
  ccv_nnc_graph_visit_free(visit);
1850
66
  visit = 
ccv_nnc_graph_visit_new33
(compiled_data->graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0);
1851
  // Find any missing nodes to be added as source. Right now, these are only set nodes.
1852
1.34k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1853
1.34k
    if (!(visited[(idx >> 5)] & (1u << (idx & 31))))
1854
51
    {
1855
51
      assert(exec_info[idx].cmd.cmd == CCV_NNC_SET_FORWARD);
1856
51
      if (exec_info[idx].cmd.info.blas.a[0] == 0) // Special-casing for empty out the tensor set function, not for the set grad to 1 one.
1857
0
        ccv_array_add_unique_int(backward_from, idx);
1858
51
    }
1859
1.34k
  } ccv_nnc_graph_visit_endfor
1860
33
  ccv_nnc_graph_visit_free(visit);
1861
33
  ccfree(visited);
1862
33
  if (backward_from->rnum != compiled_data->backward.from_op_size) // If it doesn't match, need to redo this.
1863
0
  {
1864
0
    compiled_data->backward.from_op_size = backward_from->rnum;
1865
0
    compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccrealloc(compiled_data->backward.from_ops, sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1866
0
    for (i = 0; i < backward_from->rnum; i++)
1867
0
      compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1868
0
        .d = *(int*)ccv_array_get(backward_from, i),
1869
0
        .graph = compiled_data->graph,
1870
0
      };
1871
0
  }
1872
33
  ccv_array_free(backward_from);
1873
33
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count);
1874
33
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1875
33
}
1876
1877
void ccv_cnnp_model_dry_run(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1878
8.00k
{
1879
8.00k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1880
8.00k
  assert(compiled_data);
1881
8.00k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1882
8.00k
  assert(output_size == model->output_size * parallel_count);
1883
8.00k
  assert(input_size == model->input_size * parallel_count);
1884
8.00k
  assert(model->graph);
1885
8.00k
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(params.disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES14
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS7.99k
;
1886
8.00k
  const int mode_mismatch = (params.requires_grad && 
(7.85k
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE7.85k
||
compiled_data->gradient_mode != target_gradient_mode7.82k
||
compiled_data->disable_outgrad != params.disable_outgrad7.82k
));
1887
8.00k
  if (!compiled_data->graph || 
mode_mismatch7.91k
)
1888
92
  {
1889
92
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1890
92
    if (mode_mismatch) // If mode mismatch, we need to redo the backward as well (no need to redo apply_gradients, it doesn't require target_gradient_mode or disable_outgrad.
1891
33
      _ccv_cnnp_compiled_data_backward_free(compiled_data);
1892
92
    if (params.requires_grad)
1893
33
      _ccv_cnnp_model_multistage_jit_0(model, params.disable_outgrad, params.is_test, inputs, input_size, outputs, output_size);
1894
59
    else
1895
59
      _ccv_cnnp_model_multistage_no_grad_jit(model, inputs, input_size, outputs, output_size);
1896
7.91k
  } else {
1897
7.91k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->tensor_arena);
1898
7.91k
    assert((input_size % parallel_count) == 0);
1899
7.91k
    const int input_size_per_p = input_size / parallel_count;
1900
7.91k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1901
7.91k
    assert((output_size % parallel_count) == 0);
1902
7.91k
    const int output_size_per_p = output_size / parallel_count;
1903
7.91k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1904
7.91k
  }
1905
8.00k
  if (compiled_data->is_test != params.is_test)
1906
64
  {
1907
64
    compiled_data->is_test = params.is_test;
1908
64
    ccv_nnc_graph_exec_update_t update = {
1909
64
      .parallel_count = parallel_count,
1910
64
      .graph = model->graph,
1911
64
      .graph_exec_arena = compiled_data->graph_exec_arena,
1912
64
    };
1913
64
    ccv_cnnp_model_set_is_test(model, params.is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1914
64
  }
1915
8.00k
}
1916
1917
void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1918
8.00k
{
1919
8.00k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1920
8.00k
  assert(compiled_data);
1921
8.00k
  ccv_cnnp_model_dry_run(model, params, inputs, input_size, outputs, output_size);
1922
8.00k
  if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD)
1923
73
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1924
7.93k
  else {
1925
7.93k
    if (!compiled_data->evaluate.schedule)
1926
38
      compiled_data->evaluate.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, 0, 0, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size);
1927
7.93k
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->evaluate.schedule, tensor_tape, stream_context);
1928
7.93k
  }
1929
8.00k
}
1930
1931
// Compile the graph to run ccv_cnnp_model_backward after ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1932
// Particularly, this method compiles the accumulator graph.
1933
static void _ccv_cnnp_model_multistage_jit_1(ccv_cnnp_model_t* const model)
1934
5
{
1935
5
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1936
5
  assert(compiled_data);
1937
5
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1938
5
  ccv_nnc_symbolic_graph_t* accum = ccv_nnc_symbolic_graph_new();
1939
5
  const int parallel_count = ccv_max(model->parallel_count, 1);
1940
5
  const int parameter_size = compiled_data->parameters->rnum;
1941
5
  int i, j;
1942
5
  compiled_data->backward.gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size * parallel_count * 3);
1943
5
  compiled_data->backward.accum_gradients = compiled_data->backward.gradients + parameter_size * parallel_count;
1944
5
  compiled_data->backward.updated_accum_gradients = compiled_data->backward.accum_gradients + parameter_size * parallel_count;
1945
20
  for (i = 0; i < parameter_size; 
i++15
)
1946
30
    
for (j = 0; 15
j < parallel_count;
j++15
)
1947
15
      if (compiled_data->tensors.gradients[i + j * parameter_size])
1948
15
      {
1949
15
        const ccv_nnc_tensor_param_t info = compiled_data->tensors.gradients[i + j * parameter_size]->info;
1950
        // Now, the old gradient is the accumulated gradient, getting new gradient tensor setup so we can collect them.
1951
15
        compiled_data->tensors.accum_gradients[i + j * parameter_size] = compiled_data->tensors.gradients[i + j * parameter_size];
1952
15
        compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1953
15
        ccv_nnc_tensor_symbol_t inputs[2];
1954
15
        inputs[0] = compiled_data->backward.accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1955
15
        inputs[1] = compiled_data->backward.gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1956
15
        ccv_nnc_tensor_symbol_t output = compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1957
15
        ccv_nnc_graph_exec_symbol_new(accum, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0);
1958
15
      } else {
1959
0
        compiled_data->backward.accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1960
0
        compiled_data->backward.gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1961
0
        compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL;
1962
0
      }
1963
5
  ccv_nnc_graph_exec_symbol_autogen(accum, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1964
5
  if (ccv_nnc_symbolic_graph_source_size(accum) == 0)
1965
0
  {
1966
0
    ccv_nnc_symbolic_graph_free(accum);
1967
    // Create empty graph.
1968
0
    compiled_data->backward.accum = ccv_nnc_graph_new();
1969
0
    ccv_nnc_graph_topsort(compiled_data->backward.accum, 0, 0);
1970
0
    return;
1971
0
  }
1972
5
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1973
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1974
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1, tensor_binds);
1975
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1976
5
  ccv_nnc_symbolic_graph_compile(accum, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(accum), SYMBOLIC_GRAPH_DESTINATIONS(accum), &compiled_data->backward.accum, &compiled_data->backward.tensor_arena, &compiled_data->backward.graph_exec_arena);
1977
5
  ccv_nnc_symbolic_graph_free(accum);
1978
5
  ccv_array_free(tensor_binds);
1979
5
  ccv_nnc_graph_set_default_static_schedule(compiled_data->backward.accum, compiled_data->stream_type, model->max_stream_count);
1980
5
}
1981
1982
void ccv_cnnp_model_backward(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const ingrads, const int ingrad_size, ccv_nnc_tensor_t* const* const outgrads, const int outgrad_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1983
7.91k
{
1984
7.91k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1985
7.91k
  assert(compiled_data);
1986
7.91k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1987
7.91k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1988
7.91k
  assert(ingrad_size == 0 || ingrad_size == model->output_size * parallel_count);
1989
7.91k
  if (outgrad_size > 0)
1990
2.51k
    { assert(outgrad_size == compiled_data->outgrad_size * parallel_count); }
1991
7.91k
  assert(model->graph);
1992
7.91k
  assert(compiled_data->graph);
1993
7.91k
  const int parameter_size = compiled_data->parameters->rnum;
1994
  // If we need to accumulate the gradients now, do jit on accumulator.
1995
7.91k
  if (compiled_data->backward.count > 0)
1996
1.71k
  {
1997
1.71k
    if (!compiled_data->backward.accum)
1998
5
      _ccv_cnnp_model_multistage_jit_1(model);
1999
1.71k
    else if (compiled_data->backward.count == 1) {
2000
      //  On this round, we need to switch accumulated gradients with gradients (so we can do accumulation properly).
2001
496
      int i;
2002
1.48k
      for (i = 0; i < parameter_size * parallel_count; 
i++986
)
2003
986
      {
2004
986
        ccv_nnc_tensor_t* tensor;
2005
986
        CCV_SWAP(compiled_data->tensors.accum_gradients[i], compiled_data->tensors.gradients[i], tensor);
2006
986
      }
2007
496
      if (compiled_data->backward.tensor_arena)
2008
496
      {
2009
496
        ccv_nnc_tensor_arena_clear_bindings(compiled_data->backward.tensor_arena);
2010
        // Do rebind in case we messed up the binding (we switch accum_gradients and gradients).
2011
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1);
2012
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
2013
496
        _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
2014
496
      }
2015
496
    }
2016
1.71k
  }
2017
7.91k
  const int ingrad_size_per_p = model->output_size;
2018
7.91k
  const int outgrad_size_per_p = compiled_data->outgrad_size;
2019
7.91k
  int i, j;
2020
15.8k
  for (i = 0; i < ingrad_size_per_p; 
i++7.91k
)
2021
7.91k
  {
2022
7.91k
    const ccv_nnc_tensor_symbol_t ingrad = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
2023
7.91k
    if (!ingrad_size || 
!ingrads3.79k
||
ingrads[i] == 03.79k
)
2024
4.22k
    {
2025
      // Set it to 1 if it is not specified.
2026
4.22k
      ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ingrad);
2027
4.22k
      if (ingrad_tensor)
2028
4.22k
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
2029
4.34k
      for (j = 1; j < parallel_count; 
j++120
)
2030
120
      {
2031
120
        ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j));
2032
120
        if (ingrad_tensor)
2033
120
          ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
2034
120
      }
2035
4.22k
    } else {
2036
      // Make sure the length matches, in case it is an alias.
2037
3.69k
      assert(ccv_nnc_tensor_count(ingrads[i]->info) == ccv_nnc_tensor_count(ccv_nnc_tensor_symbol_params(model->graph, ingrad)));
2038
3.69k
      ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ingrad, ingrads[i]);
2039
3.69k
      for (j = 1; j < parallel_count; 
j++6
)
2040
6
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j), ingrads[i + ingrad_size_per_p * j]);
2041
3.69k
    }
2042
7.91k
  }
2043
7.91k
  if (outgrad_size > 0)
2044
2.51k
  {
2045
2.51k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS && "shouldn't pass disable_outgrad to ccv_cnnp_model_evaluate before if you plan to compute outgrad");
2046
5.14k
    
for (i = 0; 2.51k
i < outgrad_size_per_p;
i++2.62k
)
2047
2.62k
      if (outgrads[i])
2048
2.43k
      {
2049
2.43k
        const ccv_nnc_tensor_symbol_t outgrad = compiled_data->outgrads[i];
2050
2.43k
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, outgrad, outgrads[i]);
2051
2.43k
        for (j = 1; j < parallel_count; 
j++6
)
2052
6
          ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, outgrad, j), outgrads[i + outgrad_size_per_p * j]);
2053
2.43k
      }
2054
5.40k
  } else {
2055
5.40k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES ||
2056
5.40k
      compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS);
2057
5.40k
  }
2058
  // We need to rebind here because in ccv_cnnp_evaluate, we clear bindings, that will reset all bindings for the gradients.
2059
  // For parameters and internals these are fine because when we clear bindings, it restores to original bindings, which are these
2060
  // parameters and internals. The same cannot be said for gradients due to the accum_gradients switching.
2061
7.91k
  _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
2062
7.91k
  if (!compiled_data->backward.schedule)
2063
28
    compiled_data->backward.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, 0, 0);
2064
  // Run the backward pass.
2065
7.91k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->backward.schedule, tensor_tape, stream_context);
2066
  // If we need to run accumulation round, do that now.
2067
7.91k
  if (compiled_data->backward.count > 0)
2068
1.71k
    ccv_nnc_graph_run_with_schedule(compiled_data->backward.accum, 0, 0, 0, stream_context);
2069
  // Update the count, this determines whether we need to accumulate or not.
2070
7.91k
  ++compiled_data->backward.count;
2071
7.91k
}
2072
2073
// Compile the graph to run ccv_cnnp_model_apply_gradients after ccv_cnnp_model_backward (MULTISTAGE_MODE).
2074
// Particularly, this method compiles the parameter update graph.
2075
static void _ccv_cnnp_model_multistage_jit_2(ccv_cnnp_model_t* const model)
2076
24
{
2077
24
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2078
24
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
2079
24
  const int parallel_count = ccv_max(model->parallel_count, 1);
2080
24
  const int parameter_size = compiled_data->parameters->rnum;
2081
24
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
2082
24
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
2083
24
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
2084
  // Bind accumulated gradients.
2085
24
  if (compiled_data->backward.count > 1)
2086
4
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count, tensor_binds);
2087
20
  else
2088
20
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
2089
24
  ccv_array_t* const apply_gradients_from = ccv_array_new(sizeof(int), 0, 0);
2090
24
  int i, j;
2091
256
  for (i = 0; i < compiled_data->backward.to_size; 
i++232
)
2092
232
  {
2093
232
    const int* tos;
2094
232
    int to_size;
2095
232
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->backward.tos[i], &tos, &to_size);
2096
738
    for (j = 0; j < to_size; 
j++506
)
2097
506
    {
2098
      // Check if this is already show up in the backward graph, if that is the case, it won't be in the apply
2099
      // gradients graph.
2100
506
      const ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
2101
506
        .d = tos[j],
2102
506
        .graph = model->graph,
2103
506
      });
2104
506
      if (!exec.graph)
2105
316
        ccv_array_add_unique_int(apply_gradients_from, tos[j]);
2106
506
    }
2107
232
  }
2108
24
  const int from_size = apply_gradients_from->rnum;
2109
24
  if (from_size == 0)
2110
0
  {
2111
0
    ccv_array_free(apply_gradients_from);
2112
0
    ccv_array_free(tensor_binds);
2113
0
    return;
2114
0
  }
2115
24
  ccv_nnc_graph_exec_symbol_t* const froms = (ccv_nnc_graph_exec_symbol_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * from_size);
2116
160
  for (i = 0; i < from_size; 
i++136
)
2117
136
    froms[i] = (ccv_nnc_graph_exec_symbol_t){
2118
136
      .d = *(int*)ccv_array_get(apply_gradients_from, i),
2119
136
      .graph = model->graph
2120
136
    };
2121
24
  ccv_array_free(apply_gradients_from);
2122
  // It can only ends with updates on the parameters.
2123
24
  ccv_array_t* const tos = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), parameter_size * parallel_count, 0);
2124
160
  for (i = 0;  i < parameter_size; 
i++136
)
2125
136
  {
2126
136
    if (compiled_data->update_nodes[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
2127
0
      continue;
2128
136
    ccv_array_push(tos, &compiled_data->update_nodes[i]);
2129
316
    for (j = 1; j < parallel_count; 
j++180
)
2130
180
    {
2131
180
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->update_nodes[i], j);
2132
180
      ccv_array_push(tos, &copy);
2133
180
    }
2134
136
  }
2135
24
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, froms, from_size, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(tos, 0), tos->rnum, &compiled_data->apply_gradients.graph, &compiled_data->apply_gradients.tensor_arena, &compiled_data->apply_gradients.graph_exec_arena);
2136
24
  ccv_array_free(tos);
2137
24
  ccv_array_free(tensor_binds);
2138
24
  ccfree(froms);
2139
24
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2140
219
  for (i = 0; i < max_saved_aux_size * parameter_size; 
i++195
)
2141
195
  {
2142
    // Skip on no tensor.
2143
195
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
2144
0
      continue;
2145
195
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, compiled_data->saved_aux[i].source);
2146
195
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
2147
543
    for (j = 1; j < parallel_count; 
j++348
)
2148
348
    {
2149
348
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
2150
348
      if (copy)
2151
348
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
2152
348
    }
2153
195
  }
2154
24
  ccv_nnc_graph_set_default_static_schedule(compiled_data->apply_gradients.graph, compiled_data->stream_type, model->max_stream_count);
2155
24
}
2156
2157
void ccv_cnnp_model_apply_gradients(ccv_cnnp_model_t* const model, ccv_nnc_stream_context_t* const stream_context)
2158
7.84k
{
2159
7.84k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2160
7.84k
  assert(compiled_data);
2161
7.84k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
2162
7.84k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2163
7.84k
  assert(model->graph);
2164
7.84k
  assert(compiled_data->graph);
2165
  // Skip if there is no backward pass.
2166
7.84k
  if (compiled_data->backward.count <= 0)
2167
1.65k
    return;
2168
  // Skip if there is no parameters.
2169
6.19k
  if (compiled_data->parameters->rnum == 0)
2170
3
  {
2171
3
    compiled_data->backward.count = 0;
2172
3
    return;
2173
3
  }
2174
6.19k
  if (!compiled_data->apply_gradients.graph)
2175
24
    _ccv_cnnp_model_multistage_jit_2(model);
2176
6.16k
  else {
2177
6.16k
    const int parameter_size = compiled_data->parameters->rnum;
2178
6.16k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->apply_gradients.tensor_arena);
2179
    // Change to bind accum_gradients if we do gradient accumulation (run backward more than once).
2180
6.16k
    if (compiled_data->backward.count > 1)
2181
497
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count);
2182
5.67k
    else
2183
5.67k
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
2184
6.16k
  }
2185
6.19k
  if (compiled_data->apply_gradients.graph)
2186
6.19k
    ccv_nnc_graph_run_with_schedule(compiled_data->apply_gradients.graph, 0, 0, 0, stream_context);
2187
  // Reset backward count to 0.
2188
6.19k
  compiled_data->backward.count = 0;
2189
6.19k
}
2190
2191
void ccv_cnnp_model_set_parameter(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, const ccv_nnc_tensor_t* const tensor)
2192
35
{
2193
35
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2194
35
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 18
:
parameter->param_sel27
;
2195
35
  assert(parameter->param_sel != 0);
2196
35
  const int tensors_init = !!compiled_data->tensors_init.v;
2197
35
  if (!tensors_init)
2198
19
    _ccv_cnnp_model_tensors_init(model, compiled_data);
2199
16
  else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1)
2200
  // Check if it is not fully allocated, if it is not, init_1.
2201
0
    ccv_cnnp_model_tensors_init_1(model, compiled_data);
2202
35
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2203
35
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2204
35
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 134
:
parameter->param_ref1
;
2205
35
  if (param_ref < 0)
2206
1
    { assert(parameter_indices->rnum == 1); }
2207
34
  else
2208
34
    { assert(param_ref < parameter_indices->rnum); }
2209
35
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2210
35
  ccv_array_free(parameter_indices);
2211
35
  const int parameter_size = compiled_data->parameters->rnum;
2212
35
  assert(d >= 0);
2213
35
  assert(d < parameter_size);
2214
35
  const int parallel_count = ccv_max(model->parallel_count, 1);
2215
35
  ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2216
35
  assert(dest);
2217
35
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)tensor), TENSOR_LIST(dest), 0);
2218
35
  int i;
2219
35
  for (i = 1; i < parallel_count; 
i++0
)
2220
0
  {
2221
0
    ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d + i * parameter_size]);
2222
0
    if (copy_tensor)
2223
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
2224
0
  }
2225
  // Mark this symbol as init'ed.
2226
35
  const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, d))->d;
2227
35
  uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2228
35
  init_v[s >> 5] |= (1u << (s & 0x1f));
2229
35
}
2230
2231
void ccv_cnnp_model_parameter_copy(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, ccv_nnc_tensor_t* const tensor)
2232
6
{
2233
6
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2234
6
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 13
:
parameter->param_sel3
;
2235
6
  assert(parameter->param_sel != 0);
2236
6
  assert(compiled_data->tensors.parameters);
2237
6
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2238
6
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2239
6
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 13
:
parameter->param_ref3
;
2240
6
  if (param_ref < 0)
2241
3
    { assert(parameter_indices->rnum == 1); }
2242
3
  else
2243
3
    { assert(param_ref < parameter_indices->rnum); }
2244
6
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2245
6
  ccv_array_free(parameter_indices);
2246
6
  const int parameter_size = compiled_data->parameters->rnum;
2247
6
  assert(d >= 0);
2248
6
  assert(d < parameter_size);
2249
  // We don't need to consider parallel_count, every parameter on each device is identical.
2250
6
  ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2251
6
  assert(src);
2252
6
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(tensor), 0);
2253
6
}
2254
2255
ccv_nnc_tensor_param_t ccv_cnnp_model_parameter_tensor_params(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter)
2256
1
{
2257
1
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2258
1
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 10
: parameter->param_sel;
2259
1
  assert(parameter->param_sel != 0);
2260
1
  assert(compiled_data->tensors.parameters);
2261
1
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2262
1
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2263
1
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 10
: parameter->param_ref;
2264
1
  if (param_ref < 0)
2265
1
    { assert(parameter_indices->rnum == 1); }
2266
0
  else
2267
0
    { assert(param_ref < parameter_indices->rnum); }
2268
1
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2269
1
  ccv_array_free(parameter_indices);
2270
1
  const int parameter_size = compiled_data->parameters->rnum;
2271
1
  assert(d >= 0);
2272
1
  assert(d < parameter_size);
2273
  // We don't need to consider parallel_count, every parameter on each device is identical.
2274
1
  ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]);
2275
1
  assert(tensor);
2276
1
  return tensor->info;
2277
1
}
2278
2279
const char* ccv_cnnp_model_parameter_name(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter)
2280
2
{
2281
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2282
2
  const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 1 : 
parameter->param_sel0
;
2283
2
  assert(parameter->param_sel != 0);
2284
2
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2285
2
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
2286
2
  const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 1 : 
parameter->param_ref0
;
2287
2
  if (param_ref < 0)
2288
0
    { assert(parameter_indices->rnum == 1); }
2289
2
  else
2290
2
    { assert(param_ref < parameter_indices->rnum); }
2291
2
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
2292
2
  ccv_array_free(parameter_indices);
2293
2
  const int parameter_size = compiled_data->parameters->rnum;
2294
2
  assert(d >= 0);
2295
2
  assert(d < parameter_size);
2296
2
  return *(char**)ccv_array_get(compiled_data->ids.parameters, d);
2297
2
}
2298
2299
int ccv_cnnp_model_parameter_count(ccv_cnnp_model_t* const model)
2300
0
{
2301
0
  assert(model->compiled_data);
2302
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2303
0
  return compiled_data->parameters->rnum;
2304
0
}
2305
2306
uint64_t ccv_cnnp_model_parameters_size(ccv_cnnp_model_t* const model)
2307
0
{
2308
0
  assert(model->compiled_data);
2309
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2310
0
  const int parameter_size = compiled_data->parameters->rnum;
2311
0
  int i;
2312
0
  const ccv_nnc_symbolic_graph_t* const graph = model->graph;
2313
0
  uint64_t size = 0;
2314
0
  const int tensors_init = !!compiled_data->tensors_init.v;
2315
0
  uint32_t* const init_v = tensors_init ? CCV_NNC_INIT_V(compiled_data->tensors_init.v) : 0;
2316
0
  for (i = 0; i < parameter_size; i++)
2317
0
  {
2318
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
2319
0
    if (tensors_init && compiled_data->tensors.parameters && (init_v[d >> 5] | (1u << (d & 0x1f))) && compiled_data->tensors.parameters[i])
2320
0
    {
2321
0
      ccv_nnc_tensor_param_t params = compiled_data->tensors.parameters[i]->info;
2322
0
      size += ccv_nnc_tensor_data_size(params);
2323
0
      continue;
2324
0
    }
2325
0
    ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(graph, (ccv_nnc_tensor_symbol_t){
2326
0
      .graph = graph,
2327
0
      .d = d
2328
0
    });
2329
0
    size += ccv_nnc_tensor_data_size(params);
2330
0
  }
2331
0
  return size;
2332
0
}
2333
2334
int ccv_cnnp_model_parameters_move(ccv_cnnp_model_t* const model, char** const names, ccv_nnc_tensor_t** const tensors, const int count, int type)
2335
3
{
2336
3
  assert(model->compiled_data);
2337
3
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2338
3
  if (count != compiled_data->parameters->rnum)
2339
0
    return 0;
2340
3
  if (CCV_TENSOR_GET_DEVICE(type) == CCV_COMPUTE_DEVICE_ANY)
2341
0
    CCV_TENSOR_SET_DEVICE_ID(type, 0);
2342
3
  int i;
2343
  // We don't need to consider parallel_count, every parameter on each device is identical.
2344
6
  for (i = 0; i < count; 
i++3
)
2345
3
  {
2346
3
    ccv_nnc_tensor_t* tensor = compiled_data->tensors.parameters[i];
2347
3
    if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything.
2348
0
    {
2349
0
      tensors[i] = 0;
2350
0
      continue;
2351
0
    }
2352
3
    tensor = CCV_NNC_TENSOR(tensor);
2353
3
    if (tensor->info.type == type)
2354
3
      tensors[i] = tensor;
2355
0
    else {
2356
0
      ccv_nnc_tensor_param_t info = tensor->info;
2357
0
      info.type = type;
2358
0
      tensors[i] = ccv_nnc_tensor_new(0, info, 0); // Create this tensor, don't initiate copy yet.
2359
0
    }
2360
3
  }
2361
6
  for (i = 0; i < count; 
i++3
)
2362
3
  {
2363
3
    ccv_nnc_tensor_t* tensor = compiled_data->tensors.parameters[i];
2364
3
    if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything.
2365
0
      continue;
2366
3
    tensor = CCV_NNC_TENSOR(tensor);
2367
    // Now initiate transfer. We should do this one on a stream.
2368
3
    if (tensor->info.type != type)
2369
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(tensors[i]), 0);
2370
3
  }
2371
  // Copy names and remove parameters.
2372
6
  for (i = 0; i < count; 
i++3
)
2373
3
  {
2374
3
    ccv_nnc_tensor_t* const tensor = compiled_data->tensors.parameters[i];
2375
3
    if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything.
2376
0
    {
2377
0
      names[i] = 0;
2378
0
      continue;
2379
0
    }
2380
3
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2381
3
    const size_t name_len = ccv_min(strnlen(name, 1023), 1023);
2382
3
    names[i] = ccmalloc(name_len + 1);
2383
3
    names[i][name_len] = 0;
2384
3
    memcpy(names[i], name, name_len);
2385
3
    if (tensor->info.type == type)
2386
3
      compiled_data->tensors.parameters[i] = 0; // Only move when it is moved.
2387
3
  }
2388
3
  return 1;
2389
3
}
2390
2391
KHASH_MAP_INIT_STR(ccv_cnnp_parameter_id, int)
2392
2393
void ccv_cnnp_model_set_parameters_from_key_values(ccv_cnnp_model_t* const model, char* const* const names, ccv_nnc_tensor_t** const tensors, const int count, const int invalidates)
2394
2
{
2395
2
  assert(model->compiled_data);
2396
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2397
2
  int i;
2398
2
  khash_t(ccv_cnnp_parameter_id)* id_map = 0;
2399
2
  if (count != compiled_data->parameters->rnum)
2400
0
  {
2401
0
    id_map = kh_init(ccv_cnnp_parameter_id);
2402
    // Build the map between name and the index.
2403
0
    for (i = 0; i < count; i++)
2404
0
    {
2405
0
      int ret;
2406
0
      const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, names[i], &ret);
2407
0
      assert(ret != 0);
2408
0
      kh_val(id_map, k) = i;
2409
0
    }
2410
0
  }
2411
2
  const int parameter_size = compiled_data->parameters->rnum;
2412
2
  int* copy_back = 0;
2413
2
  const int tensors_init = !!compiled_data->tensors_init.v;
2414
2
  if (!tensors_init)
2415
1
    ccv_cnnp_model_tensors_init_0(model, compiled_data);
2416
2
  const int parallel_count = ccv_max(model->parallel_count, 1);
2417
2
  uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2418
4
  for (i = 0; i < parameter_size; 
i++2
)
2419
2
  {
2420
2
    int j = i;
2421
2
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2422
2
    if (i >= 0 || 
strncmp(name, names[i], 1023) != 00
)
2423
2
    {
2424
      // Build the map.
2425
2
      if (id_map == 0)
2426
2
      {
2427
2
        id_map = kh_init(ccv_cnnp_parameter_id);
2428
4
        for (j = 0; j < count; 
j++2
)
2429
2
        {
2430
2
          int ret;
2431
2
          const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, names[j], &ret);
2432
2
          assert(ret != 0);
2433
2
          kh_val(id_map, k) = j;
2434
2
        }
2435
2
      }
2436
2
      const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, name);
2437
2
      if (k == kh_end(id_map)) // Cannot find the name, skip.
2438
0
        continue;
2439
2
      j = kh_val(id_map, k);
2440
2
    }
2441
2
    if (compiled_data->tensors.parameters[i]) // Cannot be a shared parameter to read.
2442
0
      { assert(!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1)); }
2443
2
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
2444
2
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
2445
2
    if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY)
2446
1
      CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
2447
2
    const int d = parameter.d;
2448
2
    if (info.type == tensors[j]->info.type && invalidates) // Can move.
2449
1
    {
2450
      // Deallocate it if needed.
2451
1
      if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1))
2452
1
        if (compiled_data->tensors.parameters[i])
2453
0
          ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]);
2454
1
      compiled_data->tensors.parameters[i] = tensors[j];
2455
1
      tensors[j] = 0;
2456
1
    } else {
2457
1
      if (!compiled_data->tensors.parameters[i])
2458
1
      { // Not allocated, to allocate first.
2459
        // Create new one, make sure we create this by having the right parameters.
2460
1
        const int type = info.type;
2461
1
        info = tensors[j]->info;
2462
1
        info.type = type; // Revert back the type.
2463
1
        compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0);
2464
1
      }
2465
1
      if (!copy_back)
2466
1
        copy_back = (int*)cccalloc(parameter_size, sizeof(int));
2467
1
      copy_back[i] = j + 1;
2468
1
    }
2469
2
    init_v[d >> 5] |= (1u << (d & 0x1f));
2470
    // Create this tensor for other data parallel allocations.
2471
2
    info = compiled_data->tensors.parameters[i]->info; // In case we loaded a different info.
2472
2
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
2473
2
    for (j = 1; j < parallel_count; 
j++0
)
2474
0
      if (!compiled_data->tensors.parameters[i + j * parameter_size])
2475
0
      {
2476
0
        if (j != device_id)
2477
0
          CCV_TENSOR_SET_DEVICE_ID(info.type, j);
2478
0
        else
2479
0
          CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
2480
0
        compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
2481
0
      }
2482
      // No need to copy over, this is done in ccv_cnnp_model.c's copy_tensors method.
2483
2
  }
2484
2
  if (id_map)
2485
2
    kh_destroy(ccv_cnnp_parameter_id, id_map);
2486
  // Now do the transfer.
2487
2
  if (copy_back)
2488
1
  {
2489
2
    for (i = 0; i < parameter_size; 
i++1
)
2490
1
    {
2491
1
      ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[i]);
2492
1
      if (copy_back[i] == 0)
2493
0
        continue;
2494
1
      const int j = copy_back[i] - 1;
2495
1
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensors[j]), TENSOR_LIST(tensor), 0);
2496
1
    }
2497
1
    ccfree(copy_back);
2498
1
  }
2499
2
}
2500
2501
ccv_cnnp_model_io_t ccv_cnnp_model_parameter_first(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f first, void* const context)
2502
0
{
2503
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2504
0
  assert(compiled_data);
2505
0
  const int parameter_size = compiled_data->parameters->rnum;
2506
0
  int i;
2507
0
  for (i = 0; i < parameter_size; i++)
2508
0
  {
2509
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2510
0
    if (first(model, name, context))
2511
0
      return ccv_cnnp_model_parameters(model, -1, i);
2512
0
  }
2513
0
  return 0;
2514
0
}
2515
2516
ccv_array_t* ccv_cnnp_model_parameters_filter(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f filter, void* const context)
2517
0
{
2518
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2519
0
  assert(compiled_data);
2520
0
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 0, 0);
2521
0
  const int parameter_size = compiled_data->parameters->rnum;
2522
0
  int i;
2523
0
  for (i = 0; i < parameter_size; i++)
2524
0
  {
2525
0
    const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i);
2526
0
    if (filter(model, name, context))
2527
0
    {
2528
0
      ccv_cnnp_model_io_t parameter = ccv_cnnp_model_parameters(model, -1, i);
2529
0
      ccv_array_push(parameters, &parameter);
2530
0
    }
2531
0
  }
2532
0
  return parameters;
2533
2534
0
}
2535
2536
CCV_WARN_UNUSED(ccv_cnnp_model_io_t) ccv_cnnp_model_parameter_first_uninit(ccv_cnnp_model_t* const model)
2537
0
{
2538
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2539
0
  assert(compiled_data);
2540
0
  const int tensors_init = !!compiled_data->tensors_init.v;
2541
0
  if (!tensors_init) // If nothing initialized, we return parameter 0.
2542
0
    return ccv_cnnp_model_parameters(model, -1, 0);
2543
0
  const int parameter_size = compiled_data->parameters->rnum;
2544
0
  int i;
2545
0
  const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v);
2546
0
  for (i = 0; i < parameter_size; i++)
2547
0
  {
2548
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
2549
0
    if (!(init_v[d >> 5] & (1u << (d & 0x1f))))
2550
0
      return ccv_cnnp_model_parameters(model, -1, i);
2551
0
  }
2552
0
  return 0;
2553
0
}
2554
2555
static ccv_array_t* _ccv_cnnp_model_parameter_indices(const ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, int* const param_ref)
2556
49
{
2557
49
  const int to_param_sel = parameters->param_sel > 0 ? 
parameters->param_sel - 10
: parameters->param_sel;
2558
49
  assert(parameters->param_sel != 0);
2559
49
  ccv_array_t* const to_parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2560
49
  ccv_cnnp_model_add_to_parameter_indices(parameters->model, to_param_sel, to_parameter_indices);
2561
49
  *param_ref = parameters->param_ref > 0 ? 
parameters->param_ref - 10
: parameters->param_ref;
2562
49
  return to_parameter_indices;
2563
49
}
2564
2565
static void _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_array_t** const parameter_indices, int* const param_ref, ccv_array_t** const from_parameter_indices, int* const from_param_ref, const int only_init_0)
2566
14
{
2567
  // If the model is not compiled yet. Compile them now.
2568
14
  if (!model->graph)
2569
3
  {
2570
3
    model->graph = ccv_nnc_symbolic_graph_new();
2571
3
    assert(from_model->compiled_data);
2572
3
    const int input_size = from_model->input_size;
2573
3
    ccv_nnc_tensor_param_t input_params[input_size];
2574
3
    int i;
2575
9
    for (i = 0; i < input_size; 
i++6
)
2576
6
      input_params[i] = ccv_nnc_tensor_symbol_params(from_model->graph, from_model->inputs[i]);
2577
3
    _ccv_cnnp_model_compile(model, input_params, input_size, from_model->compiled_data->loss);
2578
3
    model->parallel_count = from_model->parallel_count;
2579
3
    model->memory_compression = from_model->memory_compression;
2580
3
    model->memory_reduction = from_model->memory_reduction;
2581
3
    model->gradient_checkpointing = from_model->gradient_checkpointing;
2582
3
    model->compiled_data->stream_type = from_model->compiled_data->stream_type;
2583
3
    model->compiled_data->minimize.minimizer = from_model->compiled_data->minimize.minimizer;
2584
3
    model->compiled_data->minimize.max_saved_aux_size = from_model->compiled_data->minimize.max_saved_aux_size;
2585
3
  }
2586
14
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2587
14
  assert(to_compiled_data);
2588
14
  const int to_tensors_init = !!to_compiled_data->tensors_init.v;
2589
14
  if (!to_tensors_init)
2590
10
  {
2591
10
    if (only_init_0)
2592
1
      ccv_cnnp_model_tensors_init_0(model, to_compiled_data);
2593
9
    else
2594
9
      _ccv_cnnp_model_tensors_init(model, to_compiled_data);
2595
10
  } else 
if (4
!only_init_04
&&
(uintptr_t)to_compiled_data->tensors_init.v & (uintptr_t)13
)
2596
    // Check if it is not fully allocated, if it is not, init_1.
2597
0
      ccv_cnnp_model_tensors_init_1(model, to_compiled_data);
2598
14
  assert(to_compiled_data->tensors.parameters);
2599
14
  *parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, param_ref);
2600
14
  *from_parameter_indices = _ccv_cnnp_model_parameter_indices(from_model, from_parameters, from_param_ref);
2601
14
  if (*from_param_ref < 0 && *param_ref >= 0)
2602
0
    { assert((*from_parameter_indices)->rnum == 1); }
2603
14
  else if (*from_param_ref >= 0)
2604
0
    { assert(*from_param_ref < (*from_parameter_indices)->rnum); }
2605
14
  if (*param_ref < 0 && *from_param_ref >= 0)
2606
0
    { assert((*parameter_indices)->rnum == 1); }
2607
14
  else if (*param_ref >= 0)
2608
0
    { assert(*param_ref < (*parameter_indices)->rnum); }
2609
14
}
2610
2611
void ccv_cnnp_model_set_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2612
9
{
2613
9
  ccv_array_t* to_parameter_indices;
2614
9
  int to_param_ref;
2615
9
  ccv_array_t* from_parameter_indices;
2616
9
  int from_param_ref;
2617
9
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0);
2618
  // Should be exactly the same tensor.
2619
9
  if (to_param_ref < 0 && from_param_ref < 0)
2620
9
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2621
  // To models.
2622
9
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2623
9
  assert(to_compiled_data);
2624
  // From models.
2625
9
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2626
9
  const int parallel_count = ccv_max(model->parallel_count, 1);
2627
9
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2628
9
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2629
9
  int i, j;
2630
9
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2631
9
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2632
18
  for (i = 0; i < rnum; 
i++9
)
2633
9
  {
2634
9
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2635
9
    assert(src_d >= 0);
2636
9
    assert(src_d < from_compiled_data->parameters->rnum);
2637
9
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2638
    // If the original is not init'ed. We cannot copy from.
2639
9
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2640
0
      continue;
2641
9
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2642
9
    assert(dest_d >= 0);
2643
9
    assert(dest_d < to_compiled_data->parameters->rnum);
2644
9
    ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]);
2645
9
    assert(src);
2646
9
    ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2647
9
    assert(dest);
2648
9
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(dest), 0);
2649
27
    for (j = 1; j < parallel_count; 
j++18
)
2650
18
    {
2651
18
      ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2652
18
      if (copy_tensor)
2653
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
2654
18
    }
2655
    // Mark this symbol as init'ed.
2656
9
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2657
9
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2658
9
  }
2659
9
  ccv_array_free(to_parameter_indices);
2660
9
  ccv_array_free(from_parameter_indices);
2661
9
}
2662
2663
void ccv_cnnp_model_share_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_cnnp_model_parameters_renamer_f renamer, void* const context)
2664
2
{
2665
2
  ccv_array_t* to_parameter_indices;
2666
2
  int to_param_ref;
2667
2
  ccv_array_t* from_parameter_indices;
2668
2
  int from_param_ref;
2669
2
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 1);
2670
  // Should be exactly the same tensor.
2671
2
  if (renamer == 0 && 
to_param_ref < 01
&&
from_param_ref < 01
)
2672
1
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2673
  // To models.
2674
2
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2675
2
  assert(to_compiled_data);
2676
  // From models.
2677
2
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2678
2
  const int parallel_count = ccv_max(model->parallel_count, 1);
2679
2
  assert(parallel_count == ccv_max(from_model->parallel_count, 1)); // Should have the same parallel count can share parameters.
2680
2
  const int from_parameter_size = from_compiled_data->parameters->rnum;
2681
2
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2682
2
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2683
2
  int i, j;
2684
2
  khash_t(ccv_cnnp_parameter_id)* id_map = 0;
2685
2
  char* updated_name = 0;
2686
2
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2687
2
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2688
8
  for (i = 0; i < rnum; 
i++6
)
2689
6
  {
2690
6
    int src_d = (from_param_ref >= 0 ? 
from_param_ref0
: i) < from_parameter_indices->rnum ?
*(int*)4
ccv_array_get4
(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i) :
from_parameter_size2
;
2691
    // Need to figure out how to use the renamer here.
2692
6
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2693
6
    assert(dest_d >= 0);
2694
6
    assert(dest_d < to_parameter_size);
2695
6
    if (renamer)
2696
3
    {
2697
3
      const char* const src_name = (src_d < from_parameter_size && 
src_d >= 01
) ?
*(char**)1
ccv_array_get1
(from_compiled_data->ids.parameters, src_d) :
02
;
2698
3
      const char* const dest_name = *(char**)ccv_array_get(to_compiled_data->ids.parameters, dest_d);
2699
3
      if (!updated_name)
2700
1
        updated_name = (char*)ccmalloc(1024);
2701
3
      const size_t src_name_len = src_name == 0 ? 
02
:
ccv_min1
(strnlen(src_name, 1023), 1023);
2702
3
      if (src_name_len > 0)
2703
1
        memcpy(updated_name, src_name, src_name_len);
2704
3
      updated_name[src_name_len] = 0;
2705
3
      if (renamer(context, dest_name, updated_name, 1024) != 0)
2706
0
        continue; // Skip this.
2707
3
      if (src_name != 0 && 
memcmp(updated_name, src_name, src_name_len) == 01
&&
strnlen(updated_name, 1023) == src_name_len0
)
2708
0
      {
2709
        // Nothing changed.
2710
3
      } else {
2711
3
        if (!id_map)
2712
1
        {
2713
1
          id_map = kh_init(ccv_cnnp_parameter_id);
2714
2
          for (j = 0; j < from_parameter_size; 
j++1
)
2715
1
          {
2716
1
            int ret;
2717
1
            const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, *(char**)ccv_array_get(from_compiled_data->ids.parameters, j), &ret);
2718
1
            assert(ret != 0);
2719
1
            kh_val(id_map, k) = j;
2720
1
          }
2721
1
        }
2722
3
        const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, updated_name);
2723
3
        if (k == kh_end(id_map)) // Cannot find the name, skip.
2724
2
          continue;
2725
1
        src_d = kh_val(id_map, k);
2726
1
        assert(src_d >= 0);
2727
1
        assert(src_d < from_parameter_size);
2728
1
      }
2729
3
    }
2730
6
    assert
(src_d >= 0)4
;
2731
4
    assert(src_d < from_parameter_size);
2732
4
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2733
    // If the original is not init'ed. We cannot share from.
2734
4
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2735
0
      continue;
2736
8
    
for (j = 0; 4
j < parallel_count;
j++4
)
2737
4
    {
2738
4
      ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * from_parameter_size]);
2739
4
      assert(src);
2740
4
      ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
2741
4
      if (dest && 
!((uintptr_t)dest & (uintptr_t)1)1
)
2742
1
        ccv_nnc_tensor_free(dest);
2743
4
      to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size] = (ccv_nnc_tensor_t*)((uintptr_t)src | (uintptr_t)1);
2744
4
    }
2745
    // Mark this symbol as init'ed.
2746
4
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2747
4
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2748
4
  }
2749
2
  ccv_array_free(to_parameter_indices);
2750
2
  ccv_array_free(from_parameter_indices);
2751
2
  if (id_map)
2752
1
    kh_destroy(ccv_cnnp_parameter_id, id_map);
2753
2
  if (updated_name)
2754
1
    ccfree(updated_name);
2755
  // Mark it as incomplete so we will call init_1.
2756
2
  if (ccv_cnnp_model_tensors_any_to_alloc(model, to_compiled_data))
2757
0
    to_compiled_data->tensors_init.v = (uint32_t*)((uintptr_t)to_compiled_data->tensors_init.v | (uintptr_t)1);
2758
2
  else // Remove the flag.
2759
2
    to_compiled_data->tensors_init.v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2760
2
}
2761
2762
ccv_nnc_stream_context_t* ccv_cnnp_compiled_data_get_stream(ccv_cnnp_compiled_data_t* const compiled_data, const int type)
2763
24
{
2764
24
  if (!compiled_data->stream_map)
2765
4
    compiled_data->stream_map = kh_init(stream_map);
2766
24
  int ret = 0;
2767
24
  khiter_t k = kh_put(stream_map, compiled_data->stream_map, type, &ret);
2768
24
  assert(ret >= 0);
2769
24
  ccv_nnc_stream_context_t* stream = kh_val(compiled_data->stream_map, k);
2770
  // If ret == 0, the key already exist, we can return directly, otherwise, create and return.
2771
24
  if (ret != 0)
2772
16
  {
2773
16
    stream = ccv_nnc_stream_context_new(type);
2774
16
    kh_val(compiled_data->stream_map, k) = stream;
2775
16
  }
2776
24
  return stream;
2777
24
}
2778
2779
void ccv_cnnp_model_parameters_zip_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2780
3
{
2781
3
  ccv_array_t* to_parameter_indices;
2782
3
  int to_param_ref;
2783
3
  ccv_array_t* from_parameter_indices;
2784
3
  int from_param_ref;
2785
3
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0);
2786
  // Should be exactly the same tensor.
2787
3
  if (to_param_ref < 0 && from_param_ref < 0)
2788
3
    { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); }
2789
  // To models.
2790
3
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2791
3
  assert(to_compiled_data);
2792
  // From models.
2793
3
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2794
3
  const int parallel_count = ccv_max(model->parallel_count, 1);
2795
3
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2796
3
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2797
3
  assert(aux_in_size >= 0);
2798
3
  assert(aux_out_size >= 0);
2799
3
  int i, j;
2800
3
  ccv_nnc_tensor_t* inputs[aux_in_size + 2];
2801
3
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2802
3
  for (i = 0; i < aux_in_size; 
i++0
)
2803
0
    inputs[i + 2] = aux_ins[i];
2804
3
  for (i = 0; i < aux_out_size; 
i++0
)
2805
0
    outputs[i + 1] = aux_outs[i];
2806
3
  const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v);
2807
3
  uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v);
2808
6
  for (i = 0; i < rnum; 
i++3
)
2809
3
  {
2810
3
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2811
3
    assert(src_d >= 0);
2812
3
    assert(src_d < from_compiled_data->parameters->rnum);
2813
3
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2814
    // If the original is not init'ed. We cannot copy from.
2815
3
    if (!(from_init_v[s >> 5] & (1u << (s & 0x1f))))
2816
0
      continue;
2817
3
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2818
3
    assert(dest_d >= 0);
2819
3
    assert(dest_d < to_compiled_data->parameters->rnum);
2820
3
    if (parallel_count > 1)
2821
2
    {
2822
2
      ccv_nnc_stream_context_t* streams[parallel_count];
2823
2
      ccv_nnc_stream_signal_t* signal;
2824
2
      if (stream_context)
2825
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2826
10
      for (j = 0; j < parallel_count; 
j++8
)
2827
8
      {
2828
8
        ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * to_parameter_size]);
2829
8
        ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2830
8
        if (!dest || !src)
2831
0
        {
2832
0
          streams[j] = 0;
2833
0
          continue;
2834
0
        }
2835
        // At the moment, can only handle them on the same device.
2836
8
        assert(CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GET_MEMORY(dest->info.type));
2837
8
        assert(CCV_TENSOR_GET_DEVICE_ID(src->info.type) == CCV_TENSOR_GET_DEVICE_ID(dest->info.type));
2838
8
        const int stream_type = CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2839
8
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(src->info.type);
2840
8
        int type = stream_type;
2841
8
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2842
8
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2843
        // Wait signal to finish.
2844
8
        if (stream_context)
2845
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2846
8
        inputs[0] = outputs[0] = dest;
2847
8
        inputs[1] = src;
2848
8
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_0);
2849
8
        if (stream_context)
2850
4
        {
2851
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2852
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2853
4
        }
2854
8
        streams[j] = stream_0;
2855
8
      }
2856
      // If this should be blocking, blocking it.
2857
2
      if (!stream_context)
2858
5
        
for (j = 0; 1
j < parallel_count;
j++4
)
2859
4
          if (streams[j])
2860
4
            ccv_nnc_stream_context_wait(streams[j]);
2861
2
    } else {
2862
1
      ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]);
2863
1
      assert(src);
2864
1
      ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2865
1
      assert(dest);
2866
1
      inputs[0] = outputs[0] = dest;
2867
1
      inputs[1] = src;
2868
1
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_context);
2869
1
    }
2870
    // Mark this symbol as init'ed.
2871
3
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2872
3
    to_init_v[d >> 5] |= (1u << (d & 0x1f));
2873
3
  }
2874
3
  ccv_array_free(to_parameter_indices);
2875
3
  ccv_array_free(from_parameter_indices);
2876
3
}
2877
2878
void ccv_cnnp_model_parameters_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2879
15
{
2880
15
  int to_param_ref;
2881
15
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2882
  // To models.
2883
15
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2884
15
  assert(to_compiled_data);
2885
  // Tensor has to be inited already.
2886
15
  assert(!!to_compiled_data->tensors_init.v);
2887
15
  assert(to_compiled_data->tensors.parameters);
2888
  // From models.
2889
15
  const int parallel_count = ccv_max(model->parallel_count, 1);
2890
15
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2891
15
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2892
15
  assert(aux_in_size >= 0);
2893
15
  assert(aux_out_size >= 0);
2894
15
  int i, j;
2895
15
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
2896
15
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2897
15
  for (i = 0; i < aux_in_size; 
i++0
)
2898
0
    inputs[i + 1] = aux_ins[i];
2899
15
  for (i = 0; i < aux_out_size; 
i++0
)
2900
0
    outputs[i + 1] = aux_outs[i];
2901
30
  for (i = 0; i < rnum; 
i++15
)
2902
15
  {
2903
15
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2904
15
    assert(dest_d >= 0);
2905
15
    assert(dest_d < to_compiled_data->parameters->rnum);
2906
15
    if (parallel_count > 1)
2907
4
    {
2908
4
      ccv_nnc_stream_context_t* streams[parallel_count];
2909
4
      ccv_nnc_stream_signal_t* signal;
2910
4
      if (stream_context)
2911
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2912
20
      for (j = 0; j < parallel_count; 
j++16
)
2913
16
      {
2914
16
        ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]);
2915
16
        if (!dest)
2916
0
        {
2917
0
          streams[j] = 0;
2918
0
          continue;
2919
0
        }
2920
16
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2921
16
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
2922
16
        int type = stream_type;
2923
16
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2924
16
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2925
        // Wait signal to finish.
2926
16
        if (stream_context)
2927
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2928
16
        inputs[0] = outputs[0] = dest;
2929
16
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
2930
16
        if (stream_context)
2931
4
        {
2932
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2933
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2934
4
        }
2935
16
        streams[j] = stream_0;
2936
16
      }
2937
      // If this should be blocking, blocking it.
2938
4
      if (!stream_context)
2939
15
        
for (j = 0; 3
j < parallel_count;
j++12
)
2940
12
          if (streams[j])
2941
12
            ccv_nnc_stream_context_wait(streams[j]);
2942
11
    } else {
2943
11
      ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]);
2944
11
      assert(dest);
2945
11
      inputs[0] = outputs[0] = dest;
2946
11
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
2947
11
    }
2948
    // No need to mark this symbol as init'ed, it is already.
2949
15
  }
2950
15
  ccv_array_free(to_parameter_indices);
2951
15
}
2952
2953
void ccv_cnnp_model_parameter_gradients_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2954
6
{
2955
6
  int to_param_ref;
2956
6
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2957
  // To models.
2958
6
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2959
6
  assert(to_compiled_data);
2960
  // Tensor has to be inited already.
2961
6
  assert(!!to_compiled_data->tensors_init.v);
2962
6
  ccv_nnc_tensor_t** tensor_gradients;
2963
6
  if (to_compiled_data->backward.count > 1)
2964
3
    tensor_gradients = to_compiled_data->tensors.accum_gradients;
2965
3
  else
2966
3
    tensor_gradients = to_compiled_data->tensors.gradients;
2967
6
  assert(tensor_gradients);
2968
  // From models.
2969
6
  const int parallel_count = ccv_max(model->parallel_count, 1);
2970
6
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2971
6
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2972
6
  assert(aux_in_size >= 0);
2973
6
  assert(aux_out_size >= 0);
2974
6
  int i, j;
2975
6
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
2976
6
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2977
10
  for (i = 0; i < aux_in_size; 
i++4
)
2978
4
    inputs[i + 1] = aux_ins[i];
2979
14
  for (i = 0; i < aux_out_size; 
i++8
)
2980
8
    outputs[i + 1] = aux_outs[i];
2981
12
  for (i = 0; i < rnum; 
i++6
)
2982
6
  {
2983
6
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2984
6
    assert(dest_d >= 0);
2985
6
    assert(dest_d < to_compiled_data->parameters->rnum);
2986
6
    if (parallel_count > 1)
2987
0
    {
2988
0
      ccv_nnc_stream_context_t* streams[parallel_count];
2989
0
      ccv_nnc_stream_signal_t* signal;
2990
0
      if (stream_context)
2991
0
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2992
0
      for (j = 0; j < parallel_count; j++)
2993
0
      {
2994
0
        ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d + j * to_parameter_size];
2995
0
        if (!dest)
2996
0
        {
2997
0
          streams[j] = 0;
2998
0
          continue;
2999
0
        }
3000
0
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU;
3001
0
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
3002
0
        int type = stream_type;
3003
0
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
3004
0
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
3005
        // Wait signal to finish.
3006
0
        if (stream_context)
3007
0
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
3008
0
        inputs[0] = outputs[0] = dest;
3009
0
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
3010
0
        if (stream_context)
3011
0
        {
3012
0
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
3013
0
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
3014
0
        }
3015
0
        streams[j] = stream_0;
3016
0
      }
3017
      // If this should be blocking, blocking it.
3018
0
      if (!stream_context)
3019
0
        for (j = 0; j < parallel_count; j++)
3020
0
          if (streams[j])
3021
0
            ccv_nnc_stream_context_wait(streams[j]);
3022
6
    } else {
3023
6
      ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d];
3024
6
      if (!dest)
3025
0
        continue;
3026
6
      assert(dest);
3027
6
      inputs[0] = outputs[0] = dest;
3028
6
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
3029
6
    }
3030
    // No need to mark this symbol as init'ed, it is already.
3031
6
  }
3032
6
  ccv_array_free(to_parameter_indices);
3033
6
}
3034
3035
ccv_nnc_cmd_t ccv_cnnp_model_minimizer(ccv_cnnp_model_t* const model)
3036
2.20k
{
3037
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3038
2.20k
  assert(compiled_data);
3039
2.20k
  return compiled_data->minimize.minimizer;
3040
2.20k
}
3041
3042
void ccv_cnnp_model_set_minimizer(ccv_cnnp_model_t* const model, const ccv_nnc_cmd_t minimizer, const int reset, const ccv_cnnp_model_io_t* const set_parameters, const int set_parameter_size)
3043
4.36k
{
3044
4.36k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3045
4.36k
  assert(compiled_data);
3046
4.36k
  const int parameter_size = compiled_data->parameters->rnum;
3047
4.36k
  if (parameter_size == 0)
3048
3
    return;
3049
4.35k
  if (reset)
3050
2.49k
    { assert(set_parameters == 0 && set_parameter_size == 0); }
3051
4.35k
  const int old_max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
3052
4.35k
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
3053
4.35k
  if (saved_aux_size > compiled_data->minimize.max_saved_aux_size)
3054
7
    compiled_data->minimize.max_saved_aux_size = saved_aux_size;
3055
4.35k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
3056
  // We update all parameters, at this point, we have one minimizer.
3057
4.35k
  if (set_parameters == 0 || 
set_parameter_size == 0301
)
3058
4.05k
    compiled_data->minimize.minimizer = minimizer;
3059
4.35k
  int i;
3060
4.35k
  if (set_parameters && 
set_parameter_size301
)
3061
301
  {
3062
    // I need to save what's the minimizer along with this.
3063
301
    if (!compiled_data->minimize.parameters)
3064
5
      compiled_data->minimize.parameters = ccv_array_new(sizeof(ccv_cnnp_set_minimizer_for_parameter_t*), 1, 0);
3065
301
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = ccmalloc(sizeof(ccv_cnnp_set_minimizer_for_parameter_t) + (set_parameter_size - 1) * sizeof(ccv_cnnp_model_io_t));
3066
301
    set_minimizer_for_parameter->minimizer = minimizer;
3067
301
    set_minimizer_for_parameter->parameter_size = set_parameter_size;
3068
301
    memcpy(set_minimizer_for_parameter->parameters, set_parameters, sizeof(ccv_cnnp_model_io_t) * set_parameter_size);
3069
301
    ccv_array_push(compiled_data->minimize.parameters, &set_minimizer_for_parameter);
3070
301
  }
3071
  // If reset is true, clear the parameters array.
3072
4.35k
  if (reset && 
compiled_data->minimize.parameters2.49k
)
3073
291
  {
3074
582
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++291
)
3075
291
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
3076
291
    ccv_array_clear(compiled_data->minimize.parameters);
3077
291
  }
3078
4.35k
  if (!compiled_data->update_nodes)
3079
9
    return;
3080
4.34k
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
3081
4.34k
  assert(symbolic_graph);
3082
4.34k
  if (saved_aux_size > old_max_saved_aux_size)
3083
7
  {
3084
7
    assert(compiled_data->updated_parameters);
3085
    // Reallocate first, move them around later.
3086
7
    compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccrealloc(compiled_data->updated_parameters, sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * saved_aux_size * parameter_size);
3087
7
    compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
3088
7
    compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
3089
    // We need to do this from back to front because saved_aux_size > old_saved_aux_size, it could overlap.
3090
7
    _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, old_max_saved_aux_size, saved_aux_size);
3091
7
  }
3092
4.34k
  int flag = 0;
3093
4.34k
  const int parallel_count = ccv_max(model->parallel_count, 1);
3094
4.34k
  if (set_parameters && 
set_parameter_size296
)
3095
296
  {
3096
296
    ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
3097
592
    for (i = 0; i < set_parameter_size; 
i++296
)
3098
296
    {
3099
296
      const int param_sel = set_parameters[i]->param_sel > 0 ? 
set_parameters[i]->param_sel - 1291
:
set_parameters[i]->param_sel5
;
3100
296
      assert(set_parameters[i]->param_sel != 0);
3101
296
      const int old_rnum = parameter_indices->rnum;
3102
296
      ccv_cnnp_model_add_to_parameter_indices(set_parameters[i]->model, param_sel, parameter_indices);
3103
296
      const int param_ref = set_parameters[i]->param_ref > 0 ? 
set_parameters[i]->param_ref - 10
: set_parameters[i]->param_ref;
3104
296
      assert(set_parameters[i]->param_ref != 0);
3105
296
      if (param_ref >= 0)
3106
0
      {
3107
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
3108
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
3109
0
        parameter_indices->rnum = old_rnum + 1;
3110
0
      }
3111
296
    }
3112
    // We may have duplicated indices, but that is OK, we will set it twice.
3113
5.24k
    
for (i = 0; 296
i < parameter_indices->rnum;
i++4.95k
)
3114
4.95k
    {
3115
4.95k
      const int d = *(int*)ccv_array_get(parameter_indices, i);
3116
4.95k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, d))
3117
0
        flag = 1;
3118
4.95k
    }
3119
296
    ccv_array_free(parameter_indices);
3120
4.05k
  } else {
3121
19.1k
    for (i = 0; i < parameter_size; 
i++15.0k
)
3122
15.0k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, i))
3123
65
        flag = 1;
3124
4.05k
    if (compiled_data->minimize.parameters)
3125
291
      if (_ccv_cnnp_apply_parameters_with_minimizer(model))
3126
0
        flag = 1;
3127
4.05k
  }
3128
4.34k
  if (flag)
3129
7
  {
3130
    // If saved_aux_size doesn't match, we need to remove / add new saved_aux to the graph. But first, free up apply gradients graph.
3131
7
    if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_FIT_MODE)
3132
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
3133
7
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
3134
7
  }
3135
4.34k
}
3136
3137
void ccv_cnnp_model_set_compile_params(ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_compile_param_t compile_params)
3138
0
{
3139
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3140
0
  assert(compiled_data);
3141
0
  compiled_data->compile_params = compile_params;
3142
0
}
3143
3144
void ccv_cnnp_model_dot(const ccv_cnnp_model_t* const model, const int flags, FILE** const outs, const int out_size)
3145
48
{
3146
48
  if (model->graph && 
out_size > 047
)
3147
47
    ccv_nnc_symbolic_graph_dot(model->graph, flags, outs[0]);
3148
48
  if (model->compiled_data && 
model->compiled_data->graph47
&&
out_size > 116
)
3149
0
    ccv_nnc_graph_dot(model->compiled_data->graph, flags, outs[1]);
3150
48
  if (model->compiled_data && 
model->compiled_data->backward.accum47
&&
out_size > 20
)
3151
0
    ccv_nnc_graph_dot(model->compiled_data->backward.accum, flags, outs[2]);
3152
48
  if (model->compiled_data && 
model->compiled_data->apply_gradients.graph47
&&
out_size > 33
)
3153
0
    ccv_nnc_graph_dot(model->compiled_data->apply_gradients.graph, flags, outs[3]);
3154
48
}
3155
3156
void ccv_cnnp_model_format(const ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_format_f format_fn, void* const context)
3157
0
{
3158
0
  if (model->graph)
3159
0
    ccv_nnc_symbolic_graph_format(model->graph, 0, 0, 0, 0, format_fn, context);
3160
0
}
3161
3162
static void _ccv_cnnp_compiled_data_free(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
3163
2.30k
{
3164
2.30k
  int i;
3165
2.30k
  const int parameter_size = compiled_data->parameters->rnum;
3166
2.30k
  ccv_array_free(compiled_data->parameters);
3167
2.30k
  if (compiled_data->parameter_flags)
3168
10
    ccfree(compiled_data->parameter_flags);
3169
2.30k
  const int internal_size = compiled_data->internals->rnum;
3170
2.30k
  ccv_array_free(compiled_data->internals);
3171
2.30k
  assert(compiled_data->ids.parameters->rnum == parameter_size);
3172
2.30k
  assert(compiled_data->ids.internals->rnum == internal_size);
3173
5.26k
  
for (i = 0; 2.30k
i < parameter_size;
i++2.95k
)
3174
2.95k
    ccfree(*(char**)ccv_array_get(compiled_data->ids.parameters, i));
3175
2.30k
  ccv_array_free(compiled_data->ids.parameters);
3176
2.46k
  for (i = 0; i < internal_size; 
i++165
)
3177
165
    ccfree(*(char**)ccv_array_get(compiled_data->ids.internals, i));
3178
2.30k
  ccv_array_free(compiled_data->ids.internals);
3179
2.30k
  const int parallel_count = ccv_max(model->parallel_count, 1);
3180
2.30k
  if (compiled_data->tensors.parameters)
3181
95
  {
3182
799
    for (i = 0; i < parameter_size * parallel_count; 
i++704
)
3183
      // If it is not marked as not belonging, we can free it.
3184
704
      if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1))
3185
700
        if (compiled_data->tensors.parameters[i])
3186
698
          ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]);
3187
254
    for (i = 0; i < internal_size * parallel_count; 
i++159
)
3188
159
      if (compiled_data->tensors.internals[i])
3189
158
        ccv_nnc_tensor_free(compiled_data->tensors.internals[i]);
3190
95
    ccfree(compiled_data->tensors.parameters);
3191
95
  }
3192
2.30k
  if (compiled_data->tensors.gradients)
3193
32
  {
3194
362
    for (i = 0; i < parameter_size * parallel_count; 
i++330
)
3195
330
    {
3196
330
      if (compiled_data->tensors.gradients[i])
3197
328
        ccv_nnc_tensor_free(compiled_data->tensors.gradients[i]);
3198
330
      if (compiled_data->tensors.accum_gradients[i])
3199
15
        ccv_nnc_tensor_free(compiled_data->tensors.accum_gradients[i]);
3200
330
    }
3201
32
    ccfree(compiled_data->tensors.gradients);
3202
32
  }
3203
2.30k
  if (compiled_data->minimize.parameters)
3204
5
  {
3205
15
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++10
)
3206
10
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
3207
5
    ccv_array_free(compiled_data->minimize.parameters);
3208
5
  }
3209
2.30k
  if (compiled_data->rewindables)
3210
45
    ccv_array_free(compiled_data->rewindables);
3211
2.30k
  if (compiled_data->tensors_init.v)
3212
95
    ccfree(CCV_NNC_INIT_V(compiled_data->tensors_init.v));
3213
2.30k
  if (compiled_data->evaluate.tos)
3214
2.30k
    ccfree(compiled_data->evaluate.tos);
3215
2.30k
  compiled_data->evaluate.tos = 0;
3216
2.30k
  if (compiled_data->stream_map)
3217
4
  {
3218
4
    khiter_t k;
3219
36
    for (k = 
kh_begin4
(compiled_data->stream_map); k != kh_end(compiled_data->stream_map);
++k32
)
3220
32
    {
3221
32
      if (!kh_exist(compiled_data->stream_map, k))
3222
16
        continue;
3223
16
      ccv_nnc_stream_context_t* const stream = kh_val(compiled_data->stream_map, k);
3224
16
      ccv_nnc_stream_context_free(stream);
3225
16
    }
3226
4
    kh_destroy(stream_map, compiled_data->stream_map);
3227
4
  }
3228
2.30k
  _ccv_cnnp_compiled_data_graph_free(compiled_data);
3229
2.30k
  _ccv_cnnp_compiled_data_gradient_free(compiled_data);
3230
2.30k
  _ccv_cnnp_compiled_data_backward_free(compiled_data);
3231
2.30k
  _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
3232
2.30k
  if (compiled_data->gradient_checkpoints)
3233
2
  {
3234
4
    for (i = 0; i < compiled_data->gradient_checkpoints->rnum; 
i++2
)
3235
2
    {
3236
2
      ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(compiled_data->gradient_checkpoints, i);
3237
2
      assert(checkpoint->inputs);
3238
2
      ccfree(checkpoint->inputs);
3239
2
      ccv_array_free(checkpoint->tensor_symbols);
3240
2
    }
3241
2
    ccv_array_free(compiled_data->gradient_checkpoints);
3242
2
  }
3243
2.30k
  ccv_nnc_xpu_alloc_destroy(&compiled_data->xpu_alloc);
3244
2.30k
  ccfree(compiled_data);
3245
2.30k
}
3246
3247
void ccv_cnnp_model_free(ccv_cnnp_model_t* const model)
3248
5.44k
{
3249
5.44k
  ccv_cnnp_model_deinit(model);
3250
5.44k
  if (model->isa->dealloc)
3251
1.22k
    model->isa->dealloc(model);
3252
5.44k
  if (model->io)
3253
794
  {
3254
794
    int i;
3255
1.95k
    for (i = 0; i < model->io->rnum; 
i++1.15k
)
3256
1.15k
    {
3257
1.15k
      ccv_cnnp_model_io_t model_io = *(ccv_cnnp_model_io_t*)ccv_array_get(model->io, i);
3258
1.15k
      if (model_io->outgoings)
3259
650
        ccv_array_free(model_io->outgoings);
3260
1.15k
      if (model_io->incomings)
3261
591
        ccv_array_free(model_io->incomings);
3262
1.15k
      if (model_io->dependencies)
3263
2
        ccv_array_free(model_io->dependencies);
3264
1.15k
      ccfree(model_io);
3265
1.15k
    }
3266
794
    ccv_array_free(model->io);
3267
794
  }
3268
5.44k
  if (model->parameter_indices)
3269
2.52k
    ccv_array_free(model->parameter_indices);
3270
5.44k
  if (model->inputs)
3271
2.30k
    ccfree(model->inputs);
3272
5.44k
  if (model->graph)
3273
2.30k
    ccv_nnc_symbolic_graph_free(model->graph);
3274
5.44k
  if (model->compiled_data)
3275
2.30k
    _ccv_cnnp_compiled_data_free(model, model->compiled_data);
3276
5.44k
  if (model->name)
3277
216
    ccfree(model->name);
3278
5.44k
  ccfree(model);
3279
5.44k
}
3280
3281
void ccv_cnnp_model_cancel(ccv_cnnp_model_t* const model)
3282
0
{
3283
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
3284
0
  if (!compiled_data)
3285
0
    return;
3286
0
  if (compiled_data->graph)
3287
0
    ccv_nnc_graph_cancel(compiled_data->graph);
3288
0
  if (compiled_data->apply_gradients.graph)
3289
0
    ccv_nnc_graph_cancel(compiled_data->apply_gradients.graph);
3290
0
}
3291
3292
void ccv_cnnp_model_set_flags(ccv_cnnp_model_t* const model, const int flags)
3293
0
{
3294
0
  model->exec_flags = flags;
3295
0
}
3296
3297
int ccv_cnnp_model_flags(ccv_cnnp_model_t* const model)
3298
0
{
3299
0
  return model->exec_flags;
3300
0
}