Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_dynamic_graph_evaluate.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_nnc_easy.h"
5
#include "ccv_internal.h"
6
#include "_ccv_nnc_dynamic_graph.h"
7
#include "_ccv_cnnp_model.h"
8
9
// MARK - Level-5.5 API
10
11
static int _ccv_cnnp_model_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
12
4.81k
{
13
4.81k
  ccv_nnc_stateful_exec_t* const stateful_exec = (ccv_nnc_stateful_exec_t*)cmd.data;
14
4.81k
  ccv_cnnp_model_t* const model = (ccv_cnnp_model_t*)stateful_exec->data;
15
  // I cannot just use stream context, it cannot synchronize correctly based on existing coroutine implementation.
16
4.81k
  int i;
17
4.81k
  int wait_for_any_neighbor = 0;
18
4.81k
  const int parallel_count = ccv_max(model->parallel_count, 1);
19
4.81k
  if (stream_context) // Find all neighbor context and wait on them all.
20
804
    
for (i = 0; 396
i < parallel_count;
i++408
)
21
408
    {
22
408
      ccv_nnc_stream_context_t* const neighbor_context = ccv_nnc_stream_context_find_neighbor(stream_context, i);
23
408
      if (neighbor_context && 
neighbor_context != stream_context204
)
24
6
      {
25
6
        ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(neighbor_context);
26
6
        ccv_nnc_stream_context_wait_signal(stream_context, signal);
27
6
        wait_for_any_neighbor = 1;
28
6
      }
29
408
    }
30
4.81k
  co_scheduler_t* old_scheduler;
31
4.81k
  co_routine_t* old_main;
32
4.81k
  if (stream_context)
33
396
  {
34
396
    old_main = stream_context->main;
35
396
    old_scheduler = stream_context->scheduler;
36
    // We cannot piggyback on old scheduler.
37
396
    stream_context->scheduler = 0;
38
    // We will have a new main coroutine when schedule as the root.
39
    // Otherwise it will be scheduled after the existing routines all scheduled
40
    // out, and that won't be right.
41
396
    stream_context->main = 0;
42
396
  }
43
4.81k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD)
44
2.41k
  {
45
2.41k
    ccv_cnnp_model_evaluate(model, (ccv_cnnp_evaluate_param_t){
46
2.41k
      .requires_grad = stateful_exec->requires_grad,
47
2.41k
      .disable_outgrad = stateful_exec->disable_outgrad,
48
2.41k
      .is_test = stateful_exec->is_test,
49
2.41k
    }, inputs, input_size, outputs, output_size, 0, stream_context);
50
2.41k
  } else {
51
2.40k
    const int ingrad_size = model->output_size * parallel_count;
52
2.40k
    assert(ingrad_size <= input_size);
53
2.40k
    if (stateful_exec->disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE)
54
2.39k
      ccv_cnnp_model_backward(model, inputs, ingrad_size, outputs, output_size, 0, stream_context);
55
4
    else if (stateful_exec->disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL)
56
0
      ccv_cnnp_model_backward(model, inputs, ingrad_size, 0, 0, 0, stream_context);
57
4
    else {
58
4
      assert(output_size == model->input_size * parallel_count);
59
4
      int per_outgrad_size = 0;
60
4
      int i, j, k;
61
14
      for (i = 0; i < model->input_size; 
i++10
)
62
10
        if (!(stateful_exec->disable_outgrad & ((uint64_t)1 << i)))
63
4
          ++per_outgrad_size;
64
4
      assert(per_outgrad_size > 0);
65
4
      const int outgrad_size = per_outgrad_size * parallel_count;
66
4
      ccv_nnc_tensor_t* outgrads[outgrad_size];
67
14
      for (i = 0; i < parallel_count; 
i++10
)
68
32
        
for (k = 0, j = 0; 10
j < model->input_size;
j++22
)
69
22
          if (!(stateful_exec->disable_outgrad & ((uint64_t)1 << j)))
70
10
            outgrads[(k++) + i * per_outgrad_size] = outputs[j + i * model->input_size];
71
4
      ccv_cnnp_model_backward(model, inputs, ingrad_size, outgrads, outgrad_size, 0, stream_context);
72
4
    }
73
2.40k
    stateful_exec->did_backward_but_not_apply_gradients = 1;
74
2.40k
  }
75
4.81k
  if (stream_context)
76
396
  {
77
    // Should have new scheduler created.
78
396
    assert(stream_context->scheduler);
79
    // The new scheduler shouldn't be active (everything is scheduled).
80
396
    assert(!co_scheduler_is_active(stream_context->scheduler));
81
396
    co_scheduler_free(stream_context->scheduler);
82
    // Switch back to the old scheduler.
83
396
    stream_context->scheduler = old_scheduler;
84
    // The main coroutine should be cleared.
85
396
    assert(!stream_context->main);
86
396
    stream_context->main = old_main;
87
396
  }
88
4.81k
  if (wait_for_any_neighbor) // Find all neighbor context and wait on them all.
89
2
  {
90
2
    assert(stream_context);
91
2
    ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
92
10
    for (i = 0; i < parallel_count; 
i++8
)
93
8
    {
94
8
      ccv_nnc_stream_context_t* const neighbor_context = ccv_nnc_stream_context_find_neighbor(stream_context, i);
95
8
      if (neighbor_context && neighbor_context != stream_context)
96
6
        ccv_nnc_stream_context_wait_signal(neighbor_context, signal);
97
8
    }
98
2
  }
99
4.81k
  return CCV_NNC_EXEC_SUCCESS;
100
4.81k
}
101
102
static void _ccv_cnnp_model_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
103
4.41k
{
104
4.41k
  ccv_nnc_stateful_exec_t* const stateful_exec = (ccv_nnc_stateful_exec_t*)cmd.data;
105
4.41k
  ccv_cnnp_model_t* const model = (ccv_cnnp_model_t*)stateful_exec->data;
106
4.41k
  const int parallel_count = ccv_max(model->parallel_count, 1);
107
4.41k
  const int per_input_size = input_size / parallel_count;
108
4.41k
  assert(per_input_size > 0);
109
4.41k
  assert((input_size % parallel_count) == 0);
110
4.41k
  const int per_output_size = output_size / parallel_count;
111
4.41k
  assert(per_output_size > 0);
112
4.41k
  assert((output_size % parallel_count) == 0);
113
4.41k
  int i, j;
114
8.84k
  for (i = 0; i < parallel_count; 
i++4.43k
)
115
4.43k
  {
116
4.43k
    ccv_cnnp_model_tensor_auto(model, outputs + i * per_output_size, per_output_size);
117
    // Set device id to the corresponding inputs' device id.
118
4.43k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i * per_input_size].type);
119
8.86k
    for (j = 0; j < per_output_size; 
j++4.43k
)
120
4.43k
      CCV_TENSOR_SET_DEVICE_ID(outputs[i * per_output_size + j].type, device_id);
121
4.43k
  }
122
4.41k
}
123
124
static void _ccv_cnnp_model_apply_gradients(const ccv_nnc_cmd_t cmd, ccv_nnc_stream_context_t* const stream_context)
125
2.40k
{
126
2.40k
  ccv_nnc_stateful_exec_t* const stateful_exec = (ccv_nnc_stateful_exec_t*)cmd.data;
127
2.40k
  ccv_cnnp_model_t* const model = (ccv_cnnp_model_t*)stateful_exec->data;
128
2.40k
  ccv_cnnp_model_apply_gradients(model, stream_context);
129
2.40k
}
130
131
static ccv_nnc_stateful_cmd_vtab_t ccv_cnnp_model_exec_isa = {
132
  .super = {
133
    .exec = _ccv_cnnp_model_exec,
134
    .tensor_auto = _ccv_cnnp_model_tensor_auto,
135
  },
136
  .apply_gradients = _ccv_cnnp_model_apply_gradients,
137
};
138
139
void ccv_nnc_dynamic_graph_dry_run(ccv_nnc_dynamic_graph_t* const dynamic_graph, ccv_cnnp_model_t* const model, const int is_test, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_stream_context_t* const stream_context)
140
0
{
141
0
  assert(input_size > 0);
142
0
  const int parallel_count = ccv_max(model->parallel_count, 1);
143
0
  const int per_input_size = input_size / parallel_count;
144
0
  assert(per_input_size > 0);
145
0
  assert((input_size % parallel_count) == 0);
146
0
  int i, j;
147
0
  if (!model->graph)
148
0
  {
149
0
    ccv_nnc_tensor_param_t input_params[per_input_size];
150
0
    for (i = 0; i < per_input_size; i++)
151
0
      input_params[i] = inputs[i]->info;
152
0
    ccv_cnnp_model_compile(model, input_params, per_input_size, CMD_NOOP(), CMD_NOOP());
153
0
  } else {
154
0
    assert(per_input_size == model->input_size);
155
0
    ccv_nnc_tensor_param_t input_params[per_input_size];
156
0
    int flag = 0;
157
0
    for (i = 0; i < per_input_size; i++)
158
0
    {
159
0
      input_params[i] = inputs[i]->info;
160
0
      const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(model->graph, model->inputs[i]);
161
      // If these two parameters doesn't match, recompile the graph..
162
0
      if (memcmp(&params, &input_params[i], sizeof(params)) != 0)
163
0
        flag = 1;
164
0
    }
165
0
    if (flag) // Recompile the graph.
166
0
      ccv_cnnp_model_compile(model, input_params, per_input_size, ccv_cnnp_model_minimizer(model), CMD_NOOP());
167
0
  }
168
0
  ccv_nnc_tensor_t* input_tensors[input_size];
169
0
  for (i = 0; i < input_size; i++)
170
0
  {
171
    // Cannot have the parameter be a partial tensor view for model evaluation.
172
0
    input_tensors[i] = inputs[i] ? ccv_nnc_tensor_from_variable(dynamic_graph, inputs[i], stream_context) : 0;
173
0
    if (input_tensors[i])
174
0
      { assert(CCV_IS_TENSOR_CONTIGUOUS(input_tensors[i])); }
175
0
  }
176
0
  const int per_output_size = ccv_cnnp_model_output_size(model);
177
0
  ccv_nnc_tensor_param_t output_params[ccv_max(1, per_output_size)];
178
0
  const int output_size = per_output_size * parallel_count;
179
0
  ccv_nnc_tensor_variable_t outputs[output_size];
180
0
  ccv_nnc_tensor_t* output_tensors[output_size];
181
0
  for (i = 0; i < parallel_count; i++)
182
0
  {
183
0
    for (j = 0; j < per_output_size; j++)
184
0
      output_params[j] = ccv_nnc_tensor_auto;
185
0
    ccv_cnnp_model_tensor_auto(model, output_params, per_output_size);
186
0
    for (j = 0; j < per_output_size; j++)
187
0
      if (!ccv_nnc_is_tensor_auto(output_params[j]))
188
0
      {
189
0
        outputs[i * per_output_size + j] = ccv_nnc_tensor_variable_new(dynamic_graph, output_params[j]);
190
0
        output_tensors[i * per_output_size + j] = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i * per_output_size + j], stream_context);
191
0
      } else {
192
0
        outputs[i * per_output_size + j] = 0;
193
0
        output_tensors[i * per_output_size + j] = 0;
194
0
      }
195
0
  }
196
0
  if (dynamic_graph->no_grad)
197
0
  {
198
0
    ccv_cnnp_model_dry_run(model, (ccv_cnnp_evaluate_param_t){
199
0
      .requires_grad = 0,
200
0
      .disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL,
201
0
      .is_test = is_test,
202
0
    }, input_tensors, input_size, output_tensors, output_size);
203
0
  } else {
204
0
    uint64_t disable_outgrad = 0;
205
0
    int count = 0;
206
0
    for (i = 0; i < per_input_size; i++)
207
0
      if (!inputs[i] || inputs[i]->type == CCV_NNC_TENSOR_CONSTANT)
208
0
      {
209
0
        disable_outgrad |= ((uint64_t)1 << i);
210
0
        ++count;
211
0
      }
212
0
    if (count == per_input_size)
213
0
      disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL;
214
0
    ccv_cnnp_model_dry_run(model, (ccv_cnnp_evaluate_param_t){
215
0
      .requires_grad = 1,
216
0
      .disable_outgrad = disable_outgrad,
217
0
      .is_test = is_test,
218
0
    }, input_tensors, input_size, output_tensors, output_size);
219
0
  }
220
  // Free the allocated variables.
221
0
  for (i = 0; i < output_size; i++)
222
0
    if (outputs[i])
223
0
      ccv_nnc_tensor_variable_free(dynamic_graph, outputs[i]);
224
0
}
225
226
void ccv_nnc_dynamic_graph_evaluate(ccv_nnc_dynamic_graph_t* const dynamic_graph, ccv_cnnp_model_t* const model, const int is_test, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_tensor_variable_t* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
227
2.41k
{
228
2.41k
  ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_CUSTOM_FORWARD, (ccv_nnc_cmd_vtab_t*)&ccv_cnnp_model_exec_isa, (ccv_nnc_cmd_param_t){}, 0);
229
2.41k
  assert(input_size > 0);
230
2.41k
  const int parallel_count = ccv_max(model->parallel_count, 1);
231
2.41k
  const int per_input_size = input_size / parallel_count;
232
2.41k
  assert(per_input_size > 0);
233
2.41k
  assert((input_size % parallel_count) == 0);
234
2.41k
  int i;
235
2.41k
  if (!model->graph)
236
9
  {
237
9
    ccv_nnc_tensor_param_t input_params[per_input_size];
238
24
    for (i = 0; i < per_input_size; 
i++15
)
239
15
      input_params[i] = inputs[i]->info;
240
9
    ccv_cnnp_model_compile(model, input_params, per_input_size, CMD_NOOP(), CMD_NOOP());
241
2.40k
  } else {
242
2.40k
    assert(per_input_size == model->input_size);
243
2.40k
    ccv_nnc_tensor_param_t input_params[per_input_size];
244
2.40k
    int flag = 0;
245
4.81k
    for (i = 0; i < per_input_size; 
i++2.41k
)
246
2.41k
    {
247
2.41k
      input_params[i] = inputs[i]->info;
248
2.41k
      const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(model->graph, model->inputs[i]);
249
      // If these two parameters doesn't match, recompile the graph..
250
2.41k
      if (memcmp(&params, &input_params[i], sizeof(params)) != 0)
251
2.20k
        flag = 1;
252
2.41k
    }
253
2.40k
    if (flag) // Recompile the graph.
254
2.20k
      ccv_cnnp_model_compile(model, input_params, per_input_size, ccv_cnnp_model_minimizer(model), CMD_NOOP());
255
2.40k
  }
256
4.86k
  
for (i = 0; 2.41k
i < input_size;
i++2.45k
)
257
2.45k
  {
258
    // Cannot have the parameter be a partial tensor view for model evaluation.
259
2.45k
    ccv_nnc_tensor_t* const tensor = inputs[i] ? ccv_nnc_tensor_from_variable(dynamic_graph, inputs[i], stream_context) : 
00
;
260
2.45k
    if (tensor)
261
2.45k
      { assert(CCV_IS_TENSOR_CONTIGUOUS(tensor)); }
262
2.45k
  }
263
2.41k
  if (dynamic_graph->no_grad)
264
4
  {
265
4
    ccv_nnc_stateful_exec_t stateful_exec = {
266
4
      .requires_grad = 0,
267
4
      .is_test = is_test,
268
4
      .disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL,
269
4
      .tensor_tape = tensor_tape,
270
4
      .data = model
271
4
    };
272
4
    cmd.data = &stateful_exec;
273
    // Parallel parameter doesn't make sense here, the parallel is defined inside the model.
274
4
    ccv_nnc_dynamic_graph_exec_ret(dynamic_graph, cmd, ccv_nnc_no_hint, 0, inputs, input_size, outputs, output_size, 0, stream_context, 0);
275
2.40k
  } else {
276
2.40k
    uint64_t disable_outgrad = 0;
277
2.40k
    int count = 0;
278
4.82k
    for (i = 0; i < per_input_size; 
i++2.41k
)
279
2.41k
      if (!inputs[i] || inputs[i]->type == CCV_NNC_TENSOR_CONSTANT)
280
7
      {
281
7
        disable_outgrad |= ((uint64_t)1 << i);
282
7
        ++count;
283
7
      }
284
2.40k
    if (count == per_input_size)
285
1
      disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL;
286
2.40k
    ccv_nnc_stateful_exec_t* const stateful_exec = (ccv_nnc_stateful_exec_t*)ccmalloc(sizeof(ccv_nnc_stateful_exec_t));
287
2.40k
    cmd.data = stateful_exec;
288
2.40k
    stateful_exec->requires_grad = 1;
289
2.40k
    stateful_exec->is_test = is_test;
290
2.40k
    stateful_exec->did_backward_but_not_apply_gradients = 0;
291
2.40k
    stateful_exec->should_free = 0;
292
2.40k
    stateful_exec->disable_outgrad = disable_outgrad;
293
2.40k
    stateful_exec->tensor_tape = tensor_tape;
294
2.40k
    stateful_exec->data = model;
295
2.40k
    stateful_exec->cmd = cmd;
296
2.40k
    ccv_nnc_graph_exec_symbol_t symbol = {};
297
2.40k
    ccv_nnc_dynamic_graph_exec_ret(dynamic_graph, cmd, ccv_nnc_no_hint, 0, inputs, input_size, outputs, output_size, 0, stream_context, &symbol);
298
2.40k
    if (!symbol.graph) // This is because inputs are all constants.
299
1
      ccfree(stateful_exec); // No one records it, there is no cmd.data refer to it.
300
2.40k
    else {
301
2.40k
      if (!dynamic_graph->stateful_execs)
302
12
      {
303
12
        dynamic_graph->stateful_execs = ccv_array_new(sizeof(ccv_nnc_stateful_exec_t*), 1, 0);
304
12
        ccv_array_push(dynamic_graph->stateful_execs, &stateful_exec);
305
12
        stateful_exec->index = dynamic_graph->stateful_execs->rnum - 1;
306
2.39k
      } else {
307
2.39k
        if (dynamic_graph->reuse_stateful_exec >= 0)
308
2.38k
        {
309
2.38k
          *(ccv_nnc_stateful_exec_t**)ccv_array_get(dynamic_graph->stateful_execs, dynamic_graph->reuse_stateful_exec) = stateful_exec;
310
2.38k
          stateful_exec->index = dynamic_graph->reuse_stateful_exec;
311
2.38k
          int flag = 0;
312
4.03k
          for (i = dynamic_graph->reuse_stateful_exec + 1; !flag && 
i < dynamic_graph->stateful_execs->rnum2.38k
;
i++1.64k
)
313
1.64k
            if (*(ccv_nnc_stateful_exec_t**)ccv_array_get(dynamic_graph->stateful_execs, i) == 0)
314
1.64k
              dynamic_graph->reuse_stateful_exec = i, flag = 1;
315
2.38k
          if (!flag) // Reset to 1.
316
743
            dynamic_graph->reuse_stateful_exec = -1;
317
2.38k
        } else {
318
          // Push new, no reuse available.
319
9
          ccv_array_push(dynamic_graph->stateful_execs, &stateful_exec);
320
9
          stateful_exec->index = dynamic_graph->stateful_execs->rnum - 1;
321
9
        }
322
2.39k
      }
323
2.40k
    }
324
2.40k
  }
325
2.41k
}
326