Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_dynamic_graph_apply_gradients.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_nnc_easy.h"
5
#include "ccv_internal.h"
6
#include "_ccv_nnc_dynamic_graph.h"
7
8
// MARK - Level-4.5 API
9
10
void ccv_nnc_dynamic_graph_apply_gradients(ccv_nnc_dynamic_graph_t* const dynamic_graph, const ccv_nnc_cmd_t minimizer, const ccv_nnc_tensor_variable_t* const gradients, const int gradient_size, ccv_nnc_tensor_variable_t* const parameters, const int parameter_size, ccv_nnc_tensor_variable_t* const saved_aux, const int parallel, ccv_nnc_stream_context_t* const stream_context)
11
1.75k
{
12
1.75k
  assert(gradient_size == parameter_size);
13
1.75k
  assert(!dynamic_graph->no_grad);
14
  // Call apply gradients to stateful execs first.
15
1.75k
  int i, j;
16
1.75k
  if (dynamic_graph->stateful_execs)
17
753
  {
18
3.15k
    for (i = 0; i < dynamic_graph->stateful_execs->rnum; 
i++2.40k
)
19
2.40k
    {
20
2.40k
      ccv_nnc_stateful_exec_t* const stateful_exec = *(ccv_nnc_stateful_exec_t**)ccv_array_get(dynamic_graph->stateful_execs, i);
21
      // We only apply gradients when backward round has done.
22
2.40k
      if (stateful_exec && stateful_exec->did_backward_but_not_apply_gradients)
23
2.40k
      {
24
2.40k
        const ccv_nnc_stateful_cmd_vtab_t* const isa = (ccv_nnc_stateful_cmd_vtab_t*)stateful_exec->cmd.isa;
25
2.40k
        if (isa->apply_gradients)
26
2.40k
          isa->apply_gradients(stateful_exec->cmd, stream_context);
27
2.40k
        stateful_exec->did_backward_but_not_apply_gradients = 0;
28
2.40k
        if (stateful_exec->should_free)
29
2.10k
        {
30
2.10k
          ccfree(stateful_exec);
31
2.10k
          *(ccv_nnc_stateful_exec_t**)ccv_array_get(dynamic_graph->stateful_execs, i) = 0;
32
2.10k
          if (i < dynamic_graph->reuse_stateful_exec || dynamic_graph->reuse_stateful_exec < 0)
33
450
            dynamic_graph->reuse_stateful_exec = i;
34
2.10k
        }
35
2.40k
      }
36
2.40k
    }
37
753
  }
38
1.75k
  if (parameter_size == 0)
39
349
    return;
40
1.40k
  const int aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
41
1.40k
  const int saved_aux_size = parameter_size * aux_size;
42
1.40k
  ccv_nnc_tensor_symbol_t update_inputs[aux_size + 2];
43
1.40k
  ccv_nnc_tensor_symbol_t update_outputs[aux_size + 1];
44
1.40k
  int freeable_size = 0;
45
1.40k
  ccv_nnc_graph_exec_symbol_t sources[parameter_size];
46
1.40k
  ccv_nnc_graph_exec_symbol_t minimizes[parameter_size];
47
1.40k
  ccv_nnc_tensor_variable_t freeables[parameter_size + saved_aux_size];
48
1.40k
  ccv_array_t* const symbol_stack = ccv_array_new(sizeof(ccv_nnc_tape_symbol_t), 1, 0);
49
1.40k
  ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol, symbol_stack, 0);
50
1.40k
  ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol_alias, symbol_stack, 0);
51
1.40k
  ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_graph_exec_symbol, symbol_stack, 0);
52
1.40k
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), parameter_size * 3 + saved_aux_size * 2, 0);
53
1.40k
  const int parallel_count = ccv_max(parallel, 1);
54
1.40k
  const int per_parameter_size = parameter_size / parallel_count;
55
1.40k
  assert((parameter_size % parallel_count) == 0);
56
1.40k
  ccv_nnc_tensor_symbol_t* const allreduce_inputs = parallel_count > 1 ? 
(ccv_nnc_tensor_symbol_t*)alloca4
(sizeof(ccv_nnc_tensor_symbol_t) * parallel_count * 2 + sizeof(ccv_nnc_graph_exec_symbol_t) * per_parameter_size) :
01.40k
;
57
1.40k
  ccv_nnc_tensor_symbol_t* const allreduce_outputs = allreduce_inputs ? 
allreduce_inputs + parallel_count4
:
01.40k
;
58
1.40k
  ccv_nnc_graph_exec_symbol_t* const allreduces = allreduce_outputs ? 
(ccv_nnc_graph_exec_symbol_t*)(allreduce_outputs + parallel_count)4
:
01.40k
;
59
1.40k
  if (parallel_count > 1) // Doing allreduce first.
60
4
  {
61
10
    for (i = 0; i < per_parameter_size; 
i++6
)
62
6
    {
63
26
      for (j = 0; j < parallel_count; 
j++20
)
64
20
      {
65
20
        const int idx = i + j * per_parameter_size;
66
20
        assert(parameters[idx]->symbol.d >= 0);
67
20
        const ccv_nnc_tensor_param_t info = parameters[idx]->info;
68
20
        const ccv_nnc_tensor_symbol_t gradient = allreduce_inputs[j] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
69
20
        const ccv_nnc_tensor_bind_t bind = {
70
20
          .symbol = gradient,
71
20
          .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, gradients[idx], stream_context)
72
20
        };
73
20
        ccv_array_push(tensor_binds, &bind);
74
20
        allreduce_outputs[j] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
75
20
      }
76
6
      allreduces[i] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_COMM_ALLREDUCE_FORWARD(), allreduce_inputs, parallel_count, allreduce_outputs, parallel_count, 0);
77
26
      for (j = 0; j < parallel_count; 
j++20
)
78
20
      {
79
20
        const int idx = i + j * per_parameter_size;
80
20
        assert(parameters[idx]->symbol.d >= 0);
81
20
        const ccv_nnc_tensor_param_t info = parameters[idx]->info;
82
20
        update_inputs[0] = allreduce_outputs[j];
83
20
        update_inputs[1] = parameters[idx]->symbol;
84
20
        ccv_nnc_tensor_bind_t bind = {
85
20
          .symbol = parameters[idx]->symbol,
86
20
          .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, parameters[idx], stream_context)
87
20
        };
88
20
        ccv_array_push(tensor_binds, &bind);
89
20
        freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, parameters[idx]);
90
20
        bind.symbol = update_outputs[0] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
91
20
        bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, parameters[idx], stream_context);
92
20
        ccv_array_push(tensor_binds, &bind);
93
20
        int k;
94
20
        ccv_nnc_tensor_symbol_t set_zeros[aux_size];
95
20
        int set_zero_size = 0;
96
52
        for (k = 0; k < aux_size; 
k++32
)
97
32
          update_inputs[2 + k] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
98
52
        for (k = 0; k < aux_size; 
k++32
)
99
32
          if (!ccv_nnc_tensor_variable_contains_value(saved_aux[idx * aux_size + k])) // Need to 0 init the saved aux in this case.
100
32
          {
101
32
            if (ccv_nnc_is_tensor_auto(saved_aux[idx * aux_size + k]->info))
102
16
              saved_aux[idx * aux_size + k]->info = info;
103
32
            set_zeros[set_zero_size++] = update_inputs[2 + k];
104
32
          }
105
52
        for (k = 0; k < aux_size; 
k++32
)
106
32
        {
107
32
          bind.symbol = update_inputs[2 + k];
108
32
          bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, saved_aux[idx * aux_size + k], stream_context);
109
32
          ccv_array_push(tensor_binds, &bind);
110
32
          freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, saved_aux[idx * aux_size + k]);
111
32
          bind.symbol = update_outputs[1 + k] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
112
32
          bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, saved_aux[idx * aux_size + k], stream_context);
113
32
          ccv_array_push(tensor_binds, &bind);
114
32
        }
115
20
        sources[idx] = minimizes[idx] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, minimizer, update_inputs, aux_size + 2, update_outputs, aux_size + 1, 0);
116
20
        if (set_zero_size > 0)
117
16
        {
118
16
          const ccv_nnc_graph_exec_symbol_t set_zero = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_SET_FORWARD(0), 0, 0, set_zeros, set_zero_size, 0);
119
16
          ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, set_zero, minimizes[idx]);
120
16
          ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, allreduces[i], set_zero);
121
16
          sources[idx] = set_zero;
122
16
        } else
123
4
          ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, allreduces[i], minimizes[idx]);
124
20
      }
125
6
    }
126
1.40k
  } else {
127
2.80k
    for (i = 0; i < per_parameter_size; 
i++1.40k
)
128
1.40k
    {
129
1.40k
      assert(parameters[i]->symbol.d >= 0);
130
1.40k
      const ccv_nnc_tensor_param_t info = parameters[i]->info;
131
1.40k
      const ccv_nnc_tensor_symbol_t gradient = update_inputs[0] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
132
1.40k
      ccv_nnc_tensor_bind_t bind = {
133
1.40k
        .symbol = gradient,
134
1.40k
        .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, gradients[i], stream_context)
135
1.40k
      };
136
1.40k
      ccv_array_push(tensor_binds, &bind);
137
1.40k
      update_inputs[1] = parameters[i]->symbol;
138
1.40k
      bind.symbol = parameters[i]->symbol;
139
1.40k
      bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, parameters[i], stream_context);
140
1.40k
      ccv_array_push(tensor_binds, &bind);
141
1.40k
      freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, parameters[i]);
142
1.40k
      bind.symbol = update_outputs[0] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
143
1.40k
      bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, parameters[i], stream_context);
144
1.40k
      ccv_array_push(tensor_binds, &bind);
145
1.40k
      ccv_nnc_tensor_symbol_t set_zeros[aux_size];
146
1.40k
      int set_zero_size = 0;
147
2.80k
      for (j = 0; j < aux_size; 
j++1.40k
)
148
1.40k
        update_inputs[2 + j] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
149
2.80k
      for (j = 0; j < aux_size; 
j++1.40k
)
150
1.40k
        if (!ccv_nnc_tensor_variable_contains_value(saved_aux[i * aux_size + j])) // Need to 0 init the saved aux in this case.
151
4
        {
152
4
          if (ccv_nnc_is_tensor_auto(saved_aux[i * aux_size + j]->info))
153
0
            saved_aux[i * aux_size + j]->info = info;
154
4
          set_zeros[set_zero_size++] = update_inputs[2 + j];
155
4
        }
156
2.80k
      for (j = 0; j < aux_size; 
j++1.40k
)
157
1.40k
      {
158
1.40k
        bind.symbol = update_inputs[2 + j];
159
1.40k
        bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, saved_aux[i * aux_size + j], stream_context);
160
1.40k
        ccv_array_push(tensor_binds, &bind);
161
1.40k
        freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, saved_aux[i * aux_size + j]);
162
1.40k
        bind.symbol = update_outputs[1 + j] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
163
1.40k
        bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, saved_aux[i * aux_size + j], stream_context);
164
1.40k
        ccv_array_push(tensor_binds, &bind);
165
1.40k
      }
166
1.40k
      sources[i] = minimizes[i] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, minimizer, update_inputs, aux_size + 2, update_outputs, aux_size + 1, 0);
167
1.40k
      if (set_zero_size > 0)
168
3
      {
169
3
        const ccv_nnc_graph_exec_symbol_t set_zero = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_SET_FORWARD(0), 0, 0, set_zeros, set_zero_size, 0);
170
3
        ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, set_zero, minimizes[i]);
171
3
        sources[i] = set_zero;
172
3
      }
173
1.40k
    }
174
1.40k
  }
175
1.40k
  ccv_nnc_dy_xpu_alloc_t xpu_alloc = {
176
1.40k
    .xpu_alloc = &dynamic_graph->xpu_alloc,
177
1.40k
    .stream = stream_context
178
1.40k
  };
179
1.40k
  ccv_nnc_symbolic_graph_compile_param_t compile_params = {
180
1.40k
    .allocator = {
181
1.40k
      .isa = &ccv_nnc_dy_allocator_isa,
182
1.40k
      .context = {
183
1.40k
        .alloc = &xpu_alloc,
184
1.40k
        .free = &dynamic_graph->xpu_alloc,
185
1.40k
      }
186
1.40k
    }
187
1.40k
  };
188
1.40k
  ccv_nnc_graph_t* graph = 0;
189
1.40k
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
190
1.40k
  ccv_nnc_graph_exec_arena_t* exec_arena = 0;
191
1.40k
  ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params,
192
1.40k
    (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum,
193
1.40k
    0, 0,
194
1.40k
    parallel_count > 1 ? 
allreduces4
:
sources1.40k
, parallel_count > 1 ?
per_parameter_size4
:
parameter_size1.40k
,
195
1.40k
    minimizes, parameter_size,
196
1.40k
    &graph, &tensor_arena, &exec_arena);
197
1.40k
  ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, 0, 0, 0);
198
1.40k
  ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, 0, 0, 0);
199
1.40k
  ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, 0, 0, 0);
200
1.40k
  ccv_array_free(tensor_binds);
201
  // Remove newly added symbols to restore the graph.
202
8.58k
  for (i = 0; i < symbol_stack->rnum; 
i++7.18k
)
203
7.18k
  {
204
7.18k
    const ccv_nnc_tape_symbol_t* const symbol = (ccv_nnc_tape_symbol_t*)ccv_array_get(symbol_stack, i);
205
7.18k
    if (symbol->type == CCV_NNC_SYMBOL_TENSOR || 
symbol->type == CCV_NNC_SYMBOL_TENSOR_ALIAS1.44k
)
206
5.73k
      ccv_nnc_tensor_symbol_free(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
207
5.73k
        .d = symbol->d,
208
5.73k
        .graph = dynamic_graph->tape
209
5.73k
      });
210
1.44k
    else if (symbol->type == CCV_NNC_SYMBOL_GRAPH_EXEC)
211
1.44k
      ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){
212
1.44k
        .d = symbol->d,
213
1.44k
        .graph = dynamic_graph->tape
214
1.44k
      });
215
7.18k
  }
216
1.40k
  ccv_array_free(symbol_stack);
217
1.40k
  if (stream_context)
218
3
  {
219
3
    ccv_nnc_graph_set_default_static_schedule(graph, ccv_nnc_stream_context_type(stream_context), dynamic_graph->max_stream_count);
220
3
    ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context);
221
3
    ccv_nnc_tensor_arena_buffer_free(tensor_arena);
222
3
    ccv_nnc_compilation_artifact_t* const artifact = ccv_nnc_compilation_artifact_new(graph, tensor_arena, exec_arena);
223
3
    ccv_nnc_stream_context_add_callback(stream_context, (ccv_nnc_callback_f)ccv_nnc_compilation_artifact_free, artifact);
224
1.40k
  } else {
225
1.40k
    if (parallel_count > 1)
226
1
    { // We need to schedule it, now to figure out what stream type we are at.
227
1
      int flag = 0;
228
2
      for (i = 0; !flag && 
i < parameter_size1
;
i++1
)
229
1
        flag = (CCV_TENSOR_GET_MEMORY(parameters[i]->info.type) == CCV_TENSOR_GPU_MEMORY);
230
1
      const int stream_type = flag ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
231
1
      ccv_nnc_graph_set_default_static_schedule(graph, stream_type, dynamic_graph->max_stream_count);
232
1
      ccv_nnc_stream_context_t* const default_stream = ccv_nnc_graph_default_stream(graph);
233
1
      ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, default_stream);
234
1
      ccv_nnc_stream_context_wait(default_stream);
235
1
    } else
236
1.40k
      ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
237
1.40k
    ccv_nnc_graph_free(graph);
238
1.40k
    ccv_nnc_tensor_arena_free(tensor_arena);
239
1.40k
    ccv_nnc_graph_exec_arena_free(exec_arena);
240
1.40k
  }
241
  // Now, able to free some of the reused outputs. This need to be the last step otherwise some of the exec symbols
242
  // above may be freed by this operation.
243
4.26k
  for (i = 0; i < freeable_size; 
i++2.85k
)
244
2.85k
    ccv_nnc_tensor_variable_free(dynamic_graph, freeables[i]);
245
1.40k
}