Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_symbolic_graph_backward.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_symbolic_graph.h"
6
7
// MARK - Level-3.5 API
8
9
typedef struct {
10
  int f_wrt; // Check if both f_symbols and wrt_symbols flow through this node.
11
  ccv_array_t* outgoings; // backward traverse nodes.
12
  uint64_t* input_bitmasks;
13
  int input_bitmask_size;
14
  uint64_t* output_bitmasks;
15
  int output_bitmask_size;
16
} ccv_nnc_graph_backward_info_t;
17
18
typedef struct {
19
  int input_size;
20
  int* inputs;
21
  int output;
22
  ccv_array_t* outgoings;
23
  float value;
24
  ccv_nnc_graph_exec_symbol_t symbol;
25
} ccv_nnc_sum_or_set_graph_exec_symbol_t;
26
27
typedef struct {
28
  int input_size;
29
  int output_size;
30
  int* inputs;
31
  int* outputs;
32
  ccv_array_t* outgoings;
33
  ccv_nnc_cmd_t cmd;
34
  ccv_nnc_graph_exec_symbol_t symbol;
35
} ccv_nnc_autograd_graph_exec_symbol_t;
36
37
typedef struct {
38
  int d; // The pointer to the forward level object.
39
  int alias_ref; // The alias ref to itself (autograd_tensor_symbols array).
40
  int flags; // Flags for this symbol.
41
  ccv_nnc_tensor_symbol_t symbol;
42
} ccv_nnc_autograd_tensor_symbol_t;
43
44
typedef struct {
45
  int d; // The tensor symbol ref.
46
  int x; // The exec symbol ref.
47
  ccv_array_t* exec_registry; // Additional exec symbol refs, similar to x, only useful for aliasing.
48
  ccv_array_t* alias_registry; // int point to all the alias (if this is not an alias). The alias is the object in autograd_tensor_symbols, you need another level of indirection to get the actual forward level alias.
49
} ccv_nnc_tensor_ref_t;
50
51
typedef struct {
52
  int c; // The start non-accumulated version.
53
  ccv_array_t* ref_version; // tensor ref point to the reverse tensor symbol.
54
} ccv_nnc_autograd_tensor_version_t;
55
56
typedef struct {
57
  int d;
58
  int alias_ref;
59
} ccv_nnc_sum_variable_t;
60
61
// This method tries to figure out if a set of aliases can cover the whole tensor dim.
62
// This is not a precise implementation though. The requirement is to answer this question
63
// with a given memory constraint, therefore, only allow up to 65536 different tensor locations.
64
// If you have more than that, it will assume that it doesn't have fully assigned aliases,
65
// and will return 0.
66
67
// Return 1 if inserted successfully.
68
static inline int _ccv_nnc_try_mix(int* const md, const int ins, const int c)
69
43
{
70
43
  if (!c)
71
25
  {
72
25
    md[0] = ins;
73
25
    return 1;
74
25
  }
75
18
  int ll = 0, uu = c - 1;
76
18
  int mm;
77
20
  do {
78
20
    mm = ll + ((uu - ll) >> 1);
79
20
    if (ins == md[mm])
80
16
      return 0;
81
4
    else if (ins < md[mm])
82
2
      uu = mm - 1;
83
2
    else if (ins > md[mm])
84
2
      ll = mm + 1;
85
20
  } while (
ll <= uu4
);
86
2
  if (ll < c)
87
2
    memmove(md + ll + 1, md + ll, sizeof(int) * (c - ll));
88
2
  md[ll] = ins;
89
2
  return 1;
90
18
}
91
92
static inline int _ccv_nnc_mix_idx(const int* const md, const int ins, const int c)
93
30
{
94
30
  if (c <= 1)
95
22
    return 0;
96
8
  int ll = 0, uu = c - 1;
97
8
  int mm;
98
14
  do {
99
14
    mm = ll + ((uu - ll) >> 1);
100
14
    if (ins == md[mm])
101
8
      return mm;
102
6
    else if (ins < md[mm])
103
0
      uu = mm - 1;
104
6
    else if (ins > md[mm])
105
6
      ll = mm + 1;
106
14
  } while (
ll <= uu6
);
107
8
  assert
(0 && "Shouldn't reach here")0
;
108
0
  return -1;
109
0
}
110
111
static inline void _ccv_nnc_try_set_pix_0(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint32_t* const cube, int offset)
112
6
{
113
6
  const int s = (ofs[0] == 0) ? 
03
:
_ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 13
;
114
6
  const int d = ((ofs[0] + dim[0] == tensor_dim[0]) ? 
cube_dim[0]3
:
_ccv_nnc_mix_idx(scmd[0], ofs[0] + 3
ccv_max3
(1, dim[0]), cube_dim[0])) + 1;
115
6
  assert(s >= 0 && d > s);
116
6
  int i;
117
12
  for (i = s; i < d; 
i++6
)
118
    // Fill this pix. I can make this faster by loop through full ones (divided by 8), but too lazy.
119
6
    cube[(offset + i) >> 5] |= (1u << ((offset + i) & 0x1f));
120
6
}
121
122
static inline void _ccv_nnc_try_set_pix_1(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint32_t* const cube, int offset)
123
16
{
124
16
  const int s0 = (ofs[0] == 0) ? 
014
:
_ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 12
;
125
16
  const int d0 = ((ofs[0] + dim[0] == tensor_dim[0]) ? 
cube_dim[0]14
:
_ccv_nnc_mix_idx(scmd[0], ofs[0] + 2
ccv_max2
(1, dim[0]), cube_dim[0])) + 1;
126
16
  assert(s0 >= 0 && d0 > s0);
127
16
  const int s1 = (ofs[1] == 0) ? 
010
:
_ccv_nnc_mix_idx(scmd[1], ofs[1], cube_dim[1]) + 16
;
128
16
  const int d1 = ((ofs[1] + dim[1] == tensor_dim[1]) ? 
cube_dim[1]10
:
_ccv_nnc_mix_idx(scmd[1], ofs[1] + 6
ccv_max6
(1, dim[1]), cube_dim[1])) + 1;
129
16
  assert(s1 >= 0 && d1 > s1);
130
16
  int i, j;
131
16
  const int step1 = cube_step[1];
132
16
  if (step1 == d0 - s0)
133
12
  {
134
    // Faster one, we can simply loop through.
135
26
    for (i = s1 * step1; i < d1 * step1; 
i++14
)
136
14
      cube[(offset + i) >> 5] |= (1u << ((offset + i) & 0x1f));
137
12
  } else {
138
4
    offset += s1 * step1;
139
    // There are gaps, slow one.
140
8
    for (i = s1; i < d1; 
i++, offset += step14
)
141
8
      
for (j = s0; 4
j < d0;
j++4
)
142
4
        cube[(offset + j) >> 5] |= (1u << ((offset + j) & 0x1f));
143
4
  }
144
16
}
145
146
static inline void _ccv_nnc_try_set_pix(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint32_t* const cube, int offset, const int dim_idx)
147
30
{
148
30
  switch (dim_idx)
149
30
  {
150
16
    case 1:
151
16
      _ccv_nnc_try_set_pix_1(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset);
152
16
      return;
153
6
    case 0:
154
6
      _ccv_nnc_try_set_pix_0(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset);
155
6
      return;
156
30
  }
157
8
  int i;
158
8
  const int s = (ofs[dim_idx] == 0) ? 
06
:
_ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx], cube_dim[dim_idx]) + 12
;
159
8
  const int d = ((ofs[dim_idx] + dim[dim_idx] == tensor_dim[dim_idx]) ? 
cube_dim[dim_idx]2
:
_ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx] + 6
ccv_max6
(1, dim[dim_idx]), cube_dim[dim_idx])) + 1;
160
8
  assert(s >= 0 && d > s);
161
16
  
for (i = s; 8
i < d;
i++8
)
162
8
    _ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset + i * cube_step[dim_idx], dim_idx - 1);
163
8
}
164
165
static int _ccv_nnc_tensor_ref_fully_assigned_with_aliases(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info)
166
2.11k
{
167
  // Only work with tensor_ref of aliases.
168
2.11k
  assert(tensor_ref->alias_registry);
169
2.11k
  const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
170
2.11k
  assert(tensor_symbol_info[autograd->d].alias_ref == 0);
171
2.11k
  const int* tensor_dim = tensor_symbol_info[autograd->d].info.dim;
172
2.11k
  const size_t tensor_count = ccv_nnc_dimension_count(tensor_dim);
173
2.11k
  int i, j;
174
2.15k
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++39
)
175
2.12k
  {
176
2.12k
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
177
2.12k
    assert(d < autograd_tensor_symbols->rnum);
178
2.12k
    const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
179
2.12k
    assert(tensor_symbol_info[autograd->d].alias_ref);
180
2.12k
    const int* stride = tensor_symbol_info[autograd->d].stride;
181
2.12k
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
182
    // If this is just reshaped (i.e., dimension is the same, and inc covers the whole). We have fully assigned.
183
2.12k
    if (ccv_nnc_is_tensor_stride_packed(stride, dim) && 
ccv_nnc_dimension_count(dim) == tensor_count2.10k
)
184
2.09k
      return 1;
185
2.12k
  }
186
26
  int tensor_nd_reshaped = 0;
187
26
  int tensor_dim_reshaped[CCV_NNC_MAX_DIM_ALLOC] = {0};
188
65
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++39
)
189
39
  {
190
39
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
191
39
    assert(d < autograd_tensor_symbols->rnum);
192
39
    const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
193
39
    assert(tensor_symbol_info[autograd->d].alias_ref);
194
39
    const int* stride = tensor_symbol_info[autograd->d].stride;
195
39
    const int nd = ccv_nnc_tensor_nd(stride);
196
39
    if (i == 0) // Derive a tensor dim from the first one, by doing divisions on strides.
197
26
    {
198
26
      if (nd > 0)
199
26
      {
200
26
        tensor_dim_reshaped[0] = tensor_count / stride[0];
201
56
        for (j = 1; j < nd; 
j++30
)
202
30
          tensor_dim_reshaped[j] = stride[j - 1] / stride[j];
203
26
        tensor_nd_reshaped = nd;
204
26
      }
205
26
      continue;
206
26
    }
207
    // If reshaped differently, we cannot run out fill algorithm, do this conservatively.
208
13
    if (nd != tensor_nd_reshaped)
209
0
      return 0;
210
    // Otherwise if inc doesn't match original dim, it is not covered.
211
13
    if (!ccv_nnc_is_tensor_stride_packed(stride, tensor_dim_reshaped))
212
0
      return 0;
213
13
  }
214
26
  if (tensor_nd_reshaped > 0)
215
26
    tensor_dim = tensor_dim_reshaped;
216
  /* We need a solid cube (potentially hyper dimensional) to compute if there are overlaps.
217
   * To make this cube as small as possible, we need to map the actual tensor dimension
218
   * (therefore, we don't actually allocate the whole tensor to compute overlaps) to a smaller
219
   * cube given the ofs and dim size of its aliases.
220
   *
221
   * The following code generated the dimension mapping (using scratch space) with binary search + insertion
222
   * and then we fill the cube with a given tensor alias's dimensional information (ofs, dim).
223
   * Afterwards, we simply need to check if the cube is totally filled up to know if this tensor
224
   * is fully assigned with its aliases (if that is the case, we can skip zeroing for this tensor).
225
   *
226
   * There are several restrictions though to make this faster: 1). I cannot handle any cube that all side
227
   * lengths combined larger than 1023 (scm only have 1024 scratch space). 2). I cannot handle any cube
228
   * that the total volume is larger than 2048 * 8 (I only allocate 2K on stack for this).
229
   * */
230
26
  int scm[1024]; // Having 1024 int scratch space for mapping dimensions. (Or sparse coordinate mapping).
231
26
  int cube_dim[CCV_NNC_MAX_DIM_ALLOC] = {}; // Mapping dimension size.
232
26
  int cube_size = 1;
233
26
  int* scmptr = scm;
234
50
  for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && tensor_dim[i]; 
i++24
)
235
40
  {
236
40
    int head = 0, tail = 0; // Note that we touched both the head and tail (otherwise this dimension is not fully covered).
237
40
    int len = 0;
238
105
    for (j = 0; j < tensor_ref->alias_registry->rnum; 
j++65
)
239
65
    {
240
65
      const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, j);
241
65
      assert(d < autograd_tensor_symbols->rnum);
242
65
      const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
243
65
      assert(tensor_symbol_info[autograd->d].alias_ref);
244
65
      const int* ofs = tensor_symbol_info[autograd->d].ofs;
245
65
      const int* dim = tensor_symbol_info[autograd->d].info.dim;
246
65
      head = head || 
(ofs[i] == 0)44
;
247
65
      tail = tail || 
(ofs[i] + 47
ccv_max47
(1, dim[i]) == tensor_dim[i]);
248
65
      if (ofs[i] != 0)
249
14
        len += _ccv_nnc_try_mix(scmptr, ofs[i], len);
250
65
      if (scmptr - scm + len >= 1024) // Cannot handle that much, abort.
251
0
        return 0;
252
65
      if (ofs[i] + ccv_max(1, dim[i]) < tensor_dim[i])
253
29
        len += _ccv_nnc_try_mix(scmptr, ofs[i] + ccv_max(1, dim[i]), len);
254
65
      if (scmptr - scm + len >= 1024) // Cannot handle that much, abort.
255
0
        return 0;
256
65
    }
257
40
    if (!head || 
!tail39
)
258
16
      return 0;
259
24
    cube_size *= (len + 1);
260
24
    cube_dim[i] = len;
261
24
    scmptr += len; // Moving to next level.
262
24
  }
263
  // The cube map is too large, cannot do the computation, assume it is not fully assigned.
264
10
  if (cube_size > 2048 * 8)
265
0
    return 0;
266
  // binary map to see if it fills up.
267
10
  uint32_t cube[(cube_size + 31) >> 5];
268
10
  memset(cube, 0, sizeof(uint32_t) * ((cube_size + 31) >> 5));
269
10
  int* scmd[CCV_NNC_MAX_DIM_ALLOC] = {}; // Sparse coordinate map at dimension x.
270
10
  int cube_step[CCV_NNC_MAX_DIM_ALLOC] = {};
271
32
  for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && tensor_dim[i]; 
i++22
)
272
22
  {
273
22
    cube_step[i] = (i > 0) ? 
cube_step[i - 1] * (cube_dim[i - 1] + 1)12
:
110
;
274
22
    scmd[i] = (i > 0) ? 
scmd[i - 1] + cube_dim[i - 1]12
:
scm10
;
275
22
  }
276
10
  const int max_dim = i;
277
32
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++22
)
278
22
  {
279
22
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
280
22
    assert(d < autograd_tensor_symbols->rnum);
281
22
    const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
282
22
    assert(tensor_symbol_info[autograd->d].alias_ref);
283
22
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
284
22
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
285
22
    _ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, 0, max_dim - 1);
286
22
  }
287
  // Compare to see now if the binary map filled up. If it filled up, we know it is fully assigned.
288
10
  for (i = 0; i < (cube_size >> 5); 
i++0
)
289
0
    if (cube[i] < 0xffffffff)
290
0
      return 0;
291
10
  if ((cube_size & 0x1f) > 0)
292
10
  {
293
    // Fetch the rest.
294
10
    uint32_t r = 0;
295
32
    for (i = 0; i < (cube_size & 0x1f); 
i++22
)
296
22
      r |= (1u << i);
297
10
    assert(cube[((cube_size + 31) >> 5) - 1] <= r);
298
10
    if (cube[((cube_size + 31) >> 5) - 1] < r)
299
0
      return 0;
300
10
  }
301
10
  return 1;
302
10
}
303
304
static int _ccv_nnc_tensor_ref_version_find_init(const ccv_nnc_autograd_tensor_version_t* const tensor_ver)
305
5
{
306
5
  int i;
307
10
  for (i = 0; i < tensor_ver->ref_version->rnum; 
i++5
)
308
7
    if (((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i))->x < 0)
309
2
      return i;
310
3
  return -1;
311
5
}
312
313
static void _ccv_nnc_graph_sum_autograd_tensor_versions(const int idx, const int d, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_or_set_execs)
314
4.27k
{
315
4.27k
  int i, j;
316
4.27k
  assert(tensor_ver->c < tensor_ver->ref_version->rnum);
317
4.27k
  const int input_size = tensor_ver->ref_version->rnum - tensor_ver->c;
318
4.27k
  int* inputs = (int*)ccmalloc(sizeof(int) * input_size);
319
12.8k
  for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; 
i++8.57k
)
320
8.57k
    inputs[i] = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i))->d;
321
4.27k
  const ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
322
4.27k
    .d = d
323
4.27k
  };
324
4.27k
  ccv_array_push(autograd_tensor_symbols, &tensor_sym);
325
4.27k
  ccv_nnc_sum_or_set_graph_exec_symbol_t sum_exec = {
326
4.27k
    .input_size = input_size,
327
4.27k
    .inputs = inputs,
328
4.27k
    .output = autograd_tensor_symbols->rnum - 1
329
4.27k
  };
330
4.27k
  if (idx >= 0)
331
4.24k
  {
332
4.24k
    sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0);
333
4.24k
    ccv_array_push(sum_exec.outgoings, &idx);
334
4.24k
  }
335
4.27k
  ccv_array_push(sum_or_set_execs, &sum_exec);
336
4.27k
  const int outgoing = exec_symbol_info_size + sum_or_set_execs->rnum - 1;
337
12.8k
  for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; 
i++8.57k
)
338
8.57k
  {
339
8.57k
    const ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i);
340
8.57k
    const int x = tensor_ref->x;
341
8.57k
    if (x < 0) /* This is initialization tensor, it has to be occurred before the execution anyway. */
342
1
    {
343
      // No alias.
344
1
      assert(!tensor_ref->alias_registry);
345
      // No associated additional execs.
346
1
      assert(!tensor_ref->exec_registry);
347
1
      continue;
348
1
    }
349
8.57k
    if (x < exec_symbol_info_size)
350
8.57k
    {
351
8.57k
      ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
352
8.57k
      if (!back_exec->outgoings)
353
39
        back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
354
8.57k
      ccv_array_replace_unique_int(back_exec->outgoings, idx, outgoing);
355
8.57k
    } else {
356
      // This tensor_ref is generated by the sum operation.
357
0
      ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size);
358
0
      ccv_array_replace_unique_int(sum_or_set->outgoings, idx, outgoing);
359
0
    }
360
    // If this tensor have associated alias, we need to init it to zeros when it is allocated (we only need to set a flag here)
361
    // it is handled at compilation phase.
362
8.57k
    if (tensor_ref->alias_registry &&
363
      // Loop over to see if this tensor is fully occupied to avoid extra zero step.
364
8.57k
      
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)22
)
365
8
    {
366
8
      ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
367
      // By having alias_registry, what this symbol represents must not by an alias.
368
8
      assert(tensor_sym->alias_ref == 0);
369
8
      tensor_sym->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
370
8
    }
371
8.57k
    if (tensor_ref->exec_registry)
372
4
      
for (j = 0; 2
j < tensor_ref->exec_registry->rnum;
j++2
)
373
2
      {
374
2
        const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
375
2
        assert(x >= 0);
376
        // The exec_registry can only be generated by alias registry, therefore, it cannot reference to a sum operation.
377
2
        assert(x < exec_symbol_info_size);
378
2
        ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
379
2
        if (!back_exec->outgoings)
380
1
          back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
381
2
        ccv_array_replace_unique_int(back_exec->outgoings, idx, outgoing);
382
2
      }
383
8.57k
  }
384
4.27k
  const ccv_nnc_tensor_ref_t tensor_ref = {
385
4.27k
    .d = autograd_tensor_symbols->rnum - 1,
386
4.27k
    .x = outgoing
387
4.27k
  };
388
4.27k
  ccv_array_push(tensor_ver->ref_version, &tensor_ref);
389
  /* Move the c pointer up to the latest summed result. */
390
4.27k
  tensor_ver->c = tensor_ver->ref_version->rnum - 1;
391
4.27k
}
392
393
static int _ccv_nnc_tensor_ref_version_involve_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
394
69
{
395
69
  assert(alias->alias_ref > 0);
396
  // No alias_registry, must conflict (owns the whole band).
397
69
  if (!tensor_ref->alias_registry)
398
25
    return 1;
399
44
  int i;
400
63
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++19
)
401
54
  {
402
54
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
403
54
    assert(d < autograd_tensor_symbols->rnum);
404
54
    ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
405
54
    if (ccv_nnc_over_tensor_symbol_aliases(tensor_symbol_info + autograd->d, alias))
406
35
      return 1;
407
54
  }
408
  // All aliases referenced by this ref_version doesn't overlap with the provided one, thus, there is no conflict at all.
409
9
  return 0;
410
44
}
411
412
static int _ccv_nnc_tensor_ref_version_find_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
413
30
{
414
30
  assert(alias->alias_ref > 0);
415
  // No alias_registry, thus, cannot find the exact matched alias.
416
30
  if (!tensor_ref->alias_registry)
417
11
    return -1;
418
19
  int i;
419
34
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++15
)
420
26
  {
421
26
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
422
26
    assert(d < autograd_tensor_symbols->rnum);
423
26
    ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
424
    // This must reference to an alias.
425
26
    assert(tensor_symbol_info[autograd->d].alias_ref);
426
26
    const int* stride = tensor_symbol_info[autograd->d].stride;
427
26
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
428
26
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
429
    // If everything matches, this is the required alias.
430
26
    if (memcmp(stride, alias->stride, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0 &&
431
26
      memcmp(ofs, alias->ofs, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0 &&
432
26
      
memcmp(dim, alias->info.dim, sizeof(int) * 11
CCV_NNC_MAX_DIM_ALLOC11
) == 0)
433
11
      return d;
434
26
  }
435
8
  return -1;
436
19
}
437
438
static int _ccv_nnc_tensor_ref_version_has_this_alias_exclusively(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
439
4
{
440
4
  assert(alias->alias_ref > 0);
441
  // No alias_registry, thus, cannot find the exact matched alias.
442
4
  if (!tensor_ref->alias_registry)
443
0
    return 0;
444
4
  int i;
445
8
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++4
)
446
5
  {
447
5
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
448
5
    assert(d < autograd_tensor_symbols->rnum);
449
5
    ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
450
    // This must reference to an alias.
451
5
    assert(tensor_symbol_info[autograd->d].alias_ref);
452
5
    const int* stride = tensor_symbol_info[autograd->d].stride;
453
5
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
454
5
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
455
5
    if (memcmp(stride, alias->stride, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0 ||
456
5
      memcmp(ofs, alias->ofs, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0 ||
457
5
      
memcmp(dim, alias->info.dim, sizeof(int) * 4
CCV_NNC_MAX_DIM_ALLOC4
) != 0)
458
1
      return 0;
459
5
  }
460
  // If everything matches for every alias in registry, we can use any of the alias directly.
461
3
  return 1;
462
4
}
463
464
static int _ccv_nnc_graph_sum_autograd_tensor_versions_alias(const int idx, const int d, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const alias, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_or_set_execs)
465
21
{
466
21
  assert(tensor_ver->c < tensor_ver->ref_version->rnum);
467
21
  int i, j = 0;
468
21
  struct {
469
21
    int k;
470
21
    int i;
471
21
  } kd[tensor_ver->ref_version->rnum - tensor_ver->c];
472
51
  for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; 
i++30
)
473
30
  {
474
30
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i);
475
30
    const int k = _ccv_nnc_tensor_ref_version_find_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias);
476
30
    if (k >= 0)
477
11
      kd[j++] = (typeof(kd[0])){
478
11
        .k = k, .i = i
479
11
      };
480
19
    else if (_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias))
481
19
      kd[j++] = (typeof(kd[0])) {
482
19
        .k = -1, .i = i // It has dependency to the original tensor (non-alias) now, label this with highest bit.
483
19
      };
484
30
  }
485
  // Can only find one. This is the easy case, we can simply return that symbol (or its alias).
486
21
  if (j == 1)
487
15
  {
488
15
    if (kd[0].k >= 0)
489
4
      return kd[0].k; // Only can find one alias, that is the one.
490
    // Otherwise, need to create a new alias.
491
11
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[0].i);
492
11
    ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
493
    // Since we create new alias, we need to set the referenced one to be allocated with 0s.
494
11
    if (ref->alias_ref) // If this is an alias, it has to be zero initialized.
495
0
    {
496
0
      ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, ref->alias_ref - 1);
497
0
      assert(ref->alias_ref == 0); // This is original.
498
0
      ref->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
499
11
    } else if (tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied.
500
        // Loop over to see if this tensor is fully occupied to avoid extra zero step.
501
11
        
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)3
) {
502
1
      ref->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
503
1
    }
504
11
    ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
505
11
      .d = d,
506
11
      .alias_ref = tensor_ref->d + 1
507
11
    };
508
11
    ccv_array_push(autograd_tensor_symbols, &tensor_sym);
509
11
    const int ad = autograd_tensor_symbols->rnum - 1;
510
11
    if (tensor_ref->alias_registry) // Only push this when it has an alias registry (otherwise it already conflict with everyone).
511
3
      ccv_array_push(tensor_ref->alias_registry, &ad);
512
11
    if (tensor_ref->x >= exec_symbol_info_size && 
idx >= 02
)
513
2
    {
514
2
      ccv_nnc_sum_or_set_graph_exec_symbol_t* const sum_or_set_exec = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, tensor_ref->x - exec_symbol_info_size);
515
      // This may be summed, thus, we need to create a connection between this and the sum.
516
2
      if (!sum_or_set_exec->outgoings)
517
0
        sum_or_set_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
518
2
      ccv_array_push(sum_or_set_exec->outgoings, &idx);
519
2
    }
520
    // The newly inserted tensor symbol.
521
11
    return ad;
522
11
  }
523
  // Otherwise, we need to create the sum operation out of these.
524
6
  const int input_size = j;
525
6
  int has_this_alias_exclusively = 1;
526
6
  int* inputs = input_size > 0 ? (int*)ccmalloc(sizeof(int) * input_size) : 
00
;
527
21
  for (i = 0; i < input_size; 
i++15
)
528
15
  {
529
15
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
530
    // Can take a fast path if every ref involved has the same alias, our sum operation can be faster (using alias directly).
531
15
    if (has_this_alias_exclusively && 
kd[i].k >= 08
&&
_ccv_nnc_tensor_ref_version_has_this_alias_exclusively(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias)4
)
532
3
      inputs[i] = *(int*)ccv_array_get(tensor_ref->alias_registry, 0); // Assigning the alias.
533
12
    else {
534
12
      if (has_this_alias_exclusively)
535
5
      {
536
5
        has_this_alias_exclusively = 0;
537
5
        for (j = 0; j < i; 
j++0
)
538
0
          inputs[j] = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[j].i))->d;
539
5
      }
540
12
      inputs[i] = tensor_ref->d;
541
12
    }
542
15
  }
543
6
  ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
544
6
    .d = alias->alias_ref - 1
545
6
  };
546
6
  ccv_array_push(autograd_tensor_symbols, &tensor_sym);
547
6
  const int tensor_ref_d = autograd_tensor_symbols->rnum - 1;
548
6
  tensor_sym.d = d;
549
6
  tensor_sym.alias_ref = tensor_ref_d + 1;
550
6
  ccv_array_push(autograd_tensor_symbols, &tensor_sym);
551
6
  const int ad = autograd_tensor_symbols->rnum - 1;
552
6
  ccv_nnc_sum_or_set_graph_exec_symbol_t sum_exec = {
553
6
    .input_size = input_size,
554
6
    .inputs = inputs,
555
6
    .output = has_this_alias_exclusively ? 
ad1
:
tensor_ref_d5
/* If has this alias exclusively, the output should be alias as well. Otherwise the output is the real tensor. */
556
6
  };
557
6
  if (idx >= 0)
558
6
  {
559
6
    sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0);
560
6
    ccv_array_push(sum_exec.outgoings, &idx);
561
6
  }
562
6
  ccv_array_push(sum_or_set_execs, &sum_exec);
563
6
  const int outgoing = exec_symbol_info_size + sum_or_set_execs->rnum - 1;
564
6
  int no_alias_registry = 0;
565
21
  for (i = 0; i < input_size; 
i++15
)
566
15
  {
567
15
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
568
15
    if (!has_this_alias_exclusively)
569
12
    {
570
      // If the sum operation is not operating on one alias. I need to zero this tensor out when it is first
571
      // allocated (see discussions around the flags I use).
572
12
      ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
573
12
      if (tensor_sym->alias_ref)
574
0
      {
575
        // Find the original tensor_sym and set its flags (I prefer to set flags on its original).
576
0
        ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_sym->alias_ref - 1);
577
0
        assert(ref->alias_ref == 0); // This is original.
578
0
        ref->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
579
12
      } else if (tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied.
580
          // Loop over to see if this tensor is fully occupied to avoid extra zero step.
581
12
          
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)9
) {
582
6
        tensor_sym->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
583
6
      }
584
12
    }
585
    // Check to see if any of these tensors doesn't have alias.
586
15
    no_alias_registry |= (!tensor_ref->alias_registry);
587
15
    const int x = tensor_ref->x;
588
15
    assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
589
15
    if (x < exec_symbol_info_size)
590
15
    {
591
15
      ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
592
15
      if (!back_exec->outgoings)
593
0
        back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
594
15
      ccv_array_push(back_exec->outgoings, &outgoing);
595
15
    } else {
596
0
      ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size);
597
0
      ccv_array_push(sum_or_set->outgoings, &outgoing);
598
0
    }
599
15
    if (tensor_ref->exec_registry)
600
6
      
for (j = 0; 3
j < tensor_ref->exec_registry->rnum;
j++3
)
601
3
      {
602
3
        const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
603
3
        assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
604
3
        assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation.
605
3
        ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
606
3
        if (!back_exec->outgoings)
607
0
          back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
608
3
        ccv_array_push(back_exec->outgoings, &outgoing);
609
3
      }
610
15
  }
611
6
  const ccv_nnc_tensor_ref_t tensor_ref = {
612
6
    .d = tensor_ref_d,
613
6
    .x = outgoing,
614
6
    .exec_registry = 0, // I don't need to take execution dependencies because this tensor is generated by sum, therefore, we already take that dependency.
615
6
    .alias_registry = !no_alias_registry || 
has_this_alias_exclusively2
?
ccv_array_new(sizeof(int), 1, 0)4
:
02
616
6
  };
617
  // If there is no alias registry, then we take the whole tensor ref as one.
618
6
  if (!no_alias_registry || 
has_this_alias_exclusively2
)
619
4
  {
620
    // If this tensor ref contains multiple different types of alias, have to add them together (otherwise
621
    // the computation for if there is an empty slot in this tensor ref is not correct without all the
622
    // occupancy availability information).
623
4
    if (!has_this_alias_exclusively)
624
10
      
for (i = 0; 3
i < input_size;
i++7
)
625
7
      {
626
7
        ccv_nnc_tensor_ref_t* ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
627
7
        assert(ref->alias_registry);
628
        // It may get duplicates. But whatever, won't matter the computation.
629
19
        
for (j = 0; 7
j < ref->alias_registry->rnum;
j++12
)
630
12
          ccv_array_push(tensor_ref.alias_registry, ccv_array_get(ref->alias_registry, j));
631
7
      }
632
4
    ccv_array_push(tensor_ref.alias_registry, &ad);
633
4
  }
634
6
  assert(input_size <= tensor_ver->ref_version->rnum - tensor_ver->c);
635
6
  ccv_nnc_tensor_ref_t x;
636
21
  for (i = 0; i < input_size; 
i++15
)
637
    // If the current one (i + tensor_ver->c) is smaller than the one referenced to, exchange.
638
15
    if (kd[i].i > i + tensor_ver->c)
639
0
      CCV_SWAP(*(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i + tensor_ver->c), *(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i), x);
640
6
  ccv_array_push(tensor_ver->ref_version, &tensor_ref);
641
  // We've consumed input_size tensor refs, now move c up to the pointer of non-consumed tensors.
642
6
  tensor_ver->c += input_size;
643
6
  return ad;
644
6
}
645
646
typedef struct ccv_nnc_symbolic_graph_backward_prep_s {
647
  int exec_symbol_info_size; // Number of graph exec symbols before adding any new symbols related to automatic differentiation.
648
  int tensor_symbol_info_size; // Number of tensor symbols before adding anything new.
649
  int sub_prep_size;
650
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
651
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
652
  ccv_nnc_graph_backward_info_t* backward_info; // Corresponding to forward graph exec symbol info, it is exactly in reverse.
653
  ccv_nnc_graph_visit_t* forward_visit; // The visitor structure (top sorted index) when doing traversal.
654
  ccv_nnc_graph_visit_t* backward_visit; // The visitor structure (top sorted index) when doing reverse traversal.
655
  ccv_nnc_autograd_graph_exec_symbol_t* autograd_execs; // The graph exec symbols we need for automatic differentiation. This is a 1:1 mapping for forward graph exec symbols, however, unlike backward_info, its outgoings may be more complex (may contain outgoing flows to sum nodes).
656
  ccv_nnc_autograd_tensor_version_t* autograd_tensor_versions; // Corresponding to forward tensor symbols, each may contain multiple versions (due to multi-write).
657
  ccv_array_t* autograd_tensor_symbols; // The tensor symbols we need for automatic differentiation (it may not be 1:1 mapping).
658
  ccv_array_t* sum_or_set_execs; // The sum nodes, because in reverse mode, a tensor could have multiple versions, we need to sum them up before use.
659
  struct ccv_nnc_symbolic_graph_backward_prep_s* sub_preps; // The preps of its sub-graphs.
660
  // Pointers not managed by this struct
661
  ccv_nnc_symbolic_graph_t* graph;
662
} ccv_nnc_symbolic_graph_backward_prep_t;
663
664
static ccv_nnc_symbolic_graph_backward_prep_t _ccv_nnc_symbolic_graph_backward_prep(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
665
6.78k
{
666
6.78k
  const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
667
6.78k
  assert(exec_symbol_info_size > 0);
668
6.78k
  const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
669
6.78k
  assert(tensor_symbol_info_size > 0);
670
6.78k
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * exec_symbol_info_size);
671
6.78k
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * tensor_symbol_info_size);
672
13.5k
  ccv_nnc_graph_visit_t* forward_visit = 
ccv_nnc_graph_visit_new6.78k
(graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), exec_symbol_info_size, sources, source_size, destinations, destination_size, 0);
673
0
  ccv_nnc_symbolic_graph_symbol_infer(graph, forward_visit, sources, source_size, destinations, destination_size, 0, 0, tensor_symbol_info, exec_symbol_info);
674
13.5k
  int i;
675
  // Now, for each one of these, find a reverse graph.
676
13.5k
  ccv_nnc_graph_backward_info_t* backward_info = (ccv_nnc_graph_backward_info_t*)
cccalloc6.78k
(exec_symbol_info_size, sizeof(ccv_nnc_graph_backward_info_t));
677
19.1k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, node, idx) {
678
19.1k
    assert(ccv_nnc_cmd_is_forward(node->cmd) || node->cmd.cmd == CCV_NNC_NOOP);
679
19.1k
    if (node->outgoings)
680
24.7k
      
for (i = 0; 12.3k
i < node->outgoings->rnum;
i++12.4k
)
681
12.4k
      {
682
12.4k
        int d = *(int*)ccv_array_get(node->outgoings, i);
683
12.4k
        if (!backward_info[d].outgoings)
684
12.3k
          backward_info[d].outgoings = ccv_array_new(sizeof(int32_t), 1, 0);
685
12.4k
        ccv_array_push(backward_info[d].outgoings, &idx);
686
12.4k
      }
687
19.1k
  } ccv_nnc_graph_visit_endfor
688
  // Also mark only the output bits that we use.
689
25.9k
  
for (i = 0; 6.78k
i < exec_symbol_info_size;
i++19.1k
)
690
19.1k
  {
691
19.1k
    backward_info[i].input_bitmask_size = ((exec_symbol_info[i].output_size * 2 + exec_symbol_info[i].input_size + 63) >> 6);
692
19.1k
    backward_info[i].output_bitmask_size = ((exec_symbol_info[i].input_size + 63) >> 6);
693
    // Allocate input / output bitmasks
694
19.1k
    if (backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size > 0)
695
19.1k
    {
696
19.1k
      backward_info[i].input_bitmasks = (uint64_t*)cccalloc(backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size, sizeof(uint64_t));
697
19.1k
      if (backward_info[i].output_bitmask_size)
698
19.1k
        backward_info[i].output_bitmasks = backward_info[i].input_bitmasks + backward_info[i].input_bitmask_size;
699
19.1k
    }
700
19.1k
  }
701
6.78k
  ccv_nnc_graph_visit_t* backward_visit = ccv_nnc_graph_visit_new(graph, backward_info, exec_symbol_info_size, destinations, destination_size, sources, source_size, 0);
702
6.78k
  const int sub_prep_size = graph->sub_graphs ? 
graph->sub_graphs->rnum2
:
06.77k
;
703
6.78k
  ccv_nnc_symbolic_graph_backward_prep_t* sub_preps = sub_prep_size > 0 ? 
(ccv_nnc_symbolic_graph_backward_prep_t*)2
cccalloc2
(sub_prep_size, sizeof(ccv_nnc_symbolic_graph_backward_prep_t)) :
06.77k
;
704
6.78k
  for (i = 0; i < sub_prep_size; 
i++4
)
705
4
  {
706
4
    const ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(graph->sub_graphs, i);
707
4
    sub_preps[i] = _ccv_nnc_symbolic_graph_backward_prep(sub_graph, ccv_nnc_symbolic_graph_sources(sub_graph), ccv_nnc_symbolic_graph_source_size(sub_graph), ccv_nnc_symbolic_graph_destinations(sub_graph), ccv_nnc_symbolic_graph_destination_size(sub_graph));
708
4
  }
709
6.78k
  return (ccv_nnc_symbolic_graph_backward_prep_t){
710
6.78k
    .exec_symbol_info_size = exec_symbol_info_size,
711
6.78k
    .tensor_symbol_info_size = tensor_symbol_info_size,
712
6.78k
    .sub_prep_size = sub_prep_size,
713
6.78k
    .exec_symbol_info = exec_symbol_info,
714
6.78k
    .tensor_symbol_info = tensor_symbol_info,
715
6.78k
    .backward_info = backward_info,
716
6.78k
    .forward_visit = forward_visit,
717
6.78k
    .backward_visit = backward_visit,
718
6.78k
    .sub_preps = sub_preps,
719
6.78k
    .graph = (ccv_nnc_symbolic_graph_t*)graph,
720
6.78k
  };
721
6.78k
}
722
723
static void _ccv_nnc_symbolic_graph_backward_exec_io(const ccv_nnc_graph_exec_symbol_info_t* const node, int** const back_input_map, int** const back_output_map, int* const back_input_size, int* const back_output_size)
724
19.1k
{
725
19.1k
  int i;
726
19.1k
  if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
727
7
  {
728
7
    *back_input_map = node->outputs;
729
7
    *back_input_size = node->output_size;
730
14
    for (i = 0; i < node->case_of.argument.offset; 
i++7
)
731
7
      (*back_output_map)[i] = node->inputs[i];
732
7
    const int argument_offset = node->case_of.argument.offset;
733
7
    const int argument_size = node->case_of.argument.size;
734
    // Skip the argument range.
735
7
    for (i = argument_offset + argument_size; i < node->input_size; 
i++0
)
736
0
      (*back_output_map)[i - argument_size] = node->inputs[i];
737
7
    *back_output_size = node->input_size - node->case_of.argument.size;
738
19.1k
  } else { // if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) {
739
19.1k
    *back_input_map = node->outputs;
740
19.1k
    *back_input_size = node->output_size;
741
19.1k
    *back_output_map = node->inputs;
742
19.1k
    *back_output_size = node->input_size;
743
19.1k
  }
744
19.1k
}
745
746
static void _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(const ccv_nnc_graph_exec_symbol_info_t* const forw_exec, const ccv_nnc_symbolic_graph_t* const sub_graph, const int graph_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const uint64_t* const input_bitmasks, const uint64_t* const output_bitmasks, ccv_array_t* const sub_f_symbols, ccv_array_t* const sub_wrt_symbols)
747
8
{
748
8
  int i, j;
749
8
  ccv_array_clear(sub_wrt_symbols);
750
8
  int forw_outputs[ccv_max(1, forw_exec->output_size)];
751
8
  int forw_inputs[ccv_max(1, forw_exec->input_size)];
752
8
  int* back_input_map = forw_outputs;
753
8
  int* back_output_map = forw_inputs;
754
8
  int back_input_size, back_output_size;
755
8
  _ccv_nnc_symbolic_graph_backward_exec_io(forw_exec, &back_input_map, &back_output_map, &back_input_size, &back_output_size);
756
18
  for (i = 0; i < back_output_size; 
i++10
)
757
10
    if (output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
758
8
    {
759
8
      const int d = back_output_map[i];
760
8
      const ccv_array_t* const s_refs = tensor_symbol_info[d].s_ref;
761
8
      const int s_ref = s_refs && s_refs->rnum > graph_ref ? 
*(int*)7
ccv_array_get7
(s_refs, graph_ref) - 1 :
-11
;
762
8
      if (s_ref >= 0)
763
4
      {
764
4
        ccv_nnc_tensor_symbol_t sub_wrt_symbol = {
765
4
          .d = s_ref,
766
4
          .graph = sub_graph,
767
4
        };
768
4
        ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol);
769
4
      } else
770
4
        ccv_array_push(sub_wrt_symbols, &NO_TENSOR_SYMBOL);
771
8
    }
772
8
  ccv_array_clear(sub_f_symbols);
773
16
  for (i = 0; i < back_input_size; 
i++8
)
774
8
    if (input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
775
8
    {
776
8
      const int d = back_input_map[i];
777
8
      ccv_nnc_tensor_symbol_t sub_f_symbol = {
778
8
        .d = *(int*)ccv_array_get(tensor_symbol_info[d].s_ref, graph_ref) - 1,
779
8
        .graph = sub_graph,
780
8
      };
781
8
      ccv_array_push(sub_f_symbols, &sub_f_symbol);
782
8
    }
783
  // Go through all its assignments (parameterized loop), making them either wrt or f.
784
  // The reason is these must flow through the graph, otherwise we cannot form a full
785
  // enclosed loop. Also because they are the additional f / wrt symbols, there is
786
  // no case that we cannot find their corresponding gradients in the backward sub graphs
787
  // (these gradients have to be parameterized to form an enclosed loop as well).
788
30
  for (i = 0; i < sub_graph->tensor_symbol_info->rnum; 
i++22
)
789
22
  {
790
22
    const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, i);
791
22
    if (tensor_symbol_info->assign_ref)
792
2
    {
793
2
      const int assign_ref = tensor_symbol_info->assign_ref - 1;
794
      // i is the wrt, assign_ref is the f.
795
2
      int flag = 0;
796
4
      for (j = 0; !flag && j < sub_wrt_symbols->rnum; 
j++2
)
797
2
        flag = (((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, j))->d == i);
798
2
      if (!flag)
799
2
      {
800
2
        ccv_nnc_tensor_symbol_t sub_wrt_symbol = {
801
2
          .d = i,
802
2
          .graph = sub_graph,
803
2
        };
804
2
        ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol);
805
2
      }
806
2
      flag = 0;
807
4
      for (j = 0; !flag && 
j < sub_f_symbols->rnum2
;
j++2
)
808
2
        flag = (((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, j))->d == assign_ref);
809
2
      if (!flag)
810
0
      {
811
0
        ccv_nnc_tensor_symbol_t sub_f_symbol = {
812
0
          .d = assign_ref,
813
0
          .graph = sub_graph,
814
0
        };
815
0
        ccv_array_push(sub_f_symbols, &sub_f_symbol);
816
0
      }
817
2
    }
818
22
  }
819
8
}
820
821
// Check whether for a given f_symbol, we can compute wrt_symbols at all, if we can, tag the minimal io and ops (some ops can be replaced with noop) required to do so.
822
static int _ccv_nnc_symbolic_graph_backward_prep_prune_ops(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
823
6.78k
{
824
6.78k
  int i, j, p;
825
6.78k
  const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size;
826
6.78k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
827
6.78k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info =backward_prep->tensor_symbol_info;
828
6.78k
  const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit;
829
  // Now, for each one of these, find a reverse graph.
830
6.78k
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
831
6.78k
  const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit;
832
  // Find the f_symbols, and tag its flows.
833
19.1k
  ccv_nnc_graph_visit_for(backward_visit, backward_info, node, idx) {
834
19.1k
    int f = node->f_wrt & 0x1;
835
26.2k
    for (i = 0; i < exec_symbol_info[idx].output_size && 
!f19.4k
;
i++7.04k
)
836
7.04k
    {
837
7.04k
      int d = exec_symbol_info[idx].outputs[i];
838
7.04k
      if (d < 0)
839
206
        continue;
840
6.84k
      
while (6.83k
tensor_symbol_info[d].alias_ref)
841
3
        d = tensor_symbol_info[d].alias_ref - 1;
842
13.6k
      for (j = 0; j < f_symbol_size && 
!f6.87k
;
j++6.85k
)
843
6.85k
        if (d == f_symbols[j].d)
844
6.79k
          f = 1;
845
6.83k
    }
846
19.1k
    if (f)
847
19.1k
    {
848
19.1k
      node->f_wrt |= f;
849
19.1k
      if (node->outgoings)
850
24.6k
        
for (i = 0; 12.3k
i < node->outgoings->rnum;
i++12.3k
)
851
12.3k
        {
852
12.3k
          int d = *(int*)ccv_array_get(node->outgoings, i);
853
12.3k
          backward_info[d].f_wrt |= f;
854
12.3k
        }
855
19.1k
    }
856
19.1k
  } ccv_nnc_graph_visit_endfor
857
  // Find the wrt_symbols, and tag its flows.
858
19.1k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, node, idx) {
859
19.1k
    int wrt = backward_info[idx].f_wrt & 0x2;
860
30.1k
    for (i = 0; i < node->input_size && 
!wrt27.5k
;
i++10.9k
)
861
10.9k
    {
862
10.9k
      int d = node->inputs[i];
863
10.9k
      if (d < 0)
864
1
        continue;
865
10.9k
      
while (10.9k
tensor_symbol_info[d].alias_ref)
866
7
        d = tensor_symbol_info[d].alias_ref - 1;
867
24.6k
      for (j = 0; j < wrt_symbol_size && 
!wrt13.8k
;
j++13.7k
)
868
13.7k
      {
869
13.7k
        int wrt_d = wrt_symbols[j].d;
870
13.7k
        if (wrt_d < 0)
871
29
          continue;
872
        // Find the root of this tensor alias.
873
13.7k
        if (tensor_symbol_info[wrt_d].alias_ref)
874
2
          wrt_d = tensor_symbol_info[wrt_d].alias_ref - 1;
875
13.7k
        if (d == wrt_d)
876
6.85k
          wrt = 0x2;
877
13.7k
      }
878
10.9k
    }
879
19.1k
    if (wrt)
880
19.1k
    {
881
19.1k
      backward_info[idx].f_wrt |= wrt;
882
19.1k
      if (node->outgoings)
883
24.7k
        
for (i = 0; 12.3k
i < node->outgoings->rnum;
i++12.3k
)
884
12.3k
        {
885
12.3k
          int d = *(int*)ccv_array_get(node->outgoings, i);
886
12.3k
          backward_info[d].f_wrt |= wrt;
887
12.3k
        }
888
19.1k
    }
889
19.1k
  } ccv_nnc_graph_visit_endfor
890
6.78k
  enum {
891
6.78k
    WRT_SYMBOL_USE = 1,
892
6.78k
    F_SYMBOL_USE = 2
893
6.78k
  };
894
6.78k
  uint8_t* used_grad = (uint8_t*)cccalloc(tensor_symbol_info_size, sizeof(uint8_t));
895
  // First, all f_symbols and wrt_symbols are used.
896
13.5k
  for (i = 0; i < f_symbol_size; 
i++6.79k
)
897
6.79k
    if (f_symbols[i].d >= 0)
898
6.79k
      used_grad[tensor_symbol_info[f_symbols[i].d].alias_ref ? 
tensor_symbol_info[f_symbols[i].d].alias_ref - 10
: f_symbols[i].d] |= F_SYMBOL_USE;
899
16.3k
  for (i = 0; i < wrt_symbol_size; 
i++9.53k
)
900
9.53k
    if (wrt_symbols[i].d >= 0)
901
9.52k
      used_grad[tensor_symbol_info[wrt_symbols[i].d].alias_ref ? 
tensor_symbol_info[wrt_symbols[i].d].alias_ref - 11
:
wrt_symbols[i].d9.52k
] |= WRT_SYMBOL_USE;
902
  // Do optimistic assumption, and then compute used_grad
903
19.1k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) {
904
19.1k
    ccv_nnc_graph_backward_info_t* node = backward_info + idx;
905
    /* Only interested in the ones on the f / wrt flow */
906
19.1k
    if ((node->f_wrt & 0x3) == 0x3)
907
19.1k
    {
908
19.1k
      const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
909
19.1k
      ccv_nnc_cmd_t cmd = forw_exec->cmd;
910
19.1k
      if (cmd.cmd != CCV_NNC_NOOP)
911
19.1k
        cmd.cmd += 1; /* Backward command is the one after forward command. */
912
19.1k
      assert(ccv_nnc_cmd_is_backward(cmd) || cmd.cmd == CCV_NNC_NOOP);
913
92.9k
      
for (i = 0; 19.1k
i < forw_exec->output_size * 2 + forw_exec->input_size;
i++73.8k
)
914
73.8k
        if (!(i >= forw_exec->output_size && 
i < forw_exec->output_size + forw_exec->input_size54.3k
&&
915
73.8k
          
forw_exec->inputs[i - forw_exec->output_size] < 034.7k
) && // If the input is empty, no need.
916
73.8k
          
!(73.8k
i >= forw_exec->output_size + forw_exec->input_size73.8k
&&
i < forw_exec->output_size * 2 + forw_exec->input_size19.5k
&&
917
73.8k
          
forw_exec->outputs[i - forw_exec->output_size - forw_exec->input_size] < 019.5k
) && // If the output is empty, no need.
918
73.8k
          
!(73.6k
i < forw_exec->output_size73.6k
&&
forw_exec->outputs[i] < 019.5k
)) // If the output is empty for gradient, no need.
919
73.4k
          node->input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
920
53.8k
      for (i = 0; i < forw_exec->input_size; 
i++34.7k
)
921
34.7k
        if (!(forw_exec->inputs[i] < 0)) // If the inputs is empty, no need.
922
34.7k
          node->output_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
923
19.1k
      int maybe_noop = 1;
924
23.1k
      for (i = 0; i < forw_exec->input_size; 
i++4.05k
)
925
        /* See if it is used as wrt, if not, no need to run this node at all. */
926
23.1k
        if (forw_exec->inputs[i] >= 0 && 
used_grad[23.1k
tensor_symbol_info[forw_exec->inputs[i]].alias_ref23.1k
?
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 12.09k
:
forw_exec->inputs[i]21.0k
] & WRT_SYMBOL_USE)
927
19.1k
        {
928
19.1k
          maybe_noop = 0;
929
19.1k
          break;
930
19.1k
        }
931
19.1k
      if (maybe_noop)
932
0
      {
933
0
        for (i = 0; i < node->input_bitmask_size; i++)
934
0
          node->input_bitmasks[i] = 0;
935
0
        for (i = 0; i < node->output_bitmask_size; i++)
936
0
          node->output_bitmasks[i] = 0;
937
0
        node->output_bitmask_size = 0;
938
19.1k
      } else if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) {
939
        // Clear out all potential outputs if we think it is not a wrt symbols.
940
6
        for (i = 0; i < forw_exec->input_size; 
i++4
)
941
4
          if ((node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) &&
942
4
            !(used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? 
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 10
: forw_exec->inputs[i]] & WRT_SYMBOL_USE))
943
1
            node->output_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
944
        // But for now, assuming we need all input gradients.
945
        // Clear out all inputs / outputs from forward op.
946
8
        for (i = forw_exec->output_size; i < forw_exec->output_size * 2 + forw_exec->input_size; 
i++6
)
947
6
          node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
948
19.1k
      } else if (ccv_nnc_cmd_bitmask(cmd, forw_exec->output_size * 2 + forw_exec->input_size, forw_exec->input_size, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size)) {
949
16.7k
        int flag; /* Only continue if it changed */
950
32.1k
        do {
951
32.1k
          flag = 0;
952
          /* Check if the output first */
953
93.3k
          for (i = 0; i < forw_exec->input_size; 
i++61.2k
)
954
            /* Only try to eliminate the one that is not used. */
955
61.2k
            if ((node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) &&
956
61.2k
              
!(used_grad[52.6k
tensor_symbol_info[forw_exec->inputs[i]].alias_ref52.6k
?
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 13.25k
:
forw_exec->inputs[i]49.3k
] & WRT_SYMBOL_USE))
957
8.62k
            {
958
8.62k
              node->output_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
959
              /* If it worked, mark it as flagged. */
960
8.62k
              if (ccv_nnc_cmd_bitmask(cmd, forw_exec->output_size * 2 + forw_exec->input_size, forw_exec->input_size, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size))
961
8.58k
                flag = 1;
962
46
              else /* Refit this with the bit back again. */
963
46
                node->output_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
964
8.62k
            }
965
159k
          for (i = 0; i < forw_exec->output_size * 2 + forw_exec->input_size; 
i++127k
)
966
127k
            if ((node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) &&
967
127k
              
(96.4k
i >= forw_exec->output_size96.4k
||
968
96.4k
               
!(used_grad[32.3k
tensor_symbol_info[forw_exec->outputs[i]].alias_ref32.3k
?
tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 143
:
forw_exec->outputs[i]32.3k
] & F_SYMBOL_USE)))
969
82.0k
            { /* Try to eliminate one of the input. */
970
82.0k
              node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
971
              /* If it worked, mark it as flagged. */
972
82.0k
              if (ccv_nnc_cmd_bitmask(cmd, forw_exec->output_size * 2 + forw_exec->input_size, forw_exec->input_size, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size))
973
28.8k
                flag = 1;
974
53.2k
              else /* Refit this with the bit back again. */
975
53.2k
                node->input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
976
82.0k
            }
977
32.1k
        } while (flag);
978
16.7k
      }
979
38.6k
      for (i = 0; i < forw_exec->output_size; 
i++19.5k
)
980
19.5k
        if (node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
981
          /* Mark it is used as wrt. */
982
19.1k
          used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? 
tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 121
:
forw_exec->outputs[i]19.0k
] |= WRT_SYMBOL_USE;
983
53.8k
      for (i = 0; i < forw_exec->input_size; 
i++34.7k
)
984
          /* Mark it is used as f. */
985
34.7k
        if (node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
986
26.1k
          used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? 
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 12.12k
:
forw_exec->inputs[i]24.0k
] |= F_SYMBOL_USE;
987
19.1k
    }
988
19.1k
  } ccv_nnc_graph_visit_endfor
989
6.78k
  ccv_array_t* sub_f_symbols = 0;
990
6.78k
  ccv_array_t* sub_wrt_symbols = 0;
991
19.1k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) {
992
19.1k
    ccv_nnc_graph_backward_info_t* node = backward_info + idx;
993
19.1k
    const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
994
    /* Only interested in the ones on the f / wrt flow */
995
19.1k
    if ((node->f_wrt & 0x3) == 0x3 && 
forw_exec->graph_ref_size > 019.1k
)
996
2
    {
997
2
      uint64_t stack_input_bitmasks1[node->input_bitmask_size];
998
2
      uint64_t stack_input_bitmasks2[node->input_bitmask_size];
999
2
      uint64_t* const input_bitmasks = forw_exec->graph_ref_size > 1 ? 
stack_input_bitmasks11
:
node->input_bitmasks1
;
1000
      // We collect input masks into this location.
1001
2
      if (forw_exec->graph_ref_size > 1)
1002
1
        memset(stack_input_bitmasks2, 0, sizeof(uint64_t) * node->input_bitmask_size);
1003
6
      for (p = 0; p < forw_exec->graph_ref_size; 
p++4
)
1004
4
      {
1005
        // Reset the stack input bitmasks.
1006
4
        if (forw_exec->graph_ref_size > 1)
1007
3
          memcpy(stack_input_bitmasks1, node->input_bitmasks, sizeof(uint64_t) * node->input_bitmask_size);
1008
        // Now calling it recursively until we are sure no f_symbols can be removed.
1009
4
        const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[p] - 1;
1010
4
        ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep = backward_prep->sub_preps + graph_ref;
1011
4
        if (!sub_wrt_symbols)
1012
2
          sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1013
2
        else
1014
2
          ccv_array_clear(sub_wrt_symbols);
1015
12
        for (i = 0; i < forw_exec->input_size; 
i++8
)
1016
8
          if (node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
1017
7
          {
1018
7
            const ccv_array_t* const s_refs = tensor_symbol_info[forw_exec->inputs[i]].s_ref;
1019
7
            const int s_ref = s_refs && s_refs->rnum > graph_ref ? 
*(int*)5
ccv_array_get5
(s_refs, graph_ref) - 1 :
-12
;
1020
7
            if (s_ref >= 0)
1021
3
            {
1022
3
              ccv_nnc_tensor_symbol_t sub_wrt_symbol = {
1023
3
                .d = s_ref,
1024
3
                .graph = sub_prep->graph,
1025
3
              };
1026
3
              ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol);
1027
3
            }
1028
7
          }
1029
4
        int flag; // Only continue if it changed */
1030
4
        do {
1031
4
          flag = 0;
1032
8
          for (i = 0; i < forw_exec->output_size; 
i++4
)
1033
            // Try to reduce number of inputs for the backward graph. If it is not tagged as F_SYMBOL_USE, we can reduce it.
1034
            // It is reducible because this sub graph may have multiple computation paths, therefore, some of these may not
1035
            // involve our wrt symbols at all.
1036
4
            if (!(used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? 
tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 10
: forw_exec->outputs[i]] & F_SYMBOL_USE) &&
1037
4
              
input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))0
)
1038
0
            { /* Try to eliminate one of the input. */
1039
0
              input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
1040
0
              if (!sub_f_symbols)
1041
0
                sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1042
0
              else
1043
0
                ccv_array_clear(sub_f_symbols);
1044
0
              for (j = 0; j < forw_exec->output_size; j++)
1045
0
                if (node->input_bitmasks[j >> 6] & ((uint64_t)1 << (j & 63)))
1046
0
                {
1047
0
                  const int s_ref = *(int*)ccv_array_get(tensor_symbol_info[forw_exec->outputs[j]].s_ref, graph_ref) - 1;
1048
0
                  assert(s_ref >= 0);
1049
0
                  ccv_nnc_tensor_symbol_t sub_f_symbol = {
1050
0
                    .d = s_ref,
1051
0
                    .graph = sub_prep->graph,
1052
0
                  };
1053
0
                  ccv_array_push(sub_f_symbols, &sub_f_symbol);
1054
0
                }
1055
0
              if (_ccv_nnc_symbolic_graph_backward_prep_prune_ops(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph)))
1056
0
                flag = 1;
1057
0
              else /* Refit this with the bit back again. */
1058
0
                input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
1059
0
            }
1060
4
        } while (flag);
1061
        // I am done, need to redo above for sub_prep, and it has to be successful now.
1062
4
        if (!sub_f_symbols)
1063
2
          sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1064
2
        else
1065
2
          ccv_array_clear(sub_f_symbols);
1066
8
        for (i = 0; i < forw_exec->output_size; 
i++4
)
1067
4
          if (input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
1068
4
          {
1069
4
            const int s_ref = *(int*)ccv_array_get(tensor_symbol_info[forw_exec->outputs[i]].s_ref, graph_ref) - 1;
1070
4
            assert(s_ref >= 0);
1071
4
            ccv_nnc_tensor_symbol_t sub_f_symbol = {
1072
4
              .d = s_ref,
1073
4
              .graph = sub_prep->graph,
1074
4
            };
1075
4
            ccv_array_push(sub_f_symbols, &sub_f_symbol);
1076
4
          }
1077
4
        _ccv_nnc_symbolic_graph_backward_prep_prune_ops(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph));
1078
4
        if (forw_exec->graph_ref_size > 1)
1079
6
          
for (i = 0; 3
i < node->input_bitmask_size;
i++3
)
1080
3
            stack_input_bitmasks2[i] |= input_bitmasks[i];
1081
4
      }
1082
2
      if (forw_exec->graph_ref_size > 1)
1083
1
        memcpy(node->input_bitmasks, stack_input_bitmasks2, sizeof(uint64_t) * node->input_bitmask_size);
1084
2
    }
1085
19.1k
  } ccv_nnc_graph_visit_endfor
1086
6.78k
  if (sub_f_symbols)
1087
2
    ccv_array_free(sub_f_symbols);
1088
6.78k
  if (sub_wrt_symbols)
1089
2
    ccv_array_free(sub_wrt_symbols);
1090
6.78k
  int flag = 1;
1091
13.5k
  for (i = 0; i < f_symbol_size && 
flag6.79k
;
i++6.79k
)
1092
6.79k
    flag = (used_grad[tensor_symbol_info[f_symbols[i].d].alias_ref ? 
tensor_symbol_info[f_symbols[i].d].alias_ref - 10
: f_symbols[i].d] & WRT_SYMBOL_USE);
1093
6.78k
  ccfree(used_grad);
1094
6.78k
  return flag;
1095
6.78k
}
1096
1097
static void _ccv_nnc_symbolic_graph_backward_prep_gen(ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const int is_while, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
1098
6.78k
{
1099
6.78k
  const int exec_symbol_info_size = backward_prep->exec_symbol_info_size;
1100
6.78k
  const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size;
1101
6.78k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
1102
6.78k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info =backward_prep->tensor_symbol_info;
1103
6.78k
  const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit;
1104
  // Now, for each one of these, find a reverse graph.
1105
6.78k
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
1106
6.78k
  const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit;
1107
6.78k
  int i, j;
1108
  // Now, only the flow from f_symbols back to wrt_symbols are interested to us.
1109
  // Visit the graph in reverse order, build the AD nodes.
1110
6.78k
  ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = (ccv_nnc_autograd_graph_exec_symbol_t*)cccalloc(exec_symbol_info_size, sizeof(ccv_nnc_autograd_graph_exec_symbol_t));
1111
6.78k
  int max_forw_input_size = 0, max_forw_output_size = 0;
1112
25.9k
  for (i = 0; i < exec_symbol_info_size; 
i++19.1k
)
1113
19.1k
    if ((backward_info[i].f_wrt & 0x3) == 0x3)
1114
19.1k
    {
1115
19.1k
      max_forw_input_size = ccv_max(max_forw_input_size, exec_symbol_info[i].input_size);
1116
19.1k
      max_forw_output_size = ccv_max(max_forw_output_size, exec_symbol_info[i].output_size);
1117
19.1k
      if (backward_info[i].outgoings)
1118
12.3k
      {
1119
        // Copy over the outgoing bits.
1120
12.3k
        autograd_execs[i].outgoings = ccv_array_new(sizeof(int), backward_info[i].outgoings->rnum, 0);
1121
24.6k
        for (j = 0; j < backward_info[i].outgoings->rnum; 
j++12.3k
)
1122
12.3k
        {
1123
12.3k
          const int d = *(int*)ccv_array_get(backward_info[i].outgoings, j);
1124
          // Only push the outgoing node if it is in the f_wrt path.
1125
12.3k
          if ((backward_info[d].f_wrt & 0x3) == 0x3)
1126
12.3k
            ccv_array_push(autograd_execs[i].outgoings, &d);
1127
12.3k
        }
1128
12.3k
      }
1129
19.1k
    }
1130
6.78k
  int max_forw_inputs[ccv_max(1, max_forw_input_size)];
1131
6.78k
  int max_forw_outputs[ccv_max(1, max_forw_output_size)];
1132
6.78k
  ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = (ccv_nnc_autograd_tensor_version_t*)cccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_autograd_tensor_version_t));
1133
6.78k
  ccv_array_t* autograd_tensor_symbols = ccv_array_new(sizeof(ccv_nnc_autograd_tensor_symbol_t), tensor_symbol_info_size, 0);
1134
6.78k
  ccv_array_t* sum_or_set_execs = ccv_array_new(sizeof(ccv_nnc_sum_or_set_graph_exec_symbol_t), 0, 0);
1135
19.1k
  ccv_nnc_graph_visit_for(backward_visit, backward_info, back_info_node, idx) {
1136
    /* This is required by both f flow and wrt flow, therefore, an interest to us */
1137
19.1k
    if ((back_info_node->f_wrt & 0x3) == 0x3)
1138
19.1k
    {
1139
19.1k
      const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
1140
19.1k
      ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + idx;
1141
19.1k
      back_exec->cmd = forw_exec->cmd;
1142
19.1k
      if (back_exec->cmd.cmd != CCV_NNC_NOOP)
1143
19.1k
        back_exec->cmd.cmd += 1; /* Backward command is the one after forward command. */
1144
19.1k
      assert(ccv_nnc_cmd_is_backward(back_exec->cmd) || back_exec->cmd.cmd == CCV_NNC_NOOP);
1145
19.1k
      if (!back_info_node->output_bitmask_size) /* This has no output, can be a noop. */
1146
0
        back_exec->cmd.cmd = CCV_NNC_NOOP;
1147
19.1k
      else {
1148
19.1k
        int* back_input_map = max_forw_outputs;
1149
19.1k
        int* back_output_map = max_forw_inputs;
1150
19.1k
        _ccv_nnc_symbolic_graph_backward_exec_io(forw_exec, &back_input_map, &back_output_map, &back_exec->input_size, &back_exec->output_size);
1151
19.1k
        back_exec->inputs = ccmalloc(sizeof(int) * (back_exec->input_size + back_exec->output_size));
1152
19.1k
        back_exec->outputs = back_exec->inputs + back_exec->input_size;
1153
        /* Need to compute input before we compute output */
1154
38.6k
        for (i = 0; i < back_exec->input_size; 
i++19.5k
)
1155
19.5k
        {
1156
          /* If we can skip this input, do that. */
1157
19.5k
          if (!(back_info_node->input_bitmasks[i >> 6] & ((uint64_t)1 << i)))
1158
424
            continue;
1159
19.1k
          const int d = back_input_map[i];
1160
19.1k
          const int alias_ref = tensor_symbol_info[d].alias_ref;
1161
19.1k
          ccv_nnc_autograd_tensor_version_t* tensor_ver = alias_ref ? 
autograd_tensor_versions + (alias_ref - 1)21
:
autograd_tensor_versions + d19.0k
;
1162
          /* Initialization tensor, should corresponding to f symbols */
1163
19.1k
          if (!tensor_ver->ref_version)
1164
6.79k
          {
1165
6.79k
            ccv_nnc_autograd_tensor_symbol_t tensor_sym = {};
1166
6.79k
            if (!alias_ref)
1167
6.79k
            {
1168
6.79k
              tensor_sym.d = d;
1169
6.79k
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1170
6.79k
              const ccv_nnc_tensor_ref_t tensor_ref = {
1171
6.79k
                .d = autograd_tensor_symbols->rnum - 1,
1172
6.79k
                .x = idx,
1173
6.79k
                .alias_registry = 0
1174
6.79k
              };
1175
6.79k
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1176
6.79k
              ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1177
6.79k
            } else {
1178
2
              tensor_sym.d = alias_ref - 1;
1179
2
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1180
2
              const ccv_nnc_tensor_ref_t tensor_ref = {
1181
2
                .d = autograd_tensor_symbols->rnum - 1,
1182
2
                .x = idx,
1183
2
                .alias_registry = ccv_array_new(sizeof(int), 1, 0)
1184
2
              };
1185
2
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1186
2
              ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1187
2
              tensor_sym.d = d; /* set back */
1188
2
              tensor_sym.alias_ref = tensor_ref.d + 1;
1189
2
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1190
2
              const int ad = autograd_tensor_symbols->rnum - 1;
1191
2
              ccv_array_push(tensor_ref.alias_registry, &ad);
1192
2
            }
1193
6.79k
          }
1194
          /* The simplest case (most common), it is not an alias. */
1195
19.1k
          if (!alias_ref)
1196
19.0k
          {
1197
            /* Even simpler, this only have one reference tensor, thus, pass this as input. */
1198
19.0k
            if (tensor_ver->c == tensor_ver->ref_version->rnum - 1)
1199
14.8k
            {
1200
14.8k
              ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1201
              /* There are alias associated with this tensor ref, zero it out when this tensor is allocated. */
1202
              /* This is is required. Consider the case that we have an alias of this tensor used somehwere */
1203
              /* on forward pass, when we compute backward, we have that alias computed first, however, its */
1204
              /* underlying tensor is not zero initialized, and we will end up with garbage values here. */
1205
14.8k
              if (tensor_ref->alias_registry &&
1206
                /* Loop over to see if this tensor is fully occupied to avoid extra zero step. */
1207
14.8k
                
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)2.08k
)
1208
1
              {
1209
1
                ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1210
1
                assert(tensor_sym->alias_ref == 0);
1211
1
                tensor_sym->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
1212
1
              }
1213
14.8k
              back_exec->inputs[i] = tensor_ref->d;
1214
14.8k
            } else {
1215
              /* Otherwise, we need to sum them up, and then pass the summed result to the computation. */
1216
4.24k
              _ccv_nnc_graph_sum_autograd_tensor_versions(idx, d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_or_set_execs);
1217
4.24k
              ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1218
4.24k
              back_exec->inputs[i] = tensor_ref->d;
1219
4.24k
            }
1220
19.0k
          } else
1221
            /* If this is an alias, go through all available tensor ref versions */
1222
21
            back_exec->inputs[i] = _ccv_nnc_graph_sum_autograd_tensor_versions_alias(idx, d, tensor_symbol_info, exec_symbol_info_size, tensor_symbol_info + d, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_or_set_execs);
1223
19.1k
        }
1224
53.8k
        
for (i = 0; 19.1k
i < back_exec->output_size;
i++34.7k
)
1225
34.7k
        {
1226
          /* If we can skip this output, do that. */
1227
34.7k
          if (!(back_info_node->output_bitmasks[i >> 6] & ((uint64_t)1 << i)))
1228
8.59k
            continue;
1229
26.1k
          const int d = back_output_map[i];
1230
26.1k
          const int alias_ref = tensor_symbol_info[d].alias_ref;
1231
26.1k
          ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
1232
26.1k
            .d = d
1233
26.1k
          };
1234
          /* The simplest case (most common), it is not an alias. */
1235
26.1k
          if (!alias_ref)
1236
24.0k
          {
1237
24.0k
            ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1238
24.0k
            const ccv_nnc_tensor_ref_t tensor_ref = {
1239
24.0k
              .d = autograd_tensor_symbols->rnum - 1,
1240
24.0k
              .x = idx,
1241
24.0k
              .exec_registry = 0,
1242
24.0k
              .alias_registry = 0
1243
24.0k
            };
1244
24.0k
            ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + d;
1245
24.0k
            if (!tensor_ver->ref_version)
1246
19.7k
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1247
24.0k
            ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1248
24.0k
            back_exec->outputs[i] = tensor_ref.d;
1249
24.0k
          } else {
1250
            /* Otherwise, in case that this is an alias, we try to find the existing one (in tensor_ver
1251
             * see if can meet the need (thus, for the tensor info / ofs, it fits). */
1252
2.12k
            ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + (alias_ref - 1);
1253
2.12k
            if (!tensor_ver->ref_version)
1254
2.09k
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1255
            /* If already exists a ref version, check if any of these not-sealed tensors have free space. */
1256
2.12k
            int found = 0;
1257
2.17k
            for (j = tensor_ver->c; !found && 
j < tensor_ver->ref_version->rnum2.16k
;
j++50
)
1258
50
            {
1259
50
              ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, j);
1260
50
              if (!_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, tensor_symbol_info + d))
1261
9
              {
1262
9
                tensor_sym.alias_ref = tensor_ref->d + 1;
1263
9
                ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1264
9
                const int ad = autograd_tensor_symbols->rnum - 1;
1265
9
                ccv_array_push(tensor_ref->alias_registry, &ad);
1266
9
                if (!tensor_ref->exec_registry)
1267
7
                  tensor_ref->exec_registry = ccv_array_new(sizeof(int), 1, 0);
1268
9
                ccv_array_push(tensor_ref->exec_registry, &idx);
1269
9
                back_exec->outputs[i] = ad;
1270
9
                found = 1;
1271
9
              }
1272
50
            }
1273
2.12k
            if (!found) /* Cannot find an tensor ref to insert, create one first */
1274
2.11k
            {
1275
2.11k
              tensor_sym.d = alias_ref - 1; /* Reference back to the non-alias. */
1276
2.11k
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1277
2.11k
              const ccv_nnc_tensor_ref_t tensor_ref = {
1278
2.11k
                .d = autograd_tensor_symbols->rnum - 1,
1279
2.11k
                .x = idx,
1280
2.11k
                .exec_registry = 0,
1281
2.11k
                .alias_registry = ccv_array_new(sizeof(int), 1, 0)
1282
2.11k
              };
1283
2.11k
              ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1284
2.11k
              tensor_sym.d = d; /* set back */
1285
2.11k
              tensor_sym.alias_ref = tensor_ref.d + 1;
1286
2.11k
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1287
2.11k
              const int ad = autograd_tensor_symbols->rnum - 1;
1288
2.11k
              ccv_array_push(tensor_ref.alias_registry, &ad);
1289
2.11k
              back_exec->outputs[i] = ad;
1290
2.11k
            }
1291
2.12k
          }
1292
26.1k
        }
1293
19.1k
      }
1294
19.1k
    }
1295
19.1k
  } ccv_nnc_graph_visit_endfor
1296
  // Find all relevant wrt symbols, generate sum for them if needed.
1297
16.3k
  
for (i = 0; 6.78k
i < wrt_symbol_size;
i++9.53k
)
1298
9.53k
  {
1299
9.53k
    const int d = wrt_symbols[i].d;
1300
9.53k
    if (d < 0)
1301
9
      continue;
1302
9.52k
    const int ref_d = (!tensor_symbol_info[d].alias_ref) ? 
d9.52k
:
tensor_symbol_info[d].alias_ref - 11
;
1303
9.52k
    ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + ref_d;
1304
9.52k
    if (!tensor_ver->ref_version)
1305
1
    {
1306
      // This wrt symbol is not available at all, for this case, we set its flag to init zero.
1307
1
      const ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
1308
1
        .d = ref_d
1309
1
      };
1310
1
      ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1311
1
      ccv_nnc_sum_or_set_graph_exec_symbol_t set_exec = {
1312
1
        .value = 0,
1313
1
        .output = autograd_tensor_symbols->rnum - 1,
1314
1
      };
1315
1
      ccv_array_push(sum_or_set_execs, &set_exec);
1316
      // Insert the one to be set to zero.
1317
1
      const ccv_nnc_tensor_ref_t tensor_ref = {
1318
1
        .d = autograd_tensor_symbols->rnum - 1,
1319
1
        .x = exec_symbol_info_size + sum_or_set_execs->rnum - 1,
1320
1
      };
1321
1
      tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1322
1
      ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1323
1
      continue;
1324
1
    }
1325
    // If it is a while loop, we need to insert an accumulator to the graph (this is expressed as a initialization tensor summed with existing results).
1326
    // First, insert the initialization tensor if this wrt results is not used directly in next while loop (thus, it participates the computation, therefore, no need to accumulate).
1327
9.52k
    if (is_while && 
!tensor_symbol_info[ref_d].assign_ref2
&&
1328
9.52k
      
_ccv_nnc_tensor_ref_version_find_init(tensor_ver) < 01
) // If the initialization tensor is not inserted yet.
1329
1
    {
1330
1
      const ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
1331
1
        .d = ref_d
1332
1
      };
1333
1
      ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1334
      // Insert the one to be summed.
1335
1
      const ccv_nnc_tensor_ref_t tensor_ref = {
1336
1
        .d = autograd_tensor_symbols->rnum - 1,
1337
1
        .x = -1, // This denotes it is an initialization vector.
1338
1
      };
1339
1
      ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1340
1
    }
1341
    // If there are more than one tensor in the list, it is possible to sum them up.
1342
9.52k
    if (tensor_ver->c < tensor_ver->ref_version->rnum - 1)
1343
30
      _ccv_nnc_graph_sum_autograd_tensor_versions(-1, ref_d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_or_set_execs);
1344
    // The tensor version should have ref_version, and only one now (after sum up).
1345
9.52k
    assert(tensor_ver->c == tensor_ver->ref_version->rnum - 1);
1346
9.52k
  }
1347
  // Adding additional fields to backward_prep now.
1348
6.78k
  backward_prep->autograd_execs = autograd_execs;
1349
6.78k
  backward_prep->autograd_tensor_versions = autograd_tensor_versions;
1350
6.78k
  backward_prep->autograd_tensor_symbols = autograd_tensor_symbols;
1351
6.78k
  backward_prep->sum_or_set_execs = sum_or_set_execs;
1352
6.78k
  ccv_array_t* sub_f_symbols = 0;
1353
6.78k
  ccv_array_t* sub_wrt_symbols = 0;
1354
19.1k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) {
1355
19.1k
    ccv_nnc_graph_backward_info_t* node = backward_info + idx;
1356
19.1k
    const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
1357
    /* Only interested in the ones on the f / wrt flow */
1358
19.1k
    if ((node->f_wrt & 0x3) == 0x3)
1359
19.1k
    {
1360
19.1k
      const int is_while = (forw_exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE);
1361
19.1k
      for (i = 0; i < forw_exec->graph_ref_size; 
i++4
)
1362
4
      {
1363
        // Now calling it recursively until we are sure no f_symbols can be removed.
1364
4
        const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[i] - 1;
1365
4
        ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep = backward_prep->sub_preps + graph_ref;
1366
4
        if (!sub_wrt_symbols)
1367
2
          sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1368
4
        if (!sub_f_symbols)
1369
2
          sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1370
4
        _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, node->input_bitmasks, node->output_bitmasks, sub_f_symbols, sub_wrt_symbols);
1371
4
        _ccv_nnc_symbolic_graph_backward_prep_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, is_while, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph));
1372
4
      }
1373
19.1k
    }
1374
19.1k
  } ccv_nnc_graph_visit_endfor
1375
6.78k
  if (sub_f_symbols)
1376
2
    ccv_array_free(sub_f_symbols);
1377
6.78k
  if (sub_wrt_symbols)
1378
2
    ccv_array_free(sub_wrt_symbols);
1379
6.78k
}
1380
1381
static void _ccv_nnc_symbolic_graph_backward_prep_free(const ccv_nnc_symbolic_graph_backward_prep_t backward_prep)
1382
6.78k
{
1383
6.78k
  int i, j;
1384
6.78k
  const int exec_symbol_info_size = backward_prep.exec_symbol_info_size;
1385
6.78k
  const int tensor_symbol_info_size = backward_prep.tensor_symbol_info_size;
1386
6.78k
  ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep.autograd_execs;
1387
6.78k
  if (autograd_execs)
1388
6.78k
  {
1389
25.9k
    for (i = 0; i < exec_symbol_info_size; 
i++19.1k
)
1390
19.1k
    {
1391
19.1k
      if (autograd_execs[i].inputs)
1392
19.1k
        ccfree(autograd_execs[i].inputs);
1393
19.1k
      if (autograd_execs[i].outgoings)
1394
12.3k
        ccv_array_free(autograd_execs[i].outgoings);
1395
19.1k
    }
1396
6.78k
    ccfree(autograd_execs);
1397
6.78k
  }
1398
6.78k
  ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep.autograd_tensor_versions;
1399
6.78k
  if (autograd_tensor_versions)
1400
6.78k
  {
1401
46.4k
    for (i = 0; i < tensor_symbol_info_size; 
i++39.6k
)
1402
39.6k
    {
1403
39.6k
      if (autograd_tensor_versions[i].ref_version)
1404
28.6k
      {
1405
65.9k
        for (j = 0; j < autograd_tensor_versions[i].ref_version->rnum; 
j++37.2k
)
1406
37.2k
        {
1407
37.2k
          ccv_nnc_tensor_ref_t* ref_version = (ccv_nnc_tensor_ref_t*)ccv_array_get(autograd_tensor_versions[i].ref_version, j);
1408
37.2k
          if (ref_version->exec_registry)
1409
7
            ccv_array_free(ref_version->exec_registry);
1410
37.2k
          if (ref_version->alias_registry)
1411
2.12k
            ccv_array_free(ref_version->alias_registry);
1412
37.2k
        }
1413
28.6k
        ccv_array_free(autograd_tensor_versions[i].ref_version);
1414
28.6k
      }
1415
39.6k
    }
1416
6.78k
    ccfree(autograd_tensor_versions);
1417
6.78k
  }
1418
6.78k
  if (backward_prep.autograd_tensor_symbols)
1419
6.78k
    ccv_array_free(backward_prep.autograd_tensor_symbols);
1420
6.78k
  ccv_array_t* const sum_or_set_execs = backward_prep.sum_or_set_execs;
1421
6.78k
  if (sum_or_set_execs)
1422
6.78k
  {
1423
11.0k
    for (i = 0; i < sum_or_set_execs->rnum; 
i++4.28k
)
1424
4.28k
    {
1425
4.28k
      ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, i);
1426
4.28k
      if (sum_or_set->inputs)
1427
4.28k
        ccfree(sum_or_set->inputs);
1428
4.28k
      if (sum_or_set->outgoings)
1429
4.25k
        ccv_array_free(sum_or_set->outgoings);
1430
4.28k
    }
1431
6.78k
    ccv_array_free(sum_or_set_execs);
1432
6.78k
  }
1433
  // Now afterwards, these are mandatory.
1434
6.78k
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep.backward_info;
1435
25.9k
  for (i = 0; i < exec_symbol_info_size; 
i++19.1k
)
1436
19.1k
  {
1437
19.1k
    if (backward_info[i].outgoings)
1438
12.3k
      ccv_array_free(backward_info[i].outgoings);
1439
19.1k
    if (backward_info[i].input_bitmasks)
1440
19.1k
      ccfree(backward_info[i].input_bitmasks);
1441
19.1k
  }
1442
6.78k
  ccfree(backward_info);
1443
6.78k
  ccv_nnc_graph_visit_free(backward_prep.backward_visit);
1444
6.78k
  ccv_nnc_graph_visit_free(backward_prep.forward_visit);
1445
6.78k
  ccfree(backward_prep.exec_symbol_info);
1446
6.78k
  ccfree(backward_prep.tensor_symbol_info);
1447
6.78k
  for (i = 0; i < backward_prep.sub_prep_size; 
i++4
)
1448
4
    _ccv_nnc_symbolic_graph_backward_prep_free(backward_prep.sub_preps[i]);
1449
6.78k
  if (backward_prep.sub_preps)
1450
2
    ccfree(backward_prep.sub_preps);
1451
6.78k
}
1452
1453
static void _ccv_nnc_add_backward_breakpoint_for_symbol(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_graph_exec_symbol_t breakpoint, ccv_nnc_symbolic_graph_t* const graph, ccv_array_t* const sub_breakpoints)
1454
1
{
1455
1
  const ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), 0, 0, 0, 0, 0);
1456
1
  ccv_array_push(sub_breakpoints, &noop);
1457
  // Now need to hook this up to the graph.
1458
1
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
1459
1
  const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit;
1460
  // Now, for each one of these, find a reverse graph.
1461
1
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
1462
1
  int i;
1463
  // Clean up the high bit.
1464
4
  for (i = 0; i < backward_prep->exec_symbol_info_size; 
i++3
)
1465
3
    backward_info[i].f_wrt &= ~0x4;
1466
1
  assert((backward_info[breakpoint.d].f_wrt & 0x3) != 0x3);
1467
1
  backward_info[breakpoint.d].f_wrt |= 0x4;
1468
1
  const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit;
1469
1
  const ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep->autograd_execs;
1470
  // Going forward to find whether this breakpoint is a source node to some f_wrt nodes.
1471
3
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, forw_exec, idx) {
1472
3
    ccv_nnc_graph_backward_info_t* const node = backward_info + idx;
1473
    // If it is tagged on breakpoint flow, but not as both f or wrt, flow through it.
1474
3
    if ((node->f_wrt & 0x4) && 
(node->f_wrt & 0x3) != 0x31
)
1475
1
      for (i = 0; forw_exec->outgoings && 
i < forw_exec->outgoings->rnum0
;
i++0
)
1476
0
      {
1477
0
        const int outgoing_idx = *(int*)ccv_array_get(forw_exec->outgoings, i);
1478
0
        ccv_nnc_graph_backward_info_t* const outgoing_node = backward_info + outgoing_idx;
1479
        // If this is a f_wrt node. Concatenate.
1480
0
        if (!(outgoing_node->f_wrt & 0x4) && (outgoing_node->f_wrt & 0x3) == 0x3)
1481
0
            ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[outgoing_idx].symbol, noop);
1482
0
        outgoing_node->f_wrt |= 0x4;
1483
0
      }
1484
3
  } ccv_nnc_graph_visit_endfor
1485
  // Going backward to find whether this breakpoint is a destination node for some f_wrt_nodes.
1486
3
  ccv_nnc_graph_visit_for(backward_visit, backward_info, node, idx) {
1487
3
    if ((node->f_wrt & 0x4) && 
(node->f_wrt & 0x3) != 0x32
)
1488
2
      
for (i = 0; 1
node->outgoings && i < node->outgoings->rnum;
i++1
)
1489
1
      {
1490
1
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
1491
1
        ccv_nnc_graph_backward_info_t* const outgoing_node = backward_info + outgoing_idx;
1492
        // If this is a f_wrt node. Concatenate.
1493
1
        if (!(outgoing_node->f_wrt & 0x4) && (outgoing_node->f_wrt & 0x3) == 0x3)
1494
1
            ccv_nnc_graph_exec_symbol_concat(graph, noop, autograd_execs[outgoing_idx].symbol);
1495
1
        outgoing_node->f_wrt |= 0x4;
1496
1
      }
1497
3
  } ccv_nnc_graph_visit_endfor
1498
1
}
1499
1500
static ccv_nnc_autograd_tensor_symbol_t* _ccv_nnc_autograd_tensor_symbol_from_tensor_version(ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_autograd_tensor_version_t* const tensor_ver)
1501
7
{
1502
7
  assert(tensor_ver->ref_version);
1503
7
  const ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1504
7
  return (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1505
7
}
1506
1507
static void _ccv_nnc_symbolic_graph_set_backward_carry_overs(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph)
1508
1
{
1509
1
  int i;
1510
5
  for (i = 0; i < backward_prep->graph->tensor_symbol_info->rnum; 
i++4
)
1511
4
  {
1512
4
    const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = backward_prep->tensor_symbol_info + i;
1513
4
    if (tensor_symbol_info->assign_ref)
1514
1
    {
1515
1
      const int assign_ref = tensor_symbol_info->assign_ref - 1;
1516
1
      ccv_nnc_autograd_tensor_symbol_t* const destination_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + assign_ref);
1517
1
      ccv_nnc_autograd_tensor_symbol_t* const source_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + i);
1518
1
      ccv_nnc_symbolic_graph_set_carry_overs(graph, (ccv_nnc_tensor_symbol_map_t []){
1519
1
        { .source = source_autograd_symbol->symbol, .destination = destination_autograd_symbol->symbol }
1520
1
      }, 1);
1521
1
    }
1522
4
  }
1523
3
  for (i = 0; i < wrt_symbol_size; 
i++2
)
1524
2
  {
1525
2
    const int d = wrt_symbols[i].d;
1526
2
    if (d < 0)
1527
0
      continue;
1528
2
    const int ref_d = (!backward_prep->tensor_symbol_info[d].alias_ref) ? d : 
backward_prep->tensor_symbol_info[d].alias_ref - 10
;
1529
2
    const ccv_nnc_autograd_tensor_version_t* const tensor_ver = backward_prep->autograd_tensor_versions + ref_d;
1530
2
    const int init_ref_ver = _ccv_nnc_tensor_ref_version_find_init(tensor_ver);
1531
2
    if (init_ref_ver >= 0)
1532
1
    {
1533
1
      const int init_d = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, init_ref_ver))->d;
1534
1
      ccv_nnc_autograd_tensor_symbol_t* const destination_autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(backward_prep->autograd_tensor_symbols, init_d);
1535
1
      ccv_nnc_autograd_tensor_symbol_t* const source_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + ref_d);
1536
1
      ccv_nnc_symbolic_graph_set_carry_overs(graph, (ccv_nnc_tensor_symbol_map_t []){
1537
1
        { .source = source_autograd_symbol->symbol, .destination = destination_autograd_symbol->symbol }
1538
1
      }, 1);
1539
1
    }
1540
2
  }
1541
1
}
1542
1543
static void _ccv_nnc_symbolic_graph_add_init_zeros(const ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph, ccv_nnc_symbolic_graph_t* const sub_graph, ccv_array_t* const symbols)
1544
1
{
1545
1
  int i;
1546
3
  for (i = 0; i < wrt_symbol_size; 
i++2
)
1547
2
  {
1548
2
    const int d = wrt_symbols[i].d;
1549
2
    if (d < 0)
1550
0
      continue;
1551
2
    const int ref_d = (!sub_prep->tensor_symbol_info[d].alias_ref) ? d : 
sub_prep->tensor_symbol_info[d].alias_ref - 10
;
1552
2
    const ccv_nnc_autograd_tensor_version_t* const tensor_ver = sub_prep->autograd_tensor_versions + ref_d;
1553
2
    const int init_ref_ver = _ccv_nnc_tensor_ref_version_find_init(tensor_ver);
1554
2
    if (init_ref_ver >= 0)
1555
1
    {
1556
      // Need de-dup logic.
1557
1
      const int init_d = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, init_ref_ver))->d;
1558
1
      ccv_nnc_autograd_tensor_symbol_t* const init_autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(sub_prep->autograd_tensor_symbols, init_d);
1559
1
      const ccv_nnc_tensor_symbol_info_t* const sub_init_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, init_autograd_symbol->symbol.d);
1560
      // If it doesn't have a parent ref yet, create one.
1561
1
      if (!sub_init_symbol_info->p_ref)
1562
1
      {
1563
1
        ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, sub_prep->tensor_symbol_info[ref_d].info, 0);
1564
1
        ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
1565
1
        ccv_array_push(symbols, &new_symbol);
1566
1
        ccv_nnc_tensor_symbol_hookup(graph, sub_graph, new_symbol, init_autograd_symbol->symbol);
1567
1
      }
1568
1
    }
1569
2
  }
1570
1
}
1571
1572
static void _ccv_nnc_symbolic_graph_add_tape_vars(const ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep, ccv_nnc_symbolic_graph_t* const root, ccv_nnc_symbolic_graph_t* const graph, ccv_nnc_symbolic_graph_t* const sub_graph, ccv_array_t* const symbols)
1573
4
{
1574
4
  int i;
1575
24
  for (i = 0; i < sub_graph->tensor_symbol_info->rnum; 
i++20
)
1576
20
  {
1577
20
    const ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, i);
1578
20
    if ((symbol_info->flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && 
symbol_info->pair_ref7
)
1579
7
    {
1580
7
      const int pair_ref = symbol_info->pair_ref - 1;
1581
7
      const ccv_nnc_tensor_symbol_t root_symbol = ccv_nnc_tensor_symbol_resolve(root, (ccv_nnc_tensor_symbol_t){
1582
7
        .d = pair_ref,
1583
7
        .graph = sub_prep->graph,
1584
7
      });
1585
7
      if (root_symbol.d >= 0)
1586
3
      {
1587
3
        ccv_nnc_tensor_symbol_hookup(root, sub_graph, root_symbol, (ccv_nnc_tensor_symbol_t){
1588
3
          .d = i,
1589
3
          .graph = sub_graph,
1590
3
        });
1591
3
        if (symbols)
1592
2
        {
1593
2
          const ccv_nnc_tensor_symbol_t p_symbol = ccv_nnc_tensor_symbol_resolve(graph, (ccv_nnc_tensor_symbol_t){
1594
2
            .d = i,
1595
2
            .graph = sub_graph,
1596
2
          });
1597
2
          ccv_array_push(symbols, &p_symbol);
1598
2
        }
1599
3
      }
1600
7
    }
1601
20
  }
1602
4
}
1603
1604
static void _ccv_nnc_symbolic_graph_backward_gen(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph, ccv_nnc_symbolic_graph_t* const root)
1605
6.78k
{
1606
6.78k
  assert(graph == backward_prep->graph || graph->pair == backward_prep->graph);
1607
6.78k
  const int exec_symbol_info_size = backward_prep->exec_symbol_info_size;
1608
6.78k
  const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size;
1609
6.78k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
1610
6.78k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = backward_prep->tensor_symbol_info;
1611
6.78k
  int i, j, k, p;
1612
6.78k
  ccv_array_t* const autograd_tensor_symbols = backward_prep->autograd_tensor_symbols;
1613
  // Generate required symbols based on the information gathered above.
1614
46.1k
  for (i = 0; i < autograd_tensor_symbols->rnum; 
i++39.4k
)
1615
39.4k
  {
1616
39.4k
    ccv_nnc_autograd_tensor_symbol_t* symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, i);
1617
39.4k
    assert(symbol->d >= 0);
1618
39.4k
    assert(symbol->d < tensor_symbol_info_size);
1619
39.4k
    const ccv_nnc_tensor_symbol_info_t* const forw_symbol = tensor_symbol_info + symbol->d;
1620
39.4k
    if (!symbol->alias_ref)
1621
37.2k
    {
1622
37.2k
      assert(!forw_symbol->alias_ref);
1623
37.2k
      symbol->symbol = ccv_nnc_tensor_symbol_new(graph, forw_symbol->info, 0);
1624
37.2k
      ccv_nnc_tensor_symbol_set_flags(graph, symbol->symbol, symbol->flags);
1625
37.2k
    } else {
1626
2.14k
      assert(forw_symbol->alias_ref);
1627
2.14k
      assert(symbol->flags == 0); // We don't set flags on alias.
1628
      // Due to our generation order, this must be after the original symbol is created.
1629
2.14k
      ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, symbol->alias_ref - 1);
1630
2.14k
      symbol->symbol = ccv_nnc_tensor_symbol_alias_new(graph, ref->symbol, forw_symbol->ofs, forw_symbol->stride, forw_symbol->info, 0);
1631
2.14k
    }
1632
39.4k
  }
1633
6.78k
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
1634
6.78k
  ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep->autograd_execs;
1635
6.78k
  ccv_array_t* symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1636
6.78k
  ccv_array_t* symbol_map = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_map_t), 0, 0);
1637
6.78k
  ccv_array_t* sub_f_symbols = 0;
1638
6.78k
  ccv_array_t* sub_wrt_symbols = 0;
1639
6.78k
  ccv_array_t* sub_execs = 0;
1640
25.9k
  for (i = 0; i < exec_symbol_info_size; 
i++19.1k
)
1641
19.1k
  {
1642
    // This is not going to be an interesting node. Skip.
1643
19.1k
    if ((backward_info[i].f_wrt & 0x3) != 0x3)
1644
86
      continue;
1645
19.1k
    ccv_nnc_graph_backward_info_t* const back_info = backward_info + i;
1646
19.1k
    ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i;
1647
19.1k
    if (back_exec->cmd.cmd == CCV_NNC_NOOP)
1648
1
    {
1649
1
      back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, 0, 0, 0, 0, 0);
1650
1
      continue;
1651
1
    }
1652
19.1k
    const ccv_nnc_graph_exec_symbol_info_t* const forw_exec = exec_symbol_info + i;
1653
19.1k
    if (forw_exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1654
1
    {
1655
1
      ccv_array_clear(symbols);
1656
1
      const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[0] - 1;
1657
1
      ccv_nnc_symbolic_graph_backward_prep_t* sub_prep = backward_prep->sub_preps + graph_ref;
1658
1
      ccv_nnc_symbolic_graph_t* sub_graph = ccv_nnc_symbolic_graph_new();
1659
1
      sub_graph->pair = sub_prep->graph;
1660
1
      if (!sub_wrt_symbols)
1661
1
        sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1662
      // I am done, need to redo above for sub_prep, and it has to be successful now.
1663
1
      if (!sub_f_symbols)
1664
1
        sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1665
1
      _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, back_info->input_bitmasks, back_info->output_bitmasks, sub_f_symbols, sub_wrt_symbols);
1666
1
      _ccv_nnc_symbolic_graph_backward_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph, root);
1667
1
      back_exec->symbol = ccv_nnc_symbolic_graph_while(graph, back_exec->cmd.cmd, sub_graph, forw_exec->name);
1668
1
      if (!sub_execs)
1669
1
        sub_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0);
1670
1
      ccv_array_clear(sub_execs);
1671
      // Find the breakpoints in forward graph, creating the reverse one.
1672
2
      for (j = 0; j < sub_prep->graph->breakpoint_size; 
j++1
)
1673
1
      {
1674
1
        const int d = sub_prep->graph->breakpoints[j].d;
1675
1
        if (sub_prep->autograd_execs[d].symbol.graph)
1676
0
          ccv_array_push(sub_execs, &sub_prep->autograd_execs[d].symbol);
1677
1
        else
1678
1
          _ccv_nnc_add_backward_breakpoint_for_symbol(sub_prep, sub_prep->graph->breakpoints[j], sub_graph, sub_execs);
1679
1
      }
1680
1
      ccv_nnc_symbolic_graph_set_while_expr(sub_graph, NOOP_GRAPH_WHILE_EXPR, 0, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_execs, 0), sub_execs->rnum);
1681
1
      ccv_nnc_graph_exec_symbol_autogen(sub_graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1682
1
      _ccv_nnc_symbolic_graph_set_backward_carry_overs(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph);
1683
2
      for (j = 0; j < back_exec->input_size; 
j++1
)
1684
1
        if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j))
1685
1
          ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol));
1686
      // Find whether in the wrt symbols, anything we need to init to zero, if there are, these need to be inputs here too.
1687
1
      _ccv_nnc_symbolic_graph_add_init_zeros(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, graph, sub_graph, symbols);
1688
1
      _ccv_nnc_symbolic_graph_add_tape_vars(sub_prep, root, graph, sub_graph, symbols);
1689
      // input_size at this point, may be different from the back_exec->input_size, the reason is because we may added zeroing tensors as input tensors.
1690
1
      const int input_size = symbols->rnum;
1691
3
      for (j = 0; j < back_exec->output_size; 
j++2
)
1692
2
        if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1693
1
          ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol));
1694
1
      const int output_size = symbols->rnum - input_size;
1695
1
      const int p_idx = sub_prep->graph->p_idx - 1;
1696
1
      assert(back_exec->input_size == forw_exec->output_size);
1697
1
      k = 0;
1698
2
      for (j = 0; j < back_exec->input_size; 
j++1
)
1699
1
        if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j))
1700
1
        {
1701
1
          const ccv_nnc_tensor_symbol_info_t* const info = tensor_symbol_info + forw_exec->outputs[j];
1702
1
          const int s_idx = *(int*)ccv_array_get(info->s_ref, p_idx) - 1;
1703
1
          assert(s_idx >= 0);
1704
1
          const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + s_idx);
1705
1
          ccv_nnc_tensor_symbol_hookup(graph, sub_graph, *(ccv_nnc_tensor_symbol_t*)ccv_array_get(symbols, k), autograd_symbol->symbol);
1706
1
          ++k;
1707
1
        }
1708
1
      k = input_size; // Reset k, the symbol pass already set up by add_init_zeros.
1709
1
      assert(back_exec->output_size == forw_exec->input_size);
1710
3
      
for (j = 0; 1
j < back_exec->output_size;
j++2
)
1711
2
        if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1712
1
        {
1713
1
          const ccv_nnc_tensor_symbol_info_t* const info = tensor_symbol_info + forw_exec->inputs[j];
1714
1
          const int s_idx = *(int*)ccv_array_get(info->s_ref, p_idx) - 1;
1715
1
          assert(s_idx >= 0);
1716
1
          const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + s_idx);
1717
1
          ccv_nnc_tensor_symbol_hookup(graph, sub_graph, *(ccv_nnc_tensor_symbol_t*)ccv_array_get(symbols, k), autograd_symbol->symbol);
1718
1
          ++k;
1719
1
        }
1720
1
      ccv_nnc_graph_exec_symbol_set_io(graph, back_exec->symbol, ccv_array_get(symbols, 0), input_size, ccv_array_get(symbols, input_size), output_size);
1721
19.1k
    } else if (forw_exec->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
1722
1
      ccv_array_clear(symbol_map);
1723
2
      for (j = 0; j < back_exec->output_size; 
j++1
)
1724
1
        if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1725
1
        {
1726
1
          ccv_nnc_tensor_symbol_map_t symbol = {
1727
1
            .source = ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol,
1728
1
            .destination = ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol,
1729
1
          };
1730
1
          ccv_array_push(symbol_map, &symbol);
1731
1
        }
1732
1
      const int symbol_map_size = symbol_map->rnum;
1733
1
      back_exec->symbol = ccv_nnc_symbolic_graph_case_of_new(graph, back_exec->cmd.cmd, 0, 0, ccv_array_get(symbol_map, 0), symbol_map_size, forw_exec->name);
1734
1
      ccv_nnc_symbolic_graph_set_case_of_expr(graph, back_exec->symbol, NOOP_GRAPH_CASE_OF_EXPR, 0);
1735
4
      for (p = 0; p < forw_exec->graph_ref_size; 
p++3
)
1736
3
      {
1737
3
        const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[p] - 1;
1738
3
        ccv_nnc_symbolic_graph_backward_prep_t* sub_prep = backward_prep->sub_preps + graph_ref;
1739
3
        ccv_nnc_symbolic_graph_t* sub_graph = ccv_nnc_symbolic_graph_new();
1740
3
        sub_graph->pair = sub_prep->graph;
1741
3
        if (!sub_wrt_symbols)
1742
1
          sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1743
        // I am done, need to redo above for sub_prep, and it has to be successful now.
1744
3
        if (!sub_f_symbols)
1745
1
          sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1746
3
        _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, back_info->input_bitmasks, back_info->output_bitmasks, sub_f_symbols, sub_wrt_symbols);
1747
3
        _ccv_nnc_symbolic_graph_backward_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph, root);
1748
3
        ccv_array_clear(symbol_map);
1749
3
        k = 0;
1750
6
        for (j = 0; j < back_exec->output_size; 
j++3
)
1751
3
          if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1752
3
          {
1753
3
            const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, k))->d;
1754
3
            if (d >= 0)
1755
1
            {
1756
1
              const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + d);
1757
1
              ccv_nnc_tensor_symbol_map_t symbol = {
1758
1
                .source = autograd_symbol->symbol,
1759
1
                .destination = ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol,
1760
1
              };
1761
1
              ccv_array_push(symbol_map, &symbol);
1762
2
            } else {
1763
              // Create a new tensor in sub-graph and set it to be 0.
1764
2
              const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]);
1765
              // autograd_symbol->d points to the corresponding forward tensor.
1766
2
              ccv_nnc_tensor_symbol_t zero_symbol = ccv_nnc_tensor_symbol_new(sub_graph, tensor_symbol_info[autograd_symbol->d].info, 0);
1767
2
              ccv_nnc_graph_exec_symbol_new(sub_graph, CMD_SET_FORWARD(0), 0, 0, &zero_symbol, 1, 0);
1768
2
              ccv_nnc_tensor_symbol_map_t symbol = {
1769
2
                .source = zero_symbol,
1770
2
                .destination = autograd_symbol->symbol,
1771
2
              };
1772
2
              ccv_array_push(symbol_map, &symbol);
1773
2
            }
1774
3
            ++k;
1775
3
          }
1776
3
        ccv_nnc_graph_exec_symbol_autogen(sub_graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1777
3
        const int symbol_map_size = symbol_map->rnum;
1778
3
        ccv_nnc_symbolic_graph_set_case_of(graph, back_exec->symbol, sub_graph, p, ccv_array_get(symbol_map, 0), symbol_map_size);
1779
        // Hookup input only after this becomes a sub graph of the graph.
1780
3
        k = 0;
1781
6
        for (j = 0; j < back_exec->input_size; 
j++3
)
1782
3
          if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j))
1783
3
          {
1784
3
            const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, k))->d;
1785
3
            assert(d >= 0);
1786
            // No corresponding sub tensors allocated. Skip.
1787
3
            if (!sub_prep->autograd_tensor_versions[d].ref_version ||
1788
3
              
!sub_prep->autograd_tensor_versions[d].ref_version->rnum1
)
1789
2
              continue;
1790
1
            const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + d);
1791
1
            ccv_nnc_tensor_symbol_hookup(graph, sub_graph, ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol, autograd_symbol->symbol);
1792
1
            ++k;
1793
1
          }
1794
        // Need to make sure tape vars are hooked up.
1795
3
        _ccv_nnc_symbolic_graph_add_tape_vars(sub_prep, root, graph, sub_graph, 0);
1796
3
      }
1797
19.1k
    } else {
1798
19.1k
      ccv_array_clear(symbols);
1799
      // Gradient inputs.
1800
38.6k
      for (j = 0; j < back_exec->input_size; 
j++19.5k
)
1801
19.5k
        if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j))
1802
19.1k
          ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol));
1803
424
        else
1804
424
          ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
1805
      // Inputs from forward function.
1806
53.8k
      for (j = 0; j < forw_exec->input_size; 
j++34.7k
)
1807
34.7k
        if (!(back_info->input_bitmasks[(j + back_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size))))
1808
14.4k
          ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
1809
20.2k
        else {
1810
20.2k
          const ccv_nnc_tensor_symbol_t symbol = {
1811
20.2k
            .d = forw_exec->inputs[j],
1812
20.2k
            .graph = backward_prep->graph
1813
20.2k
          };
1814
20.2k
          if (graph == backward_prep->graph)
1815
20.2k
            ccv_array_push(symbols, &symbol);
1816
5
          else { // Otherwise, create a new symbol, and set its pair to the old symbol.
1817
5
            const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, tensor_symbol_info[forw_exec->inputs[j]].info, tensor_symbol_info[forw_exec->inputs[j]].name);
1818
5
            ccv_nnc_tensor_symbol_pair_with(graph, new_symbol, symbol);
1819
5
            const int flags = ccv_nnc_tensor_symbol_flags(backward_prep->graph, symbol) | CCV_NNC_TENSOR_SYMBOL_TAPE_VAR;
1820
5
            ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags);
1821
5
            ccv_nnc_tensor_symbol_set_flags(backward_prep->graph, symbol, flags);
1822
5
            ccv_array_push(symbols, &new_symbol);
1823
5
          }
1824
20.2k
        }
1825
      // Outputs from forward function.
1826
38.6k
      for (j = 0; j < forw_exec->output_size; 
j++19.5k
)
1827
19.5k
        if (!(back_info->input_bitmasks[(j + back_exec->input_size + forw_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size + forw_exec->input_size))))
1828
14.3k
          ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
1829
5.14k
        else {
1830
5.14k
          const ccv_nnc_tensor_symbol_t symbol = {
1831
5.14k
            .d = forw_exec->outputs[j],
1832
5.14k
            .graph = backward_prep->graph
1833
5.14k
          };
1834
5.14k
          if (graph == backward_prep->graph)
1835
5.14k
            ccv_array_push(symbols, &symbol);
1836
2
          else { // Otherwise, create a new symbol, and set its pair to the old symbol.
1837
2
            const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, tensor_symbol_info[forw_exec->outputs[j]].info, tensor_symbol_info[forw_exec->outputs[j]].name);
1838
2
            ccv_nnc_tensor_symbol_pair_with(graph, new_symbol, symbol);
1839
2
            const int flags = ccv_nnc_tensor_symbol_flags(backward_prep->graph, symbol) | CCV_NNC_TENSOR_SYMBOL_TAPE_VAR;
1840
2
            ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags);
1841
2
            ccv_nnc_tensor_symbol_set_flags(backward_prep->graph, symbol, flags);
1842
2
            ccv_array_push(symbols, &new_symbol);
1843
2
          }
1844
5.14k
        }
1845
53.8k
      for (j = 0; j < back_exec->output_size; 
j++34.7k
)
1846
34.7k
        if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1847
26.1k
          ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol));
1848
8.59k
        else
1849
8.59k
          ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
1850
19.1k
      back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, ccv_array_get(symbols, 0), back_exec->input_size + forw_exec->input_size + forw_exec->output_size, ccv_array_get(symbols, back_exec->input_size + forw_exec->input_size + forw_exec->output_size), back_exec->output_size, 0);
1851
19.1k
      ccv_nnc_graph_exec_symbol_set_hint(graph, back_exec->symbol, exec_symbol_info[i].hint);
1852
19.1k
      ccv_nnc_graph_exec_symbol_pair_with(graph, back_exec->symbol, (ccv_nnc_graph_exec_symbol_t){
1853
19.1k
        .d = i,
1854
19.1k
        .graph = backward_prep->graph,
1855
19.1k
      });
1856
19.1k
    }
1857
19.1k
  }
1858
6.78k
  if (sub_f_symbols)
1859
2
    ccv_array_free(sub_f_symbols);
1860
6.78k
  if (sub_wrt_symbols)
1861
2
    ccv_array_free(sub_wrt_symbols);
1862
6.78k
  if (sub_execs)
1863
1
    ccv_array_free(sub_execs);
1864
6.78k
  ccv_array_t* const sum_or_set_execs = backward_prep->sum_or_set_execs;
1865
11.0k
  for (i = 0; i < sum_or_set_execs->rnum; 
i++4.28k
)
1866
4.28k
  {
1867
4.28k
    ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set_exec = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, i);
1868
    // It is sum, set don't have inputs.
1869
4.28k
    if (sum_or_set_exec->input_size)
1870
4.28k
    {
1871
4.28k
      ccv_array_clear(symbols);
1872
      // This is to sum.
1873
12.8k
      for (j = 0; j < sum_or_set_exec->input_size; 
j++8.59k
)
1874
8.59k
        ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, sum_or_set_exec->inputs[j]))->symbol));
1875
4.28k
      ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_EWSUM_FORWARD, 0, CMD_GENERIC(), 0);
1876
4.28k
      sum_or_set_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, cmd, ccv_array_get(symbols, 0), sum_or_set_exec->input_size, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, sum_or_set_exec->output))->symbol), 1, 0);
1877
4.28k
    } else
1878
1
      sum_or_set_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, CMD_SET_FORWARD(sum_or_set_exec->value), 0, 0, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, sum_or_set_exec->output))->symbol), 1, 0);
1879
4.28k
  }
1880
6.78k
  ccv_array_free(symbol_map);
1881
6.78k
  ccv_array_free(symbols);
1882
25.9k
  for (i = 0; i < exec_symbol_info_size; 
i++19.1k
)
1883
19.1k
  {
1884
    // This is not going to be an interesting node. Skip.
1885
19.1k
    if ((backward_info[i].f_wrt & 0x3) != 0x3)
1886
86
      continue;
1887
19.1k
    ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i;
1888
    // If on the same graph, we cannot decide whether it is before or after the forw_exec, enforcing it is after forw_exec.
1889
19.1k
    if (graph == backward_prep->graph)
1890
19.1k
      ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
1891
19.1k
        .d = i,
1892
19.1k
        .graph = graph
1893
19.1k
      }, back_exec->symbol);
1894
19.1k
    if (back_exec->outgoings)
1895
24.7k
      
for (j = 0; 12.3k
j < back_exec->outgoings->rnum;
j++12.4k
)
1896
12.4k
      {
1897
12.4k
        int d = *(int*)ccv_array_get(back_exec->outgoings, j);
1898
12.4k
        if (d < exec_symbol_info_size)
1899
8.08k
          ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, autograd_execs[d].symbol);
1900
4.36k
        else
1901
4.36k
          ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, d - exec_symbol_info_size))->symbol);
1902
12.4k
      }
1903
19.1k
  }
1904
11.0k
  for (i = 0; i < sum_or_set_execs->rnum; 
i++4.28k
)
1905
4.28k
  {
1906
4.28k
    ccv_nnc_sum_or_set_graph_exec_symbol_t* exec = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, i);
1907
4.28k
    if (exec->outgoings)
1908
8.50k
      
for (j = 0; 4.25k
j < exec->outgoings->rnum;
j++4.25k
)
1909
4.25k
      {
1910
4.25k
        int d = *(int*)ccv_array_get(exec->outgoings, j);
1911
4.25k
        if (d < exec_symbol_info_size)
1912
4.25k
          ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, autograd_execs[d].symbol);
1913
0
        else
1914
0
          ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, d - exec_symbol_info_size))->symbol);
1915
4.25k
      }
1916
4.28k
  }
1917
  // Now, everything is done, set the metadata on graph so that we can lookup later for backward symbols
1918
6.78k
  if (graph->backward.tensor_symbol_idx)
1919
4.40k
    graph->backward.tensor_symbol_idx = (int*)ccrealloc(graph->backward.tensor_symbol_idx, sizeof(int) * (graph->tensor_symbol_info->rnum + tensor_symbol_info_size));
1920
2.37k
  else
1921
2.37k
    graph->backward.tensor_symbol_idx = (int*)ccmalloc(sizeof(int) * (graph->tensor_symbol_info->rnum + tensor_symbol_info_size));
1922
6.78k
  graph->backward.tensor_symbol_size = tensor_symbol_info_size;
1923
6.78k
  graph->backward.exec_symbol_idx = graph->backward.tensor_symbol_idx + tensor_symbol_info_size;
1924
6.78k
  graph->backward.exec_symbol_size = graph->tensor_symbol_info->rnum;
1925
46.4k
  for (i = 0; i < tensor_symbol_info_size; 
i++39.6k
)
1926
39.6k
    graph->backward.tensor_symbol_idx[i] = -1;
1927
85.8k
  for (i = 0; i < graph->backward.exec_symbol_size; 
i++79.1k
)
1928
79.1k
    graph->backward.exec_symbol_idx[i] = -1;
1929
6.78k
  ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep->autograd_tensor_versions;
1930
  // Assigning for wrt symbols.
1931
16.3k
  for (i = 0; i < wrt_symbol_size; 
i++9.53k
)
1932
9.53k
  {
1933
9.53k
    const int d = wrt_symbols[i].d;
1934
9.53k
    if (d < 0)
1935
9
      continue;
1936
9.53k
    assert
(d < tensor_symbol_info_size)9.52k
;
1937
9.52k
    const ccv_nnc_tensor_symbol_info_t* const forw_symbol = tensor_symbol_info + d;
1938
9.52k
    ccv_nnc_autograd_tensor_version_t* const tensor_ver = autograd_tensor_versions + ((!forw_symbol->alias_ref) ? 
d9.52k
:
forw_symbol->alias_ref - 11
);
1939
9.52k
    assert(tensor_ver->ref_version);
1940
9.52k
    ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1941
9.52k
    ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1942
    // If this wrt symbol is an alias, create extra alias for this.
1943
9.52k
    if (!forw_symbol->alias_ref)
1944
9.52k
      graph->backward.tensor_symbol_idx[d] = autograd_symbol->symbol.d;
1945
1
    else // We create new alias, and this cannot be referenced from exec_symbol_idx because its size limited to previous tensor symbol size.
1946
1
      graph->backward.tensor_symbol_idx[d] = ccv_nnc_tensor_symbol_alias_new(graph, autograd_symbol->symbol, forw_symbol->ofs, forw_symbol->stride, forw_symbol->info, 0).d;
1947
9.52k
    const int dd = autograd_symbol->symbol.d;
1948
9.52k
    const int x = tensor_ref->x;
1949
9.52k
    if (tensor_ref->exec_registry && 
tensor_ref->exec_registry->rnum2
) // Create no-op node.
1950
2
    {
1951
2
      ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), 0, 0, 0, 0, 0);
1952
2
      if (x < exec_symbol_info_size)
1953
2
        ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop);
1954
0
      else
1955
0
        ccv_nnc_graph_exec_symbol_concat(graph, ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size))->symbol, noop);
1956
6
      for (j = 0; j < tensor_ref->exec_registry->rnum; 
j++4
)
1957
4
      {
1958
4
        const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
1959
4
        assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
1960
4
        assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation.
1961
4
        ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop);
1962
4
      }
1963
2
      graph->backward.exec_symbol_idx[dd] = noop.d;
1964
9.52k
    } else {
1965
9.52k
      if (x < exec_symbol_info_size)
1966
9.49k
        graph->backward.exec_symbol_idx[dd] = autograd_execs[x].symbol.d;
1967
33
      else
1968
33
        graph->backward.exec_symbol_idx[dd] = ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size))->symbol.d;
1969
9.52k
    }
1970
9.52k
  }
1971
  // Assigning for f symbols.
1972
13.5k
  
for (i = 0; 6.78k
i < f_symbol_size;
i++6.79k
)
1973
6.79k
  {
1974
6.79k
    const int d = f_symbols[i].d;
1975
6.79k
    assert(d >= 0);
1976
6.79k
    assert(d < tensor_symbol_info_size);
1977
6.79k
    const ccv_nnc_autograd_tensor_version_t* const tensor_ver = autograd_tensor_versions + d;
1978
6.79k
    if (tensor_ver->ref_version)
1979
6.79k
    {
1980
      // We don't use _ccv_nnc_autograd_tensor_symbol_from_tensor_version because that select the last version, but for us, we need the first version.
1981
6.79k
      const ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, 0);
1982
6.79k
      const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1983
6.79k
      graph->backward.tensor_symbol_idx[d] = autograd_symbol->symbol.d;
1984
      // Cannot find relevant backward exec symbols for f, it could be many.
1985
6.79k
    }
1986
6.79k
  }
1987
6.78k
}
1988
1989
void ccv_nnc_symbolic_graph_backward(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
1990
6.77k
{
1991
6.77k
  int i;
1992
  // f symbols cannot be alias.
1993
13.5k
  for (i = 0; i < f_symbol_size; 
i++6.79k
)
1994
6.79k
    if (f_symbols[i].d >= 0)
1995
6.79k
    {
1996
6.79k
      assert(f_symbols[i].graph == graph); // f symbol has to be in the current graph.
1997
6.79k
      assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, f_symbols[i].d))->alias_ref);
1998
6.79k
    }
1999
16.3k
  
for (i = 0; 6.77k
i < wrt_symbol_size;
i++9.53k
)
2000
9.53k
    if (wrt_symbols[i].d >= 0)
2001
9.52k
    {
2002
9.52k
      assert(wrt_symbols[i].graph == graph);
2003
      // This is not an alias, or what it refers to is not an alias.
2004
9.52k
      assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, wrt_symbols[i].d))->alias_ref || !((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, ((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, wrt_symbols[i].d))->alias_ref - 1))->alias_ref);
2005
9.52k
    }
2006
6.77k
  const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
2007
6.77k
  const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
2008
6.77k
  assert(exec_symbol_info_size > 0);
2009
6.77k
  assert(tensor_symbol_info_size > 0);
2010
6.77k
  ccv_nnc_symbolic_graph_backward_prep_t backward_prep = _ccv_nnc_symbolic_graph_backward_prep(graph, sources, source_size, destinations, destination_size);
2011
6.77k
  _ccv_nnc_symbolic_graph_backward_prep_prune_ops(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, sources, source_size, destinations, destination_size);
2012
6.77k
  _ccv_nnc_symbolic_graph_backward_prep_gen(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, 0, sources, source_size, destinations, destination_size);
2013
6.77k
  _ccv_nnc_symbolic_graph_backward_gen(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, graph, graph);
2014
6.77k
  _ccv_nnc_symbolic_graph_backward_prep_free(backward_prep);
2015
6.77k
}
2016
2017
ccv_nnc_tensor_symbol_t ccv_nnc_tensor_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol)
2018
27.6k
{
2019
27.6k
  assert(symbol.d >= 0);
2020
27.6k
  assert(symbol.d < graph->backward.tensor_symbol_size);
2021
27.6k
  if (graph->backward.tensor_symbol_idx[symbol.d] < 0)
2022
10
    return NO_TENSOR_SYMBOL;
2023
27.6k
  ccv_nnc_tensor_symbol_t tensor = {
2024
27.6k
    .d = graph->backward.tensor_symbol_idx[symbol.d],
2025
27.6k
    .graph = graph,
2026
27.6k
  };
2027
27.6k
  return tensor;
2028
27.6k
}
2029
2030
ccv_nnc_graph_exec_symbol_t ccv_nnc_graph_exec_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol)
2031
17.7k
{
2032
17.7k
  assert(symbol.d >= 0);
2033
17.7k
  assert(symbol.d < graph->tensor_symbol_info->rnum);
2034
17.7k
  int dd = symbol.d;
2035
  // Check if this is an alias. Use the original if it is.
2036
17.7k
  ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, dd);
2037
17.7k
  if (symbol_info->alias_ref)
2038
2
    dd = symbol_info->alias_ref - 1;
2039
17.7k
  assert(dd >= 0);
2040
17.7k
  assert(dd < graph->backward.exec_symbol_size);
2041
17.7k
  if (graph->backward.exec_symbol_idx[dd] < 0)
2042
0
    return (ccv_nnc_graph_exec_symbol_t){
2043
0
      .graph = 0,
2044
0
      .d = CCV_NNC_NO_GRAPH_EXEC_SYMBOL
2045
0
    };
2046
17.7k
  ccv_nnc_graph_exec_symbol_t exec = {
2047
17.7k
    .d = graph->backward.exec_symbol_idx[dd],
2048
17.7k
    .graph = graph
2049
17.7k
  };
2050
17.7k
  return exec;
2051
17.7k
}