Coverage Report

Created: 2021-04-05 01:08

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_symbolic_graph_backward.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_symbolic_graph.h"
6
7
// MARK - Level-3.5 API
8
9
typedef struct {
10
  int f_wrt; // Check if both f_symbols and wrt_symbols flow through this node.
11
  ccv_array_t* outgoings; // backward traverse nodes.
12
  uint64_t* input_bitmasks;
13
  int input_bitmask_size;
14
  uint64_t* output_bitmasks;
15
  int output_bitmask_size;
16
} ccv_nnc_graph_backward_info_t;
17
18
typedef struct {
19
  int input_size;
20
  int* inputs;
21
  int output;
22
  ccv_array_t* outgoings;
23
  float value;
24
  ccv_nnc_graph_exec_symbol_t symbol;
25
} ccv_nnc_sum_or_set_graph_exec_symbol_t;
26
27
typedef struct {
28
  int input_size;
29
  int output_size;
30
  int* inputs;
31
  int* outputs;
32
  ccv_array_t* outgoings;
33
  ccv_nnc_cmd_t cmd;
34
  ccv_nnc_graph_exec_symbol_t symbol;
35
} ccv_nnc_autograd_graph_exec_symbol_t;
36
37
typedef struct {
38
  int d; // The pointer to the forward level object.
39
  int alias_ref; // The alias ref to itself (autograd_tensor_symbols array).
40
  int flags; // Flags for this symbol.
41
  ccv_nnc_tensor_symbol_t symbol;
42
} ccv_nnc_autograd_tensor_symbol_t;
43
44
typedef struct {
45
  int d; // The tensor symbol ref.
46
  int x; // The exec symbol ref.
47
  ccv_array_t* exec_registry; // Additional exec symbol refs, similar to x, only useful for aliasing.
48
  ccv_array_t* alias_registry; // int point to all the alias (if this is not an alias). The alias is the object in autograd_tensor_symbols, you need another level of indirection to get the actual forward level alias.
49
} ccv_nnc_tensor_ref_t;
50
51
typedef struct {
52
  int c; // The start non-accumulated version.
53
  ccv_array_t* ref_version; // tensor ref point to the reverse tensor symbol.
54
} ccv_nnc_autograd_tensor_version_t;
55
56
typedef struct {
57
  int d;
58
  int alias_ref;
59
} ccv_nnc_sum_variable_t;
60
61
// This method tries to figure out if a set of aliases can cover the whole tensor dim.
62
// This is not a precise implementation though. The requirement is to answer this question
63
// with a given memory constraint, therefore, only allow up to 65536 different tensor locations.
64
// If you have more than that, it will assume that it doesn't have fully assigned aliases,
65
// and will return 0.
66
67
// Return 1 if inserted successfully.
68
static inline int _ccv_nnc_try_mix(int* const md, const int ins, const int c)
69
43
{
70
43
  if (!c)
71
25
  {
72
25
    md[0] = ins;
73
25
    return 1;
74
25
  }
75
18
  int ll = 0, uu = c - 1;
76
18
  int mm;
77
20
  do {
78
20
    mm = ll + ((uu - ll) >> 1);
79
20
    if (ins == md[mm])
80
16
      return 0;
81
4
    else if (ins < md[mm])
82
2
      uu = mm - 1;
83
2
    else if (ins > md[mm])
84
2
      ll = mm + 1;
85
20
  } while (
ll <= uu4
);
86
18
  
if (2
ll < c2
)
87
2
    memmove(md + ll + 1, md + ll, sizeof(int) * (c - ll));
88
2
  md[ll] = ins;
89
2
  return 1;
90
18
}
91
92
static inline int _ccv_nnc_mix_idx(const int* const md, const int ins, const int c)
93
26
{
94
26
  if (c <= 1)
95
18
    return 0;
96
8
  int ll = 0, uu = c - 1;
97
8
  int mm;
98
14
  do {
99
14
    mm = ll + ((uu - ll) >> 1);
100
14
    if (ins == md[mm])
101
8
      return mm;
102
6
    else if (ins < md[mm])
103
0
      uu = mm - 1;
104
6
    else if (ins > md[mm])
105
6
      ll = mm + 1;
106
14
  } while (
ll <= uu6
);
107
8
  assert(0 && "Shouldn't reach here");
108
0
  return -1;
109
0
}
110
111
static inline void _ccv_nnc_try_set_pix_0(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint32_t* const cube, int offset)
112
6
{
113
6
  const int s = (ofs[0] == 0) ? 
03
:
_ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 13
;
114
6
  const int d = ((ofs[0] + dim[0] == tensor_dim[0]) ? 
cube_dim[0]3
:
_ccv_nnc_mix_idx(scmd[0], ofs[0] + 3
ccv_max3
(1, dim[0]), cube_dim[0])) + 1;
115
6
  assert(s >= 0 && d > s);
116
6
  int i;
117
12
  for (i = s; i < d; 
i++6
)
118
6
    // Fill this pix. I can make this faster by loop through full ones (divided by 8), but too lazy.
119
6
    cube[(offset + i) >> 5] |= (1u << ((offset + i) & 0x1f));
120
6
}
121
122
static inline void _ccv_nnc_try_set_pix_1(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint32_t* const cube, int offset)
123
14
{
124
14
  const int s0 = (ofs[0] == 0) ? 
012
:
_ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 12
;
125
14
  const int d0 = ((ofs[0] + dim[0] == tensor_dim[0]) ? 
cube_dim[0]12
:
_ccv_nnc_mix_idx(scmd[0], ofs[0] + 2
ccv_max2
(1, dim[0]), cube_dim[0])) + 1;
126
14
  assert(s0 >= 0 && d0 > s0);
127
14
  const int s1 = (ofs[1] == 0) ? 
08
:
_ccv_nnc_mix_idx(scmd[1], ofs[1], cube_dim[1]) + 16
;
128
14
  const int d1 = ((ofs[1] + dim[1] == tensor_dim[1]) ? 
cube_dim[1]8
:
_ccv_nnc_mix_idx(scmd[1], ofs[1] + 6
ccv_max6
(1, dim[1]), cube_dim[1])) + 1;
129
14
  assert(s1 >= 0 && d1 > s1);
130
14
  int i, j;
131
14
  const int step1 = cube_step[1];
132
14
  if (step1 == d0 - s0)
133
10
  {
134
10
    // Faster one, we can simply loop through.
135
22
    for (i = s1 * step1; i < d1 * step1; 
i++12
)
136
12
      cube[(offset + i) >> 5] |= (1u << ((offset + i) & 0x1f));
137
10
  } else {
138
4
    offset += s1 * step1;
139
4
    // There are gaps, slow one.
140
8
    for (i = s1; i < d1; 
i++, offset += step14
)
141
8
      
for (j = s0; 4
j < d0;
j++4
)
142
4
        cube[(offset + j) >> 5] |= (1u << ((offset + j) & 0x1f));
143
4
  }
144
14
}
145
146
static inline void _ccv_nnc_try_set_pix(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint32_t* const cube, int offset, const int dim_idx)
147
24
{
148
24
  switch (dim_idx)
149
24
  {
150
14
    case 1:
151
14
      _ccv_nnc_try_set_pix_1(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset);
152
14
      return;
153
6
    case 0:
154
6
      _ccv_nnc_try_set_pix_0(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset);
155
6
      return;
156
4
  }
157
4
  int i;
158
4
  const int s = (ofs[dim_idx] == 0) ? 
02
:
_ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx], cube_dim[dim_idx]) + 12
;
159
4
  const int d = ((ofs[dim_idx] + dim[dim_idx] == tensor_dim[dim_idx]) ? 
cube_dim[dim_idx]2
:
_ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx] + 2
ccv_max2
(1, dim[dim_idx]), cube_dim[dim_idx])) + 1;
160
4
  assert(s >= 0 && d > s);
161
8
  
for (i = s; 4
i < d;
i++4
)
162
4
    _ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset + i * cube_step[dim_idx], dim_idx - 1);
163
4
}
164
165
static int _ccv_nnc_tensor_ref_fully_assigned_with_aliases(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info)
166
1.14k
{
167
1.14k
  // Only work with tensor_ref of aliases.
168
1.14k
  assert(tensor_ref->alias_registry);
169
1.14k
  const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
170
1.14k
  assert(tensor_symbol_info[autograd->d].alias_ref == 0);
171
1.14k
  const int* tensor_dim = tensor_symbol_info[autograd->d].info.dim;
172
1.14k
  const int tensor_count = ccv_nnc_dimension_count(tensor_dim);
173
1.14k
  int i, j;
174
1.17k
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++37
)
175
1.15k
  {
176
1.15k
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
177
1.15k
    assert(d < autograd_tensor_symbols->rnum);
178
1.15k
    const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
179
1.15k
    assert(tensor_symbol_info[autograd->d].alias_ref);
180
1.15k
    const int* inc = tensor_symbol_info[autograd->d].inc;
181
1.15k
    // If this is just reshaped (i.e., dimension is the same, and inc covers the whole). We have fully assigned.
182
1.15k
    if (memcmp(inc, tensor_symbol_info[autograd->d].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0 &&
183
1.15k
      
ccv_nnc_dimension_count(inc) == tensor_count1.11k
)
184
1.11k
      return 1;
185
37
    // Otherwise if inc doesn't match original dim, it is not covered.
186
37
    if (memcmp(inc, tensor_dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0)
187
0
      return 0;
188
37
  }
189
1.14k
  /* We need a solid cube (potentially hyper dimensional) to compute if there are overlaps.
190
1.14k
   * To make this cube as small as possible, we need to map the actual tensor dimension
191
1.14k
   * (therefore, we don't actually allocate the whole tensor to compute overlaps) to a smaller
192
1.14k
   * cube given the ofs and dim size of its aliases.
193
1.14k
   *
194
1.14k
   * The following code generated the dimension mapping (using scratch space) with binary search + insertion
195
1.14k
   * and then we fill the cube with a given tensor alias's dimensional information (ofs, dim).
196
1.14k
   * Afterwards, we simply need to check if the cube is totally filled up to know if this tensor
197
1.14k
   * is fully assigned with its aliases (if that is the case, we can skip zeroing for this tensor).
198
1.14k
   *
199
1.14k
   * There are several restrictions though to make this faster: 1). I cannot handle any cube that all side
200
1.14k
   * lengths combined larger than 1023 (scm only have 1024 scratch space). 2). I cannot handle any cube
201
1.14k
   * that the total volume is larger than 2048 * 8 (I only allocate 2K on stack for this).
202
1.14k
   * */
203
1.14k
  int scm[1024]; // Having 1024 int scratch space for mapping dimensions. (Or sparse coordinate mapping).
204
24
  int cube_dim[CCV_NNC_MAX_DIM_ALLOC] = {}; // Mapping dimension size.
205
24
  int cube_size = 1;
206
24
  int* scmptr = scm;
207
40
  for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && tensor_dim[i]; 
i++16
)
208
32
  {
209
32
    int head = 0, tail = 0; // Note that we touched both the head and tail (otherwise this dimension is not fully covered).
210
32
    int len = 0;
211
89
    for (j = 0; j < tensor_ref->alias_registry->rnum; 
j++57
)
212
57
    {
213
57
      const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, j);
214
57
      assert(d < autograd_tensor_symbols->rnum);
215
57
      const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
216
57
      assert(tensor_symbol_info[autograd->d].alias_ref);
217
57
      const int* ofs = tensor_symbol_info[autograd->d].ofs;
218
57
      const int* dim = tensor_symbol_info[autograd->d].info.dim;
219
57
      head = head || 
(ofs[i] == 0)36
;
220
57
      tail = tail || 
(ofs[i] + 39
ccv_max39
(1, dim[i]) == tensor_dim[i]);
221
57
      if (ofs[i] != 0)
222
14
        len += _ccv_nnc_try_mix(scmptr, ofs[i], len);
223
57
      if (scmptr - scm + len >= 1024) // Cannot handle that much, abort.
224
0
        return 0;
225
57
      if (ofs[i] + ccv_max(1, dim[i]) < tensor_dim[i])
226
29
        len += _ccv_nnc_try_mix(scmptr, ofs[i] + ccv_max(1, dim[i]), len);
227
57
      if (scmptr - scm + len >= 1024) // Cannot handle that much, abort.
228
0
        return 0;
229
57
    }
230
32
    if (!head || 
!tail31
)
231
16
      return 0;
232
16
    cube_size *= (len + 1);
233
16
    cube_dim[i] = len;
234
16
    scmptr += len; // Moving to next level.
235
16
  }
236
24
  // The cube map is too large, cannot do the computation, assume it is not fully assigned.
237
24
  
if (8
cube_size > 2048 * 88
)
238
0
    return 0;
239
8
  // binary map to see if it fills up.
240
8
  uint32_t cube[(cube_size + 31) >> 5];
241
8
  memset(cube, 0, sizeof(uint32_t) * ((cube_size + 31) >> 5));
242
8
  int* scmd[CCV_NNC_MAX_DIM_ALLOC] = {}; // Sparse coordinate map at dimension x.
243
8
  int cube_step[CCV_NNC_MAX_DIM_ALLOC] = {};
244
22
  for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && tensor_dim[i]; 
i++14
)
245
14
  {
246
14
    cube_step[i] = (i > 0) ? 
cube_step[i - 1] * (cube_dim[i - 1] + 1)6
:
18
;
247
14
    scmd[i] = (i > 0) ? 
scmd[i - 1] + cube_dim[i - 1]6
:
scm8
;
248
14
  }
249
8
  const int max_dim = i;
250
28
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++20
)
251
20
  {
252
20
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
253
20
    assert(d < autograd_tensor_symbols->rnum);
254
20
    const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
255
20
    assert(tensor_symbol_info[autograd->d].alias_ref);
256
20
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
257
20
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
258
20
    _ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, 0, max_dim - 1);
259
20
  }
260
8
  // Compare to see now if the binary map filled up. If it filled up, we know it is fully assigned.
261
8
  for (i = 0; i < (cube_size >> 5); 
i++0
)
262
0
    if (cube[i] < 0xffffffff)
263
0
      return 0;
264
8
  if ((cube_size & 0x1f) > 0)
265
8
  {
266
8
    // Fetch the rest.
267
8
    uint32_t r = 0;
268
28
    for (i = 0; i < (cube_size & 0x1f); 
i++20
)
269
20
      r |= (1u << i);
270
8
    assert(cube[((cube_size + 31) >> 5) - 1] <= r);
271
8
    if (cube[((cube_size + 31) >> 5) - 1] < r)
272
0
      return 0;
273
8
  }
274
8
  return 1;
275
8
}
276
277
static int _ccv_nnc_tensor_ref_version_find_init(const ccv_nnc_autograd_tensor_version_t* const tensor_ver)
278
5
{
279
5
  int i;
280
10
  for (i = 0; i < tensor_ver->ref_version->rnum; 
i++5
)
281
7
    if (((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i))->x < 0)
282
2
      return i;
283
5
  
return -13
;
284
5
}
285
286
static void _ccv_nnc_graph_sum_autograd_tensor_versions(const int idx, const int d, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_or_set_execs)
287
4.26k
{
288
4.26k
  int i, j;
289
4.26k
  assert(tensor_ver->c < tensor_ver->ref_version->rnum);
290
4.26k
  const int input_size = tensor_ver->ref_version->rnum - tensor_ver->c;
291
4.26k
  int* inputs = (int*)ccmalloc(sizeof(int) * input_size);
292
12.8k
  for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; 
i++8.55k
)
293
8.55k
    inputs[i] = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i))->d;
294
4.26k
  const ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
295
4.26k
    .d = d
296
4.26k
  };
297
4.26k
  ccv_array_push(autograd_tensor_symbols, &tensor_sym);
298
4.26k
  ccv_nnc_sum_or_set_graph_exec_symbol_t sum_exec = {
299
4.26k
    .input_size = input_size,
300
4.26k
    .inputs = inputs,
301
4.26k
    .output = autograd_tensor_symbols->rnum - 1
302
4.26k
  };
303
4.26k
  if (idx >= 0)
304
4.24k
  {
305
4.24k
    sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0);
306
4.24k
    ccv_array_push(sum_exec.outgoings, &idx);
307
4.24k
  }
308
4.26k
  ccv_array_push(sum_or_set_execs, &sum_exec);
309
4.26k
  const int outgoing = exec_symbol_info_size + sum_or_set_execs->rnum - 1;
310
12.8k
  for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; 
i++8.55k
)
311
8.55k
  {
312
8.55k
    const ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i);
313
8.55k
    const int x = tensor_ref->x;
314
8.55k
    if (x < 0) /* This is initialization tensor, it has to be occurred before the execution anyway. */
315
1
    {
316
1
      // No alias.
317
1
      assert(!tensor_ref->alias_registry);
318
1
      // No associated additional execs.
319
1
      assert(!tensor_ref->exec_registry);
320
1
      continue;
321
8.55k
    }
322
8.55k
    if (x < exec_symbol_info_size)
323
8.55k
    {
324
8.55k
      ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
325
8.55k
      if (!back_exec->outgoings)
326
31
        back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
327
8.55k
      ccv_array_replace_unique_int(back_exec->outgoings, idx, outgoing);
328
8.55k
    } else {
329
0
      // This tensor_ref is generated by the sum operation.
330
0
      ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size);
331
0
      ccv_array_replace_unique_int(sum_or_set->outgoings, idx, outgoing);
332
0
    }
333
8.55k
    // If this tensor have associated alias, we need to init it to zeros when it is allocated (we only need to set a flag here)
334
8.55k
    // it is handled at compilation phase.
335
8.55k
    if (tensor_ref->alias_registry &&
336
8.55k
      // Loop over to see if this tensor is fully occupied to avoid extra zero step.
337
8.55k
      
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)28
)
338
8
    {
339
8
      ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
340
8
      // By having alias_registry, what this symbol represents must not by an alias.
341
8
      assert(tensor_sym->alias_ref == 0);
342
8
      tensor_sym->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
343
8
    }
344
8.55k
    if (tensor_ref->exec_registry)
345
4
      
for (j = 0; 2
j < tensor_ref->exec_registry->rnum;
j++2
)
346
2
      {
347
2
        const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
348
2
        assert(x >= 0);
349
2
        // The exec_registry can only be generated by alias registry, therefore, it cannot reference to a sum operation.
350
2
        assert(x < exec_symbol_info_size);
351
2
        ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
352
2
        if (!back_exec->outgoings)
353
1
          back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
354
2
        ccv_array_replace_unique_int(back_exec->outgoings, idx, outgoing);
355
2
      }
356
8.55k
  }
357
4.26k
  const ccv_nnc_tensor_ref_t tensor_ref = {
358
4.26k
    .d = autograd_tensor_symbols->rnum - 1,
359
4.26k
    .x = outgoing
360
4.26k
  };
361
4.26k
  ccv_array_push(tensor_ver->ref_version, &tensor_ref);
362
4.26k
  /* Move the c pointer up to the latest summed result. */
363
4.26k
  tensor_ver->c = tensor_ver->ref_version->rnum - 1;
364
4.26k
}
365
366
static int _ccv_nnc_tensor_ref_version_involve_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
367
80
{
368
80
  assert(alias->alias_ref > 0);
369
80
  // No alias_registry, must conflict (owns the whole band).
370
80
  if (!tensor_ref->alias_registry)
371
28
    return 1;
372
52
  int i;
373
73
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++21
)
374
64
  {
375
64
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
376
64
    assert(d < autograd_tensor_symbols->rnum);
377
64
    ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
378
64
    if (ccv_nnc_over_tensor_symbol_aliases(tensor_symbol_info + autograd->d, alias))
379
43
      return 1;
380
64
  }
381
52
  // All aliases referenced by this ref_version doesn't overlap with the provided one, thus, there is no conflict at all.
382
52
  
return 09
;
383
52
}
384
385
static int _ccv_nnc_tensor_ref_version_find_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
386
27
{
387
27
  assert(alias->alias_ref > 0);
388
27
  // No alias_registry, thus, cannot find the exact matched alias.
389
27
  if (!tensor_ref->alias_registry)
390
8
    return -1;
391
19
  int i;
392
34
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++15
)
393
26
  {
394
26
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
395
26
    assert(d < autograd_tensor_symbols->rnum);
396
26
    ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
397
26
    // This must reference to an alias.
398
26
    assert(tensor_symbol_info[autograd->d].alias_ref);
399
26
    const int* inc = tensor_symbol_info[autograd->d].inc;
400
26
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
401
26
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
402
26
    // If everything matches, this is the required alias.
403
26
    if (memcmp(inc, alias->inc, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0 &&
404
26
      memcmp(ofs, alias->ofs, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0 &&
405
26
      
memcmp(dim, alias->info.dim, sizeof(int) * 11
CCV_NNC_MAX_DIM_ALLOC11
) == 0)
406
11
      return d;
407
26
  }
408
19
  
return -18
;
409
19
}
410
411
static int _ccv_nnc_tensor_ref_version_has_this_alias_exclusively(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
412
4
{
413
4
  assert(alias->alias_ref > 0);
414
4
  // No alias_registry, thus, cannot find the exact matched alias.
415
4
  if (!tensor_ref->alias_registry)
416
0
    return 0;
417
4
  int i;
418
8
  for (i = 0; i < tensor_ref->alias_registry->rnum; 
i++4
)
419
5
  {
420
5
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
421
5
    assert(d < autograd_tensor_symbols->rnum);
422
5
    ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
423
5
    // This must reference to an alias.
424
5
    assert(tensor_symbol_info[autograd->d].alias_ref);
425
5
    const int* inc = tensor_symbol_info[autograd->d].inc;
426
5
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
427
5
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
428
5
    if (memcmp(inc, alias->inc, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0 ||
429
5
      memcmp(ofs, alias->ofs, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0 ||
430
5
      
memcmp(dim, alias->info.dim, sizeof(int) * 4
CCV_NNC_MAX_DIM_ALLOC4
) != 0)
431
1
      return 0;
432
5
  }
433
4
  // If everything matches for every alias in registry, we can use any of the alias directly.
434
4
  
return 13
;
435
4
}
436
437
static int _ccv_nnc_graph_sum_autograd_tensor_versions_alias(const int idx, const int d, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const alias, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_or_set_execs)
438
19
{
439
19
  assert(tensor_ver->c < tensor_ver->ref_version->rnum);
440
19
  int i, j = 0;
441
19
  struct {
442
19
    int k;
443
19
    int i;
444
19
  } kd[tensor_ver->ref_version->rnum - tensor_ver->c];
445
46
  for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; 
i++27
)
446
27
  {
447
27
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i);
448
27
    const int k = _ccv_nnc_tensor_ref_version_find_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias);
449
27
    if (k >= 0)
450
11
      kd[j++] = (typeof(kd[0])){
451
11
        .k = k, .i = i
452
11
      };
453
16
    else if (_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias))
454
16
      kd[j++] = (typeof(kd[0])) {
455
16
        .k = -1, .i = i // It has dependency to the original tensor (non-alias) now, label this with highest bit.
456
16
      };
457
27
  }
458
19
  // Can only find one. This is the easy case, we can simply return that symbol (or its alias).
459
19
  if (j == 1)
460
14
  {
461
14
    if (kd[0].k >= 0)
462
4
      return kd[0].k; // Only can find one alias, that is the one.
463
10
    // Otherwise, need to create a new alias.
464
10
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[0].i);
465
10
    ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
466
10
    // Since we create new alias, we need to set the referenced one to be allocated with 0s.
467
10
    if (ref->alias_ref) // If this is an alias, it has to be zero initialized.
468
0
    {
469
0
      ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, ref->alias_ref - 1);
470
0
      assert(ref->alias_ref == 0); // This is original.
471
0
      ref->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
472
10
    } else if (tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied.
473
10
        // Loop over to see if this tensor is fully occupied to avoid extra zero step.
474
10
        
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)3
) {
475
1
      ref->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
476
1
    }
477
10
    ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
478
10
      .d = d,
479
10
      .alias_ref = tensor_ref->d + 1
480
10
    };
481
10
    ccv_array_push(autograd_tensor_symbols, &tensor_sym);
482
10
    const int ad = autograd_tensor_symbols->rnum - 1;
483
10
    if (tensor_ref->alias_registry) // Only push this when it has an alias registry (otherwise it already conflict with everyone).
484
3
      ccv_array_push(tensor_ref->alias_registry, &ad);
485
10
    // The newly inserted tensor symbol.
486
10
    return ad;
487
5
  }
488
5
  // Otherwise, we need to create the sum operation out of these.
489
5
  const int input_size = j;
490
5
  int has_this_alias_exclusively = 1;
491
5
  int* inputs = input_size > 0 ? (int*)ccmalloc(sizeof(int) * input_size) : 
00
;
492
18
  for (i = 0; i < input_size; 
i++13
)
493
13
  {
494
13
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
495
13
    // Can take a fast path if every ref involved has the same alias, our sum operation can be faster (using alias directly).
496
13
    if (has_this_alias_exclusively && 
kd[i].k >= 07
&&
_ccv_nnc_tensor_ref_version_has_this_alias_exclusively(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias)4
)
497
3
      inputs[i] = *(int*)ccv_array_get(tensor_ref->alias_registry, 0); // Assigning the alias.
498
13
    else {
499
10
      if (has_this_alias_exclusively)
500
4
      {
501
4
        has_this_alias_exclusively = 0;
502
4
        for (j = 0; j < i; 
j++0
)
503
0
          inputs[j] = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[j].i))->d;
504
4
      }
505
10
      inputs[i] = tensor_ref->d;
506
10
    }
507
13
  }
508
5
  ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
509
5
    .d = alias->alias_ref - 1
510
5
  };
511
5
  ccv_array_push(autograd_tensor_symbols, &tensor_sym);
512
5
  const int tensor_ref_d = autograd_tensor_symbols->rnum - 1;
513
5
  tensor_sym.d = d;
514
5
  tensor_sym.alias_ref = tensor_ref_d + 1;
515
5
  ccv_array_push(autograd_tensor_symbols, &tensor_sym);
516
5
  const int ad = autograd_tensor_symbols->rnum - 1;
517
5
  ccv_nnc_sum_or_set_graph_exec_symbol_t sum_exec = {
518
5
    .input_size = input_size,
519
5
    .inputs = inputs,
520
5
    .output = has_this_alias_exclusively ? 
ad1
:
tensor_ref_d4
/* If has this alias exclusively, the output should be alias as well. Otherwise the output is the real tensor. */
521
5
  };
522
5
  if (idx >= 0)
523
5
  {
524
5
    sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0);
525
5
    ccv_array_push(sum_exec.outgoings, &idx);
526
5
  }
527
5
  ccv_array_push(sum_or_set_execs, &sum_exec);
528
5
  const int outgoing = exec_symbol_info_size + sum_or_set_execs->rnum - 1;
529
5
  int no_alias_registry = 0;
530
18
  for (i = 0; i < input_size; 
i++13
)
531
13
  {
532
13
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
533
13
    if (!has_this_alias_exclusively)
534
10
    {
535
10
      // If the sum operation is not operating on one alias. I need to zero this tensor out when it is first
536
10
      // allocated (see discussions around the flags I use).
537
10
      ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
538
10
      if (tensor_sym->alias_ref)
539
0
      {
540
0
        // Find the original tensor_sym and set its flags (I prefer to set flags on its original).
541
0
        ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_sym->alias_ref - 1);
542
0
        assert(ref->alias_ref == 0); // This is original.
543
0
        ref->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
544
10
      } else if (tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied.
545
10
          // Loop over to see if this tensor is fully occupied to avoid extra zero step.
546
10
          
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)9
) {
547
6
        tensor_sym->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
548
6
      }
549
10
    }
550
13
    // Check to see if any of these tensors doesn't have alias.
551
13
    no_alias_registry |= (!tensor_ref->alias_registry);
552
13
    const int x = tensor_ref->x;
553
13
    assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
554
13
    if (x < exec_symbol_info_size)
555
13
    {
556
13
      ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
557
13
      if (!back_exec->outgoings)
558
0
        back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
559
13
      ccv_array_push(back_exec->outgoings, &outgoing);
560
13
    } else {
561
0
      ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size);
562
0
      ccv_array_push(sum_or_set->outgoings, &outgoing);
563
0
    }
564
13
    if (tensor_ref->exec_registry)
565
6
      
for (j = 0; 3
j < tensor_ref->exec_registry->rnum;
j++3
)
566
3
      {
567
3
        const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
568
3
        assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
569
3
        assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation.
570
3
        ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
571
3
        if (!back_exec->outgoings)
572
0
          back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
573
3
        ccv_array_push(back_exec->outgoings, &outgoing);
574
3
      }
575
13
  }
576
5
  const ccv_nnc_tensor_ref_t tensor_ref = {
577
5
    .d = tensor_ref_d,
578
5
    .x = outgoing,
579
5
    .exec_registry = 0, // I don't need to take execution dependencies because this tensor is generated by sum, therefore, we already take that dependency.
580
5
    .alias_registry = !no_alias_registry || 
has_this_alias_exclusively1
?
ccv_array_new(sizeof(int), 1, 0)4
:
01
581
5
  };
582
5
  // If there is no alias registry, then we take the whole tensor ref as one.
583
5
  if (!no_alias_registry || 
has_this_alias_exclusively1
)
584
4
  {
585
4
    // If this tensor ref contains multiple different types of alias, have to add them together (otherwise
586
4
    // the computation for if there is an empty slot in this tensor ref is not correct without all the
587
4
    // occupancy availability information).
588
4
    if (!has_this_alias_exclusively)
589
10
      
for (i = 0; 3
i < input_size;
i++7
)
590
7
      {
591
7
        ccv_nnc_tensor_ref_t* ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
592
7
        assert(ref->alias_registry);
593
7
        // It may get duplicates. But whatever, won't matter the computation.
594
19
        
for (j = 0; 7
j < ref->alias_registry->rnum;
j++12
)
595
12
          ccv_array_push(tensor_ref.alias_registry, ccv_array_get(ref->alias_registry, j));
596
7
      }
597
4
    ccv_array_push(tensor_ref.alias_registry, &ad);
598
4
  }
599
5
  assert(input_size <= tensor_ver->ref_version->rnum - tensor_ver->c);
600
5
  ccv_nnc_tensor_ref_t x;
601
18
  for (i = 0; i < input_size; 
i++13
)
602
13
    // If the current one (i + tensor_ver->c) is smaller than the one referenced to, exchange.
603
13
    if (kd[i].i > i + tensor_ver->c)
604
13
      
CCV_SWAP0
(*(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i + tensor_ver->c), *(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i), x);
605
5
  ccv_array_push(tensor_ver->ref_version, &tensor_ref);
606
5
  // We've consumed input_size tensor refs, now move c up to the pointer of non-consumed tensors.
607
5
  tensor_ver->c += input_size;
608
5
  return ad;
609
5
}
610
611
typedef struct ccv_nnc_symbolic_graph_backward_prep_s {
612
  int exec_symbol_info_size; // Number of graph exec symbols before adding any new symbols related to automatic differentiation.
613
  int tensor_symbol_info_size; // Number of tensor symbols before adding anything new.
614
  int sub_prep_size;
615
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
616
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
617
  ccv_nnc_graph_backward_info_t* backward_info; // Corresponding to forward graph exec symbol info, it is exactly in reverse.
618
  ccv_nnc_graph_visit_t* forward_visit; // The visitor structure (top sorted index) when doing traversal.
619
  ccv_nnc_graph_visit_t* backward_visit; // The visitor structure (top sorted index) when doing reverse traversal.
620
  ccv_nnc_autograd_graph_exec_symbol_t* autograd_execs; // The graph exec symbols we need for automatic differentiation. This is a 1:1 mapping for forward graph exec symbols, however, unlike backward_info, its outgoings may be more complex (may contain outgoing flows to sum nodes).
621
  ccv_nnc_autograd_tensor_version_t* autograd_tensor_versions; // Corresponding to forward tensor symbols, each may contain multiple versions (due to multi-write).
622
  ccv_array_t* autograd_tensor_symbols; // The tensor symbols we need for automatic differentiation (it may not be 1:1 mapping).
623
  ccv_array_t* sum_or_set_execs; // The sum nodes, because in reverse mode, a tensor could have multiple versions, we need to sum them up before use.
624
  struct ccv_nnc_symbolic_graph_backward_prep_s* sub_preps; // The preps of its sub-graphs.
625
  // Pointers not managed by this struct
626
  ccv_nnc_symbolic_graph_t* graph;
627
} ccv_nnc_symbolic_graph_backward_prep_t;
628
629
static ccv_nnc_symbolic_graph_backward_prep_t _ccv_nnc_symbolic_graph_backward_prep(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
630
6.71k
{
631
6.71k
  const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
632
6.71k
  assert(exec_symbol_info_size > 0);
633
6.71k
  const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
634
6.71k
  assert(tensor_symbol_info_size > 0);
635
6.71k
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * exec_symbol_info_size);
636
6.71k
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * tensor_symbol_info_size);
637
13.4k
  ccv_nnc_graph_visit_t* forward_visit = 
ccv_nnc_graph_visit_new6.71k
(graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), exec_symbol_info_size, sources, source_size, destinations, destination_size, 0);
638
13.4k
  ccv_nnc_symbolic_graph_symbol_infer(graph, forward_visit, sources, source_size, destinations, destination_size, 0, 0, tensor_symbol_info, exec_symbol_info);
639
13.4k
  int i;
640
13.4k
  // Now, for each one of these, find a reverse graph.
641
13.4k
  ccv_nnc_graph_backward_info_t* backward_info = (ccv_nnc_graph_backward_info_t*)
cccalloc6.71k
(exec_symbol_info_size, sizeof(ccv_nnc_graph_backward_info_t));
642
18.0k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, node, idx) {
643
18.0k
    assert(ccv_nnc_cmd_is_forward(node->cmd) || node->cmd.cmd == CCV_NNC_NOOP);
644
18.0k
    if (node->outgoings)
645
22.6k
      
for (i = 0; 11.3k
i < node->outgoings->rnum;
i++11.3k
)
646
11.3k
      {
647
11.3k
        int d = *(int*)ccv_array_get(node->outgoings, i);
648
11.3k
        if (backward_info[d].outgoings == 0)
649
11.2k
          backward_info[d].outgoings = ccv_array_new(sizeof(int32_t), 1, 0);
650
11.3k
        ccv_array_push(backward_info[d].outgoings, &idx);
651
11.3k
      }
652
18.0k
  } ccv_nnc_graph_visit_endfor
653
13.4k
  // Also mark only the output bits that we use.
654
24.7k
  
for (i = 0; 6.71k
i < exec_symbol_info_size;
i++18.0k
)
655
18.0k
  {
656
18.0k
    backward_info[i].input_bitmask_size = ((exec_symbol_info[i].output_size * 2 + exec_symbol_info[i].input_size + 63) >> 6);
657
18.0k
    backward_info[i].output_bitmask_size = ((exec_symbol_info[i].input_size + 63) >> 6);
658
18.0k
    // Allocate input / output bitmasks
659
18.0k
    if (backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size > 0)
660
18.0k
    {
661
18.0k
      backward_info[i].input_bitmasks = (uint64_t*)cccalloc(backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size, sizeof(uint64_t));
662
18.0k
      if (backward_info[i].output_bitmask_size)
663
18.0k
        backward_info[i].output_bitmasks = backward_info[i].input_bitmasks + backward_info[i].input_bitmask_size;
664
18.0k
    }
665
18.0k
  }
666
6.71k
  ccv_nnc_graph_visit_t* backward_visit = ccv_nnc_graph_visit_new(graph, backward_info, exec_symbol_info_size, destinations, destination_size, sources, source_size, 0);
667
6.71k
  const int sub_prep_size = graph->sub_graphs ? 
graph->sub_graphs->rnum2
:
06.71k
;
668
6.71k
  ccv_nnc_symbolic_graph_backward_prep_t* sub_preps = sub_prep_size > 0 ? 
(ccv_nnc_symbolic_graph_backward_prep_t*)2
cccalloc2
(sub_prep_size, sizeof(ccv_nnc_symbolic_graph_backward_prep_t)) :
06.71k
;
669
6.72k
  for (i = 0; i < sub_prep_size; 
i++4
)
670
4
  {
671
4
    const ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(graph->sub_graphs, i);
672
4
    sub_preps[i] = _ccv_nnc_symbolic_graph_backward_prep(sub_graph, ccv_nnc_symbolic_graph_sources(sub_graph), ccv_nnc_symbolic_graph_source_size(sub_graph), ccv_nnc_symbolic_graph_destinations(sub_graph), ccv_nnc_symbolic_graph_destination_size(sub_graph));
673
4
  }
674
6.71k
  return (ccv_nnc_symbolic_graph_backward_prep_t){
675
6.71k
    .exec_symbol_info_size = exec_symbol_info_size,
676
6.71k
    .tensor_symbol_info_size = tensor_symbol_info_size,
677
6.71k
    .sub_prep_size = sub_prep_size,
678
6.71k
    .exec_symbol_info = exec_symbol_info,
679
6.71k
    .tensor_symbol_info = tensor_symbol_info,
680
6.71k
    .backward_info = backward_info,
681
6.71k
    .forward_visit = forward_visit,
682
6.71k
    .backward_visit = backward_visit,
683
6.71k
    .sub_preps = sub_preps,
684
6.71k
    .graph = (ccv_nnc_symbolic_graph_t*)graph,
685
6.71k
  };
686
6.71k
}
687
688
static void _ccv_nnc_symbolic_graph_backward_exec_io(const ccv_nnc_graph_exec_symbol_info_t* const node, int** const back_input_map, int** const back_output_map, int* const back_input_size, int* const back_output_size)
689
18.0k
{
690
18.0k
  int i;
691
18.0k
  if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
692
7
  {
693
7
    *back_input_map = node->outputs;
694
7
    *back_input_size = node->output_size;
695
14
    for (i = 0; i < node->case_of.argument.offset; 
i++7
)
696
7
      (*back_output_map)[i] = node->inputs[i];
697
7
    const int argument_offset = node->case_of.argument.offset;
698
7
    const int argument_size = node->case_of.argument.size;
699
7
    // Skip the argument range.
700
7
    for (i = argument_offset + argument_size; i < node->input_size; 
i++0
)
701
0
      (*back_output_map)[i - argument_size] = node->inputs[i];
702
7
    *back_output_size = node->input_size - node->case_of.argument.size;
703
18.0k
  } else { // if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) {
704
18.0k
    *back_input_map = node->outputs;
705
18.0k
    *back_input_size = node->output_size;
706
18.0k
    *back_output_map = node->inputs;
707
18.0k
    *back_output_size = node->input_size;
708
18.0k
  }
709
18.0k
}
710
711
static void _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(const ccv_nnc_graph_exec_symbol_info_t* const forw_exec, const ccv_nnc_symbolic_graph_t* const sub_graph, const int graph_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const uint64_t* const input_bitmasks, const uint64_t* const output_bitmasks, ccv_array_t* const sub_f_symbols, ccv_array_t* const sub_wrt_symbols)
712
8
{
713
8
  int i, j;
714
8
  ccv_array_clear(sub_wrt_symbols);
715
8
  int forw_outputs[ccv_max(1, forw_exec->output_size)];
716
8
  int forw_inputs[ccv_max(1, forw_exec->input_size)];
717
8
  int* back_input_map = forw_outputs;
718
8
  int* back_output_map = forw_inputs;
719
8
  int back_input_size, back_output_size;
720
8
  _ccv_nnc_symbolic_graph_backward_exec_io(forw_exec, &back_input_map, &back_output_map, &back_input_size, &back_output_size);
721
18
  for (i = 0; i < back_output_size; 
i++10
)
722
10
    if (output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
723
8
    {
724
8
      const int d = back_output_map[i];
725
8
      const ccv_array_t* const s_refs = tensor_symbol_info[d].s_ref;
726
8
      const int s_ref = s_refs && s_refs->rnum > graph_ref ? 
*(int*)7
ccv_array_get7
(s_refs, graph_ref) - 1 :
-11
;
727
8
      if (s_ref >= 0)
728
4
      {
729
4
        ccv_nnc_tensor_symbol_t sub_wrt_symbol = {
730
4
          .d = s_ref,
731
4
          .graph = sub_graph,
732
4
        };
733
4
        ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol);
734
4
      } else
735
4
        ccv_array_push(sub_wrt_symbols, &NO_TENSOR_SYMBOL);
736
8
    }
737
8
  ccv_array_clear(sub_f_symbols);
738
16
  for (i = 0; i < back_input_size; 
i++8
)
739
8
    if (input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
740
8
    {
741
8
      const int d = back_input_map[i];
742
8
      ccv_nnc_tensor_symbol_t sub_f_symbol = {
743
8
        .d = *(int*)ccv_array_get(tensor_symbol_info[d].s_ref, graph_ref) - 1,
744
8
        .graph = sub_graph,
745
8
      };
746
8
      ccv_array_push(sub_f_symbols, &sub_f_symbol);
747
8
    }
748
8
  // Go through all its assignments (parameterized loop), making them either wrt or f.
749
8
  // The reason is these must flow through the graph, otherwise we cannot form a full
750
8
  // enclosed loop. Also because they are the additional f / wrt symbols, there is
751
8
  // no case that we cannot find their corresponding gradients in the backward sub graphs
752
8
  // (these gradients have to be parameterized to form an enclosed loop as well).
753
30
  for (i = 0; i < sub_graph->tensor_symbol_info->rnum; 
i++22
)
754
22
  {
755
22
    const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, i);
756
22
    if (tensor_symbol_info->assign_ref)
757
2
    {
758
2
      const int assign_ref = tensor_symbol_info->assign_ref - 1;
759
2
      // i is the wrt, assign_ref is the f.
760
2
      int flag = 0;
761
4
      for (j = 0; !flag && j < sub_wrt_symbols->rnum; 
j++2
)
762
2
        flag = (((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, j))->d == i);
763
2
      if (!flag)
764
2
      {
765
2
        ccv_nnc_tensor_symbol_t sub_wrt_symbol = {
766
2
          .d = i,
767
2
          .graph = sub_graph,
768
2
        };
769
2
        ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol);
770
2
      }
771
2
      flag = 0;
772
4
      for (j = 0; !flag && 
j < sub_f_symbols->rnum2
;
j++2
)
773
2
        flag = (((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, j))->d == assign_ref);
774
2
      if (!flag)
775
0
      {
776
0
        ccv_nnc_tensor_symbol_t sub_f_symbol = {
777
0
          .d = assign_ref,
778
0
          .graph = sub_graph,
779
0
        };
780
0
        ccv_array_push(sub_f_symbols, &sub_f_symbol);
781
0
      }
782
2
    }
783
22
  }
784
8
}
785
786
// Check whether for a given f_symbol, we can compute wrt_symbols at all, if we can, tag the minimal io and ops (some ops can be replaced with noop) required to do so.
787
static int _ccv_nnc_symbolic_graph_backward_prep_prune_ops(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
788
6.71k
{
789
6.71k
  int i, j, p;
790
6.71k
  const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size;
791
6.71k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
792
6.71k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info =backward_prep->tensor_symbol_info;
793
6.71k
  const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit;
794
6.71k
  // Now, for each one of these, find a reverse graph.
795
6.71k
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
796
6.71k
  const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit;
797
6.71k
  // Find the f_symbols, and tag its flows.
798
18.0k
  ccv_nnc_graph_visit_for(backward_visit, backward_info, node, idx) {
799
18.0k
    int f = node->f_wrt & 0x1;
800
25.0k
    for (i = 0; i < exec_symbol_info[idx].output_size && 
!f18.2k
;
i++6.98k
)
801
6.98k
    {
802
6.98k
      int d = exec_symbol_info[idx].outputs[i];
803
6.98k
      if (d < 0)
804
204
        continue;
805
6.77k
      
while (6.77k
tensor_symbol_info[d].alias_ref)
806
3
        d = tensor_symbol_info[d].alias_ref - 1;
807
13.5k
      for (j = 0; j < f_symbol_size && 
!f6.80k
;
j++6.79k
)
808
6.79k
        if (d == f_symbols[j].d)
809
6.73k
          f = 1;
810
6.77k
    }
811
18.0k
    if (f)
812
18.0k
    {
813
18.0k
      node->f_wrt |= f;
814
18.0k
      if (node->outgoings)
815
22.5k
        
for (i = 0; 11.2k
i < node->outgoings->rnum;
i++11.3k
)
816
11.3k
        {
817
11.3k
          int d = *(int*)ccv_array_get(node->outgoings, i);
818
11.3k
          backward_info[d].f_wrt |= f;
819
11.3k
        }
820
18.0k
    }
821
18.0k
  } ccv_nnc_graph_visit_endfor
822
6.71k
  // Find the wrt_symbols, and tag its flows.
823
18.0k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, node, idx) {
824
18.0k
    int wrt = backward_info[idx].f_wrt & 0x2;
825
28.9k
    for (i = 0; i < node->input_size && 
!wrt26.4k
;
i++10.8k
)
826
10.8k
    {
827
10.8k
      int d = node->inputs[i];
828
10.8k
      if (d < 0)
829
0
        continue;
830
10.8k
      
while (10.8k
tensor_symbol_info[d].alias_ref)
831
5
        d = tensor_symbol_info[d].alias_ref - 1;
832
24.4k
      for (j = 0; j < wrt_symbol_size && 
!wrt13.6k
;
j++13.6k
)
833
13.6k
        if (d == wrt_symbols[j].d)
834
6.78k
          wrt = 0x2;
835
10.8k
    }
836
18.0k
    if (wrt)
837
18.0k
    {
838
18.0k
      backward_info[idx].f_wrt |= wrt;
839
18.0k
      if (node->outgoings)
840
22.6k
        
for (i = 0; 11.3k
i < node->outgoings->rnum;
i++11.3k
)
841
11.3k
        {
842
11.3k
          int d = *(int*)ccv_array_get(node->outgoings, i);
843
11.3k
          backward_info[d].f_wrt |= wrt;
844
11.3k
        }
845
18.0k
    }
846
18.0k
  } ccv_nnc_graph_visit_endfor
847
6.71k
  enum {
848
6.71k
    WRT_SYMBOL_USE = 1,
849
6.71k
    F_SYMBOL_USE = 2
850
6.71k
  };
851
6.71k
  uint8_t* used_grad = (uint8_t*)cccalloc(tensor_symbol_info_size, sizeof(uint8_t));
852
6.71k
  // First, all f_symbols and wrt_symbols are used.
853
13.4k
  for (i = 0; i < f_symbol_size; 
i++6.73k
)
854
6.73k
    if (f_symbols[i].d >= 0)
855
6.73k
      used_grad[tensor_symbol_info[f_symbols[i].d].alias_ref ? 
tensor_symbol_info[f_symbols[i].d].alias_ref - 10
: f_symbols[i].d] |= F_SYMBOL_USE;
856
16.1k
  for (i = 0; i < wrt_symbol_size; 
i++9.45k
)
857
9.45k
    if (wrt_symbols[i].d >= 0)
858
9.45k
      used_grad[tensor_symbol_info[wrt_symbols[i].d].alias_ref ? 
tensor_symbol_info[wrt_symbols[i].d].alias_ref - 10
: wrt_symbols[i].d] |= WRT_SYMBOL_USE;
859
6.71k
  // Do optimistic assumption, and then compute used_grad
860
18.0k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) {
861
18.0k
    ccv_nnc_graph_backward_info_t* node = backward_info + idx;
862
18.0k
    /* Only interested in the ones on the f / wrt flow */
863
18.0k
    if ((node->f_wrt & 0x3) == 0x3)
864
17.9k
    {
865
17.9k
      const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
866
17.9k
      ccv_nnc_cmd_t cmd = forw_exec->cmd;
867
17.9k
      if (cmd.cmd != CCV_NNC_NOOP)
868
17.9k
        cmd.cmd += 1; /* Backward command is the one after forward command. */
869
17.9k
      assert(ccv_nnc_cmd_is_backward(cmd) || cmd.cmd == CCV_NNC_NOOP);
870
88.3k
      
for (i = 0; 17.9k
i < forw_exec->output_size * 2 + forw_exec->input_size;
i++70.3k
)
871
70.3k
        if (!(i >= forw_exec->output_size && 
i < forw_exec->output_size + forw_exec->input_size51.9k
&&
872
70.3k
          
forw_exec->inputs[i - forw_exec->output_size] < 033.6k
) && // If the input is empty, no need.
873
70.3k
          
!(70.3k
i >= forw_exec->output_size + forw_exec->input_size70.3k
&&
i < forw_exec->output_size * 2 + forw_exec->input_size18.3k
&&
874
70.3k
          
forw_exec->outputs[i - forw_exec->output_size - forw_exec->input_size] < 018.3k
) && // If the output is empty, no need.
875
70.3k
          
!(70.1k
i < forw_exec->output_size70.1k
&&
forw_exec->outputs[i] < 018.3k
)) // If the output is empty for gradient, no need.
876
69.9k
          node->input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
877
51.6k
      for (i = 0; i < forw_exec->input_size; 
i++33.6k
)
878
33.6k
        if (!(forw_exec->inputs[i] < 0)) // If the inputs is empty, no need.
879
33.6k
          node->output_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
880
17.9k
      int maybe_noop = 1;
881
22.0k
      for (i = 0; i < forw_exec->input_size; 
i++4.03k
)
882
22.0k
        /* See if it is used as wrt, if not, no need to run this node at all. */
883
22.0k
        if (forw_exec->inputs[i] >= 0 && 
used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref 22.0k
?
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 11.11k
:
forw_exec->inputs[i]20.9k
] & WRT_SYMBOL_USE)
884
17.9k
        {
885
17.9k
          maybe_noop = 0;
886
17.9k
          break;
887
17.9k
        }
888
17.9k
      if (maybe_noop)
889
0
      {
890
0
        for (i = 0; i < node->input_bitmask_size; i++)
891
0
          node->input_bitmasks[i] = 0;
892
0
        for (i = 0; i < node->output_bitmask_size; i++)
893
0
          node->output_bitmasks[i] = 0;
894
0
        node->output_bitmask_size = 0;
895
17.9k
      } else if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) {
896
2
        // Clear out all potential outputs if we think it is not a wrt symbols.
897
6
        for (i = 0; i < forw_exec->input_size; 
i++4
)
898
4
          if ((node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) &&
899
4
            !(used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? 
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 10
: forw_exec->inputs[i]] & WRT_SYMBOL_USE))
900
1
            node->output_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
901
2
        // But for now, assuming we need all input gradients.
902
2
        // Clear out all inputs / outputs from forward op.
903
8
        for (i = forw_exec->output_size; i < forw_exec->output_size * 2 + forw_exec->input_size; 
i++6
)
904
6
          node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
905
17.9k
      } else if (ccv_nnc_cmd_bitmask(cmd, forw_exec->output_size * 2 + forw_exec->input_size, forw_exec->input_size, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size)) {
906
15.5k
        int flag; /* Only continue if it changed */
907
28.9k
        do {
908
28.9k
          flag = 0;
909
28.9k
          /* Check if the output first */
910
86.8k
          for (i = 0; i < forw_exec->input_size; 
i++57.9k
)
911
57.9k
            /* Only try to eliminate the one that is not used. */
912
57.9k
            if ((node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) &&
913
57.9k
              
!(used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref 51.3k
?
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 1309
:
forw_exec->inputs[i]51.0k
] & WRT_SYMBOL_USE))
914
10.6k
            {
915
10.6k
              node->output_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
916
10.6k
              /* If it worked, mark it as flagged. */
917
10.6k
              if (ccv_nnc_cmd_bitmask(cmd, forw_exec->output_size * 2 + forw_exec->input_size, forw_exec->input_size, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size))
918
6.53k
                flag = 1;
919
4.06k
              else /* Refit this with the bit back again. */
920
4.06k
                node->output_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
921
10.6k
            }
922
146k
          for (i = 0; i < forw_exec->output_size * 2 + forw_exec->input_size; 
i++117k
)
923
117k
            if ((node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) &&
924
117k
              
(91.8k
i >= forw_exec->output_size91.8k
||
925
91.8k
               
!(used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref 29.1k
?
tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 137
:
forw_exec->outputs[i]29.1k
] & F_SYMBOL_USE)))
926
78.5k
            { /* Try to eliminate one of the input. */
927
78.5k
              node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
928
78.5k
              /* If it worked, mark it as flagged. */
929
78.5k
              if (ccv_nnc_cmd_bitmask(cmd, forw_exec->output_size * 2 + forw_exec->input_size, forw_exec->input_size, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size))
930
24.6k
                flag = 1;
931
53.8k
              else /* Refit this with the bit back again. */
932
53.8k
                node->input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
933
78.5k
            }
934
28.9k
        } while (flag);
935
15.5k
      }
936
36.3k
      for (i = 0; i < forw_exec->output_size; 
i++18.3k
)
937
18.3k
        if (node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
938
18.0k
          /* Mark it is used as wrt. */
939
18.0k
          used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? 
tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 119
:
forw_exec->outputs[i]17.9k
] |= WRT_SYMBOL_USE;
940
51.6k
      for (i = 0; i < forw_exec->input_size; 
i++33.6k
)
941
33.6k
          /* Mark it is used as f. */
942
33.6k
        if (node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
943
27.0k
          used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? 
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 11.15k
:
forw_exec->inputs[i]25.9k
] |= F_SYMBOL_USE;
944
17.9k
    }
945
18.0k
  } ccv_nnc_graph_visit_endfor
946
6.71k
  ccv_array_t* sub_f_symbols = 0;
947
6.71k
  ccv_array_t* sub_wrt_symbols = 0;
948
18.0k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) {
949
18.0k
    ccv_nnc_graph_backward_info_t* node = backward_info + idx;
950
18.0k
    const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
951
18.0k
    /* Only interested in the ones on the f / wrt flow */
952
18.0k
    if ((node->f_wrt & 0x3) == 0x3 && 
forw_exec->graph_ref_size > 017.9k
)
953
2
    {
954
2
      uint64_t stack_input_bitmasks1[node->input_bitmask_size];
955
2
      uint64_t stack_input_bitmasks2[node->input_bitmask_size];
956
2
      uint64_t* const input_bitmasks = forw_exec->graph_ref_size > 1 ? 
stack_input_bitmasks11
:
node->input_bitmasks1
;
957
2
      // We collect input masks into this location.
958
2
      if (forw_exec->graph_ref_size > 1)
959
1
        memset(stack_input_bitmasks2, 0, sizeof(uint64_t) * node->input_bitmask_size);
960
6
      for (p = 0; p < forw_exec->graph_ref_size; 
p++4
)
961
4
      {
962
4
        // Reset the stack input bitmasks.
963
4
        if (forw_exec->graph_ref_size > 1)
964
3
          memcpy(stack_input_bitmasks1, node->input_bitmasks, sizeof(uint64_t) * node->input_bitmask_size);
965
4
        // Now calling it recursively until we are sure no f_symbols can be removed.
966
4
        const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[p] - 1;
967
4
        ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep = backward_prep->sub_preps + graph_ref;
968
4
        if (!sub_wrt_symbols)
969
2
          sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
970
2
        else
971
2
          ccv_array_clear(sub_wrt_symbols);
972
12
        for (i = 0; i < forw_exec->input_size; 
i++8
)
973
8
          if (node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
974
7
          {
975
7
            const ccv_array_t* const s_refs = tensor_symbol_info[forw_exec->inputs[i]].s_ref;
976
7
            const int s_ref = s_refs && s_refs->rnum > graph_ref ? 
*(int*)5
ccv_array_get5
(s_refs, graph_ref) - 1 :
-12
;
977
7
            if (s_ref >= 0)
978
3
            {
979
3
              ccv_nnc_tensor_symbol_t sub_wrt_symbol = {
980
3
                .d = s_ref,
981
3
                .graph = sub_prep->graph,
982
3
              };
983
3
              ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol);
984
3
            }
985
7
          }
986
4
        int flag; // Only continue if it changed */
987
4
        do {
988
4
          flag = 0;
989
8
          for (i = 0; i < forw_exec->output_size; 
i++4
)
990
4
            // Try to reduce number of inputs for the backward graph. If it is not tagged as F_SYMBOL_USE, we can reduce it.
991
4
            // It is reducible because this sub graph may have multiple computation paths, therefore, some of these may not
992
4
            // involve our wrt symbols at all.
993
4
            if (!(used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? 
tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 10
: forw_exec->outputs[i]] & F_SYMBOL_USE) &&
994
4
              
input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))0
)
995
0
            { /* Try to eliminate one of the input. */
996
0
              input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
997
0
              if (!sub_f_symbols)
998
0
                sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
999
0
              else
1000
0
                ccv_array_clear(sub_f_symbols);
1001
0
              for (j = 0; j < forw_exec->output_size; j++)
1002
0
                if (node->input_bitmasks[j >> 6] & ((uint64_t)1 << (j & 63)))
1003
0
                {
1004
0
                  const int s_ref = *(int*)ccv_array_get(tensor_symbol_info[forw_exec->outputs[j]].s_ref, graph_ref) - 1;
1005
0
                  assert(s_ref >= 0);
1006
0
                  ccv_nnc_tensor_symbol_t sub_f_symbol = {
1007
0
                    .d = s_ref,
1008
0
                    .graph = sub_prep->graph,
1009
0
                  };
1010
0
                  ccv_array_push(sub_f_symbols, &sub_f_symbol);
1011
0
                }
1012
0
              if (_ccv_nnc_symbolic_graph_backward_prep_prune_ops(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph)))
1013
0
                flag = 1;
1014
0
              else /* Refit this with the bit back again. */
1015
0
                input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
1016
0
            }
1017
4
        } while (flag);
1018
4
        // I am done, need to redo above for sub_prep, and it has to be successful now.
1019
4
        if (!sub_f_symbols)
1020
2
          sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1021
2
        else
1022
2
          ccv_array_clear(sub_f_symbols);
1023
8
        for (i = 0; i < forw_exec->output_size; 
i++4
)
1024
4
          if (input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
1025
4
          {
1026
4
            const int s_ref = *(int*)ccv_array_get(tensor_symbol_info[forw_exec->outputs[i]].s_ref, graph_ref) - 1;
1027
4
            assert(s_ref >= 0);
1028
4
            ccv_nnc_tensor_symbol_t sub_f_symbol = {
1029
4
              .d = s_ref,
1030
4
              .graph = sub_prep->graph,
1031
4
            };
1032
4
            ccv_array_push(sub_f_symbols, &sub_f_symbol);
1033
4
          }
1034
4
        _ccv_nnc_symbolic_graph_backward_prep_prune_ops(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph));
1035
4
        if (forw_exec->graph_ref_size > 1)
1036
6
          
for (i = 0; 3
i < node->input_bitmask_size;
i++3
)
1037
3
            stack_input_bitmasks2[i] |= input_bitmasks[i];
1038
4
      }
1039
2
      if (forw_exec->graph_ref_size > 1)
1040
1
        memcpy(node->input_bitmasks, stack_input_bitmasks2, sizeof(uint64_t) * node->input_bitmask_size);
1041
2
    }
1042
18.0k
  } ccv_nnc_graph_visit_endfor
1043
6.71k
  if (sub_f_symbols)
1044
2
    ccv_array_free(sub_f_symbols);
1045
6.71k
  if (sub_wrt_symbols)
1046
2
    ccv_array_free(sub_wrt_symbols);
1047
6.71k
  int flag = 1;
1048
13.4k
  for (i = 0; i < f_symbol_size && 
flag6.73k
;
i++6.73k
)
1049
6.73k
    flag = (used_grad[tensor_symbol_info[f_symbols[i].d].alias_ref ? 
tensor_symbol_info[f_symbols[i].d].alias_ref - 10
: f_symbols[i].d] & WRT_SYMBOL_USE);
1050
6.71k
  ccfree(used_grad);
1051
6.71k
  return flag;
1052
6.71k
}
1053
1054
static void _ccv_nnc_symbolic_graph_backward_prep_gen(ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const int is_while, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
1055
6.71k
{
1056
6.71k
  const int exec_symbol_info_size = backward_prep->exec_symbol_info_size;
1057
6.71k
  const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size;
1058
6.71k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
1059
6.71k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info =backward_prep->tensor_symbol_info;
1060
6.71k
  const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit;
1061
6.71k
  // Now, for each one of these, find a reverse graph.
1062
6.71k
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
1063
6.71k
  const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit;
1064
6.71k
  int i, j;
1065
6.71k
  // Now, only the flow from f_symbols back to wrt_symbols are interested to us.
1066
6.71k
  // Visit the graph in reverse order, build the AD nodes.
1067
6.71k
  ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = (ccv_nnc_autograd_graph_exec_symbol_t*)cccalloc(exec_symbol_info_size, sizeof(ccv_nnc_autograd_graph_exec_symbol_t));
1068
6.71k
  int max_forw_input_size = 0, max_forw_output_size = 0;
1069
24.7k
  for (i = 0; i < exec_symbol_info_size; 
i++18.0k
)
1070
18.0k
    if ((backward_info[i].f_wrt & 0x3) == 0x3)
1071
17.9k
    {
1072
17.9k
      max_forw_input_size = ccv_max(max_forw_input_size, exec_symbol_info[i].input_size);
1073
17.9k
      max_forw_output_size = ccv_max(max_forw_output_size, exec_symbol_info[i].output_size);
1074
17.9k
      if (backward_info[i].outgoings)
1075
11.2k
      {
1076
11.2k
        // Copy over the outgoing bits.
1077
11.2k
        autograd_execs[i].outgoings = ccv_array_new(sizeof(int), backward_info[i].outgoings->rnum, 0);
1078
22.5k
        for (j = 0; j < backward_info[i].outgoings->rnum; 
j++11.3k
)
1079
11.3k
        {
1080
11.3k
          const int d = *(int*)ccv_array_get(backward_info[i].outgoings, j);
1081
11.3k
          // Only push the outgoing node if it is in the f_wrt path.
1082
11.3k
          if ((backward_info[d].f_wrt & 0x3) == 0x3)
1083
11.3k
            ccv_array_push(autograd_execs[i].outgoings, &d);
1084
11.3k
        }
1085
11.2k
      }
1086
17.9k
    }
1087
6.71k
  int max_forw_inputs[ccv_max(1, max_forw_input_size)];
1088
6.71k
  int max_forw_outputs[ccv_max(1, max_forw_output_size)];
1089
6.71k
  ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = (ccv_nnc_autograd_tensor_version_t*)cccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_autograd_tensor_version_t));
1090
6.71k
  ccv_array_t* autograd_tensor_symbols = ccv_array_new(sizeof(ccv_nnc_autograd_tensor_symbol_t), tensor_symbol_info_size, 0);
1091
6.71k
  ccv_array_t* sum_or_set_execs = ccv_array_new(sizeof(ccv_nnc_sum_or_set_graph_exec_symbol_t), 0, 0);
1092
18.0k
  ccv_nnc_graph_visit_for(backward_visit, backward_info, back_info_node, idx) {
1093
18.0k
    /* This is required by both f flow and wrt flow, therefore, an interest to us */
1094
18.0k
    if ((back_info_node->f_wrt & 0x3) == 0x3)
1095
17.9k
    {
1096
17.9k
      const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
1097
17.9k
      ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + idx;
1098
17.9k
      back_exec->cmd = forw_exec->cmd;
1099
17.9k
      if (back_exec->cmd.cmd != CCV_NNC_NOOP)
1100
17.9k
        back_exec->cmd.cmd += 1; /* Backward command is the one after forward command. */
1101
17.9k
      assert(ccv_nnc_cmd_is_backward(back_exec->cmd) || back_exec->cmd.cmd == CCV_NNC_NOOP);
1102
17.9k
      if (!back_info_node->output_bitmask_size) /* This has no output, can be a noop. */
1103
0
        back_exec->cmd.cmd = CCV_NNC_NOOP;
1104
17.9k
      else {
1105
17.9k
        int* back_input_map = max_forw_outputs;
1106
17.9k
        int* back_output_map = max_forw_inputs;
1107
17.9k
        _ccv_nnc_symbolic_graph_backward_exec_io(forw_exec, &back_input_map, &back_output_map, &back_exec->input_size, &back_exec->output_size);
1108
17.9k
        back_exec->inputs = ccmalloc(sizeof(int) * (back_exec->input_size + back_exec->output_size));
1109
17.9k
        back_exec->outputs = back_exec->inputs + back_exec->input_size;
1110
17.9k
        /* Need to compute input before we compute output */
1111
36.3k
        for (i = 0; i < back_exec->input_size; 
i++18.3k
)
1112
18.3k
        {
1113
18.3k
          /* If we can skip this input, do that. */
1114
18.3k
          if (!(back_info_node->input_bitmasks[i >> 6] & ((uint64_t)1 << i)))
1115
383
            continue;
1116
18.0k
          const int d = back_input_map[i];
1117
18.0k
          const int alias_ref = tensor_symbol_info[d].alias_ref;
1118
18.0k
          ccv_nnc_autograd_tensor_version_t* tensor_ver = alias_ref ? 
autograd_tensor_versions + (alias_ref - 1)19
:
autograd_tensor_versions + d17.9k
;
1119
18.0k
          /* Initialization tensor, should corresponding to f symbols */
1120
18.0k
          if (!tensor_ver->ref_version)
1121
6.72k
          {
1122
6.72k
            ccv_nnc_autograd_tensor_symbol_t tensor_sym = {};
1123
6.72k
            if (!alias_ref)
1124
6.72k
            {
1125
6.72k
              tensor_sym.d = d;
1126
6.72k
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1127
6.72k
              const ccv_nnc_tensor_ref_t tensor_ref = {
1128
6.72k
                .d = autograd_tensor_symbols->rnum - 1,
1129
6.72k
                .x = idx,
1130
6.72k
                .alias_registry = 0
1131
6.72k
              };
1132
6.72k
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1133
6.72k
              ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1134
6.72k
            } else {
1135
2
              tensor_sym.d = alias_ref - 1;
1136
2
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1137
2
              const ccv_nnc_tensor_ref_t tensor_ref = {
1138
2
                .d = autograd_tensor_symbols->rnum - 1,
1139
2
                .x = idx,
1140
2
                .alias_registry = ccv_array_new(sizeof(int), 1, 0)
1141
2
              };
1142
2
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1143
2
              ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1144
2
              tensor_sym.d = d; /* set back */
1145
2
              tensor_sym.alias_ref = tensor_ref.d + 1;
1146
2
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1147
2
              const int ad = autograd_tensor_symbols->rnum - 1;
1148
2
              ccv_array_push(tensor_ref.alias_registry, &ad);
1149
2
            }
1150
6.72k
          }
1151
18.0k
          /* The simplest case (most common), it is not an alias. */
1152
18.0k
          if (!alias_ref)
1153
17.9k
          {
1154
17.9k
            /* Even simpler, this only have one reference tensor, thus, pass this as input. */
1155
17.9k
            if (tensor_ver->c == tensor_ver->ref_version->rnum - 1)
1156
13.7k
            {
1157
13.7k
              ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1158
13.7k
              /* There are alias associated with this tensor ref, zero it out when this tensor is allocated. */
1159
13.7k
              /* This is is required. Consider the case that we have an alias of this tensor used somehwere */
1160
13.7k
              /* on forward pass, when we compute backward, we have that alias computed first, however, its */
1161
13.7k
              /* underlying tensor is not zero initialized, and we will end up with garbage values here. */
1162
13.7k
              if (tensor_ref->alias_registry &&
1163
13.7k
                /* Loop over to see if this tensor is fully occupied to avoid extra zero step. */
1164
13.7k
                
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)1.10k
)
1165
1
              {
1166
1
                ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1167
1
                assert(tensor_sym->alias_ref == 0);
1168
1
                tensor_sym->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS;
1169
1
              }
1170
13.7k
              back_exec->inputs[i] = tensor_ref->d;
1171
13.7k
            } else {
1172
4.24k
              /* Otherwise, we need to sum them up, and then pass the summed result to the computation. */
1173
4.24k
              _ccv_nnc_graph_sum_autograd_tensor_versions(idx, d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_or_set_execs);
1174
4.24k
              ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1175
4.24k
              back_exec->inputs[i] = tensor_ref->d;
1176
4.24k
            }
1177
17.9k
          } else
1178
19
            /* If this is an alias, go through all available tensor ref versions */
1179
19
            back_exec->inputs[i] = _ccv_nnc_graph_sum_autograd_tensor_versions_alias(idx, d, tensor_symbol_info, exec_symbol_info_size, tensor_symbol_info + d, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_or_set_execs);
1180
18.0k
        }
1181
51.6k
        
for (i = 0; 17.9k
i < back_exec->output_size;
i++33.6k
)
1182
33.6k
        {
1183
33.6k
          /* If we can skip this output, do that. */
1184
33.6k
          if (!(back_info_node->output_bitmasks[i >> 6] & ((uint64_t)1 << i)))
1185
6.54k
            continue;
1186
27.0k
          const int d = back_output_map[i];
1187
27.0k
          const int alias_ref = tensor_symbol_info[d].alias_ref;
1188
27.0k
          ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
1189
27.0k
            .d = d
1190
27.0k
          };
1191
27.0k
          /* The simplest case (most common), it is not an alias. */
1192
27.0k
          if (!alias_ref)
1193
25.9k
          {
1194
25.9k
            ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1195
25.9k
            const ccv_nnc_tensor_ref_t tensor_ref = {
1196
25.9k
              .d = autograd_tensor_symbols->rnum - 1,
1197
25.9k
              .x = idx,
1198
25.9k
              .exec_registry = 0,
1199
25.9k
              .alias_registry = 0
1200
25.9k
            };
1201
25.9k
            ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + d;
1202
25.9k
            if (!tensor_ver->ref_version)
1203
21.6k
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1204
25.9k
            ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1205
25.9k
            back_exec->outputs[i] = tensor_ref.d;
1206
25.9k
          } else {
1207
1.15k
            /* Otherwise, in case that this is an alias, we try to find the existing one (in tensor_ver
1208
1.15k
             * see if can meet the need (thus, for the tensor info / ofs, it fits). */
1209
1.15k
            ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + (alias_ref - 1);
1210
1.15k
            if (!tensor_ver->ref_version)
1211
1.11k
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1212
1.15k
            /* If already exists a ref version, check if any of these not-sealed tensors have free space. */
1213
1.15k
            int found = 0;
1214
1.21k
            for (j = tensor_ver->c; !found && 
j < tensor_ver->ref_version->rnum1.20k
;
j++64
)
1215
64
            {
1216
64
              ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, j);
1217
64
              if (!_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, tensor_symbol_info + d))
1218
9
              {
1219
9
                tensor_sym.alias_ref = tensor_ref->d + 1;
1220
9
                ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1221
9
                const int ad = autograd_tensor_symbols->rnum - 1;
1222
9
                ccv_array_push(tensor_ref->alias_registry, &ad);
1223
9
                if (!tensor_ref->exec_registry)
1224
7
                  tensor_ref->exec_registry = ccv_array_new(sizeof(int), 1, 0);
1225
9
                ccv_array_push(tensor_ref->exec_registry, &idx);
1226
9
                back_exec->outputs[i] = ad;
1227
9
                found = 1;
1228
9
              }
1229
64
            }
1230
1.15k
            if (!found) /* Cannot find an tensor ref to insert, create one first */
1231
1.14k
            {
1232
1.14k
              tensor_sym.d = alias_ref - 1; /* Reference back to the non-alias. */
1233
1.14k
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1234
1.14k
              const ccv_nnc_tensor_ref_t tensor_ref = {
1235
1.14k
                .d = autograd_tensor_symbols->rnum - 1,
1236
1.14k
                .x = idx,
1237
1.14k
                .exec_registry = 0,
1238
1.14k
                .alias_registry = ccv_array_new(sizeof(int), 1, 0)
1239
1.14k
              };
1240
1.14k
              ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1241
1.14k
              tensor_sym.d = d; /* set back */
1242
1.14k
              tensor_sym.alias_ref = tensor_ref.d + 1;
1243
1.14k
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1244
1.14k
              const int ad = autograd_tensor_symbols->rnum - 1;
1245
1.14k
              ccv_array_push(tensor_ref.alias_registry, &ad);
1246
1.14k
              back_exec->outputs[i] = ad;
1247
1.14k
            }
1248
1.15k
          }
1249
27.0k
        }
1250
17.9k
      }
1251
17.9k
    }
1252
18.0k
  } ccv_nnc_graph_visit_endfor
1253
6.71k
  // Find all relevant wrt symbols, generate sum for them if needed.
1254
16.1k
  
for (i = 0; 6.71k
i < wrt_symbol_size;
i++9.45k
)
1255
9.45k
  {
1256
9.45k
    const int d = wrt_symbols[i].d;
1257
9.45k
    if (d < 0)
1258
2
      continue;
1259
9.45k
    const int ref_d = (!tensor_symbol_info[d].alias_ref) ? d : 
tensor_symbol_info[d].alias_ref - 10
;
1260
9.45k
    ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + ref_d;
1261
9.45k
    if (!tensor_ver->ref_version)
1262
1
    {
1263
1
      // This wrt symbol is not available at all, for this case, we set its flag to init zero.
1264
1
      const ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
1265
1
        .d = ref_d
1266
1
      };
1267
1
      ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1268
1
      ccv_nnc_sum_or_set_graph_exec_symbol_t set_exec = {
1269
1
        .value = 0,
1270
1
        .output = autograd_tensor_symbols->rnum - 1,
1271
1
      };
1272
1
      ccv_array_push(sum_or_set_execs, &set_exec);
1273
1
      // Insert the one to be set to zero.
1274
1
      const ccv_nnc_tensor_ref_t tensor_ref = {
1275
1
        .d = autograd_tensor_symbols->rnum - 1,
1276
1
        .x = exec_symbol_info_size + sum_or_set_execs->rnum - 1,
1277
1
      };
1278
1
      tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1279
1
      ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1280
1
      continue;
1281
1
    }
1282
9.45k
    // If it is a while loop, we need to insert an accumulator to the graph (this is expressed as a initialization tensor summed with existing results).
1283
9.45k
    // First, insert the initialization tensor if this wrt results is not used directly in next while loop (thus, it participates the computation, therefore, no need to accumulate).
1284
9.45k
    if (is_while && 
!tensor_symbol_info[ref_d].assign_ref2
&&
1285
9.45k
      
_ccv_nnc_tensor_ref_version_find_init(tensor_ver) < 01
) // If the initialization tensor is not inserted yet.
1286
1
    {
1287
1
      const ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
1288
1
        .d = ref_d
1289
1
      };
1290
1
      ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1291
1
      // Insert the one to be summed.
1292
1
      const ccv_nnc_tensor_ref_t tensor_ref = {
1293
1
        .d = autograd_tensor_symbols->rnum - 1,
1294
1
        .x = -1, // This denotes it is an initialization vector.
1295
1
      };
1296
1
      ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1297
1
    }
1298
9.45k
    // If there are more than one tensor in the list, it is possible to sum them up.
1299
9.45k
    if (tensor_ver->c < tensor_ver->ref_version->rnum - 1)
1300
23
      _ccv_nnc_graph_sum_autograd_tensor_versions(-1, ref_d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_or_set_execs);
1301
9.45k
    // The tensor version should have ref_version, and only one now (after sum up).
1302
9.45k
    assert(tensor_ver->c == tensor_ver->ref_version->rnum - 1);
1303
9.45k
  }
1304
6.71k
  // Adding additional fields to backward_prep now.
1305
6.71k
  backward_prep->autograd_execs = autograd_execs;
1306
6.71k
  backward_prep->autograd_tensor_versions = autograd_tensor_versions;
1307
6.71k
  backward_prep->autograd_tensor_symbols = autograd_tensor_symbols;
1308
6.71k
  backward_prep->sum_or_set_execs = sum_or_set_execs;
1309
6.71k
  ccv_array_t* sub_f_symbols = 0;
1310
6.71k
  ccv_array_t* sub_wrt_symbols = 0;
1311
18.0k
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) {
1312
18.0k
    ccv_nnc_graph_backward_info_t* node = backward_info + idx;
1313
18.0k
    const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
1314
18.0k
    /* Only interested in the ones on the f / wrt flow */
1315
18.0k
    if ((node->f_wrt & 0x3) == 0x3)
1316
17.9k
    {
1317
17.9k
      const int is_while = (forw_exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE);
1318
18.0k
      for (i = 0; i < forw_exec->graph_ref_size; 
i++4
)
1319
4
      {
1320
4
        // Now calling it recursively until we are sure no f_symbols can be removed.
1321
4
        const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[i] - 1;
1322
4
        ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep = backward_prep->sub_preps + graph_ref;
1323
4
        if (!sub_wrt_symbols)
1324
2
          sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1325
4
        if (!sub_f_symbols)
1326
2
          sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1327
4
        _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, node->input_bitmasks, node->output_bitmasks, sub_f_symbols, sub_wrt_symbols);
1328
4
        _ccv_nnc_symbolic_graph_backward_prep_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, is_while, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph));
1329
4
      }
1330
17.9k
    }
1331
18.0k
  } ccv_nnc_graph_visit_endfor
1332
6.71k
  if (sub_f_symbols)
1333
2
    ccv_array_free(sub_f_symbols);
1334
6.71k
  if (sub_wrt_symbols)
1335
2
    ccv_array_free(sub_wrt_symbols);
1336
6.71k
}
1337
1338
static void _ccv_nnc_symbolic_graph_backward_prep_free(const ccv_nnc_symbolic_graph_backward_prep_t backward_prep)
1339
6.71k
{
1340
6.71k
  int i, j;
1341
6.71k
  const int exec_symbol_info_size = backward_prep.exec_symbol_info_size;
1342
6.71k
  const int tensor_symbol_info_size = backward_prep.tensor_symbol_info_size;
1343
6.71k
  ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep.autograd_execs;
1344
6.71k
  if (autograd_execs)
1345
6.71k
  {
1346
24.7k
    for (i = 0; i < exec_symbol_info_size; 
i++18.0k
)
1347
18.0k
    {
1348
18.0k
      if (autograd_execs[i].inputs)
1349
18.0k
        
ccfree17.9k
(autograd_execs[i].inputs)17.9k
;
1350
18.0k
      if (autograd_execs[i].outgoings)
1351
11.2k
        ccv_array_free(autograd_execs[i].outgoings);
1352
18.0k
    }
1353
6.71k
    ccfree(autograd_execs);
1354
6.71k
  }
1355
6.71k
  ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep.autograd_tensor_versions;
1356
6.71k
  if (autograd_tensor_versions)
1357
6.71k
  {
1358
45.1k
    for (i = 0; i < tensor_symbol_info_size; 
i++38.4k
)
1359
38.4k
    {
1360
38.4k
      if (autograd_tensor_versions[i].ref_version)
1361
29.4k
      {
1362
67.5k
        for (j = 0; j < autograd_tensor_versions[i].ref_version->rnum; 
j++38.0k
)
1363
38.0k
        {
1364
38.0k
          ccv_nnc_tensor_ref_t* ref_version = (ccv_nnc_tensor_ref_t*)ccv_array_get(autograd_tensor_versions[i].ref_version, j);
1365
38.0k
          if (ref_version->exec_registry)
1366
7
            ccv_array_free(ref_version->exec_registry);
1367
38.0k
          if (ref_version->alias_registry)
1368
1.15k
            ccv_array_free(ref_version->alias_registry);
1369
38.0k
        }
1370
29.4k
        ccv_array_free(autograd_tensor_versions[i].ref_version);
1371
29.4k
      }
1372
38.4k
    }
1373
6.71k
    ccfree(autograd_tensor_versions);
1374
6.71k
  }
1375
6.71k
  if (backward_prep.autograd_tensor_symbols)
1376
6.71k
    ccv_array_free(backward_prep.autograd_tensor_symbols);
1377
6.71k
  ccv_array_t* const sum_or_set_execs = backward_prep.sum_or_set_execs;
1378
6.71k
  if (sum_or_set_execs)
1379
6.71k
  {
1380
10.9k
    for (i = 0; i < sum_or_set_execs->rnum; 
i++4.27k
)
1381
4.27k
    {
1382
4.27k
      ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, i);
1383
4.27k
      if (sum_or_set->inputs)
1384
4.27k
        
ccfree4.27k
(sum_or_set->inputs)4.27k
;
1385
4.27k
      if (sum_or_set->outgoings)
1386
4.24k
        ccv_array_free(sum_or_set->outgoings);
1387
4.27k
    }
1388
6.71k
    ccv_array_free(sum_or_set_execs);
1389
6.71k
  }
1390
6.71k
  // Now afterwards, these are mandatory.
1391
6.71k
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep.backward_info;
1392
24.7k
  for (i = 0; i < exec_symbol_info_size; 
i++18.0k
)
1393
18.0k
  {
1394
18.0k
    if (backward_info[i].outgoings)
1395
11.2k
      ccv_array_free(backward_info[i].outgoings);
1396
18.0k
    if (backward_info[i].input_bitmasks)
1397
18.0k
      
ccfree18.0k
(backward_info[i].input_bitmasks)18.0k
;
1398
18.0k
  }
1399
6.71k
  ccfree(backward_info);
1400
6.71k
  ccv_nnc_graph_visit_free(backward_prep.backward_visit);
1401
6.71k
  ccv_nnc_graph_visit_free(backward_prep.forward_visit);
1402
6.71k
  ccfree(backward_prep.exec_symbol_info);
1403
6.71k
  ccfree(backward_prep.tensor_symbol_info);
1404
6.72k
  for (i = 0; i < backward_prep.sub_prep_size; 
i++4
)
1405
4
    _ccv_nnc_symbolic_graph_backward_prep_free(backward_prep.sub_preps[i]);
1406
6.71k
  if (backward_prep.sub_preps)
1407
6.71k
    
ccfree2
(backward_prep.sub_preps)2
;
1408
6.71k
}
1409
1410
static void _ccv_nnc_add_backward_breakpoint_for_symbol(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_graph_exec_symbol_t breakpoint, ccv_nnc_symbolic_graph_t* const graph, ccv_array_t* const sub_breakpoints)
1411
1
{
1412
1
  const ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), 0, 0, 0, 0, 0);
1413
1
  ccv_array_push(sub_breakpoints, &noop);
1414
1
  // Now need to hook this up to the graph.
1415
1
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
1416
1
  const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit;
1417
1
  // Now, for each one of these, find a reverse graph.
1418
1
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
1419
1
  int i;
1420
1
  // Clean up the high bit.
1421
4
  for (i = 0; i < backward_prep->exec_symbol_info_size; 
i++3
)
1422
3
    backward_info[i].f_wrt &= ~0x4;
1423
1
  assert((backward_info[breakpoint.d].f_wrt & 0x3) != 0x3);
1424
1
  backward_info[breakpoint.d].f_wrt |= 0x4;
1425
1
  const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit;
1426
1
  const ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep->autograd_execs;
1427
1
  // Going forward to find whether this breakpoint is a source node to some f_wrt nodes.
1428
3
  ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, forw_exec, idx) {
1429
3
    ccv_nnc_graph_backward_info_t* const node = backward_info + idx;
1430
3
    // If it is tagged on breakpoint flow, but not as both f or wrt, flow through it.
1431
3
    if ((node->f_wrt & 0x4) && 
(node->f_wrt & 0x3) != 0x31
)
1432
1
      for (i = 0; forw_exec->outgoings && 
i < forw_exec->outgoings->rnum0
;
i++0
)
1433
0
      {
1434
0
        const int outgoing_idx = *(int*)ccv_array_get(forw_exec->outgoings, i);
1435
0
        ccv_nnc_graph_backward_info_t* const outgoing_node = backward_info + outgoing_idx;
1436
0
        // If this is a f_wrt node. Concatenate.
1437
0
        if (!(outgoing_node->f_wrt & 0x4) && (outgoing_node->f_wrt & 0x3) == 0x3)
1438
0
            ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[outgoing_idx].symbol, noop);
1439
0
        outgoing_node->f_wrt |= 0x4;
1440
0
      }
1441
3
  } ccv_nnc_graph_visit_endfor
1442
1
  // Going backward to find whether this breakpoint is a destination node for some f_wrt_nodes.
1443
3
  ccv_nnc_graph_visit_for(backward_visit, backward_info, node, idx) {
1444
3
    if ((node->f_wrt & 0x4) && 
(node->f_wrt & 0x3) != 0x32
)
1445
2
      
for (i = 0; 1
node->outgoings && i < node->outgoings->rnum;
i++1
)
1446
1
      {
1447
1
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
1448
1
        ccv_nnc_graph_backward_info_t* const outgoing_node = backward_info + outgoing_idx;
1449
1
        // If this is a f_wrt node. Concatenate.
1450
1
        if (!(outgoing_node->f_wrt & 0x4) && (outgoing_node->f_wrt & 0x3) == 0x3)
1451
1
            ccv_nnc_graph_exec_symbol_concat(graph, noop, autograd_execs[outgoing_idx].symbol);
1452
1
        outgoing_node->f_wrt |= 0x4;
1453
1
      }
1454
3
  } ccv_nnc_graph_visit_endfor
1455
1
}
1456
1457
static ccv_nnc_autograd_tensor_symbol_t* _ccv_nnc_autograd_tensor_symbol_from_tensor_version(ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_autograd_tensor_version_t* const tensor_ver)
1458
7
{
1459
7
  assert(tensor_ver->ref_version);
1460
7
  const ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1461
7
  return (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1462
7
}
1463
1464
static void _ccv_nnc_symbolic_graph_set_backward_carry_overs(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph)
1465
1
{
1466
1
  int i;
1467
5
  for (i = 0; i < backward_prep->graph->tensor_symbol_info->rnum; 
i++4
)
1468
4
  {
1469
4
    const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = backward_prep->tensor_symbol_info + i;
1470
4
    if (tensor_symbol_info->assign_ref)
1471
1
    {
1472
1
      const int assign_ref = tensor_symbol_info->assign_ref - 1;
1473
1
      ccv_nnc_autograd_tensor_symbol_t* const destination_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + assign_ref);
1474
1
      ccv_nnc_autograd_tensor_symbol_t* const source_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + i);
1475
1
      ccv_nnc_symbolic_graph_set_carry_overs(graph, (ccv_nnc_tensor_symbol_map_t []){
1476
1
        { .source = source_autograd_symbol->symbol, .destination = destination_autograd_symbol->symbol }
1477
1
      }, 1);
1478
1
    }
1479
4
  }
1480
3
  for (i = 0; i < wrt_symbol_size; 
i++2
)
1481
2
  {
1482
2
    const int d = wrt_symbols[i].d;
1483
2
    if (d < 0)
1484
0
      continue;
1485
2
    const int ref_d = (!backward_prep->tensor_symbol_info[d].alias_ref) ? d : 
backward_prep->tensor_symbol_info[d].alias_ref - 10
;
1486
2
    const ccv_nnc_autograd_tensor_version_t* const tensor_ver = backward_prep->autograd_tensor_versions + ref_d;
1487
2
    const int init_ref_ver = _ccv_nnc_tensor_ref_version_find_init(tensor_ver);
1488
2
    if (init_ref_ver >= 0)
1489
1
    {
1490
1
      const int init_d = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, init_ref_ver))->d;
1491
1
      ccv_nnc_autograd_tensor_symbol_t* const destination_autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(backward_prep->autograd_tensor_symbols, init_d);
1492
1
      ccv_nnc_autograd_tensor_symbol_t* const source_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + ref_d);
1493
1
      ccv_nnc_symbolic_graph_set_carry_overs(graph, (ccv_nnc_tensor_symbol_map_t []){
1494
1
        { .source = source_autograd_symbol->symbol, .destination = destination_autograd_symbol->symbol }
1495
1
      }, 1);
1496
1
    }
1497
2
  }
1498
1
}
1499
1500
static void _ccv_nnc_symbolic_graph_add_init_zeros(const ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph, ccv_nnc_symbolic_graph_t* const sub_graph, ccv_array_t* const symbols)
1501
1
{
1502
1
  int i;
1503
3
  for (i = 0; i < wrt_symbol_size; 
i++2
)
1504
2
  {
1505
2
    const int d = wrt_symbols[i].d;
1506
2
    if (d < 0)
1507
0
      continue;
1508
2
    const int ref_d = (!sub_prep->tensor_symbol_info[d].alias_ref) ? d : 
sub_prep->tensor_symbol_info[d].alias_ref - 10
;
1509
2
    const ccv_nnc_autograd_tensor_version_t* const tensor_ver = sub_prep->autograd_tensor_versions + ref_d;
1510
2
    const int init_ref_ver = _ccv_nnc_tensor_ref_version_find_init(tensor_ver);
1511
2
    if (init_ref_ver >= 0)
1512
1
    {
1513
1
      // Need de-dup logic.
1514
1
      const int init_d = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, init_ref_ver))->d;
1515
1
      ccv_nnc_autograd_tensor_symbol_t* const init_autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(sub_prep->autograd_tensor_symbols, init_d);
1516
1
      const ccv_nnc_tensor_symbol_info_t* const sub_init_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, init_autograd_symbol->symbol.d);
1517
1
      // If it doesn't have a parent ref yet, create one.
1518
1
      if (!sub_init_symbol_info->p_ref)
1519
1
      {
1520
1
        ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, sub_prep->tensor_symbol_info[ref_d].info, 0);
1521
1
        ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
1522
1
        ccv_array_push(symbols, &new_symbol);
1523
1
        ccv_nnc_tensor_symbol_hookup(graph, sub_graph, new_symbol, init_autograd_symbol->symbol);
1524
1
      }
1525
1
    }
1526
2
  }
1527
1
}
1528
1529
static void _ccv_nnc_symbolic_graph_add_tape_vars(const ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep, ccv_nnc_symbolic_graph_t* const root, ccv_nnc_symbolic_graph_t* const graph, ccv_nnc_symbolic_graph_t* const sub_graph, ccv_array_t* const symbols)
1530
4
{
1531
4
  int i;
1532
24
  for (i = 0; i < sub_graph->tensor_symbol_info->rnum; 
i++20
)
1533
20
  {
1534
20
    const ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, i);
1535
20
    if ((symbol_info->flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && 
symbol_info->pair_ref7
)
1536
7
    {
1537
7
      const int pair_ref = symbol_info->pair_ref - 1;
1538
7
      const ccv_nnc_tensor_symbol_t root_symbol = ccv_nnc_tensor_symbol_resolve(root, (ccv_nnc_tensor_symbol_t){
1539
7
        .d = pair_ref,
1540
7
        .graph = sub_prep->graph,
1541
7
      });
1542
7
      if (root_symbol.d >= 0)
1543
3
      {
1544
3
        ccv_nnc_tensor_symbol_hookup(root, sub_graph, root_symbol, (ccv_nnc_tensor_symbol_t){
1545
3
          .d = i,
1546
3
          .graph = sub_graph,
1547
3
        });
1548
3
        if (symbols)
1549
2
        {
1550
2
          const ccv_nnc_tensor_symbol_t p_symbol = ccv_nnc_tensor_symbol_resolve(graph, (ccv_nnc_tensor_symbol_t){
1551
2
            .d = i,
1552
2
            .graph = sub_graph,
1553
2
          });
1554
2
          ccv_array_push(symbols, &p_symbol);
1555
2
        }
1556
3
      }
1557
7
    }
1558
20
  }
1559
4
}
1560
1561
static void _ccv_nnc_symbolic_graph_backward_gen(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph, ccv_nnc_symbolic_graph_t* const root)
1562
6.71k
{
1563
6.71k
  assert(graph == backward_prep->graph || graph->pair == backward_prep->graph);
1564
6.71k
  const int exec_symbol_info_size = backward_prep->exec_symbol_info_size;
1565
6.71k
  const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size;
1566
6.71k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
1567
6.71k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = backward_prep->tensor_symbol_info;
1568
6.71k
  int i, j, k, p;
1569
6.71k
  ccv_array_t* const autograd_tensor_symbols = backward_prep->autograd_tensor_symbols;
1570
6.71k
  // Generate required symbols based on the information gathered above.
1571
45.9k
  for (i = 0; i < autograd_tensor_symbols->rnum; 
i++39.2k
)
1572
39.2k
  {
1573
39.2k
    ccv_nnc_autograd_tensor_symbol_t* symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, i);
1574
39.2k
    assert(symbol->d >= 0);
1575
39.2k
    assert(symbol->d < tensor_symbol_info_size);
1576
39.2k
    const ccv_nnc_tensor_symbol_info_t* const forw_symbol = tensor_symbol_info + symbol->d;
1577
39.2k
    if (!symbol->alias_ref)
1578
38.0k
    {
1579
38.0k
      assert(!forw_symbol->alias_ref);
1580
38.0k
      symbol->symbol = ccv_nnc_tensor_symbol_new(graph, forw_symbol->info, 0);
1581
38.0k
      ccv_nnc_tensor_symbol_set_flags(graph, symbol->symbol, symbol->flags);
1582
38.0k
    } else {
1583
1.17k
      assert(forw_symbol->alias_ref);
1584
1.17k
      assert(symbol->flags == 0); // We don't set flags on alias.
1585
1.17k
      // Due to our generation order, this must be after the original symbol is created.
1586
1.17k
      ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, symbol->alias_ref - 1);
1587
1.17k
      symbol->symbol = ccv_nnc_tensor_symbol_alias_new(graph, ref->symbol, forw_symbol->ofs, forw_symbol->inc, forw_symbol->info, 0);
1588
1.17k
    }
1589
39.2k
  }
1590
6.71k
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
1591
6.71k
  ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep->autograd_execs;
1592
6.71k
  ccv_array_t* symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1593
6.71k
  ccv_array_t* symbol_map = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_map_t), 0, 0);
1594
6.71k
  ccv_array_t* sub_f_symbols = 0;
1595
6.71k
  ccv_array_t* sub_wrt_symbols = 0;
1596
6.71k
  ccv_array_t* sub_execs = 0;
1597
24.7k
  for (i = 0; i < exec_symbol_info_size; 
i++18.0k
)
1598
18.0k
  {
1599
18.0k
    // This is not going to be an interesting node. Skip.
1600
18.0k
    if ((backward_info[i].f_wrt & 0x3) != 0x3)
1601
76
      continue;
1602
17.9k
    ccv_nnc_graph_backward_info_t* const back_info = backward_info + i;
1603
17.9k
    ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i;
1604
17.9k
    if (back_exec->cmd.cmd == CCV_NNC_NOOP)
1605
1
    {
1606
1
      back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, 0, 0, 0, 0, 0);
1607
1
      continue;
1608
1
    }
1609
17.9k
    const ccv_nnc_graph_exec_symbol_info_t* const forw_exec = exec_symbol_info + i;
1610
17.9k
    if (forw_exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1611
1
    {
1612
1
      ccv_array_clear(symbols);
1613
1
      const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[0] - 1;
1614
1
      ccv_nnc_symbolic_graph_backward_prep_t* sub_prep = backward_prep->sub_preps + graph_ref;
1615
1
      ccv_nnc_symbolic_graph_t* sub_graph = ccv_nnc_symbolic_graph_new();
1616
1
      sub_graph->pair = sub_prep->graph;
1617
1
      if (!sub_wrt_symbols)
1618
1
        sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1619
1
      // I am done, need to redo above for sub_prep, and it has to be successful now.
1620
1
      if (!sub_f_symbols)
1621
1
        sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1622
1
      _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, back_info->input_bitmasks, back_info->output_bitmasks, sub_f_symbols, sub_wrt_symbols);
1623
1
      _ccv_nnc_symbolic_graph_backward_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph, root);
1624
1
      back_exec->symbol = ccv_nnc_symbolic_graph_while(graph, back_exec->cmd.cmd, sub_graph, forw_exec->name);
1625
1
      if (!sub_execs)
1626
1
        sub_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0);
1627
1
      ccv_array_clear(sub_execs);
1628
1
      // Find the breakpoints in forward graph, creating the reverse one.
1629
2
      for (j = 0; j < sub_prep->graph->breakpoint_size; 
j++1
)
1630
1
      {
1631
1
        const int d = sub_prep->graph->breakpoints[j].d;
1632
1
        if (sub_prep->autograd_execs[d].symbol.graph)
1633
0
          ccv_array_push(sub_execs, &sub_prep->autograd_execs[d].symbol);
1634
1
        else
1635
1
          _ccv_nnc_add_backward_breakpoint_for_symbol(sub_prep, sub_prep->graph->breakpoints[j], sub_graph, sub_execs);
1636
1
      }
1637
1
      ccv_nnc_symbolic_graph_set_while_expr(sub_graph, NOOP_GRAPH_WHILE_EXPR, 0, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_execs, 0), sub_execs->rnum);
1638
1
      ccv_nnc_graph_exec_symbol_autogen(sub_graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1639
1
      _ccv_nnc_symbolic_graph_set_backward_carry_overs(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph);
1640
2
      for (j = 0; j < back_exec->input_size; 
j++1
)
1641
1
        if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j))
1642
1
          ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol));
1643
1
      // Find whether in the wrt symbols, anything we need to init to zero, if there are, these need to be inputs here too.
1644
1
      _ccv_nnc_symbolic_graph_add_init_zeros(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, graph, sub_graph, symbols);
1645
1
      _ccv_nnc_symbolic_graph_add_tape_vars(sub_prep, root, graph, sub_graph, symbols);
1646
1
      // input_size at this point, may be different from the back_exec->input_size, the reason is because we may added zeroing tensors as input tensors.
1647
1
      const int input_size = symbols->rnum;
1648
3
      for (j = 0; j < back_exec->output_size; 
j++2
)
1649
2
        if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1650
1
          ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol));
1651
1
      const int output_size = symbols->rnum - input_size;
1652
1
      const int p_idx = sub_prep->graph->p_idx - 1;
1653
1
      assert(back_exec->input_size == forw_exec->output_size);
1654
1
      k = 0;
1655
2
      for (j = 0; j < back_exec->input_size; 
j++1
)
1656
1
        if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j))
1657
1
        {
1658
1
          const ccv_nnc_tensor_symbol_info_t* const info = tensor_symbol_info + forw_exec->outputs[j];
1659
1
          const int s_idx = *(int*)ccv_array_get(info->s_ref, p_idx) - 1;
1660
1
          assert(s_idx >= 0);
1661
1
          const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + s_idx);
1662
1
          ccv_nnc_tensor_symbol_hookup(graph, sub_graph, *(ccv_nnc_tensor_symbol_t*)ccv_array_get(symbols, k), autograd_symbol->symbol);
1663
1
          ++k;
1664
1
        }
1665
1
      k = input_size; // Reset k, the symbol pass already set up by add_init_zeros.
1666
1
      assert(back_exec->output_size == forw_exec->input_size);
1667
3
      
for (j = 0; 1
j < back_exec->output_size;
j++2
)
1668
2
        if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1669
1
        {
1670
1
          const ccv_nnc_tensor_symbol_info_t* const info = tensor_symbol_info + forw_exec->inputs[j];
1671
1
          const int s_idx = *(int*)ccv_array_get(info->s_ref, p_idx) - 1;
1672
1
          assert(s_idx >= 0);
1673
1
          const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + s_idx);
1674
1
          ccv_nnc_tensor_symbol_hookup(graph, sub_graph, *(ccv_nnc_tensor_symbol_t*)ccv_array_get(symbols, k), autograd_symbol->symbol);
1675
1
          ++k;
1676
1
        }
1677
1
      ccv_nnc_graph_exec_symbol_set_io(graph, back_exec->symbol, ccv_array_get(symbols, 0), input_size, ccv_array_get(symbols, input_size), output_size);
1678
17.9k
    } else if (forw_exec->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
1679
1
      ccv_array_clear(symbol_map);
1680
2
      for (j = 0; j < back_exec->output_size; 
j++1
)
1681
1
        if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1682
1
        {
1683
1
          ccv_nnc_tensor_symbol_map_t symbol = {
1684
1
            .source = ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol,
1685
1
            .destination = ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol,
1686
1
          };
1687
1
          ccv_array_push(symbol_map, &symbol);
1688
1
        }
1689
1
      const int symbol_map_size = symbol_map->rnum;
1690
1
      back_exec->symbol = ccv_nnc_symbolic_graph_case_of_new(graph, back_exec->cmd.cmd, 0, 0, ccv_array_get(symbol_map, 0), symbol_map_size, forw_exec->name);
1691
1
      ccv_nnc_symbolic_graph_set_case_of_expr(graph, back_exec->symbol, NOOP_GRAPH_CASE_OF_EXPR, 0);
1692
4
      for (p = 0; p < forw_exec->graph_ref_size; 
p++3
)
1693
3
      {
1694
3
        const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[p] - 1;
1695
3
        ccv_nnc_symbolic_graph_backward_prep_t* sub_prep = backward_prep->sub_preps + graph_ref;
1696
3
        ccv_nnc_symbolic_graph_t* sub_graph = ccv_nnc_symbolic_graph_new();
1697
3
        sub_graph->pair = sub_prep->graph;
1698
3
        if (!sub_wrt_symbols)
1699
1
          sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1700
3
        // I am done, need to redo above for sub_prep, and it has to be successful now.
1701
3
        if (!sub_f_symbols)
1702
1
          sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1703
3
        _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, back_info->input_bitmasks, back_info->output_bitmasks, sub_f_symbols, sub_wrt_symbols);
1704
3
        _ccv_nnc_symbolic_graph_backward_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph, root);
1705
3
        ccv_array_clear(symbol_map);
1706
3
        k = 0;
1707
6
        for (j = 0; j < back_exec->output_size; 
j++3
)
1708
3
          if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1709
3
          {
1710
3
            const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, k))->d;
1711
3
            if (d >= 0)
1712
1
            {
1713
1
              const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + d);
1714
1
              ccv_nnc_tensor_symbol_map_t symbol = {
1715
1
                .source = autograd_symbol->symbol,
1716
1
                .destination = ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol,
1717
1
              };
1718
1
              ccv_array_push(symbol_map, &symbol);
1719
2
            } else {
1720
2
              // Create a new tensor in sub-graph and set it to be 0.
1721
2
              const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]);
1722
2
              // autograd_symbol->d points to the corresponding forward tensor.
1723
2
              ccv_nnc_tensor_symbol_t zero_symbol = ccv_nnc_tensor_symbol_new(sub_graph, tensor_symbol_info[autograd_symbol->d].info, 0);
1724
2
              ccv_nnc_graph_exec_symbol_new(sub_graph, CMD_SET_FORWARD(0), 0, 0, &zero_symbol, 1, 0);
1725
2
              ccv_nnc_tensor_symbol_map_t symbol = {
1726
2
                .source = zero_symbol,
1727
2
                .destination = autograd_symbol->symbol,
1728
2
              };
1729
2
              ccv_array_push(symbol_map, &symbol);
1730
2
            }
1731
3
            ++k;
1732
3
          }
1733
3
        ccv_nnc_graph_exec_symbol_autogen(sub_graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1734
3
        const int symbol_map_size = symbol_map->rnum;
1735
3
        ccv_nnc_symbolic_graph_set_case_of(graph, back_exec->symbol, sub_graph, p, ccv_array_get(symbol_map, 0), symbol_map_size);
1736
3
        // Hookup input only after this becomes a sub graph of the graph.
1737
3
        k = 0;
1738
6
        for (j = 0; j < back_exec->input_size; 
j++3
)
1739
3
          if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j))
1740
3
          {
1741
3
            const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, k))->d;
1742
3
            assert(d >= 0);
1743
3
            // No corresponding sub tensors allocated. Skip.
1744
3
            if (!sub_prep->autograd_tensor_versions[d].ref_version ||
1745
3
              
!sub_prep->autograd_tensor_versions[d].ref_version->rnum1
)
1746
2
              continue;
1747
1
            const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + d);
1748
1
            ccv_nnc_tensor_symbol_hookup(graph, sub_graph, ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol, autograd_symbol->symbol);
1749
1
            ++k;
1750
1
          }
1751
3
        // Need to make sure tape vars are hooked up.
1752
3
        _ccv_nnc_symbolic_graph_add_tape_vars(sub_prep, root, graph, sub_graph, 0);
1753
3
      }
1754
17.9k
    } else {
1755
17.9k
      ccv_array_clear(symbols);
1756
17.9k
      // Gradient inputs.
1757
36.3k
      for (j = 0; j < back_exec->input_size; 
j++18.3k
)
1758
18.3k
        if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j))
1759
18.0k
          ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol));
1760
383
        else
1761
383
          ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
1762
17.9k
      // Inputs from forward function.
1763
51.5k
      for (j = 0; j < forw_exec->input_size; 
j++33.6k
)
1764
33.6k
        if (!(back_info->input_bitmasks[(j + back_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size))))
1765
11.4k
          ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
1766
22.1k
        else {
1767
22.1k
          const ccv_nnc_tensor_symbol_t symbol = {
1768
22.1k
            .d = forw_exec->inputs[j],
1769
22.1k
            .graph = backward_prep->graph
1770
22.1k
          };
1771
22.1k
          if (graph == backward_prep->graph)
1772
22.1k
            ccv_array_push(symbols, &symbol);
1773
5
          else { // Otherwise, create a new symbol, and set its pair to the old symbol.
1774
5
            const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, tensor_symbol_info[forw_exec->inputs[j]].info, tensor_symbol_info[forw_exec->inputs[j]].name);
1775
5
            ccv_nnc_tensor_symbol_pair_with(graph, new_symbol, symbol);
1776
5
            const int flags = ccv_nnc_tensor_symbol_flags(backward_prep->graph, symbol) | CCV_NNC_TENSOR_SYMBOL_TAPE_VAR;
1777
5
            ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags);
1778
5
            ccv_nnc_tensor_symbol_set_flags(backward_prep->graph, symbol, flags);
1779
5
            ccv_array_push(symbols, &new_symbol);
1780
5
          }
1781
22.1k
        }
1782
17.9k
      // Outputs from forward function.
1783
36.3k
      for (j = 0; j < forw_exec->output_size; 
j++18.3k
)
1784
18.3k
        if (!(back_info->input_bitmasks[(j + back_exec->input_size + forw_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size + forw_exec->input_size))))
1785
13.3k
          ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
1786
5.07k
        else {
1787
5.07k
          const ccv_nnc_tensor_symbol_t symbol = {
1788
5.07k
            .d = forw_exec->outputs[j],
1789
5.07k
            .graph = backward_prep->graph
1790
5.07k
          };
1791
5.07k
          if (graph == backward_prep->graph)
1792
5.07k
            ccv_array_push(symbols, &symbol);
1793
2
          else { // Otherwise, create a new symbol, and set its pair to the old symbol.
1794
2
            const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, tensor_symbol_info[forw_exec->outputs[j]].info, tensor_symbol_info[forw_exec->outputs[j]].name);
1795
2
            ccv_nnc_tensor_symbol_pair_with(graph, new_symbol, symbol);
1796
2
            const int flags = ccv_nnc_tensor_symbol_flags(backward_prep->graph, symbol) | CCV_NNC_TENSOR_SYMBOL_TAPE_VAR;
1797
2
            ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags);
1798
2
            ccv_nnc_tensor_symbol_set_flags(backward_prep->graph, symbol, flags);
1799
2
            ccv_array_push(symbols, &new_symbol);
1800
2
          }
1801
5.07k
        }
1802
51.5k
      for (j = 0; j < back_exec->output_size; 
j++33.6k
)
1803
33.6k
        if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
1804
27.0k
          ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol));
1805
6.54k
        else
1806
6.54k
          ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
1807
17.9k
      back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, ccv_array_get(symbols, 0), back_exec->input_size + forw_exec->input_size + forw_exec->output_size, ccv_array_get(symbols, back_exec->input_size + forw_exec->input_size + forw_exec->output_size), back_exec->output_size, 0);
1808
17.9k
      ccv_nnc_graph_exec_symbol_set_hint(graph, back_exec->symbol, exec_symbol_info[i].hint);
1809
17.9k
      ccv_nnc_graph_exec_symbol_pair_with(graph, back_exec->symbol, (ccv_nnc_graph_exec_symbol_t){
1810
17.9k
        .d = i,
1811
17.9k
        .graph = backward_prep->graph,
1812
17.9k
      });
1813
17.9k
    }
1814
17.9k
  }
1815
6.71k
  if (sub_f_symbols)
1816
2
    ccv_array_free(sub_f_symbols);
1817
6.71k
  if (sub_wrt_symbols)
1818
2
    ccv_array_free(sub_wrt_symbols);
1819
6.71k
  if (sub_execs)
1820
1
    ccv_array_free(sub_execs);
1821
6.71k
  ccv_array_t* const sum_or_set_execs = backward_prep->sum_or_set_execs;
1822
10.9k
  for (i = 0; i < sum_or_set_execs->rnum; 
i++4.27k
)
1823
4.27k
  {
1824
4.27k
    ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set_exec = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, i);
1825
4.27k
    // It is sum, set don't have inputs.
1826
4.27k
    if (sum_or_set_exec->input_size)
1827
4.27k
    {
1828
4.27k
      ccv_array_clear(symbols);
1829
4.27k
      // This is to sum.
1830
12.8k
      for (j = 0; j < sum_or_set_exec->input_size; 
j++8.56k
)
1831
8.56k
        ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, sum_or_set_exec->inputs[j]))->symbol));
1832
4.27k
      ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_EWSUM_FORWARD, 0, CMD_GENERIC(), 0);
1833
4.27k
      sum_or_set_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, cmd, ccv_array_get(symbols, 0), sum_or_set_exec->input_size, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, sum_or_set_exec->output))->symbol), 1, 0);
1834
4.27k
    } else
1835
1
      sum_or_set_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, CMD_SET_FORWARD(sum_or_set_exec->value), 0, 0, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, sum_or_set_exec->output))->symbol), 1, 0);
1836
4.27k
  }
1837
6.71k
  ccv_array_free(symbol_map);
1838
6.71k
  ccv_array_free(symbols);
1839
24.7k
  for (i = 0; i < exec_symbol_info_size; 
i++18.0k
)
1840
18.0k
  {
1841
18.0k
    // This is not going to be an interesting node. Skip.
1842
18.0k
    if ((backward_info[i].f_wrt & 0x3) != 0x3)
1843
76
      continue;
1844
17.9k
    ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i;
1845
17.9k
    // If on the same graph, we cannot decide whether it is before or after the forw_exec, enforcing it is after forw_exec.
1846
17.9k
    if (graph == backward_prep->graph)
1847
17.9k
      ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
1848
17.9k
        .d = i,
1849
17.9k
        .graph = graph
1850
17.9k
      }, back_exec->symbol);
1851
17.9k
    if (back_exec->outgoings)
1852
22.6k
      
for (j = 0; 11.2k
j < back_exec->outgoings->rnum;
j++11.3k
)
1853
11.3k
      {
1854
11.3k
        int d = *(int*)ccv_array_get(back_exec->outgoings, j);
1855
11.3k
        if (d < exec_symbol_info_size)
1856
7.04k
          ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, autograd_execs[d].symbol);
1857
4.34k
        else
1858
4.34k
          ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, d - exec_symbol_info_size))->symbol);
1859
11.3k
      }
1860
17.9k
  }
1861
10.9k
  for (i = 0; i < sum_or_set_execs->rnum; 
i++4.27k
)
1862
4.27k
  {
1863
4.27k
    ccv_nnc_sum_or_set_graph_exec_symbol_t* exec = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, i);
1864
4.27k
    if (exec->outgoings)
1865
8.49k
      
for (j = 0; 4.24k
j < exec->outgoings->rnum;
j++4.24k
)
1866
4.24k
      {
1867
4.24k
        int d = *(int*)ccv_array_get(exec->outgoings, j);
1868
4.24k
        if (d < exec_symbol_info_size)
1869
4.24k
          ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, autograd_execs[d].symbol);
1870
0
        else
1871
0
          ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, d - exec_symbol_info_size))->symbol);
1872
4.24k
      }
1873
4.27k
  }
1874
6.71k
  // Now, everything is done, set the metadata on graph so that we can lookup later for backward symbols
1875
6.71k
  if (graph->backward.tensor_symbol_idx)
1876
4.40k
    graph->backward.tensor_symbol_idx = (int*)ccrealloc(graph->backward.tensor_symbol_idx, sizeof(int) * (graph->tensor_symbol_info->rnum + tensor_symbol_info_size));
1877
2.31k
  else
1878
2.31k
    graph->backward.tensor_symbol_idx = (int*)ccmalloc(sizeof(int) * (graph->tensor_symbol_info->rnum + tensor_symbol_info_size));
1879
6.71k
  graph->backward.tensor_symbol_size = tensor_symbol_info_size;
1880
6.71k
  graph->backward.exec_symbol_idx = graph->backward.tensor_symbol_idx + tensor_symbol_info_size;
1881
6.71k
  graph->backward.exec_symbol_size = graph->tensor_symbol_info->rnum;
1882
45.1k
  for (i = 0; i < tensor_symbol_info_size; 
i++38.4k
)
1883
38.4k
    graph->backward.tensor_symbol_idx[i] = -1;
1884
84.3k
  for (i = 0; i < graph->backward.exec_symbol_size; 
i++77.6k
)
1885
77.6k
    graph->backward.exec_symbol_idx[i] = -1;
1886
6.71k
  ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep->autograd_tensor_versions;
1887
6.71k
  // Assigning for wrt symbols.
1888
16.1k
  for (i = 0; i < wrt_symbol_size; 
i++9.45k
)
1889
9.45k
  {
1890
9.45k
    const int d = wrt_symbols[i].d;
1891
9.45k
    if (d < 0)
1892
2
      continue;
1893
9.45k
    assert(d < tensor_symbol_info_size);
1894
9.45k
    // If this wrt symbol is an alias, create extra alias for this.
1895
9.45k
    ccv_nnc_autograd_tensor_version_t* const tensor_ver = autograd_tensor_versions + d;
1896
9.45k
    assert(tensor_ver->ref_version);
1897
9.45k
    ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1898
9.45k
    ccv_nnc_autograd_tensor_symbol_t* autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1899
9.45k
    graph->backward.tensor_symbol_idx[d] = autograd_symbol->symbol.d;
1900
9.45k
    const int dd = autograd_symbol->symbol.d;
1901
9.45k
    const int x = tensor_ref->x;
1902
9.45k
    if (tensor_ref->exec_registry && 
tensor_ref->exec_registry->rnum2
) // Create no-op node.
1903
2
    {
1904
2
      ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), 0, 0, 0, 0, 0);
1905
2
      if (x < exec_symbol_info_size)
1906
2
        ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop);
1907
0
      else
1908
0
        ccv_nnc_graph_exec_symbol_concat(graph, ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size))->symbol, noop);
1909
6
      for (j = 0; j < tensor_ref->exec_registry->rnum; 
j++4
)
1910
4
      {
1911
4
        const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
1912
4
        assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
1913
4
        assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation.
1914
4
        ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop);
1915
4
      }
1916
2
      graph->backward.exec_symbol_idx[dd] = noop.d;
1917
9.45k
    } else {
1918
9.45k
      if (x < exec_symbol_info_size)
1919
9.42k
        graph->backward.exec_symbol_idx[dd] = autograd_execs[x].symbol.d;
1920
26
      else
1921
26
        graph->backward.exec_symbol_idx[dd] = ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size))->symbol.d;
1922
9.45k
    }
1923
9.45k
  }
1924
6.71k
  // Assigning for f symbols.
1925
13.4k
  
for (i = 0; 6.71k
i < f_symbol_size;
i++6.73k
)
1926
6.73k
  {
1927
6.73k
    const int d = f_symbols[i].d;
1928
6.73k
    assert(d >= 0);
1929
6.73k
    assert(d < tensor_symbol_info_size);
1930
6.73k
    const ccv_nnc_autograd_tensor_version_t* const tensor_ver = autograd_tensor_versions + d;
1931
6.73k
    if (tensor_ver->ref_version)
1932
6.72k
    {
1933
6.72k
      // We don't use _ccv_nnc_autograd_tensor_symbol_from_tensor_version because that select the last version, but for us, we need the first version.
1934
6.72k
      const ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, 0);
1935
6.72k
      const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1936
6.72k
      graph->backward.tensor_symbol_idx[d] = autograd_symbol->symbol.d;
1937
6.72k
      // Cannot find relevant backward exec symbols for f, it could be many.
1938
6.72k
    }
1939
6.73k
  }
1940
6.71k
}
1941
1942
void ccv_nnc_symbolic_graph_backward(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
1943
6.71k
{
1944
6.71k
  int i;
1945
6.71k
  // TODO: f symbols cannot be alias yet.
1946
13.4k
  for (i = 0; i < f_symbol_size; 
i++6.72k
)
1947
6.72k
  {
1948
6.72k
    assert(f_symbols[i].graph == graph); // f symbol has to be in the current graph.
1949
6.72k
    assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, f_symbols[i].d))->alias_ref);
1950
6.72k
  }
1951
6.71k
  // TODO: wrt symbols cannot be alias yet.
1952
16.1k
  
for (i = 0; 6.71k
i < wrt_symbol_size;
i++9.45k
)
1953
9.45k
  {
1954
9.45k
    assert(wrt_symbols[i].graph == graph);
1955
9.45k
    assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, wrt_symbols[i].d))->alias_ref);
1956
9.45k
  }
1957
6.71k
  const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
1958
6.71k
  const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
1959
6.71k
  assert(exec_symbol_info_size > 0);
1960
6.71k
  assert(tensor_symbol_info_size > 0);
1961
6.71k
  ccv_nnc_symbolic_graph_backward_prep_t backward_prep = _ccv_nnc_symbolic_graph_backward_prep(graph, sources, source_size, destinations, destination_size);
1962
6.71k
  _ccv_nnc_symbolic_graph_backward_prep_prune_ops(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, sources, source_size, destinations, destination_size);
1963
6.71k
  _ccv_nnc_symbolic_graph_backward_prep_gen(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, 0, sources, source_size, destinations, destination_size);
1964
6.71k
  _ccv_nnc_symbolic_graph_backward_gen(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, graph, graph);
1965
6.71k
  _ccv_nnc_symbolic_graph_backward_prep_free(backward_prep);
1966
6.71k
}
1967
1968
ccv_nnc_tensor_symbol_t ccv_nnc_tensor_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol)
1969
27.4k
{
1970
27.4k
  assert(symbol.d >= 0);
1971
27.4k
  assert(symbol.d < graph->backward.tensor_symbol_size);
1972
27.4k
  if (graph->backward.tensor_symbol_idx[symbol.d] < 0)
1973
1
    return NO_TENSOR_SYMBOL;
1974
27.4k
  ccv_nnc_tensor_symbol_t tensor = {
1975
27.4k
    .d = graph->backward.tensor_symbol_idx[symbol.d],
1976
27.4k
    .graph = graph,
1977
27.4k
  };
1978
27.4k
  return tensor;
1979
27.4k
}
1980
1981
ccv_nnc_graph_exec_symbol_t ccv_nnc_graph_exec_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol)
1982
17.7k
{
1983
17.7k
  assert(symbol.d >= 0);
1984
17.7k
  assert(symbol.d < graph->backward.exec_symbol_size);
1985
17.7k
  const int dd = symbol.d;
1986
17.7k
  assert(graph->backward.exec_symbol_idx[dd] >= 0);
1987
17.7k
  ccv_nnc_graph_exec_symbol_t exec = {
1988
17.7k
    .d = graph->backward.exec_symbol_idx[dd],
1989
17.7k
    .graph = graph
1990
17.7k
  };
1991
17.7k
  return exec;
1992
17.7k
}