Coverage Report

Created: 2017-11-12 13:27

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_symbolic_graph_backward.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#ifdef HAVE_CUDA
6
#include "gpu/ccv_nnc_compat.h"
7
#endif
8
#include "_ccv_nnc_symbolic_graph.h"
9
10
/**
11
 * Level-4 API
12
 */
13
14
typedef struct {
15
  int f_wrt; // Check if both f_symbols and wrt_symbols flow through this node.
16
  ccv_array_t* outgoings; // backward traverse nodes.
17
  uint64_t* input_bitmasks;
18
  int input_bitmask_size;
19
  uint64_t* output_bitmasks;
20
  int output_bitmask_size;
21
} ccv_nnc_graph_backward_info_t;
22
23
typedef struct {
24
  int input_size;
25
  int* inputs;
26
  int output;
27
  ccv_array_t* outgoings;
28
  float value;
29
  ccv_nnc_graph_exec_symbol_t symbol;
30
} ccv_nnc_sum_graph_exec_symbol_t;
31
32
typedef struct {
33
  int input_size;
34
  int output_size;
35
  int* inputs;
36
  int* outputs;
37
  ccv_array_t* outgoings;
38
  ccv_nnc_cmd_t cmd;
39
  ccv_nnc_graph_exec_symbol_t symbol;
40
} ccv_nnc_autograd_graph_exec_symbol_t;
41
42
typedef struct {
43
  int d; // The pointer to the forward level object.
44
  int alias_ref; // The alias ref to itself (autograd_tensor_symbols array).
45
  int flags; // Flags for this symbol.
46
  ccv_nnc_tensor_symbol_t symbol;
47
} ccv_nnc_autograd_tensor_symbol_t;
48
49
typedef struct {
50
  int d; // The tensor symbol ref.
51
  int x; // The exec symbol ref.
52
  ccv_array_t* exec_registry; // Additional exec symbol refs, similar to x, only useful for aliasing.
53
  ccv_array_t* alias_registry; // int point to all the alias (if this is not an alias). The alias is the object in autograd_tensor_symbols, you need another level of indirection to get the actual forward level alias.
54
} ccv_nnc_tensor_ref_t;
55
56
typedef struct {
57
  int c; // The start non-accumulated version.
58
  ccv_array_t* ref_version; // tensor ref point to the reverse tensor symbol.
59
} ccv_nnc_autograd_tensor_version_t;
60
61
typedef struct {
62
  int d;
63
  int alias_ref;
64
} ccv_nnc_sum_variable_t;
65
66
// This method tries to figure out if a set of aliases can cover the whole tensor dim.
67
// This is not a precise implementation though. The requirement is to answer this question
68
// with a given memory constraint, therefore, only allow up to 65536 different tensor locations.
69
// If you have more than that, it will assume that it doesn't have fully assigned aliases,
70
// and will return 0.
71
72
// Return 1 if inserted successfully.
73
static inline int _ccv_nnc_try_mix(int* const md, const int ins, const int c)
74
36
{
75
36
  if (!c)
76
22
  {
77
22
    md[0] = ins;
78
22
    return 1;
79
22
  }
80
14
  int ll = 0, uu = c - 1;
81
14
  int mm;
82
15
  do {
83
15
    mm = ll + ((uu - ll) >> 1);
84
15
    if (ins == md[mm])
85
13
      return 0;
86
2
    else 
if (2
ins < md[mm]2
)
87
1
      uu = mm - 1;
88
1
    else 
if (1
ins > md[mm]1
)
89
1
      ll = mm + 1;
90
2
  } while (ll <= uu);
91
1
  
if (1
ll < c1
)
92
1
    memmove(md + ll + 1, md + ll, sizeof(int) * (c - ll));
93
1
  md[ll] = ins;
94
1
  return 1;
95
14
}
96
97
static inline int _ccv_nnc_mix_idx(const int* const md, const int ins, const int c)
98
20
{
99
20
  if (c <= 1)
100
16
    return 0;
101
4
  int ll = 0, uu = c - 1;
102
4
  int mm;
103
7
  do {
104
7
    mm = ll + ((uu - ll) >> 1);
105
7
    if (ins == md[mm])
106
4
      return mm;
107
3
    else 
if (3
ins < md[mm]3
)
108
0
      uu = mm - 1;
109
3
    else 
if (3
ins > md[mm]3
)
110
3
      ll = mm + 1;
111
3
  } while (ll <= uu);
112
0
  assert(0 && "Shouldn't reach here");
113
0
  return -1;
114
4
}
115
116
static inline void _ccv_nnc_try_set_pix_0(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint8_t* const cube, int offset)
117
6
{
118
3
  const int s = (ofs[0] == 0) ? 
03
:
_ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 13
;
119
3
  const int d = ((ofs[0] + dim[0] == tensor_dim[0]) ? 
cube_dim[0]3
:
_ccv_nnc_mix_idx(scmd[0], ofs[0] + 3
ccv_max3
(1, dim[0]), cube_dim[0])) + 1;
120
6
  assert(s >= 0 && d > s);
121
6
  int i;
122
12
  for (i = s; 
i < d12
;
i++6
)
123
6
    // Fill this pix. I can make this faster by loop through full ones (divided by 8), but too lazy.
124
6
    cube[(offset + i) >> 3] |= (1 << ((offset + i) & 0x7));
125
6
}
126
127
static inline void _ccv_nnc_try_set_pix_1(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint8_t* const cube, int offset)
128
9
{
129
7
  const int s0 = (ofs[0] == 0) ? 
07
:
_ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 12
;
130
7
  const int d0 = ((ofs[0] + dim[0] == tensor_dim[0]) ? 
cube_dim[0]7
:
_ccv_nnc_mix_idx(scmd[0], ofs[0] + 2
ccv_max2
(1, dim[0]), cube_dim[0])) + 1;
131
9
  assert(s0 >= 0 && d0 > s0);
132
6
  const int s1 = (ofs[1] == 0) ? 
06
:
_ccv_nnc_mix_idx(scmd[1], ofs[1], cube_dim[1]) + 13
;
133
6
  const int d1 = ((ofs[1] + dim[1] == tensor_dim[1]) ? 
cube_dim[1]6
:
_ccv_nnc_mix_idx(scmd[1], ofs[1] + 3
ccv_max3
(1, dim[1]), cube_dim[1])) + 1;
134
9
  assert(s1 >= 0 && d1 > s1);
135
9
  int i, j;
136
9
  const int step1 = cube_step[1];
137
9
  if (step1 == d0 - s0)
138
5
  {
139
5
    // Faster one, we can simply loop through.
140
11
    for (i = s1 * step1; 
i < d1 * step111
;
i++6
)
141
6
      cube[(offset + i) >> 3] |= (1 << ((offset + i) & 0x7));
142
4
  } else {
143
4
    offset += s1 * step1;
144
4
    // There are gaps, slow one.
145
8
    for (i = s1; 
i < d18
;
i++, offset += step14
)
146
8
      
for (j = s0; 4
j < d08
;
j++4
)
147
4
        cube[(offset + j) >> 3] |= (1 << ((offset + j) & 0x7));
148
4
  }
149
9
}
150
151
static inline void _ccv_nnc_try_set_pix(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint8_t* const cube, int offset, const int dim_idx)
152
19
{
153
19
  switch (dim_idx)
154
19
  {
155
9
    case 1:
156
9
      _ccv_nnc_try_set_pix_1(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset);
157
9
      return;
158
6
    case 0:
159
6
      _ccv_nnc_try_set_pix_0(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset);
160
6
      return;
161
19
  }
162
4
  int i;
163
2
  const int s = (ofs[dim_idx] == 0) ? 
02
:
_ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx], cube_dim[dim_idx]) + 12
;
164
2
  const int d = ((ofs[dim_idx] + dim[dim_idx] == tensor_dim[dim_idx]) ? 
cube_dim[dim_idx]2
:
_ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx] + 2
ccv_max2
(1, dim[dim_idx]), cube_dim[dim_idx])) + 1;
165
4
  assert(s >= 0 && d > s);
166
8
  for (i = s; 
i < d8
;
i++4
)
167
4
    _ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset + i * cube_step[dim_idx], dim_idx - 1);
168
4
}
169
170
static int _ccv_nnc_tensor_ref_fully_assigned_with_aliases(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info)
171
21
{
172
21
  // Only work with tensor_ref of aliases.
173
21
  assert(tensor_ref->alias_registry);
174
21
  const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
175
21
  assert(tensor_symbol_info[autograd->d].alias_ref == 0);
176
21
  const int* tensor_dim = tensor_symbol_info[autograd->d].info.dim;
177
21
  int i, j;
178
52
  for (i = 0; 
i < tensor_ref->alias_registry->rnum52
;
i++31
)
179
31
  {
180
31
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
181
31
    assert(d < autograd_tensor_symbols->rnum);
182
31
    const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
183
31
    assert(tensor_symbol_info[autograd->d].alias_ref);
184
31
    const int* inc = tensor_symbol_info[autograd->d].inc;
185
31
    if (
memcmp(inc, tensor_dim, sizeof(int) * 31
CCV_NNC_MAX_DIM_ALLOC31
) != 0)
186
0
      return 0;
187
31
  }
188
21
  /* We need a solid cube (potentially hyper dimensional) to compute if there are overlaps.
189
21
   * To make this cube as small as possible, we need to map the actual tensor dimension
190
21
   * (therefore, we don't actually allocate the whole tensor to compute overlaps) to a smaller
191
21
   * cube given the ofs and dim size of its aliases.
192
21
   *
193
21
   * The following code generated the dimension mapping (using scratch space) with binary search + insertion
194
21
   * and then we fill the cube with a given tensor alias's dimensional information (ofs, dim).
195
21
   * Afterwards, we simply need to check if the cube is totally filled up to know if this tensor
196
21
   * is fully assigned with its aliases (if that is the case, we can skip zeroing for this tensor).
197
21
   *
198
21
   * There are several restrictions though to make this faster: 1). I cannot handle any cube that all side
199
21
   * lengths combined larger than 1023 (scm only have 1024 scratch space). 2). I cannot handle any cube
200
21
   * that the total volume is larger than 2048 * 8 (I only allocate 2K on stack for this).
201
21
   * */
202
21
  int scm[1024]; // Having 1024 int scratch space for mapping dimensions. (Or sparse coordinate mapping).
203
21
  int cube_dim[CCV_NNC_MAX_DIM_ALLOC] = {}; // Mapping dimension size.
204
21
  int cube_size = 1;
205
21
  int* scmptr = scm;
206
32
  for (i = 0; 
i < 32
CCV_NNC_MAX_DIM_ALLOC32
&&
tensor_dim[i]32
;
i++11
)
207
26
  {
208
26
    int head = 0, tail = 0; // Note that we touched both the head and tail (otherwise this dimension is not fully covered).
209
26
    int len = 0;
210
71
    for (j = 0; 
j < tensor_ref->alias_registry->rnum71
;
j++45
)
211
45
    {
212
45
      const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, j);
213
45
      assert(d < autograd_tensor_symbols->rnum);
214
45
      const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
215
45
      assert(tensor_symbol_info[autograd->d].alias_ref);
216
45
      const int* ofs = tensor_symbol_info[autograd->d].ofs;
217
45
      const int* dim = tensor_symbol_info[autograd->d].info.dim;
218
29
      head = head || (ofs[i] == 0);
219
32
      tail = tail || 
(ofs[i] + 32
ccv_max32
(1, dim[i]) == tensor_dim[i]);
220
45
      if (ofs[i] != 0)
221
11
        len += _ccv_nnc_try_mix(scmptr, ofs[i], len);
222
45
      if (scmptr - scm + len >= 1024) // Cannot handle that much, abort.
223
0
        return 0;
224
45
      
if (45
ofs[i] + 45
ccv_max45
(1, dim[i]) < tensor_dim[i])
225
25
        
len += _ccv_nnc_try_mix(scmptr, ofs[i] + 25
ccv_max25
(1, dim[i]), len);
226
45
      if (scmptr - scm + len >= 1024) // Cannot handle that much, abort.
227
0
        return 0;
228
45
    }
229
26
    
if (26
!head || 26
!tail25
)
230
15
      return 0;
231
11
    cube_size *= (len + 1);
232
11
    cube_dim[i] = len;
233
11
    scmptr += len; // Moving to next level.
234
11
  }
235
21
  // The cube map is too large, cannot do the computation, assume it is not fully assigned.
236
6
  
if (6
cube_size > 2048 * 86
)
237
0
    return 0;
238
6
  // binary map to see if it fills up.
239
6
  uint8_t* cube = (uint8_t*)alloca(sizeof(uint8_t) * ((cube_size + 7) >> 3));
240
6
  memset(cube, 0, sizeof(uint8_t) * ((cube_size + 7) >> 3));
241
6
  int* scmd[CCV_NNC_MAX_DIM_ALLOC] = {}; // Sparse coordinate map at dimension x.
242
6
  int cube_step[CCV_NNC_MAX_DIM_ALLOC] = {};
243
16
  for (i = 0; 
i < 16
CCV_NNC_MAX_DIM_ALLOC16
&&
tensor_dim[i]16
;
i++10
)
244
10
  {
245
6
    cube_step[i] = (i > 0) ? 
cube_step[i - 1] * (cube_dim[i - 1] + 1)4
:
16
;
246
6
    scmd[i] = (i > 0) ? 
scmd[i - 1] + cube_dim[i - 1]4
:
scm6
;
247
10
  }
248
6
  const int max_dim = i;
249
21
  for (i = 0; 
i < tensor_ref->alias_registry->rnum21
;
i++15
)
250
15
  {
251
15
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
252
15
    assert(d < autograd_tensor_symbols->rnum);
253
15
    const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
254
15
    assert(tensor_symbol_info[autograd->d].alias_ref);
255
15
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
256
15
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
257
15
    _ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, 0, max_dim - 1);
258
15
  }
259
6
  // Compare to see now if the binary map filled up. If it filled up, we know it is fully assigned.
260
6
  for (i = 0; 
i < (cube_size >> 3)6
;
i++0
)
261
0
    
if (0
cube[i] < 0xff0
)
262
0
      return 0;
263
6
  
if (6
(cube_size & 0x7) > 06
)
264
6
  {
265
6
    // Fetch the rest.
266
6
    uint8_t r = 0;
267
21
    for (i = 0; 
i < (cube_size & 0x7)21
;
i++15
)
268
15
      r |= (1 << i);
269
6
    assert(cube[((cube_size + 7) >> 3) - 1] <= r);
270
6
    if (cube[((cube_size + 7) >> 3) - 1] < r)
271
0
      return 0;
272
6
  }
273
6
  return 1;
274
6
}
275
276
static int _ccv_nnc_tensor_ref_version_find_init(const ccv_nnc_autograd_tensor_version_t* const tensor_ver)
277
3
{
278
3
  int i;
279
6
  for (i = 0; 
i < tensor_ver->ref_version->rnum6
;
i++3
)
280
4
    
if (4
((ccv_nnc_tensor_ref_t*)4
ccv_array_get4
(tensor_ver->ref_version, i))->x < 0)
281
1
      return i;
282
2
  return -1;
283
3
}
284
285
static void _ccv_nnc_graph_sum_autograd_tensor_versions(const int idx, const int d, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_execs)
286
8
{
287
8
  int i, j;
288
8
  assert(tensor_ver->c < tensor_ver->ref_version->rnum);
289
8
  const int input_size = tensor_ver->ref_version->rnum - tensor_ver->c;
290
8
  int* inputs = (int*)ccmalloc(sizeof(int) * input_size);
291
26
  for (i = tensor_ver->c; 
i < tensor_ver->ref_version->rnum26
;
i++18
)
292
18
    
inputs[i] = ((ccv_nnc_tensor_ref_t*)18
ccv_array_get18
(tensor_ver->ref_version, i))->d;
293
8
  const ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
294
8
    .d = d
295
8
  };
296
8
  ccv_array_push(autograd_tensor_symbols, &tensor_sym);
297
8
  ccv_nnc_sum_graph_exec_symbol_t sum_exec = {
298
8
    .input_size = input_size,
299
8
    .inputs = inputs,
300
8
    .output = autograd_tensor_symbols->rnum - 1
301
8
  };
302
8
  if (idx >= 0)
303
3
  {
304
3
    sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0);
305
3
    ccv_array_push(sum_exec.outgoings, &idx);
306
3
  }
307
8
  ccv_array_push(sum_execs, &sum_exec);
308
8
  const int outgoing = exec_symbol_info_size + sum_execs->rnum - 1;
309
26
  for (i = tensor_ver->c; 
i < tensor_ver->ref_version->rnum26
;
i++18
)
310
18
  {
311
18
    const ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i);
312
18
    const int x = tensor_ref->x;
313
18
    if (x < 0) /* This is initialization tensor, it has to be occurred before the execution anyway. */
314
1
    {
315
1
      // No alias.
316
1
      assert(!tensor_ref->alias_registry);
317
1
      // No associated additional execs.
318
1
      assert(!tensor_ref->exec_registry);
319
1
      continue;
320
1
    }
321
17
    
if (17
x < exec_symbol_info_size17
)
322
17
    {
323
17
      ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
324
17
      if (!back_exec->outgoings)
325
6
        back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
326
17
      ccv_array_replace_int(back_exec->outgoings, idx, outgoing);
327
0
    } else {
328
0
      // This tensor_ref is generated by the sum operation.
329
0
      ccv_nnc_sum_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, x - exec_symbol_info_size);
330
0
      ccv_array_replace_int(sum_or_set->outgoings, idx, outgoing);
331
0
    }
332
17
    // If this tensor have associated alias, we need to init it to zeros when it is allocated (we only need to set a flag here)
333
17
    // it is handled at compilation phase.
334
17
    if (tensor_ref->alias_registry &&
335
17
      // Loop over to see if this tensor is fully occupied to avoid extra zero step.
336
10
      !_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info))
337
8
    {
338
8
      ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
339
8
      // By having alias_registry, what this symbol represents must not by an alias.
340
8
      assert(tensor_sym->alias_ref == 0);
341
8
      tensor_sym->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
342
8
    }
343
17
    if (tensor_ref->exec_registry)
344
4
      
for (j = 0; 2
j < tensor_ref->exec_registry->rnum4
;
j++2
)
345
2
      {
346
2
        const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
347
2
        assert(x >= 0);
348
2
        // The exec_registry can only be generated by alias registry, therefore, it cannot reference to a sum operation.
349
2
        assert(x < exec_symbol_info_size);
350
2
        ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
351
2
        if (!back_exec->outgoings)
352
1
          back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
353
2
        ccv_array_replace_int(back_exec->outgoings, idx, outgoing);
354
2
      }
355
17
  }
356
8
  const ccv_nnc_tensor_ref_t tensor_ref = {
357
8
    .d = autograd_tensor_symbols->rnum - 1,
358
8
    .x = outgoing
359
8
  };
360
8
  ccv_array_push(tensor_ver->ref_version, &tensor_ref);
361
8
  /* Move the c pointer up to the latest summed result. */
362
8
  tensor_ver->c = tensor_ver->ref_version->rnum - 1;
363
8
}
364
365
static int _ccv_nnc_tensor_ref_version_involve_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
366
38
{
367
38
  assert(alias->alias_ref > 0);
368
38
  // No alias_registry, must conflict (owns the whole band).
369
38
  if (!tensor_ref->alias_registry)
370
10
    return 1;
371
28
  int i, j;
372
46
  for (i = 0; 
i < tensor_ref->alias_registry->rnum46
;
i++18
)
373
38
  {
374
38
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
375
38
    assert(d < autograd_tensor_symbols->rnum);
376
38
    ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
377
38
    // This must reference to an alias.
378
38
    assert(tensor_symbol_info[autograd->d].alias_ref);
379
38
    const int* inc = tensor_symbol_info[autograd->d].inc;
380
38
    // Only can compare if the inc is the same, otherwise, we can only assume it overlaps.
381
38
    if (
memcmp(inc, alias->inc, sizeof(int) * 38
CCV_NNC_MAX_DIM_ALLOC38
) != 0)
382
0
      return 1;
383
38
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
384
38
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
385
38
    int none = 0;
386
84
    for (j = 0; 
j < 84
CCV_NNC_MAX_DIM_ALLOC84
&&
dim[j]84
&&
alias->info.dim[j]64
;
j++46
)
387
64
      
if (64
ccv_min64
(ofs[j] + dim[j], alias->ofs[j] + alias->info.dim[j]) <= 64
ccv_max64
(ofs[j], alias->ofs[j]))
388
18
      {
389
18
        none = 1;
390
18
        break;
391
18
      }
392
38
    // If it overlaps with this alias, nope.
393
38
    if (!none)
394
20
      return 1;
395
38
  }
396
28
  // All aliases referenced by this ref_version doesn't overlap with the provided one, thus, there is no conflict at all.
397
8
  return 0;
398
28
}
399
400
static int _ccv_nnc_tensor_ref_version_find_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
401
23
{
402
23
  assert(alias->alias_ref > 0);
403
23
  // No alias_registry, thus, cannot find the exact matched alias.
404
23
  if (!tensor_ref->alias_registry)
405
8
    return -1;
406
15
  int i;
407
24
  for (i = 0; 
i < tensor_ref->alias_registry->rnum24
;
i++9
)
408
19
  {
409
19
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
410
19
    assert(d < autograd_tensor_symbols->rnum);
411
19
    ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
412
19
    // This must reference to an alias.
413
19
    assert(tensor_symbol_info[autograd->d].alias_ref);
414
19
    const int* inc = tensor_symbol_info[autograd->d].inc;
415
19
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
416
19
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
417
19
    // If everything matches, this is the required alias.
418
19
    if (
memcmp(inc, alias->inc, sizeof(int) * 19
CCV_NNC_MAX_DIM_ALLOC19
) == 0 &&
419
19
      
memcmp(ofs, alias->ofs, sizeof(int) * 19
CCV_NNC_MAX_DIM_ALLOC19
) == 0 &&
420
10
      
memcmp(dim, alias->info.dim, sizeof(int) * 10
CCV_NNC_MAX_DIM_ALLOC10
) == 0)
421
10
      return d;
422
19
  }
423
5
  return -1;
424
15
}
425
426
static int _ccv_nnc_tensor_ref_version_has_this_alias_exclusively(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
427
4
{
428
4
  assert(alias->alias_ref > 0);
429
4
  // No alias_registry, thus, cannot find the exact matched alias.
430
4
  if (!tensor_ref->alias_registry)
431
0
    return 0;
432
4
  int i;
433
8
  for (i = 0; 
i < tensor_ref->alias_registry->rnum8
;
i++4
)
434
5
  {
435
5
    const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
436
5
    assert(d < autograd_tensor_symbols->rnum);
437
5
    ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
438
5
    // This must reference to an alias.
439
5
    assert(tensor_symbol_info[autograd->d].alias_ref);
440
5
    const int* inc = tensor_symbol_info[autograd->d].inc;
441
5
    const int* ofs = tensor_symbol_info[autograd->d].ofs;
442
5
    const int* dim = tensor_symbol_info[autograd->d].info.dim;
443
5
    if (
memcmp(inc, alias->inc, sizeof(int) * 5
CCV_NNC_MAX_DIM_ALLOC5
) != 0 ||
444
5
      
memcmp(ofs, alias->ofs, sizeof(int) * 5
CCV_NNC_MAX_DIM_ALLOC5
) != 0 ||
445
4
      
memcmp(dim, alias->info.dim, sizeof(int) * 4
CCV_NNC_MAX_DIM_ALLOC4
) != 0)
446
1
      return 0;
447
5
  }
448
4
  // If everything matches for every alias in registry, we can use any of the alias directly.
449
3
  return 1;
450
4
}
451
452
static int _ccv_nnc_graph_sum_autograd_tensor_versions_alias(const int idx, const int d, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const alias, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_execs)
453
16
{
454
16
  assert(tensor_ver->c < tensor_ver->ref_version->rnum);
455
16
  int i, j = 0;
456
16
  struct {
457
16
    int k;
458
16
    int i;
459
16
  } kd[tensor_ver->ref_version->rnum - tensor_ver->c];
460
39
  for (i = tensor_ver->c; 
i < tensor_ver->ref_version->rnum39
;
i++23
)
461
23
  {
462
23
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i);
463
23
    const int k = _ccv_nnc_tensor_ref_version_find_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias);
464
23
    if (k >= 0)
465
10
      kd[j++] = (typeof(kd[0])){
466
10
        .k = k, .i = i
467
10
      };
468
13
    else 
if (13
_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias)13
)
469
13
      kd[j++] = (typeof(kd[0])) {
470
13
        .k = -1, .i = i // It has dependency to the original tensor (non-alias) now, label this with highest bit.
471
13
      };
472
23
  }
473
16
  // Can only find one. This is the easy case, we can simply return that symbol (or its alias).
474
16
  if (j == 1)
475
12
  {
476
12
    if (kd[0].k >= 0)
477
3
      return kd[0].k; // Only can find one alias, that is the one.
478
12
    // Otherwise, need to create a new alias.
479
9
    
ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)9
ccv_array_get9
(tensor_ver->ref_version, kd[0].i);
480
9
    ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
481
9
    // Since we create new alias, we need to set the referenced one to be allocated with 0s.
482
9
    if (ref->alias_ref) // If this is an alias, it has to be zero initialized.
483
0
    {
484
0
      ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, ref->alias_ref - 1);
485
0
      assert(ref->alias_ref == 0); // This is original.
486
0
      ref->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
487
9
    } else 
if (9
tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied.9
488
9
        // Loop over to see if this tensor is fully occupied to avoid extra zero step.
489
2
        
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)2
)
{1
490
1
      ref->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
491
1
    }
492
9
    ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
493
9
      .d = d,
494
9
      .alias_ref = tensor_ref->d + 1
495
9
    };
496
9
    ccv_array_push(autograd_tensor_symbols, &tensor_sym);
497
9
    const int ad = autograd_tensor_symbols->rnum - 1;
498
9
    if (tensor_ref->alias_registry) // Only push this when it has an alias registry (otherwise it already conflict with everyone).
499
2
      ccv_array_push(tensor_ref->alias_registry, &ad);
500
9
    // The newly inserted tensor symbol.
501
9
    return ad;
502
12
  }
503
16
  // Otherwise, we need to create the sum operation out of these.
504
4
  const int input_size = j;
505
4
  int has_this_alias_exclusively = 1;
506
4
  int* inputs = input_size > 0 ? 
(int*)4
ccmalloc4
(sizeof(int) * input_size) :
00
;
507
15
  for (i = 0; 
i < input_size15
;
i++11
)
508
11
  {
509
11
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
510
11
    // Can take a fast path if every ref involved has the same alias, our sum operation can be faster (using alias directly).
511
11
    if (
has_this_alias_exclusively && 11
kd[i].k >= 06
&&
_ccv_nnc_tensor_ref_version_has_this_alias_exclusively(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias)4
)
512
3
      
inputs[i] = *(int*)3
ccv_array_get3
(tensor_ref->alias_registry, 0); // Assigning the alias.
513
8
    else {
514
8
      if (has_this_alias_exclusively)
515
3
      {
516
3
        has_this_alias_exclusively = 0;
517
3
        for (j = 0; 
j < i3
;
j++0
)
518
0
          
inputs[j] = ((ccv_nnc_tensor_ref_t*)0
ccv_array_get0
(tensor_ver->ref_version, kd[j].i))->d;
519
3
      }
520
8
      inputs[i] = tensor_ref->d;
521
8
    }
522
11
  }
523
4
  ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
524
4
    .d = alias->alias_ref - 1
525
4
  };
526
4
  ccv_array_push(autograd_tensor_symbols, &tensor_sym);
527
4
  const int tensor_ref_d = autograd_tensor_symbols->rnum - 1;
528
4
  tensor_sym.d = d;
529
4
  tensor_sym.alias_ref = tensor_ref_d + 1;
530
4
  ccv_array_push(autograd_tensor_symbols, &tensor_sym);
531
4
  const int ad = autograd_tensor_symbols->rnum - 1;
532
4
  ccv_nnc_sum_graph_exec_symbol_t sum_exec = {
533
4
    .input_size = input_size,
534
4
    .inputs = inputs,
535
3
    .output = has_this_alias_exclusively ? 
ad1
:
tensor_ref_d3
/* If has this alias exclusively, the output should be alias as well. Otherwise the output is the real tensor. */
536
4
  };
537
4
  if (idx >= 0)
538
4
  {
539
4
    sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0);
540
4
    ccv_array_push(sum_exec.outgoings, &idx);
541
4
  }
542
4
  ccv_array_push(sum_execs, &sum_exec);
543
4
  const int outgoing = exec_symbol_info_size + sum_execs->rnum - 1;
544
4
  int no_alias_registry = 0;
545
15
  for (i = 0; 
i < input_size15
;
i++11
)
546
11
  {
547
11
    ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
548
11
    if (!has_this_alias_exclusively)
549
8
    {
550
8
      // If the sum operation is not operating on one alias. I need to zero this tensor out when it is first
551
8
      // allocated (see discussions around the flags I use).
552
8
      ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
553
8
      if (tensor_sym->alias_ref)
554
0
      {
555
0
        // Find the original tensor_sym and set its flags (I prefer to set flags on its original).
556
0
        ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_sym->alias_ref - 1);
557
0
        assert(ref->alias_ref == 0); // This is original.
558
0
        ref->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
559
8
      } else 
if (8
tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied.8
560
8
          // Loop over to see if this tensor is fully occupied to avoid extra zero step.
561
7
          
!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)7
)
{5
562
5
        tensor_sym->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
563
5
      }
564
8
    }
565
11
    // Check to see if any of these tensors doesn't have alias.
566
11
    no_alias_registry |= (!tensor_ref->alias_registry);
567
11
    const int x = tensor_ref->x;
568
11
    assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
569
11
    if (x < exec_symbol_info_size)
570
11
    {
571
11
      ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
572
11
      if (!back_exec->outgoings)
573
0
        back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
574
11
      ccv_array_push(back_exec->outgoings, &outgoing);
575
0
    } else {
576
0
      ccv_nnc_sum_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, x - exec_symbol_info_size);
577
0
      ccv_array_push(sum_or_set->outgoings, &outgoing);
578
0
    }
579
11
    if (tensor_ref->exec_registry)
580
4
      
for (j = 0; 2
j < tensor_ref->exec_registry->rnum4
;
j++2
)
581
2
      {
582
2
        const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
583
2
        assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
584
2
        assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation.
585
2
        ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
586
2
        if (!back_exec->outgoings)
587
0
          back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
588
2
        ccv_array_push(back_exec->outgoings, &outgoing);
589
2
      }
590
11
  }
591
4
  const ccv_nnc_tensor_ref_t tensor_ref = {
592
4
    .d = tensor_ref_d,
593
4
    .x = outgoing,
594
4
    .exec_registry = 0, // I don't need to take execution dependencies because this tensor is generated by sum, therefore, we already take that dependency.
595
3
    .alias_registry = !no_alias_registry || 
has_this_alias_exclusively1
?
ccv_array_new(sizeof(int), 1, 0)3
:
01
596
4
  };
597
4
  // If there is no alias registry, then we take the whole tensor ref as one.
598
4
  if (
!no_alias_registry || 4
has_this_alias_exclusively1
)
599
3
  {
600
3
    // If this tensor ref contains multiple different types of alias, have to add them together (otherwise
601
3
    // the computation for if there is an empty slot in this tensor ref is not correct without all the
602
3
    // occupancy availability information).
603
3
    if (!has_this_alias_exclusively)
604
7
      
for (i = 0; 2
i < input_size7
;
i++5
)
605
5
      {
606
5
        ccv_nnc_tensor_ref_t* ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
607
5
        assert(ref->alias_registry);
608
5
        // It may get duplicates. But whatever, won't matter the computation.
609
13
        for (j = 0; 
j < ref->alias_registry->rnum13
;
j++8
)
610
8
          
ccv_array_push(tensor_ref.alias_registry, 8
ccv_array_get8
(ref->alias_registry, j));
611
5
      }
612
3
    ccv_array_push(tensor_ref.alias_registry, &ad);
613
3
  }
614
4
  assert(input_size <= tensor_ver->ref_version->rnum - tensor_ver->c);
615
4
  ccv_nnc_tensor_ref_t x;
616
15
  for (i = 0; 
i < input_size15
;
i++11
)
617
4
    // If the current one (i + tensor_ver->c) is smaller than the one referenced to, exchange.
618
11
    
if (11
kd[i].i > i + tensor_ver->c11
)
619
0
      CCV_SWAP(*(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i + tensor_ver->c), *(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i), x);
620
4
  ccv_array_push(tensor_ver->ref_version, &tensor_ref);
621
4
  // We've consumed input_size tensor refs, now move c up to the pointer of non-consumed tensors.
622
4
  tensor_ver->c += input_size;
623
4
  return ad;
624
16
}
625
626
typedef struct ccv_nnc_symbolic_graph_backward_prep_s {
627
  int exec_symbol_info_size; // Number of graph exec symbols before adding any new symbols related to automatic differentiation.
628
  int tensor_symbol_info_size; // Number of tensor symbols before adding anything new.
629
  int sub_prep_size;
630
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
631
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
632
  ccv_nnc_graph_backward_info_t* backward_info; // Corresponding to forward graph exec symbol info, it is exactly in reverse.
633
  ccv_nnc_graph_visit_t* forward_visit; // The visitor structure (top sorted index) when doing traversal.
634
  ccv_nnc_graph_visit_t* backward_visit; // The visitor structure (top sorted index) when doing reverse traversal.
635
  ccv_nnc_autograd_graph_exec_symbol_t* autograd_execs; // The graph exec symbols we need for automatic differentiation. This is a 1:1 mapping for forward graph exec symbols, however, unlike backward_info, its outgoings may be more complex (may contain outgoing flows to sum nodes).
636
  ccv_nnc_autograd_tensor_version_t* autograd_tensor_versions; // Corresponding to forward tensor symbols, each may contain multiple versions (due to multi-write).
637
  ccv_array_t* autograd_tensor_symbols; // The tensor symbols we need for automatic differentiation (it may not be 1:1 mapping).
638
  ccv_array_t* sum_execs; // The sum nodes, because in reverse mode, a tensor could have multiple versions, we need to sum them up before use.
639
  struct ccv_nnc_symbolic_graph_backward_prep_s* sub_preps; // The preps of its sub-graphs.
640
  // Pointers not managed by this struct
641
  ccv_nnc_symbolic_graph_t* graph;
642
} ccv_nnc_symbolic_graph_backward_prep_t;
643
644
static ccv_nnc_symbolic_graph_backward_prep_t _ccv_nnc_symbolic_graph_backward_prep(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
645
13
{
646
13
  const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
647
13
  const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
648
13
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * exec_symbol_info_size);
649
13
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * tensor_symbol_info_size);
650
13
  ccv_nnc_graph_visit_t* forward_visit = ccv_nnc_graph_visit_new(graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), exec_symbol_info_size, sources, source_size, destinations, destination_size);
651
13
  ccv_nnc_symbolic_graph_symbol_infer(graph, forward_visit, sources, source_size, destinations, destination_size, 0, 0, tensor_symbol_info, exec_symbol_info);
652
13
  int i;
653
13
  // Now, for each one of these, find a reverse graph.
654
13
  ccv_nnc_graph_backward_info_t* backward_info = (ccv_nnc_graph_backward_info_t*)cccalloc(exec_symbol_info_size, sizeof(ccv_nnc_graph_backward_info_t));
655
50
  
ccv_nnc_graph_visit_for50
(forward_visit, exec_symbol_info, node, idx) {50
656
50
    assert(ccv_nnc_cmd_is_forward(node->cmd) || node->cmd.cmd == CCV_NNC_NOOP);
657
50
    if (node->outgoings)
658
83
      
for (i = 0; 36
i < node->outgoings->rnum83
;
i++47
)
659
47
      {
660
47
        int d = *(int*)ccv_array_get(node->outgoings, i);
661
47
        if (backward_info[d].outgoings == 0)
662
33
          backward_info[d].outgoings = ccv_array_new(sizeof(int32_t), 1, 0);
663
47
        ccv_array_push(backward_info[d].outgoings, &idx);
664
47
      }
665
50
  } ccv_nnc_graph_visit_endfor
666
13
  // Also mark only the output bits that we use.
667
63
  for (i = 0; 
i < exec_symbol_info_size63
;
i++50
)
668
50
  {
669
50
    backward_info[i].input_bitmask_size = ((exec_symbol_info[i].output_size * 2 + exec_symbol_info[i].input_size + 63) >> 6);
670
50
    backward_info[i].output_bitmask_size = ((exec_symbol_info[i].input_size + 63) >> 6);
671
50
    // Allocate input / output bitmasks
672
50
    if (backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size > 0)
673
49
    {
674
49
      backward_info[i].input_bitmasks = (uint64_t*)cccalloc(backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size, sizeof(uint64_t));
675
49
      if (backward_info[i].output_bitmask_size)
676
49
        backward_info[i].output_bitmasks = backward_info[i].input_bitmasks + backward_info[i].input_bitmask_size;
677
49
    }
678
50
  }
679
13
  ccv_nnc_graph_visit_t* backward_visit = ccv_nnc_graph_visit_new(graph, backward_info, exec_symbol_info_size, destinations, destination_size, sources, source_size);
680
12
  const int sub_prep_size = graph->sub_graphs ? 
graph->sub_graphs->rnum1
:
012
;
681
12
  ccv_nnc_symbolic_graph_backward_prep_t* sub_preps = sub_prep_size > 0 ? 
(ccv_nnc_symbolic_graph_backward_prep_t*)1
cccalloc1
(sub_prep_size, sizeof(ccv_nnc_symbolic_graph_backward_prep_t)) :
012
;
682
14
  for (i = 0; 
i < sub_prep_size14
;
i++1
)
683
1
  {
684
1
    const ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(graph->sub_graphs, i);
685
1
    sub_preps[i] = _ccv_nnc_symbolic_graph_backward_prep(sub_graph, ccv_nnc_symbolic_graph_sources(sub_graph), ccv_nnc_symbolic_graph_source_size(sub_graph), ccv_nnc_symbolic_graph_destinations(sub_graph), ccv_nnc_symbolic_graph_destination_size(sub_graph));
686
1
  }
687
13
  return (ccv_nnc_symbolic_graph_backward_prep_t){
688
13
    .exec_symbol_info_size = exec_symbol_info_size,
689
13
    .tensor_symbol_info_size = tensor_symbol_info_size,
690
13
    .sub_prep_size = sub_prep_size,
691
13
    .exec_symbol_info = exec_symbol_info,
692
13
    .tensor_symbol_info = tensor_symbol_info,
693
13
    .backward_info = backward_info,
694
13
    .forward_visit = forward_visit,
695
13
    .backward_visit = backward_visit,
696
13
    .sub_preps = sub_preps,
697
13
    .graph = (ccv_nnc_symbolic_graph_t*)graph,
698
13
  };
699
13
}
700
701
static void _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(const ccv_nnc_graph_exec_symbol_info_t* const exec_info, const ccv_nnc_symbolic_graph_t* const sub_graph, const int graph_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const uint64_t* const input_bitmasks, const uint64_t* const output_bitmasks, ccv_array_t* const sub_f_symbols, ccv_array_t* const sub_wrt_symbols)
702
2
{
703
2
  int i, j;
704
2
  ccv_array_clear(sub_wrt_symbols);
705
6
  for (i = 0; 
i < exec_info->input_size6
;
i++4
)
706
4
    
if (4
output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))4
)
707
2
    {
708
2
      ccv_nnc_tensor_symbol_t sub_wrt_symbol = {
709
2
        .d = *(int*)ccv_array_get(tensor_symbol_info[exec_info->inputs[i]].s_ref, graph_ref) - 1,
710
2
        .graph = sub_graph,
711
2
        .info = tensor_symbol_info[exec_info->inputs[i]].info
712
2
      };
713
2
      ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol);
714
2
    }
715
2
  ccv_array_clear(sub_f_symbols);
716
4
  for (i = 0; 
i < exec_info->output_size4
;
i++2
)
717
2
    
if (2
input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))2
)
718
2
    {
719
2
      ccv_nnc_tensor_symbol_t sub_f_symbol = {
720
2
        .d = *(int*)ccv_array_get(tensor_symbol_info[exec_info->outputs[i]].s_ref, graph_ref) - 1,
721
2
        .graph = sub_graph,
722
2
        .info = tensor_symbol_info[exec_info->outputs[i]].info
723
2
      };
724
2
      ccv_array_push(sub_f_symbols, &sub_f_symbol);
725
2
    }
726
2
  // Go through all its assignments (parameterized loop), making them either wrt or f.
727
2
  // The reason is these must flow through the graph, otherwise we cannot form a full
728
2
  // enclosed loop. Also because they are the additional f / wrt symbols, there is
729
2
  // no case that we cannot find their corresponding gradients in the backward sub graphs
730
2
  // (these gradients have to be parameterized to form an enclosed loop as well).
731
10
  for (i = 0; 
i < sub_graph->tensor_symbol_info->rnum10
;
i++8
)
732
8
  {
733
8
    const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, i);
734
8
    if (tensor_symbol_info->assign_ref)
735
2
    {
736
2
      const int assign_ref = tensor_symbol_info->assign_ref - 1;
737
2
      // i is the wrt, assign_ref is the f.
738
2
      int flag = 0;
739
4
      for (j = 0; 
!flag && 4
j < sub_wrt_symbols->rnum4
;
j++2
)
740
2
        
flag = (((ccv_nnc_tensor_symbol_t*)2
ccv_array_get2
(sub_wrt_symbols, j))->d == i);
741
2
      if (!flag)
742
2
      {
743
2
        ccv_nnc_tensor_symbol_t sub_wrt_symbol = {
744
2
          .d = i,
745
2
          .graph = sub_graph,
746
2
          .info = tensor_symbol_info->info
747
2
        };
748
2
        ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol);
749
2
      }
750
2
      flag = 0;
751
4
      for (j = 0; 
!flag && 4
j < sub_f_symbols->rnum2
;
j++2
)
752
2
        
flag = (((ccv_nnc_tensor_symbol_t*)2
ccv_array_get2
(sub_f_symbols, j))->d == assign_ref);
753
2
      if (!flag)
754
0
      {
755
0
        ccv_nnc_tensor_symbol_t sub_f_symbol = {
756
0
          .d = assign_ref,
757
0
          .graph = sub_graph,
758
0
          .info = tensor_symbol_info->info
759
0
        };
760
0
        ccv_array_push(sub_f_symbols, &sub_f_symbol);
761
0
      }
762
2
    }
763
8
  }
764
2
}
765
766
// Check whether for a given f_symbol, we can compute wrt_symbols at all, if we can, tag the minimal io and ops (some ops can be replaced with noop) required to do so.
767
static int _ccv_nnc_symbolic_graph_backward_prep_prune_ops(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size)
768
13
{
769
13
  int i, j;
770
13
  const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size;
771
13
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
772
13
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info =backward_prep->tensor_symbol_info;
773
13
  const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit;
774
13
  // Now, for each one of these, find a reverse graph.
775
13
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
776
13
  const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit;
777
13
  // Find the f_symbols, and tag its flows.
778
50
  
ccv_nnc_graph_visit_for50
(backward_visit, backward_info, node, idx) {50
779
50
    int f = node->f_wrt & 0x1;
780
64
    for (i = 0; 
i < exec_symbol_info[idx].output_size && 64
!f49
;
i++14
)
781
14
    {
782
14
      int d = exec_symbol_info[idx].outputs[i];
783
16
      while (tensor_symbol_info[d].alias_ref)
784
2
        d = tensor_symbol_info[d].alias_ref - 1;
785
28
      for (j = 0; 
j < f_symbol_size && 28
!f14
;
j++14
)
786
14
        
if (14
d == f_symbols[j].d14
)
787
14
          f = 1;
788
14
    }
789
50
    if (f)
790
50
    {
791
50
      node->f_wrt |= f;
792
50
      if (node->outgoings)
793
80
        
for (i = 0; 33
i < node->outgoings->rnum80
;
i++47
)
794
47
        {
795
47
          int d = *(int*)ccv_array_get(node->outgoings, i);
796
47
          backward_info[d].f_wrt |= f;
797
47
        }
798
50
    }
799
50
  } ccv_nnc_graph_visit_endfor
800
13
  // Find the wrt_symbols, and tag its flows.
801
50
  
ccv_nnc_graph_visit_for50
(forward_visit, exec_symbol_info, node, idx) {50
802
50
    int wrt = backward_info[idx].f_wrt & 0x2;
803
77
    for (i = 0; 
i < node->input_size && 77
!wrt69
;
i++27
)
804
27
    {
805
27
      int d = node->inputs[i];
806
31
      while (tensor_symbol_info[d].alias_ref)
807
4
        d = tensor_symbol_info[d].alias_ref - 1;
808
70
      for (j = 0; 
j < wrt_symbol_size && 70
!wrt49
;
j++43
)
809
43
        
if (43
d == wrt_symbols[j].d43
)
810
16
          wrt = 0x2;
811
27
    }
812
50
    if (wrt)
813
47
    {
814
47
      backward_info[idx].f_wrt |= wrt;
815
47
      if (node->outgoings)
816
77
        
for (i = 0; 33
i < node->outgoings->rnum77
;
i++44
)
817
44
        {
818
44
          int d = *(int*)ccv_array_get(node->outgoings, i);
819
44
          backward_info[d].f_wrt |= wrt;
820
44
        }
821
47
    }
822
50
  } ccv_nnc_graph_visit_endfor
823
13
  enum {
824
13
    WRT_SYMBOL_USE = 1,
825
13
    F_SYMBOL_USE = 2
826
13
  };
827
13
  uint8_t* used_grad = (uint8_t*)cccalloc(tensor_symbol_info_size, sizeof(uint8_t));
828
13
  // First, all f_symbols and wrt_symbols are used.
829
26
  for (i = 0; 
i < f_symbol_size26
;
i++13
)
830
13
    
used_grad[tensor_symbol_info[f_symbols[i].d].alias_ref ? 13
tensor_symbol_info[f_symbols[i].d].alias_ref - 10
:
f_symbols[i].d13
] |= F_SYMBOL_USE;
831
42
  for (i = 0; 
i < wrt_symbol_size42
;
i++29
)
832
29
    
used_grad[tensor_symbol_info[wrt_symbols[i].d].alias_ref ? 29
tensor_symbol_info[wrt_symbols[i].d].alias_ref - 10
:
wrt_symbols[i].d29
] |= WRT_SYMBOL_USE;
833
13
  // Do optimistic assumption, and then compute used_grad
834
50
  
ccv_nnc_graph_visit_for50
(forward_visit, exec_symbol_info, _, idx) {50
835
50
    ccv_nnc_graph_backward_info_t* node = backward_info + idx;
836
50
    /* Only interested in the ones on the f / wrt flow */
837
50
    if ((node->f_wrt & 0x3) == 0x3)
838
47
    {
839
47
      const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
840
47
      ccv_nnc_cmd_t cmd = forw_exec->cmd;
841
47
      if (cmd.cmd != CCV_NNC_NOOP)
842
46
        cmd.cmd += 1; /* Backward command is the one after forward command. */
843
47
      assert(ccv_nnc_cmd_is_backward(cmd) || cmd.cmd == CCV_NNC_NOOP);
844
225
      for (i = 0; 
i < forw_exec->output_size * 2 + forw_exec->input_size225
;
i++178
)
845
178
        node->input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
846
131
      for (i = 0; 
i < forw_exec->input_size131
;
i++84
)
847
84
        node->output_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
848
47
      int maybe_noop = 1;
849
55
      for (i = 0; 
i < forw_exec->input_size55
;
i++8
)
850
47
        /* See if it is used as wrt, if not, no need to run this node at all. */
851
55
        
if (55
used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? 55
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 124
:
forw_exec->inputs[i]31
] & WRT_SYMBOL_USE)
852
47
        {
853
47
          maybe_noop = 0;
854
47
          break;
855
47
        }
856
47
      if (maybe_noop)
857
0
      {
858
0
        for (i = 0; 
i < node->input_bitmask_size0
;
i++0
)
859
0
          node->input_bitmasks[i] = 0;
860
0
        for (i = 0; 
i < node->output_bitmask_size0
;
i++0
)
861
0
          node->output_bitmasks[i] = 0;
862
0
        node->output_bitmask_size = 0;
863
47
      } else 
if (47
cmd.cmd == CCV_NNC_GRAPH_FORWARD || 47
cmd.cmd == CCV_NNC_GRAPH_BACKWARD47
)
{1
864
1
        // Clear out all potential outputs if we think it is not a wrt symbols.
865
3
        for (i = 0; 
i < forw_exec->input_size3
;
i++2
)
866
2
          
if (2
!(used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? 2
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 10
:
forw_exec->inputs[i]2
] & WRT_SYMBOL_USE))
867
1
            node->output_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
868
1
        // But for now, assuming we need all input gradients.
869
1
        // Clear out all inputs / outputs from forward op.
870
4
        for (i = forw_exec->output_size; 
i < forw_exec->output_size * 2 + forw_exec->input_size4
;
i++3
)
871
3
          node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
872
46
      } else 
if (46
ccv_nnc_cmd_bitmask(cmd, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size)46
)
{41
873
41
        int flag; /* Only continue if it changed */
874
72
        do {
875
72
          flag = 0;
876
72
          /* Check if the output first */
877
190
          for (i = 0; 
i < forw_exec->input_size190
;
i++118
)
878
72
            /* Only try to eliminate the one that is not used. */
879
118
            
if (118
!(used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? 118
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 152
:
forw_exec->inputs[i]66
] & WRT_SYMBOL_USE) &&
880
13
              (node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))))
881
10
            {
882
10
              node->output_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
883
10
              /* If it worked, mark it as flagged. */
884
10
              if (ccv_nnc_cmd_bitmask(cmd, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size))
885
3
                flag = 1;
886
10
              else /* Refit this with the bit back again. */
887
7
                node->output_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
888
10
            }
889
334
          for (i = 0; 
i < forw_exec->output_size * 2 + forw_exec->input_size334
;
i++262
)
890
262
            
if (262
(i >= forw_exec->output_size ||262
891
72
               
!(used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? 72
tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 131
:
forw_exec->outputs[i]41
] & F_SYMBOL_USE)) &&
892
230
              node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
893
173
            { /* Try to eliminate one of the input. */
894
173
              node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
895
173
              /* If it worked, mark it as flagged. */
896
173
              if (ccv_nnc_cmd_bitmask(cmd, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size))
897
57
                flag = 1;
898
173
              else /* Refit this with the bit back again. */
899
116
                node->input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
900
173
            }
901
72
        } while (flag);
902
41
      }
903
94
      for (i = 0; 
i < forw_exec->output_size94
;
i++47
)
904
47
        
if (47
node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))47
)
905
47
          /* Mark it is used as wrt. */
906
47
          
used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? 47
tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 116
:
forw_exec->outputs[i]31
] |= WRT_SYMBOL_USE;
907
131
      for (i = 0; 
i < forw_exec->input_size131
;
i++84
)
908
47
          /* Mark it is used as f. */
909
84
        
if (84
node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))84
)
910
80
          
used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? 80
tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 132
:
forw_exec->inputs[i]48
] |= F_SYMBOL_USE;
911
47
    }
912
50
  } ccv_nnc_graph_visit_endfor
913
13
  ccv_array_t* sub_f_symbols = 0;
914
13
  ccv_array_t* sub_wrt_symbols = 0;
915
50
  
ccv_nnc_graph_visit_for50
(forward_visit, exec_symbol_info, _, idx) {50
916
50
    ccv_nnc_graph_backward_info_t* node = backward_info + idx;
917
50
    const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
918
50
    /* Only interested in the ones on the f / wrt flow */
919
50
    if (
(node->f_wrt & 0x3) == 0x3 && 50
forw_exec->graph_ref47
)
920
1
    {
921
1
      // Now calling it recursively until we are sure no f_symbols can be removed.
922
1
      const int graph_ref = forw_exec->graph_ref - 1;
923
1
      ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep = backward_prep->sub_preps + graph_ref;
924
1
      if (!sub_wrt_symbols)
925
1
        sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
926
1
      else
927
0
        ccv_array_clear(sub_wrt_symbols);
928
3
      for (i = 0; 
i < forw_exec->input_size3
;
i++2
)
929
2
        
if (2
node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))2
)
930
1
        {
931
1
          ccv_nnc_tensor_symbol_t sub_wrt_symbol = {
932
1
            .d = *(int*)ccv_array_get(tensor_symbol_info[forw_exec->inputs[i]].s_ref, graph_ref) - 1,
933
1
            .graph = sub_prep->graph,
934
1
            .info = tensor_symbol_info[forw_exec->inputs[i]].info
935
1
          };
936
1
          ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol);
937
1
        }
938
1
      int flag; // Only continue if it changed */
939
1
      do {
940
1
        flag = 0;
941
2
        for (i = 0; 
i < forw_exec->output_size2
;
i++1
)
942
1
          // Try to reduce number of inputs for the backward graph. If it is not tagged as F_SYMBOL_USE, we can reduce it.
943
1
          // It is reducible because this sub graph may have multiple computation paths, therefore, some of these may not
944
1
          // involve our wrt symbols at all.
945
1
          
if (1
!(used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? 1
tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 10
:
forw_exec->outputs[i]1
] & F_SYMBOL_USE) &&
946
0
            node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63)))
947
0
          { /* Try to eliminate one of the input. */
948
0
            node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63));
949
0
            if (!sub_f_symbols)
950
0
              sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
951
0
            else
952
0
              ccv_array_clear(sub_f_symbols);
953
0
            for (j = 0; 
j < forw_exec->output_size0
;
j++0
)
954
0
              
if (0
node->input_bitmasks[j >> 6] & ((uint64_t)1 << (j & 63))0
)
955
0
              {
956
0
                ccv_nnc_tensor_symbol_t sub_f_symbol = {
957
0
                  .d = *(int*)ccv_array_get(tensor_symbol_info[forw_exec->outputs[j]].s_ref, graph_ref) - 1,
958
0
                  .graph = sub_prep->graph,
959
0
                  .info = tensor_symbol_info[forw_exec->outputs[j]].info
960
0
                };
961
0
                ccv_array_push(sub_f_symbols, &sub_f_symbol);
962
0
              }
963
0
            if (
_ccv_nnc_symbolic_graph_backward_prep_prune_ops(sub_prep, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph), (ccv_nnc_tensor_symbol_t*)0
ccv_array_get0
(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)
ccv_array_get0
(sub_wrt_symbols, 0), sub_wrt_symbols->rnum))
964
0
              flag = 1;
965
0
            else /* Refit this with the bit back again. */
966
0
              node->input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63));
967
0
          }
968
1
      } while (flag);
969
1
      // I am done, need to redo above for sub_prep, and it has to be successful now.
970
1
      if (!sub_f_symbols)
971
1
        sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
972
1
      else
973
0
        ccv_array_clear(sub_f_symbols);
974
2
      for (i = 0; 
i < forw_exec->output_size2
;
i++1
)
975
1
        
if (1
node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))1
)
976
1
        {
977
1
          ccv_nnc_tensor_symbol_t sub_f_symbol = {
978
1
            .d = *(int*)ccv_array_get(tensor_symbol_info[forw_exec->outputs[i]].s_ref, graph_ref) - 1,
979
1
            .graph = sub_prep->graph,
980
1
            .info = tensor_symbol_info[forw_exec->outputs[i]].info
981
1
          };
982
1
          ccv_array_push(sub_f_symbols, &sub_f_symbol);
983
1
        }
984
1
      flag = _ccv_nnc_symbolic_graph_backward_prep_prune_ops(sub_prep, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph), (ccv_nnc_tensor_symbol_t*)
ccv_array_get1
(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)
ccv_array_get1
(sub_wrt_symbols, 0), sub_wrt_symbols->rnum);
985
1
      assert(flag && "must be able to generate path from wrt symbols to f symbols for this sub graph");
986
1
    }
987
50
  } ccv_nnc_graph_visit_endfor
988
13
  if (sub_f_symbols)
989
1
    ccv_array_free(sub_f_symbols);
990
13
  if (sub_wrt_symbols)
991
1
    ccv_array_free(sub_wrt_symbols);
992
13
  int flag = 1;
993
26
  for (i = 0; 
i < f_symbol_size && 26
flag13
;
i++13
)
994
13
    
flag = (used_grad[tensor_symbol_info[f_symbols[i].d].alias_ref ? 13
tensor_symbol_info[f_symbols[i].d].alias_ref - 10
:
f_symbols[i].d13
] & WRT_SYMBOL_USE);
995
13
  ccfree(used_grad);
996
13
  return flag;
997
13
}
998
999
static void _ccv_nnc_symbolic_graph_backward_prep_gen(ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const int is_while)
1000
13
{
1001
13
  const int exec_symbol_info_size = backward_prep->exec_symbol_info_size;
1002
13
  const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size;
1003
13
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
1004
13
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info =backward_prep->tensor_symbol_info;
1005
13
  const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit;
1006
13
  // Now, for each one of these, find a reverse graph.
1007
13
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
1008
13
  const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit;
1009
13
  int i, j;
1010
13
  // Now, only the flow from f_symbols back to wrt_symbols are interested to us.
1011
13
  // Visit the graph in reverse order, build the AD nodes.
1012
13
  ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = (ccv_nnc_autograd_graph_exec_symbol_t*)cccalloc(exec_symbol_info_size, sizeof(ccv_nnc_autograd_graph_exec_symbol_t));
1013
63
  for (i = 0; 
i < exec_symbol_info_size63
;
i++50
)
1014
50
    
if (50
(backward_info[i].f_wrt & 0x3) == 0x3 && 50
backward_info[i].outgoings47
)
1015
32
    {
1016
32
      // Copy over the outgoing bits.
1017
32
      autograd_execs[i].outgoings = ccv_array_new(sizeof(int), backward_info[i].outgoings->rnum, 0);
1018
78
      for (j = 0; 
j < backward_info[i].outgoings->rnum78
;
j++46
)
1019
46
      {
1020
46
        const int d = *(int*)ccv_array_get(backward_info[i].outgoings, j);
1021
46
        // Only push the outgoing node if it is in the f_wrt path.
1022
46
        if ((backward_info[d].f_wrt & 0x3) == 0x3)
1023
44
          ccv_array_push(autograd_execs[i].outgoings, &d);
1024
46
      }
1025
32
    }
1026
13
  ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = (ccv_nnc_autograd_tensor_version_t*)cccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_autograd_tensor_version_t));
1027
13
  ccv_array_t* autograd_tensor_symbols = ccv_array_new(sizeof(ccv_nnc_autograd_tensor_symbol_t), tensor_symbol_info_size, 0);
1028
13
  ccv_array_t* sum_execs = ccv_array_new(sizeof(ccv_nnc_sum_graph_exec_symbol_t), 0, 0);
1029
50
  
ccv_nnc_graph_visit_for50
(backward_visit, backward_info, node, idx) {50
1030
50
    /* This is required by both f flow and wrt flow, therefore, an interest to us */
1031
50
    if ((node->f_wrt & 0x3) == 0x3)
1032
47
    {
1033
47
      const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
1034
47
      const ccv_nnc_graph_backward_info_t* back_info = backward_info + idx;
1035
47
      ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + idx;
1036
47
      back_exec->cmd = forw_exec->cmd;
1037
47
      if (back_exec->cmd.cmd != CCV_NNC_NOOP)
1038
46
        back_exec->cmd.cmd += 1; /* Backward command is the one after forward command. */
1039
47
      assert(ccv_nnc_cmd_is_backward(back_exec->cmd) || back_exec->cmd.cmd == CCV_NNC_NOOP);
1040
47
      if (!back_info->output_bitmask_size) /* This has no output, can be a noop. */
1041
0
        back_exec->cmd.cmd = CCV_NNC_NOOP;
1042
47
      else {
1043
47
        back_exec->output_size = forw_exec->input_size;
1044
47
        back_exec->input_size = forw_exec->output_size;
1045
47
        back_exec->inputs = ccmalloc(sizeof(int) * (back_exec->input_size + back_exec->output_size));
1046
47
        back_exec->outputs = back_exec->inputs + back_exec->input_size;
1047
47
        /* Need to compute input before we compute output */
1048
94
        for (i = 0; 
i < forw_exec->output_size94
;
i++47
)
1049
47
        {
1050
47
          /* If we can skip this input, do that. */
1051
47
          if (!(back_info->input_bitmasks[i >> 6] & ((uint64_t)1 << i)))
1052
0
            continue;
1053
47
          const int d = forw_exec->outputs[i];
1054
47
          const int alias_ref = tensor_symbol_info[d].alias_ref;
1055
31
          ccv_nnc_autograd_tensor_version_t* tensor_ver = alias_ref ? 
autograd_tensor_versions + (alias_ref - 1)16
:
autograd_tensor_versions + d31
;
1056
47
          /* Initialization tensor, should corresponding to f symbols */
1057
47
          if (!tensor_ver->ref_version)
1058
13
          {
1059
13
            ccv_nnc_autograd_tensor_symbol_t tensor_sym = {};
1060
13
            if (!alias_ref)
1061
12
            {
1062
12
              tensor_sym.d = d;
1063
12
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1064
12
              const ccv_nnc_tensor_ref_t tensor_ref = {
1065
12
                .d = autograd_tensor_symbols->rnum - 1,
1066
12
                .x = idx,
1067
12
                .alias_registry = 0
1068
12
              };
1069
12
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1070
12
              ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1071
1
            } else {
1072
1
              tensor_sym.d = alias_ref - 1;
1073
1
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1074
1
              const ccv_nnc_tensor_ref_t tensor_ref = {
1075
1
                .d = autograd_tensor_symbols->rnum - 1,
1076
1
                .x = idx,
1077
1
                .alias_registry = ccv_array_new(sizeof(int), 1, 0)
1078
1
              };
1079
1
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1080
1
              ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1081
1
              tensor_sym.d = d; /* set back */
1082
1
              tensor_sym.alias_ref = tensor_ref.d + 1;
1083
1
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1084
1
              const int ad = autograd_tensor_symbols->rnum - 1;
1085
1
              ccv_array_push(tensor_ref.alias_registry, &ad);
1086
1
            }
1087
13
          }
1088
47
          /* The simplest case (most common), it is not an alias. */
1089
47
          if (!alias_ref)
1090
31
          {
1091
31
            /* Even simpler, this only have one reference tensor, thus, pass this as input. */
1092
31
            if (tensor_ver->c == tensor_ver->ref_version->rnum - 1)
1093
28
            {
1094
28
              ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1095
28
              /* There are alias associated with this tensor ref, zero it out when this tensor is allocated. */
1096
28
              /* This is is required. Consider the case that we have an alias of this tensor used somehwere */
1097
28
              /* on forward pass, when we compute backward, we have that alias computed first, however, its */
1098
28
              /* underlying tensor is not zero initialized, and we will end up with garbage values here. */
1099
28
              if (tensor_ref->alias_registry &&
1100
28
                /* Loop over to see if this tensor is fully occupied to avoid extra zero step. */
1101
2
                !_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info))
1102
1
              {
1103
1
                ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1104
1
                assert(tensor_sym->alias_ref == 0);
1105
1
                tensor_sym->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
1106
1
              }
1107
28
              back_exec->inputs[i] = tensor_ref->d;
1108
3
            } else {
1109
3
              /* Otherwise, we need to sum them up, and then pass the summed result to the computation. */
1110
3
              _ccv_nnc_graph_sum_autograd_tensor_versions(idx, d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_execs);
1111
3
              ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1112
3
              back_exec->inputs[i] = tensor_ref->d;
1113
3
            }
1114
31
          } else
1115
47
            /* If this is an alias, go through all available tensor ref versions */
1116
16
            back_exec->inputs[i] = _ccv_nnc_graph_sum_autograd_tensor_versions_alias(idx, d, tensor_symbol_info, exec_symbol_info_size, tensor_symbol_info + d, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_execs);
1117
47
        }
1118
131
        for (i = 0; 
i < forw_exec->input_size131
;
i++84
)
1119
84
        {
1120
84
          /* If we can skip this output, do that. */
1121
84
          if (!(back_info->output_bitmasks[i >> 6] & ((uint64_t)1 << i)))
1122
4
            continue;
1123
80
          const int d = forw_exec->inputs[i];
1124
80
          const int alias_ref = tensor_symbol_info[d].alias_ref;
1125
80
          ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
1126
80
            .d = d
1127
80
          };
1128
80
          /* The simplest case (most common), it is not an alias. */
1129
80
          if (!alias_ref)
1130
48
          {
1131
48
            ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1132
48
            const ccv_nnc_tensor_ref_t tensor_ref = {
1133
48
              .d = autograd_tensor_symbols->rnum - 1,
1134
48
              .x = idx,
1135
48
              .exec_registry = 0,
1136
48
              .alias_registry = 0
1137
48
            };
1138
48
            ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + d;
1139
48
            if (!tensor_ver->ref_version)
1140
44
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1141
48
            ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1142
48
            back_exec->outputs[i] = tensor_ref.d;
1143
32
          } else {
1144
32
            /* Otherwise, in case that this is an alias, we try to find the existing one (in tensor_ver
1145
32
             * see if can meet the need (thus, for the tensor info / ofs, it fits). */
1146
32
            ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + (alias_ref - 1);
1147
32
            if (!tensor_ver->ref_version)
1148
11
              tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
1149
32
            /* If already exists a ref version, check if any of these not-sealed tensors have free space. */
1150
32
            int found = 0;
1151
57
            for (j = tensor_ver->c; 
!found && 57
j < tensor_ver->ref_version->rnum49
;
j++25
)
1152
25
            {
1153
25
              ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, j);
1154
25
              if (!_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, tensor_symbol_info + d))
1155
8
              {
1156
8
                tensor_sym.alias_ref = tensor_ref->d + 1;
1157
8
                ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1158
8
                const int ad = autograd_tensor_symbols->rnum - 1;
1159
8
                ccv_array_push(tensor_ref->alias_registry, &ad);
1160
8
                if (!tensor_ref->exec_registry)
1161
6
                  tensor_ref->exec_registry = ccv_array_new(sizeof(int), 1, 0);
1162
8
                ccv_array_push(tensor_ref->exec_registry, &idx);
1163
8
                back_exec->outputs[i] = ad;
1164
8
                found = 1;
1165
8
              }
1166
25
            }
1167
32
            if (!found) /* Cannot find an tensor ref to insert, create one first */
1168
24
            {
1169
24
              tensor_sym.d = alias_ref - 1; /* Reference back to the non-alias. */
1170
24
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1171
24
              const ccv_nnc_tensor_ref_t tensor_ref = {
1172
24
                .d = autograd_tensor_symbols->rnum - 1,
1173
24
                .x = idx,
1174
24
                .exec_registry = 0,
1175
24
                .alias_registry = ccv_array_new(sizeof(int), 1, 0)
1176
24
              };
1177
24
              ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1178
24
              tensor_sym.d = d; /* set back */
1179
24
              tensor_sym.alias_ref = tensor_ref.d + 1;
1180
24
              ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1181
24
              const int ad = autograd_tensor_symbols->rnum - 1;
1182
24
              ccv_array_push(tensor_ref.alias_registry, &ad);
1183
24
              back_exec->outputs[i] = ad;
1184
24
            }
1185
32
          }
1186
80
        }
1187
47
      }
1188
47
    }
1189
50
  } ccv_nnc_graph_visit_endfor
1190
13
  // Find all relevant wrt symbols, generate sum for them if needed.
1191
43
  for (i = 0; 
i < wrt_symbol_size43
;
i++30
)
1192
30
  {
1193
30
    const int d = wrt_symbols[i].d;
1194
30
    const int ref_d = (!tensor_symbol_info[d].alias_ref) ? 
d30
:
tensor_symbol_info[d].alias_ref - 10
;
1195
30
    ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + ref_d;
1196
30
    // If it is a while loop, we need to insert an accumulator to the graph (this is expressed as a initialization tensor summed with existing results).
1197
30
    // First, insert the initialization tensor if this wrt results is not used directly in next while loop (thus, it participates the computation, therefore, no need to accumulate).
1198
30
    if (
is_while && 30
!tensor_symbol_info[ref_d].assign_ref2
&&
1199
1
      _ccv_nnc_tensor_ref_version_find_init(tensor_ver) < 0) // If the initialization tensor is not inserted yet.
1200
1
    {
1201
1
      const ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
1202
1
        .d = ref_d
1203
1
      };
1204
1
      ccv_array_push(autograd_tensor_symbols, &tensor_sym);
1205
1
      // Insert the one to be summed.
1206
1
      const ccv_nnc_tensor_ref_t tensor_ref = {
1207
1
        .d = autograd_tensor_symbols->rnum - 1,
1208
1
        .x = -1, // This denotes it is an initialization vector.
1209
1
      };
1210
1
      ccv_array_push(tensor_ver->ref_version, &tensor_ref);
1211
1
    }
1212
30
    // If there are more than one tensor in the list, it is possible to sum them up.
1213
30
    if (tensor_ver->c < tensor_ver->ref_version->rnum - 1)
1214
5
      _ccv_nnc_graph_sum_autograd_tensor_versions(-1, ref_d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_execs);
1215
30
    // The tensor version should have ref_version, and only one now (after sum up).
1216
30
    assert(tensor_ver->c == tensor_ver->ref_version->rnum - 1);
1217
30
  }
1218
13
  // Adding additional fields to backward_prep now.
1219
13
  backward_prep->autograd_execs = autograd_execs;
1220
13
  backward_prep->autograd_tensor_versions = autograd_tensor_versions;
1221
13
  backward_prep->autograd_tensor_symbols = autograd_tensor_symbols;
1222
13
  backward_prep->sum_execs = sum_execs;
1223
13
  ccv_array_t* sub_f_symbols = 0;
1224
13
  ccv_array_t* sub_wrt_symbols = 0;
1225
50
  
ccv_nnc_graph_visit_for50
(forward_visit, exec_symbol_info, _, idx) {50
1226
50
    ccv_nnc_graph_backward_info_t* node = backward_info + idx;
1227
50
    const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
1228
50
    /* Only interested in the ones on the f / wrt flow */
1229
50
    if (
(node->f_wrt & 0x3) == 0x3 && 50
forw_exec->graph_ref47
)
1230
1
    {
1231
1
      // Now calling it recursively until we are sure no f_symbols can be removed.
1232
1
      const int graph_ref = forw_exec->graph_ref - 1;
1233
1
      ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep = backward_prep->sub_preps + graph_ref;
1234
1
      if (!sub_wrt_symbols)
1235
1
        sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1236
1
      if (!sub_f_symbols)
1237
1
        sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1238
1
      _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, node->input_bitmasks, node->output_bitmasks, sub_f_symbols, sub_wrt_symbols);
1239
1
      _ccv_nnc_symbolic_graph_backward_prep_gen(sub_prep, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph), (ccv_nnc_tensor_symbol_t*)
ccv_array_get1
(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)
ccv_array_get1
(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, 1);
1240
1
    }
1241
50
  } ccv_nnc_graph_visit_endfor
1242
13
  if (sub_f_symbols)
1243
1
    ccv_array_free(sub_f_symbols);
1244
13
  if (sub_wrt_symbols)
1245
1
    ccv_array_free(sub_wrt_symbols);
1246
13
}
1247
1248
static void _ccv_nnc_symbolic_graph_backward_prep_free(const ccv_nnc_symbolic_graph_backward_prep_t backward_prep)
1249
13
{
1250
13
  int i, j;
1251
13
  const int exec_symbol_info_size = backward_prep.exec_symbol_info_size;
1252
13
  const int tensor_symbol_info_size = backward_prep.tensor_symbol_info_size;
1253
13
  ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep.autograd_execs;
1254
13
  if (autograd_execs)
1255
13
  {
1256
63
    for (i = 0; 
i < exec_symbol_info_size63
;
i++50
)
1257
50
    {
1258
50
      if (autograd_execs[i].inputs)
1259
47
        
ccfree47
(autograd_execs[i].inputs)47
;
1260
50
      if (autograd_execs[i].outgoings)
1261
39
        ccv_array_free(autograd_execs[i].outgoings);
1262
50
    }
1263
13
    ccfree(autograd_execs);
1264
13
  }
1265
13
  ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep.autograd_tensor_versions;
1266
13
  if (autograd_tensor_versions)
1267
13
  {
1268
122
    for (i = 0; 
i < tensor_symbol_info_size122
;
i++109
)
1269
109
    {
1270
109
      if (autograd_tensor_versions[i].ref_version)
1271
68
      {
1272
166
        for (j = 0; 
j < autograd_tensor_versions[i].ref_version->rnum166
;
j++98
)
1273
98
        {
1274
98
          ccv_nnc_tensor_ref_t* ref_version = (ccv_nnc_tensor_ref_t*)ccv_array_get(autograd_tensor_versions[i].ref_version, j);
1275
98
          if (ref_version->exec_registry)
1276
6
            ccv_array_free(ref_version->exec_registry);
1277
98
          if (ref_version->alias_registry)
1278
28
            ccv_array_free(ref_version->alias_registry);
1279
98
        }
1280
68
        ccv_array_free(autograd_tensor_versions[i].ref_version);
1281
68
      }
1282
109
    }
1283
13
    ccfree(autograd_tensor_versions);
1284
13
  }
1285
13
  if (backward_prep.autograd_tensor_symbols)
1286
13
    ccv_array_free(backward_prep.autograd_tensor_symbols);
1287
13
  ccv_array_t* const sum_execs = backward_prep.sum_execs;
1288
13
  if (sum_execs)
1289
13
  {
1290
25
    for (i = 0; 
i < sum_execs->rnum25
;
i++12
)
1291
12
    {
1292
12
      ccv_nnc_sum_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, i);
1293
12
      if (sum_or_set->inputs)
1294
12
        
ccfree12
(sum_or_set->inputs)12
;
1295
12
      if (sum_or_set->outgoings)
1296
7
        ccv_array_free(sum_or_set->outgoings);
1297
12
    }
1298
13
    ccv_array_free(sum_execs);
1299
13
  }
1300
13
  // Now afterwards, these are mandatory.
1301
13
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep.backward_info;
1302
63
  for (i = 0; 
i < exec_symbol_info_size63
;
i++50
)
1303
50
  {
1304
50
    if (backward_info[i].outgoings)
1305
33
      ccv_array_free(backward_info[i].outgoings);
1306
50
    if (backward_info[i].input_bitmasks)
1307
49
      
ccfree49
(backward_info[i].input_bitmasks)49
;
1308
50
  }
1309
13
  ccfree(backward_info);
1310
13
  ccv_nnc_graph_visit_free(backward_prep.backward_visit);
1311
13
  ccv_nnc_graph_visit_free(backward_prep.forward_visit);
1312
13
  ccfree(backward_prep.exec_symbol_info);
1313
13
  ccfree(backward_prep.tensor_symbol_info);
1314
14
  for (i = 0; 
i < backward_prep.sub_prep_size14
;
i++1
)
1315
1
    _ccv_nnc_symbolic_graph_backward_prep_free(backward_prep.sub_preps[i]);
1316
13
  if (backward_prep.sub_preps)
1317
1
    
ccfree1
(backward_prep.sub_preps)1
;
1318
13
}
1319
1320
static int _ccv_nnc_noop_while_expr(ccv_nnc_tensor_t* const* const commons, const int common_size, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const void* const data)
1321
0
{
1322
0
  return 0;
1323
0
}
1324
1325
static void _ccv_nnc_add_backward_breakpoint_for_symbol(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_graph_exec_symbol_t breakpoint, ccv_nnc_symbolic_graph_t* const graph, ccv_array_t* const sub_breakpoints)
1326
1
{
1327
1
  const ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), 0, 0, 0, 0, 0);
1328
1
  ccv_array_push(sub_breakpoints, &noop);
1329
1
  // Now need to hook this up to the graph.
1330
1
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
1331
1
  const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit;
1332
1
  // Now, for each one of these, find a reverse graph.
1333
1
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
1334
1
  int i;
1335
1
  // Clean up the high bit.
1336
4
  for (i = 0; 
i < backward_prep->exec_symbol_info_size4
;
i++3
)
1337
3
    backward_info[i].f_wrt &= ~0x4;
1338
1
  assert((backward_info[breakpoint.d].f_wrt & 0x3) != 0x3);
1339
1
  backward_info[breakpoint.d].f_wrt |= 0x4;
1340
1
  const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit;
1341
1
  const ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep->autograd_execs;
1342
1
  // Going forward to find whether this breakpoint is a source node to some f_wrt nodes.
1343
3
  
ccv_nnc_graph_visit_for3
(forward_visit, exec_symbol_info, forw_exec, idx) {3
1344
3
    ccv_nnc_graph_backward_info_t* const node = backward_info + idx;
1345
3
    // If it is tagged on breakpoint flow, but not as both f or wrt, flow through it.
1346
3
    if (
(node->f_wrt & 0x4) && 3
(node->f_wrt & 0x3) != 0x32
)
1347
2
      
for (i = 0; 1
forw_exec->outgoings && 2
i < forw_exec->outgoings->rnum2
;
i++1
)
1348
1
      {
1349
1
        const int outgoing_idx = *(int*)ccv_array_get(forw_exec->outgoings, i);
1350
1
        ccv_nnc_graph_backward_info_t* const outgoing_node = backward_info + outgoing_idx;
1351
1
        // If this is a f_wrt node. Concatenate.
1352
1
        if (
!(outgoing_node->f_wrt & 0x4) && 1
(outgoing_node->f_wrt & 0x3) == 0x31
)
1353
1
            ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[outgoing_idx].symbol, noop);
1354
1
        outgoing_node->f_wrt |= 0x4;
1355
1
      }
1356
3
  } ccv_nnc_graph_visit_endfor
1357
1
  // Going backward to find whether this breakpoint is a destination node for some f_wrt_nodes.
1358
3
  
ccv_nnc_graph_visit_for3
(backward_visit, backward_info, node, idx) {3
1359
3
    if (
(node->f_wrt & 0x4) && 3
(node->f_wrt & 0x3) != 0x32
)
1360
1
      
for (i = 0; 1
node->outgoings && 1
i < node->outgoings->rnum0
;
i++0
)
1361
0
      {
1362
0
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
1363
0
        ccv_nnc_graph_backward_info_t* const outgoing_node = backward_info + outgoing_idx;
1364
0
        // If this is a f_wrt node. Concatenate.
1365
0
        if (
!(outgoing_node->f_wrt & 0x4) && 0
(outgoing_node->f_wrt & 0x3) == 0x30
)
1366
0
            ccv_nnc_graph_exec_symbol_concat(graph, noop, autograd_execs[outgoing_idx].symbol);
1367
0
        outgoing_node->f_wrt |= 0x4;
1368
0
      }
1369
3
  } ccv_nnc_graph_visit_endfor
1370
1
}
1371
1372
static ccv_nnc_autograd_tensor_symbol_t* _ccv_nnc_autograd_tensor_symbol_from_tensor_version(ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_autograd_tensor_version_t* const tensor_ver)
1373
18
{
1374
18
  assert(tensor_ver->ref_version);
1375
18
  ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1376
18
  return (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1377
18
}
1378
1379
static void _ccv_nnc_symbolic_graph_set_backward_while_params(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph)
1380
1
{
1381
1
  int i;
1382
5
  for (i = 0; 
i < backward_prep->graph->tensor_symbol_info->rnum5
;
i++4
)
1383
4
  {
1384
4
    const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = backward_prep->tensor_symbol_info + i;
1385
4
    if (tensor_symbol_info->assign_ref)
1386
1
    {
1387
1
      const int assign_ref = tensor_symbol_info->assign_ref - 1;
1388
1
      ccv_nnc_autograd_tensor_symbol_t* const destination_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + assign_ref);
1389
1
      ccv_nnc_autograd_tensor_symbol_t* const source_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + i);
1390
1
      ccv_nnc_tensor_symbol_info_t* const destination_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, destination_autograd_symbol->symbol.d);
1391
1
      destination_tensor_symbol_info->assign_ref = source_autograd_symbol->symbol.d + 1;
1392
1
    }
1393
4
  }
1394
3
  for (i = 0; 
i < wrt_symbol_size3
;
i++2
)
1395
2
  {
1396
2
    const int d = wrt_symbols[i].d;
1397
2
    const int ref_d = (!backward_prep->tensor_symbol_info[d].alias_ref) ? 
d2
:
backward_prep->tensor_symbol_info[d].alias_ref - 10
;
1398
2
    const ccv_nnc_autograd_tensor_version_t* const tensor_ver = backward_prep->autograd_tensor_versions + ref_d;
1399
2
    const int init_ref_ver = _ccv_nnc_tensor_ref_version_find_init(tensor_ver);
1400
2
    if (init_ref_ver >= 0)
1401
1
    {
1402
1
      const int init_d = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, init_ref_ver))->d;
1403
1
      ccv_nnc_autograd_tensor_symbol_t* const destination_autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(backward_prep->autograd_tensor_symbols, init_d);
1404
1
      ccv_nnc_autograd_tensor_symbol_t* const source_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + ref_d);
1405
1
      ccv_nnc_tensor_symbol_info_t* const destination_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, destination_autograd_symbol->symbol.d);
1406
1
      if (destination_tensor_symbol_info->assign_ref)
1407
0
        { assert(destination_tensor_symbol_info->assign_ref == source_autograd_symbol->symbol.d + 1); }
1408
1
      destination_tensor_symbol_info->assign_ref = source_autograd_symbol->symbol.d + 1;
1409
1
    }
1410
2
  }
1411
1
}
1412
1413
static void _ccv_nnc_symbolic_graph_backward_gen(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph)
1414
13
{
1415
13
  assert(graph == backward_prep->graph || graph->peer == backward_prep->graph);
1416
13
  const int exec_symbol_info_size = backward_prep->exec_symbol_info_size;
1417
13
  const int forward_symbol_size = graph->tensor_symbol_info->rnum;
1418
13
  const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size;
1419
13
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info;
1420
13
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = backward_prep->tensor_symbol_info;
1421
13
  int i, j, k;
1422
13
  ccv_array_t* const autograd_tensor_symbols = backward_prep->autograd_tensor_symbols;
1423
13
  // Generate required symbols based on the information gathered above.
1424
157
  for (i = 0; 
i < autograd_tensor_symbols->rnum157
;
i++144
)
1425
144
  {
1426
144
    ccv_nnc_autograd_tensor_symbol_t* symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, i);
1427
144
    assert(symbol->d >= 0);
1428
144
    assert(symbol->d < tensor_symbol_info_size);
1429
144
    const ccv_nnc_tensor_symbol_info_t* const forw_symbol = tensor_symbol_info + symbol->d;
1430
144
    if (!symbol->alias_ref)
1431
98
    {
1432
98
      assert(!forw_symbol->alias_ref);
1433
98
      symbol->symbol = ccv_nnc_tensor_symbol_new(graph, forw_symbol->info, 0);
1434
98
      ccv_nnc_tensor_symbol_set_flags(graph, symbol->symbol, symbol->flags);
1435
46
    } else {
1436
46
      assert(forw_symbol->alias_ref);
1437
46
      assert(symbol->flags == 0); // We don't set flags on alias.
1438
46
      // Due to our generation order, this must be after the original symbol is created.
1439
46
      ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, symbol->alias_ref - 1);
1440
46
      symbol->symbol = ccv_nnc_tensor_symbol_alias_new(graph, ref->symbol, forw_symbol->ofs, forw_symbol->inc, forw_symbol->info, 0);
1441
46
    }
1442
144
  }
1443
13
  ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info;
1444
13
  ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep->autograd_execs;
1445
13
  ccv_array_t* symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1446
13
  ccv_array_t* sub_f_symbols = 0;
1447
13
  ccv_array_t* sub_wrt_symbols = 0;
1448
13
  ccv_array_t* sub_execs = 0;
1449
63
  for (i = 0; 
i < exec_symbol_info_size63
;
i++50
)
1450
50
  {
1451
50
    // This is not going to be an interesting node. Skip.
1452
50
    if ((backward_info[i].f_wrt & 0x3) != 0x3)
1453
3
      continue;
1454
47
    ccv_nnc_graph_backward_info_t* const back_info = backward_info + i;
1455
47
    ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i;
1456
47
    if (back_exec->cmd.cmd == CCV_NNC_NOOP)
1457
1
    {
1458
1
      back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, 0, 0, 0, 0, 0);
1459
1
      continue;
1460
1
    }
1461
46
    ccv_array_clear(symbols);
1462
46
    const ccv_nnc_graph_exec_symbol_info_t* const forw_exec = exec_symbol_info + i;
1463
46
    if (forw_exec->graph_ref)
1464
1
    {
1465
1
      const int graph_ref = forw_exec->graph_ref - 1;
1466
1
      ccv_nnc_symbolic_graph_backward_prep_t* sub_prep = backward_prep->sub_preps + graph_ref;
1467
1
      ccv_nnc_symbolic_graph_t* sub_graph = ccv_nnc_symbolic_graph_new();
1468
1
      sub_graph->peer = sub_prep->graph;
1469
1
      if (!sub_wrt_symbols)
1470
1
        sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1471
1
      // I am done, need to redo above for sub_prep, and it has to be successful now.
1472
1
      if (!sub_f_symbols)
1473
1
        sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
1474
1
      _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, back_info->input_bitmasks, back_info->output_bitmasks, sub_f_symbols, sub_wrt_symbols);
1475
1
      _ccv_nnc_symbolic_graph_backward_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)
ccv_array_get1
(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)
ccv_array_get1
(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph);
1476
1
      back_exec->symbol = ccv_nnc_symbolic_graph_while(graph, sub_graph, forw_exec->name);
1477
1
      if (!sub_execs)
1478
1
        sub_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0);
1479
1
      ccv_array_clear(sub_execs);
1480
1
      // Find the breakpoints in forward graph, creating the reverse one.
1481
2
      for (j = 0; 
j < sub_prep->graph->breakpoint_size2
;
j++1
)
1482
1
      {
1483
1
        const int d = sub_prep->graph->breakpoints[j].d;
1484
1
        if (sub_prep->autograd_execs[d].symbol.graph)
1485
0
          ccv_array_push(sub_execs, &sub_prep->autograd_execs[d].symbol);
1486
1
        else
1487
1
          _ccv_nnc_add_backward_breakpoint_for_symbol(sub_prep, sub_prep->graph->breakpoints[j], sub_graph, sub_execs);
1488
1
      }
1489
1
      ccv_nnc_symbolic_graph_set_while_expr(sub_graph, _ccv_nnc_noop_while_expr, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_execs, 0), sub_execs->rnum);
1490
1
      ccv_nnc_graph_exec_symbol_autogen(sub_graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1491
1
      _ccv_nnc_symbolic_graph_set_backward_while_params(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph);
1492
2
      for (j = 0; 
j < back_exec->input_size2
;
j++1
)
1493
1
        
if (1
back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j)1
)
1494
1
          
ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)1
ccv_array_get1
(autograd_tensor_symbols, back_exec->inputs[j]))->symbol));
1495
1
      const int input_size = symbols->rnum;
1496
3
      for (j = 0; 
j < back_exec->output_size3
;
j++2
)
1497
2
        
if (2
back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j)2
)
1498
1
          
ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)1
ccv_array_get1
(autograd_tensor_symbols, back_exec->outputs[j]))->symbol));
1499
1
      const int output_size = symbols->rnum - input_size;
1500
1
      const int p_idx = sub_prep->graph->p_idx - 1;
1501
1
      assert(back_exec->input_size == forw_exec->output_size);
1502
1
      k = 0;
1503
2
      for (j = 0; 
j < back_exec->input_size2
;
j++1
)
1504
1
        
if (1
back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j)1
)
1505
1
        {
1506
1
          const ccv_nnc_tensor_symbol_info_t* const info = tensor_symbol_info + forw_exec->outputs[j];
1507
1
          const int s_idx = *(int*)ccv_array_get(info->s_ref, p_idx) - 1;
1508
1
          assert(s_idx >= 0);
1509
1
          const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + s_idx);
1510
1
          ccv_nnc_tensor_symbol_pass(graph, sub_graph, *(ccv_nnc_tensor_symbol_t*)ccv_array_get(symbols, k), autograd_symbol->symbol);
1511
1
          ++k;
1512
1
        }
1513
1
      assert(back_exec->output_size == forw_exec->input_size);
1514
3
      for (j = 0; 
j < back_exec->output_size3
;
j++2
)
1515
2
        
if (2
back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j)2
)
1516
1
        {
1517
1
          const ccv_nnc_tensor_symbol_info_t* const info = tensor_symbol_info + forw_exec->inputs[j];
1518
1
          const int s_idx = *(int*)ccv_array_get(info->s_ref, p_idx) - 1;
1519
1
          assert(s_idx >= 0);
1520
1
          const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + s_idx);
1521
1
          ccv_nnc_tensor_symbol_pass(graph, sub_graph, *(ccv_nnc_tensor_symbol_t*)ccv_array_get(symbols, k), autograd_symbol->symbol);
1522
1
          ++k;
1523
1
        }
1524
1
      ccv_nnc_graph_exec_symbol_set_io(graph, back_exec->symbol, 
ccv_array_get1
(symbols, 0), input_size,
ccv_array_get1
(symbols, input_size), output_size);
1525
45
    } else {
1526
45
      // Gradient inputs.
1527
90
      for (j = 0; 
j < back_exec->input_size90
;
j++45
)
1528
45
        
if (45
back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j)45
)
1529
45
          
ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)45
ccv_array_get45
(autograd_tensor_symbols, back_exec->inputs[j]))->symbol));
1530
45
        else
1531
0
          
ccv_array_push(symbols, &0
NO_TENSOR_SYMBOL0
);
1532
45
      // Inputs from forward function.
1533
125
      for (j = 0; 
j < forw_exec->input_size125
;
j++80
)
1534
80
        
if (80
!(back_info->input_bitmasks[(j + back_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size)))80
)
1535
26
          
ccv_array_push(symbols, &26
NO_TENSOR_SYMBOL26
);
1536
54
        else {
1537
54
          const ccv_nnc_tensor_symbol_t symbol = {
1538
54
            .info = tensor_symbol_info[forw_exec->inputs[j]].info,
1539
54
            .d = forw_exec->inputs[j],
1540
54
            .graph = backward_prep->graph
1541
54
          };
1542
54
          if (graph == backward_prep->graph)
1543
51
            ccv_array_push(symbols, &symbol);
1544
3
          else { // Otherwise, create a new symbol, and set its peer to the old symbol.
1545
3
            const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, tensor_symbol_info[forw_exec->inputs[j]].info, tensor_symbol_info[forw_exec->inputs[j]].name);
1546
3
            ccv_nnc_tensor_symbol_set_peer(graph, new_symbol, symbol);
1547
3
            ccv_array_push(symbols, &new_symbol);
1548
3
          }
1549
54
        }
1550
45
      // Outputs from forward function.
1551
90
      for (j = 0; 
j < forw_exec->output_size90
;
j++45
)
1552
45
        
if (45
!(back_info->input_bitmasks[(j + back_exec->input_size + forw_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size + forw_exec->input_size)))45
)
1553
28
          
ccv_array_push(symbols, &28
NO_TENSOR_SYMBOL28
);
1554
17
        else {
1555
17
          const ccv_nnc_tensor_symbol_t symbol = {
1556
17
            .info = tensor_symbol_info[forw_exec->outputs[j]].info,
1557
17
            .d = forw_exec->outputs[j],
1558
17
            .graph = backward_prep->graph
1559
17
          };
1560
17
          if (graph == backward_prep->graph)
1561
16
            ccv_array_push(symbols, &symbol);
1562
1
          else { // Otherwise, create a new symbol, and set its peer to the old symbol.
1563
1
            const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, tensor_symbol_info[forw_exec->outputs[j]].info, tensor_symbol_info[forw_exec->outputs[j]].name);
1564
1
            ccv_nnc_tensor_symbol_set_peer(graph, new_symbol, symbol);
1565
1
            ccv_array_push(symbols, &new_symbol);
1566
1
          }
1567
17
        }
1568
125
      for (j = 0; 
j < back_exec->output_size125
;
j++80
)
1569
80
        
if (80
back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j)80
)
1570
77
          
ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)77
ccv_array_get77
(autograd_tensor_symbols, back_exec->outputs[j]))->symbol));
1571
80
        else
1572
3
          
ccv_array_push(symbols, &3
NO_TENSOR_SYMBOL3
);
1573
45
      back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, 
ccv_array_get45
(symbols, 0), back_exec->input_size + forw_exec->input_size + forw_exec->output_size,
ccv_array_get45
(symbols, back_exec->input_size + forw_exec->input_size + forw_exec->output_size), back_exec->output_size, 0);
1574
45
    }
1575
46
  }
1576
13
  if (sub_f_symbols)
1577
1
    ccv_array_free(sub_f_symbols);
1578
13
  if (sub_wrt_symbols)
1579
1
    ccv_array_free(sub_wrt_symbols);
1580
13
  if (sub_execs)
1581
1
    ccv_array_free(sub_execs);
1582
13
  ccv_array_t* const sum_execs = backward_prep->sum_execs;
1583
25
  for (i = 0; 
i < sum_execs->rnum25
;
i++12
)
1584
12
  {
1585
12
    ccv_nnc_sum_graph_exec_symbol_t* exec = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, i);
1586
12
    assert(exec->input_size);
1587
12
    ccv_array_clear(symbols);
1588
12
    // This is to sum.
1589
41
    for (j = 0; 
j < exec->input_size41
;
j++29
)
1590
29
      
ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)29
ccv_array_get29
(autograd_tensor_symbols, exec->inputs[j]))->symbol));
1591
12
    ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_EWSUM_FORWARD, 0, CMD_GENERIC(), 0);
1592
12
    exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, cmd, 
ccv_array_get12
(symbols, 0), exec->input_size, &(((ccv_nnc_autograd_tensor_symbol_t*)
ccv_array_get12
(autograd_tensor_symbols, exec->output))->symbol), 1, 0);
1593
12
  }
1594
13
  ccv_array_free(symbols);
1595
63
  for (i = 0; 
i < exec_symbol_info_size63
;
i++50
)
1596
50
  {
1597
50
    // This is not going to be an interesting node. Skip.
1598
50
    if ((backward_info[i].f_wrt & 0x3) != 0x3)
1599
3
      continue;
1600
47
    ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i;
1601
47
    // If on the same graph, we cannot decide whether it is before or after the forw_exec, enforcing it is after forw_exec.
1602
47
    if (graph == backward_prep->graph)
1603
45
      ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
1604
45
        .d = i,
1605
45
        .graph = graph
1606
45
      }, back_exec->symbol);
1607
47
    if (back_exec->outgoings)
1608
108
      
for (j = 0; 39
j < back_exec->outgoings->rnum108
;
j++69
)
1609
69
      {
1610
69
        int d = *(int*)ccv_array_get(back_exec->outgoings, j);
1611
69
        if (d < exec_symbol_info_size)
1612
37
          ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, autograd_execs[d].symbol);
1613
69
        else
1614
32
          
ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, ((ccv_nnc_sum_graph_exec_symbol_t*)32
ccv_array_get32
(sum_execs, d - exec_symbol_info_size))->symbol);
1615
69
      }
1616
47
  }
1617
25
  for (i = 0; 
i < sum_execs->rnum25
;
i++12
)
1618
12
  {
1619
12
    ccv_nnc_sum_graph_exec_symbol_t* exec = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, i);
1620
12
    if (exec->outgoings)
1621
14
      
for (j = 0; 7
j < exec->outgoings->rnum14
;
j++7
)
1622
7
      {
1623
7
        int d = *(int*)ccv_array_get(exec->outgoings, j);
1624
7
        if (d < exec_symbol_info_size)
1625
7
          ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, autograd_execs[d].symbol);
1626
7
        else
1627
0
          
ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, ((ccv_nnc_sum_graph_exec_symbol_t*)0
ccv_array_get0
(sum_execs, d - exec_symbol_info_size))->symbol);
1628
7
      }
1629
12
  }
1630
13
  // Now, everything is done, set the metadata on graph so that we can lookup later for backward symbols
1631
13
  if (graph->backward_tensor_symbols)
1632
0
    
graph->backward_tensor_symbols = (int*)0
ccrealloc0
(graph->backward_tensor_symbols, sizeof(int) * (graph->tensor_symbol_info->rnum - forward_symbol_size + tensor_symbol_info_size));
1633
13
  else
1634
13
    
graph->backward_tensor_symbols = (int*)13
ccmalloc13
(sizeof(int) * (graph->tensor_symbol_info->rnum - forward_symbol_size + tensor_symbol_info_size));
1635
13
  graph->backward_exec_symbols = graph->backward_tensor_symbols + tensor_symbol_info_size;
1636
13
  graph->forward_symbol_size = forward_symbol_size;
1637
13
  graph->backward_symbol_size = graph->tensor_symbol_info->rnum - forward_symbol_size;
1638
122
  for (i = 0; 
i < tensor_symbol_info_size122
;
i++109
)
1639
109
    graph->backward_tensor_symbols[i] = -1;
1640
161
  for (i = 0; 
i < graph->backward_symbol_size161
;
i++148
)
1641
148
    graph->backward_exec_symbols[i] = -1;
1642
13
  ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep->autograd_tensor_versions;
1643
13
  // Assigning for wrt symbols.
1644
43
  for (i = 0; 
i < wrt_symbol_size43
;
i++30
)
1645
30
  {
1646
30
    const int d = wrt_symbols[i].d;
1647
30
    assert(d >= 0);
1648
30
    assert(d < tensor_symbol_info_size);
1649
30
    // If this wrt symbol is an alias, create extra alias for this.
1650
30
    ccv_nnc_autograd_tensor_version_t* const tensor_ver = autograd_tensor_versions + d;
1651
30
    assert(tensor_ver->ref_version);
1652
30
    ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
1653
30
    ccv_nnc_autograd_tensor_symbol_t* autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
1654
30
    graph->backward_tensor_symbols[d] = autograd_symbol->symbol.d;
1655
30
    assert(autograd_symbol->symbol.d >= forward_symbol_size);
1656
30
    const int dd = autograd_symbol->symbol.d - forward_symbol_size;
1657
30
    const int x = tensor_ref->x;
1658
30
    if (
tensor_ref->exec_registry && 30
tensor_ref->exec_registry->rnum2
) // Create no-op node.
1659
2
    {
1660
2
      ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), 0, 0, 0, 0, 0);
1661
2
      if (x < exec_symbol_info_size)
1662
2
        ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop);
1663
2
      else
1664
0
        
ccv_nnc_graph_exec_symbol_concat(graph, ((ccv_nnc_sum_graph_exec_symbol_t*)0
ccv_array_get0
(sum_execs, x - exec_symbol_info_size))->symbol, noop);
1665
6
      for (j = 0; 
j < tensor_ref->exec_registry->rnum6
;
j++4
)
1666
4
      {
1667
4
        const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
1668
4
        assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
1669
4
        assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation.
1670
4
        ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop);
1671
4
      }
1672
2
      graph->backward_exec_symbols[dd] = noop.d;
1673
28
    } else {
1674
28
      if (x < exec_symbol_info_size)
1675
21
        graph->backward_exec_symbols[dd] = autograd_execs[x].symbol.d;
1676
28
      else
1677
7
        
graph->backward_exec_symbols[dd] = ((ccv_nnc_sum_graph_exec_symbol_t*)7
ccv_array_get7
(sum_execs, x - exec_symbol_info_size))->symbol.d;
1678
28
    }
1679
30
  }
1680
13
  // Assigning for f symbols.
1681
26
  for (i = 0; 
i < f_symbol_size26
;
i++13
)
1682
13
  {
1683
13
    const int d = f_symbols[i].d;
1684
13
    assert(d >= 0);
1685
13
    assert(d < tensor_symbol_info_size);
1686
13
    ccv_nnc_autograd_tensor_symbol_t* autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(autograd_tensor_symbols, autograd_tensor_versions + d);
1687
13
    graph->backward_tensor_symbols[d] = autograd_symbol->symbol.d;
1688
13
    // Cannot find relevant backward exec symbols for f.
1689
13
  }
1690
13
}
1691
1692
void ccv_nnc_symbolic_graph_backward(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size)
1693
12
{
1694
12
  int i;
1695
12
  // TODO: f symbols cannot be alias yet.
1696
24
  for (i = 0; 
i < f_symbol_size24
;
i++12
)
1697
12
  {
1698
12
    assert(f_symbols[i].graph == graph); // f symbol has to be in the current graph.
1699
12
    assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, f_symbols[i].d))->alias_ref);
1700
12
  }
1701
12
  // TODO: wrt symbols cannot be alias yet.
1702
40
  for (i = 0; 
i < wrt_symbol_size40
;
i++28
)
1703
28
  {
1704
28
    assert(wrt_symbols[i].graph == graph);
1705
28
    assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, wrt_symbols[i].d))->alias_ref);
1706
28
  }
1707
12
  const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
1708
12
  const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
1709
12
  assert(exec_symbol_info_size > 0);
1710
12
  assert(tensor_symbol_info_size > 0);
1711
12
  ccv_nnc_symbolic_graph_backward_prep_t backward_prep = _ccv_nnc_symbolic_graph_backward_prep(graph, sources, source_size, destinations, destination_size);
1712
12
  const int flag = _ccv_nnc_symbolic_graph_backward_prep_prune_ops(&backward_prep, sources, source_size, destinations, destination_size, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size);
1713
12
  assert(flag && "must be able to generate path from f symbols to wrt symbols.");
1714
12
  _ccv_nnc_symbolic_graph_backward_prep_gen(&backward_prep, sources, source_size, destinations, destination_size, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, 0);
1715
12
  _ccv_nnc_symbolic_graph_backward_gen(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, graph);
1716
12
  _ccv_nnc_symbolic_graph_backward_prep_free(backward_prep);
1717
12
}
1718
1719
ccv_nnc_tensor_symbol_t ccv_nnc_tensor_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol)
1720
19
{
1721
19
  assert(symbol.d >= 0);
1722
19
  assert(symbol.d < graph->forward_symbol_size);
1723
19
  assert(graph->backward_tensor_symbols[symbol.d] >= 0);
1724
19
  ccv_nnc_tensor_symbol_t tensor = {
1725
19
    .d = graph->backward_tensor_symbols[symbol.d],
1726
19
    .graph = graph,
1727
19
    .info = symbol.info
1728
19
  };
1729
19
  return tensor;
1730
19
}
1731
1732
ccv_nnc_graph_exec_symbol_t ccv_nnc_graph_exec_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol)
1733
9
{
1734
9
  assert(symbol.d >= graph->forward_symbol_size);
1735
9
  assert(symbol.d < graph->forward_symbol_size + graph->backward_symbol_size);
1736
9
  const int dd = symbol.d - graph->forward_symbol_size;
1737
9
  assert(graph->backward_exec_symbols[dd] >= 0);
1738
9
  ccv_nnc_graph_exec_symbol_t exec = {
1739
9
    .d = graph->backward_exec_symbols[dd],
1740
9
    .graph = graph
1741
9
  };
1742
9
  return exec;
1743
9
}