Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_symbolic_graph_parallel.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_symbolic_graph.h"
6
7
// MARK - Level-3.5 API
8
9
enum {
10
  CCV_NNC_PARALLEL_BROADCAST = 0x1,
11
  CCV_NNC_PARALLEL_ALLREDUCER = 0x2,
12
  CCV_NNC_PARALLEL_REDUCER = 0x3,
13
};
14
15
static int _ccv_nnc_exec_inputs_contain(const ccv_nnc_graph_exec_symbol_info_t* const node, const int d)
16
1.46k
{
17
1.46k
  int i;
18
4.82k
  for (i = 0; i < node->input_size; 
i++3.36k
)
19
3.88k
    if (node->inputs[i] == d)
20
520
      return 1;
21
940
  return 0;
22
1.46k
}
23
24
void ccv_nnc_symbolic_graph_data_parallel(ccv_nnc_symbolic_graph_t* const graph, const int parallel, const ccv_nnc_tensor_symbol_t* const broadcasts, const int broadcast_size, const ccv_nnc_tensor_symbol_t* const allreducers, const int allreducer_size, ccv_nnc_tensor_symbol_t* const allreducer_outs, const ccv_nnc_tensor_symbol_t* const reducers, const int reducer_size, ccv_nnc_tensor_symbol_t* const reducer_outs, const int reduce_op_type, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
25
17
{
26
17
  assert(reduce_op_type == CCV_NNC_PARALLEL_REDUCE_OP_SUM);
27
17
  const int parallel_count = (parallel == 0) ? 
ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU)0
: parallel;
28
17
  if (parallel_count == 1)
29
0
    return;
30
17
  assert(parallel_count > 1);
31
34
  
ccv_nnc_graph_visit_t* const visit = 17
ccv_nnc_graph_visit_new17
(graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
32
0
  int i, j, k;
33
  // Tensor symbol has to be on device 0 or any.
34
570
  ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), node, idx) {
35
2.39k
    for (i = 0; i < node->input_size; 
i++1.82k
)
36
1.82k
      if (node->inputs[i] >= 0)
37
1.32k
      {
38
1.32k
        ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->inputs[i]);
39
1.32k
        if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY &&
40
1.32k
          CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) != CCV_COMPUTE_DEVICE_ANY)
41
1.32k
          { assert(CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) == CCV_COMPUTE_DEVICE_000); }
42
1.32k
      }
43
1.53k
    
for (i = 0; 570
i < node->output_size;
i++960
)
44
960
      if (node->outputs[i] >= 0)
45
914
      {
46
914
        ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->outputs[i]);
47
914
        if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY &&
48
914
          
CCV_TENSOR_GET_DEVICE911
(tensor_symbol->info.type) != CCV_COMPUTE_DEVICE_ANY911
)
49
911
          { assert(CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) == CCV_COMPUTE_DEVICE_000); }
50
914
      }
51
570
  } ccv_nnc_graph_visit_endfor
52
  // Run infer in the graph to get all tensors shaped.
53
17
  ccv_nnc_symbolic_graph_symbol_infer(graph, visit, sources, source_size, destinations, destination_size, 0, 0, (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, 0), (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0));
54
  // Set ANY device to default device. Make a list of execution nodes / tensors to be duplicated.
55
17
  ccv_array_t* const dup_tensors = ccv_array_new(sizeof(int), 0, 0);
56
17
  ccv_array_t* const dup_execs = ccv_array_new(sizeof(int), 0, 0);
57
17
  ccv_array_t* const broadcast_reduce_execs = ccv_array_new(sizeof(int), 0, 0);
58
17
  int* const allreduce_inputs = allreducer_size > 0 ? 
(int*)9
ccmalloc9
(sizeof(int) * allreducer_size) :
08
;
59
149
  for (i = 0; i < allreducer_size; 
i++132
)
60
132
  {
61
132
    if (allreducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
62
0
      allreduce_inputs[i] = CCV_NNC_NO_TENSOR_SYMBOL;
63
132
    else
64
132
      allreduce_inputs[i] = ccv_nnc_tensor_symbol_new(graph, ccv_nnc_tensor_symbol_params(graph, allreducers[i]), 0).d;
65
132
  }
66
17
  const int tensor_symbol_size = graph->tensor_symbol_info->rnum;
67
17
  const int graph_exec_symbol_size = graph->exec_symbol_info->rnum;
68
17
  int* const tensor_flags = (int*)cccalloc(tensor_symbol_size + graph_exec_symbol_size, sizeof(int));
69
17
  int* const exec_flags = tensor_flags + tensor_symbol_size;
70
29
  for (i = 0; i < broadcast_size; 
i++12
)
71
12
  {
72
12
    if (broadcasts[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
73
0
      continue;
74
    // Doesn't support alias for these.
75
12
    tensor_flags[broadcasts[i].d] = CCV_NNC_PARALLEL_BROADCAST;
76
12
    assert(graph == broadcasts[i].graph);
77
12
    assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, broadcasts[i].d))->alias_ref);
78
12
  }
79
17
  int* const allreduce_producers = allreducer_size > 0 ? 
(int*)9
cccalloc9
(tensor_symbol_size, sizeof(int)) :
08
;
80
149
  for (i = 0; i < allreducer_size; 
i++132
)
81
132
  {
82
132
    if (allreducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
83
0
      continue;
84
    // Doesn't support alias for these.
85
132
    tensor_flags[allreducers[i].d] = CCV_NNC_PARALLEL_ALLREDUCER;
86
132
    assert(graph == allreducers[i].graph);
87
132
    assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, allreducers[i].d))->alias_ref);
88
132
  }
89
25
  
for (i = 0; 17
i < reducer_size;
i++8
)
90
8
  {
91
8
    if (reducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
92
0
      continue;
93
    // Doesn't support alias for these.
94
8
    tensor_flags[reducers[i].d] = CCV_NNC_PARALLEL_REDUCER;
95
8
    assert(graph == reducers[i].graph);
96
8
    assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, reducers[i].d))->alias_ref);
97
8
  }
98
  // No overlap between broadcasts, allreducers, reducers.
99
29
  
for (i = 0; 17
i < broadcast_size;
i++12
)
100
12
  {
101
12
    if (broadcasts[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
102
0
      continue;
103
44
    
for (j = 0; 12
j < reducer_size;
j++32
)
104
32
      { assert(broadcasts[i].d != reducers[j].d); }
105
28
    
for (j = 0; 12
j < allreducer_size;
j++16
)
106
16
      { assert(broadcasts[i].d != allreducers[j].d); }
107
12
  }
108
149
  
for (i = 0; 17
i < allreducer_size;
i++132
)
109
132
  {
110
132
    if (allreducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
111
0
      continue;
112
132
    for (j = 0; j < reducer_size; 
j++0
)
113
0
      { assert(allreducers[i].d != reducers[j].d); }
114
132
  }
115
570
  ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), node, idx) {
116
570
    int parallelizable_data = 0;
117
570
    int reduce_inputs = 0;
118
570
    int broadcast_outputs = 0;
119
2.39k
    for (i = 0; i < node->input_size; 
i++1.82k
)
120
1.82k
      if (node->inputs[i] >= 0)
121
1.32k
      {
122
1.32k
        ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->inputs[i]);
123
1.32k
        if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY)
124
1.32k
        {
125
1.32k
          if (CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) == CCV_COMPUTE_DEVICE_ANY)
126
0
            CCV_TENSOR_SET_DEVICE_ID(tensor_symbol->info.type, 0);
127
          // Don't support alias for broadcast / allreducer / reducer.
128
1.32k
          assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_BROADCAST);
129
1.32k
          assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_ALLREDUCER);
130
1.32k
          assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_REDUCER);
131
1.32k
          const int d = node->inputs[i];
132
1.32k
          if (tensor_flags[d] == CCV_NNC_PARALLEL_REDUCER)
133
8
            reduce_inputs = 1;
134
1.32k
          parallelizable_data = 1;
135
1.32k
        }
136
1.32k
      }
137
1.53k
    
for (i = 0; 570
i < node->output_size;
i++960
)
138
960
      if (node->outputs[i] >= 0)
139
914
      {
140
914
        ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->outputs[i]);
141
914
        if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY)
142
914
        {
143
914
          if (CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) == CCV_COMPUTE_DEVICE_ANY)
144
0
            CCV_TENSOR_SET_DEVICE_ID(tensor_symbol->info.type, 0);
145
          // Don't support alias for broadcast / allreducer / reducer.
146
914
          assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_BROADCAST);
147
914
          assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_ALLREDUCER);
148
914
          assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_REDUCER);
149
914
          const int d = node->outputs[i];
150
914
          if (tensor_flags[d] == CCV_NNC_PARALLEL_BROADCAST)
151
0
            broadcast_outputs = 1;
152
914
          else if (tensor_flags[d] == CCV_NNC_PARALLEL_ALLREDUCER)
153
132
            allreduce_producers[d] = idx + 1;
154
914
          parallelizable_data = 1;
155
914
        }
156
914
      }
157
570
    assert(!(broadcast_outputs && reduce_inputs)); // This node cannot be both broadcast and reducer.
158
570
    if (broadcast_outputs ^ reduce_inputs)
159
8
    {
160
8
      if (broadcast_outputs)
161
0
        exec_flags[idx] = CCV_NNC_PARALLEL_BROADCAST;
162
8
      else if (reduce_inputs)
163
8
        exec_flags[idx] = CCV_NNC_PARALLEL_REDUCER;
164
8
      ccv_array_push(broadcast_reduce_execs, &idx);
165
562
    } else if (parallelizable_data && !broadcast_outputs && !reduce_inputs) {
166
      // If this node contains GPU data that need to be parallelized, and this node itself is not a broadcast node or a reducer node..
167
562
      ccv_array_push(dup_execs, &idx);
168
2.36k
      for (i = 0; i < node->input_size; 
i++1.80k
)
169
1.80k
        if (node->inputs[i] >= 0)
170
1.30k
        {
171
1.30k
          ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->inputs[i]);
172
1.30k
          if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY)
173
1.30k
          {
174
            // Add the symbol alias to first.
175
1.30k
            if (tensor_symbol->alias_ref)
176
112
              ccv_array_add_unique_int(dup_tensors, tensor_symbol->alias_ref - 1);
177
1.30k
            ccv_array_add_unique_int(dup_tensors, node->inputs[i]);
178
1.30k
          }
179
1.30k
        }
180
1.50k
      for (i = 0; i < node->output_size; 
i++944
)
181
944
        if (node->outputs[i] >= 0)
182
898
        {
183
898
          ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->outputs[i]);
184
898
          if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY)
185
898
          {
186
898
            if (tensor_symbol->alias_ref)
187
72
              ccv_array_add_unique_int(dup_tensors, tensor_symbol->alias_ref - 1);
188
898
            ccv_array_add_unique_int(dup_tensors, node->outputs[i]);
189
898
          }
190
898
        }
191
562
    }
192
570
  } ccv_nnc_graph_visit_endfor
193
  // Now, actually create these tensors.
194
17
  if (!graph->data_parallel.tensor_symbol_idx)
195
17
    graph->data_parallel.tensor_symbol_idx = (int*)ccmalloc(sizeof(int) * (parallel_count - 1) * tensor_symbol_size);
196
0
  else if (graph->data_parallel.tensor_symbol_size * (graph->data_parallel.count - 1) != tensor_symbol_size * (parallel_count - 1))
197
    // This may shrink too, but that is OK.
198
0
    graph->data_parallel.tensor_symbol_idx = (int*)ccrealloc(graph->data_parallel.tensor_symbol_idx, sizeof(int) * (parallel_count - 1) * tensor_symbol_size);
199
17
  graph->data_parallel.tensor_symbol_size = tensor_symbol_size;
200
17
  graph->data_parallel.count = parallel_count;
201
17
  int* const dup_tensor_idx = graph->data_parallel.tensor_symbol_idx;
202
  // dup_tensor_idx is the array starts with 0 here.
203
4.24k
  for (i = 0; i < (parallel_count - 1) * tensor_symbol_size; 
i++4.22k
)
204
4.22k
    dup_tensor_idx[i] = -1;
205
  // Make the duplicated tensors (on different devices).
206
1.34k
  for (i = 0; i < dup_tensors->rnum; 
i++1.32k
)
207
1.32k
  {
208
1.32k
    const int d = *(int*)ccv_array_get(dup_tensors, i);
209
1.32k
    ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, d);
210
1.32k
    ccv_nnc_tensor_param_t info = tensor_symbol->info;
211
1.32k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type);
212
1.32k
    const int flags = tensor_symbol->flags;
213
1.32k
    if (tensor_symbol->alias_ref)
214
136
    {
215
136
      const int alias_ref = tensor_symbol->alias_ref - 1;
216
536
      for (j = 0; j < parallel_count - 1; 
j++400
)
217
400
      {
218
400
        const int dup_d = dup_tensor_idx[alias_ref * (parallel_count - 1) + j];
219
400
        if (j + 1 != device_id)
220
400
          CCV_TENSOR_SET_DEVICE_ID(info.type, j + 1); // Set the device id.
221
0
        else
222
0
          CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
223
400
        assert(dup_d >= 0);
224
        // Get tensor symbol again, it may be invalid after added new symbol (we use it for ofs and inc).
225
400
        ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, d);
226
400
        const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_alias_new(graph, (ccv_nnc_tensor_symbol_t){
227
400
          .d = dup_d,
228
400
          .graph = graph,
229
400
        }, tensor_symbol->ofs, tensor_symbol->stride, info, 0);
230
400
        ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags);
231
400
        dup_tensor_idx[d * (parallel_count - 1) + j] = new_symbol.d;
232
400
      }
233
1.19k
    } else {
234
4.60k
      for (j = 0; j < parallel_count - 1; 
j++3.41k
)
235
3.41k
      {
236
3.41k
        if (j + 1 != device_id)
237
3.41k
          CCV_TENSOR_SET_DEVICE_ID(info.type, j + 1); // Set the device id.
238
0
        else
239
0
          CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
240
3.41k
        const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, info, 0);
241
3.41k
        ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags);
242
3.41k
        dup_tensor_idx[d * (parallel_count - 1) + j] = new_symbol.d;
243
3.41k
      }
244
1.19k
    }
245
1.32k
  }
246
17
  ccv_array_free(dup_tensors);
247
  // Now, create execs.
248
17
  if (!graph->data_parallel.exec_symbol_idx)
249
17
    graph->data_parallel.exec_symbol_idx = (int*)ccmalloc(sizeof(int) * (parallel_count - 1) * graph_exec_symbol_size);
250
0
  else if (graph->data_parallel.exec_symbol_size * (graph->data_parallel.count - 1) != graph_exec_symbol_size * (parallel_count - 1))
251
    // This may shrink too, but that is OK.
252
0
    graph->data_parallel.exec_symbol_idx = (int*)ccrealloc(graph->data_parallel.exec_symbol_idx, sizeof(int) * (parallel_count - 1) * graph_exec_symbol_size);
253
17
  graph->data_parallel.exec_symbol_size = graph_exec_symbol_size;
254
17
  int* const dup_exec_idx = graph->data_parallel.exec_symbol_idx;
255
  // dup_exec_idx is the array starts with 0 here.
256
1.62k
  for (i = 0; i < (parallel_count - 1) * graph_exec_symbol_size; 
i++1.60k
)
257
1.60k
    dup_exec_idx[i] = -1;
258
17
  int max_io_size = 1 + parallel_count;
259
  // Now make the duplicated execs nodes (on different devices).
260
579
  for (i = 0; i < dup_execs->rnum; 
i++562
)
261
562
  {
262
562
    const int d = *(int*)ccv_array_get(dup_execs, i);
263
562
    ccv_nnc_graph_exec_symbol_info_t* const exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d);
264
562
    max_io_size = ccv_max(max_io_size, exec_symbol->input_size + exec_symbol->output_size);
265
562
  }
266
17
  max_io_size = ccv_max(max_io_size, parallel_count * 2); // tensors from all parallel_count, the output is to all parallel_count (thus, allreduce).
267
17
  ccv_nnc_tensor_symbol_t max_io[max_io_size];
268
579
  for (i = 0; i < dup_execs->rnum; 
i++562
)
269
562
  {
270
562
    const int d = *(int*)ccv_array_get(dup_execs, i);
271
2.16k
    for (j = 0; j < parallel_count - 1; 
j++1.59k
)
272
1.59k
    {
273
1.59k
      ccv_nnc_graph_exec_symbol_info_t* const exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d);
274
1.59k
      const ccv_nnc_cmd_t cmd = exec_symbol->cmd;
275
1.59k
      const ccv_nnc_hint_t hint = exec_symbol->hint;
276
1.59k
      const int input_size = exec_symbol->input_size;
277
1.59k
      const int output_size = exec_symbol->output_size;
278
1.59k
      ccv_nnc_tensor_symbol_t* const inputs = max_io;
279
6.78k
      for (k = 0; k < input_size; 
k++5.18k
)
280
5.18k
      {
281
5.18k
        const int idx = exec_symbol->inputs[k];
282
5.18k
        if (idx >= 0)
283
3.75k
          inputs[k].d = dup_tensor_idx[idx * (parallel_count - 1) + j] >= 0 ? dup_tensor_idx[idx * (parallel_count - 1) + j] : 
idx0
;
284
1.42k
        else
285
1.42k
          inputs[k].d = idx;
286
5.18k
        inputs[k].graph = idx != CCV_NNC_NO_TENSOR_SYMBOL ? 
graph3.75k
:
01.42k
;
287
5.18k
      }
288
1.59k
      ccv_nnc_tensor_symbol_t* const outputs = max_io + input_size;
289
4.29k
      for (k = 0; k < output_size; 
k++2.70k
)
290
2.70k
      {
291
2.70k
        const int idx = exec_symbol->outputs[k];
292
2.70k
        if (idx >= 0)
293
2.57k
          outputs[k].d = dup_tensor_idx[idx * (parallel_count - 1) + j] >= 0 ? dup_tensor_idx[idx * (parallel_count - 1) + j] : 
idx0
;
294
126
        else
295
126
          outputs[k].d = idx;
296
2.70k
        outputs[k].graph = idx != CCV_NNC_NO_TENSOR_SYMBOL ? 
graph2.57k
:
0126
;
297
2.70k
      }
298
1.59k
      const ccv_nnc_graph_exec_symbol_t new_symbol = ccv_nnc_graph_exec_symbol_new(graph, cmd, inputs, input_size, outputs, output_size, 0);
299
1.59k
      ccv_nnc_graph_exec_symbol_set_hint(graph, new_symbol, hint);
300
1.59k
      dup_exec_idx[d * (parallel_count - 1) + j] = new_symbol.d;
301
1.59k
    }
302
562
  }
303
  // Create new tensors for broadcast / reduce.
304
17
  int* const broadcast_reduce_tensor_idx = (int*)cccalloc(tensor_symbol_size, sizeof(int));
305
37
  for (i = 0; i < broadcast_size + reducer_size; 
i++20
)
306
20
  {
307
20
    const int idx = i >= broadcast_size ? 
reducers[i - broadcast_size].d8
:
broadcasts[i].d12
;
308
20
    if (idx == CCV_NNC_NO_TENSOR_SYMBOL)
309
0
      continue;
310
20
    ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, idx);
311
20
    ccv_nnc_tensor_param_t info = tensor_symbol->info;
312
20
    const int flags = tensor_symbol->flags;
313
    // No alias handling.
314
20
    assert(!tensor_symbol->alias_ref);
315
20
    const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, info, 0);
316
20
    ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags);
317
20
    broadcast_reduce_tensor_idx[idx] = new_symbol.d + 1;
318
20
  }
319
17
  int* const broadcast_exec_idx = (int*)cccalloc(tensor_symbol_size, sizeof(int));
320
17
  int* const reduce_exec_idx = (int*)cccalloc(tensor_symbol_size, sizeof(int));
321
  // Create node for broadcast (thus, transfer data to different parallel_count) and reducer (transfer data back to a device, and sum).
322
25
  for (i = 0; i < broadcast_reduce_execs->rnum; 
i++8
)
323
8
  {
324
8
    const int d = *(int*)ccv_array_get(broadcast_reduce_execs, i);
325
    // For broadcast, we create data transfers as our dup node, and create connections to these data transfers.
326
8
    if (exec_flags[d] == CCV_NNC_PARALLEL_BROADCAST)
327
0
    {
328
0
      ccv_nnc_graph_exec_symbol_info_t* const exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d);
329
0
      ccv_nnc_tensor_symbol_t* const inputs = max_io;
330
0
      ccv_nnc_tensor_symbol_t* const outputs = max_io + 1;
331
0
      const ccv_nnc_graph_exec_symbol_t source = {
332
0
        .d = d,
333
0
        .graph = graph,
334
0
      };
335
0
      for (j = 0; j < exec_symbol->output_size; j++)
336
0
      {
337
0
        const int idx = exec_symbol->outputs[j];
338
0
        if (idx >= 0 && tensor_flags[idx] == CCV_NNC_PARALLEL_BROADCAST)
339
0
        {
340
0
          inputs[0] = (ccv_nnc_tensor_symbol_t){
341
0
            .d = idx,
342
0
            .graph = graph,
343
0
          };
344
          // Reset the tensor flags, it is broadcasted now.
345
0
          tensor_flags[idx] = 0;
346
0
          outputs[0] = (ccv_nnc_tensor_symbol_t){
347
0
            .d = broadcast_reduce_tensor_idx[idx] - 1,
348
0
            .graph = graph,
349
0
          };
350
0
          assert(broadcast_reduce_tensor_idx[idx] > 0);
351
0
          for (k = 0; k < parallel_count - 1; k++)
352
0
            outputs[k + 1] = (ccv_nnc_tensor_symbol_t){
353
0
              .d = dup_tensor_idx[idx * (parallel_count - 1) + k],
354
0
              .graph = graph,
355
0
            };
356
0
          const ccv_nnc_graph_exec_symbol_t bcast = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMM_BROADCAST_FORWARD(), inputs, 1, outputs, parallel_count, 0);
357
0
          ccv_nnc_graph_exec_symbol_concat(graph, source, bcast);
358
0
          assert(!broadcast_exec_idx[idx]);
359
0
          broadcast_exec_idx[idx] = bcast.d + 1;
360
0
        }
361
0
      }
362
8
    } else if (exec_flags[d] == CCV_NNC_PARALLEL_REDUCER) {
363
      // Gather is a bit more sophisticated, we need to use the new tensor to hold the summed value.
364
      // This is what we have right now, I will use NCCL later.
365
8
      ccv_nnc_tensor_symbol_t* const inputs = max_io;
366
8
      ccv_nnc_tensor_symbol_t* const outputs = max_io + parallel_count;
367
8
      ccv_nnc_graph_exec_symbol_info_t* exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d);
368
32
      for (j = 0; j < exec_symbol->input_size; 
j++24
)
369
24
      {
370
24
        const int idx = exec_symbol->inputs[j];
371
24
        if (idx >= 0 && tensor_flags[idx] == CCV_NNC_PARALLEL_REDUCER && 
!reduce_exec_idx[idx]8
)
372
8
        {
373
8
          inputs[0] = (ccv_nnc_tensor_symbol_t){
374
8
            .d = idx,
375
8
            .graph = graph,
376
8
          };
377
16
          for (k = 0; k < parallel_count - 1; 
k++8
)
378
8
            inputs[k + 1] = (ccv_nnc_tensor_symbol_t){
379
8
              .d = dup_tensor_idx[idx * (parallel_count - 1) + k],
380
8
              .graph = graph,
381
8
            };
382
8
          outputs[0] = (ccv_nnc_tensor_symbol_t){
383
8
            .d = broadcast_reduce_tensor_idx[idx] - 1,
384
8
            .graph = graph,
385
8
          };
386
          // Create new symbol for all other tensors to facilitate copy (this is not useful for NCCL, but useful for REF implementation).
387
8
          ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, idx);
388
8
          ccv_nnc_tensor_param_t info = tensor_symbol->info;
389
8
          const int flags = tensor_symbol->flags;
390
          // No alias handling.
391
8
          assert(!tensor_symbol->alias_ref);
392
16
          
for (k = 1; 8
k < parallel_count;
k++8
)
393
8
          {
394
8
            const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, info, 0);
395
8
            ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags);
396
8
            outputs[k] = new_symbol;
397
8
          }
398
8
          const ccv_nnc_graph_exec_symbol_t reduce = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMM_REDUCE_FORWARD(), inputs, parallel_count, outputs, parallel_count, 0);
399
          // Refresh the pointer to keep it up to date.
400
8
          exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d);
401
8
          ccv_nnc_graph_exec_symbol_concat(graph, reduce, (ccv_nnc_graph_exec_symbol_t){
402
8
            .d = d,
403
8
            .graph = graph,
404
8
          });
405
8
          reduce_exec_idx[idx] = reduce.d + 1;
406
8
        }
407
24
      }
408
      // Update the inputs pointing to the summed value.
409
32
      
for (j = 0; 8
j < exec_symbol->input_size;
j++24
)
410
24
      {
411
24
        const int idx = exec_symbol->inputs[j];
412
24
        if (idx >= 0 && tensor_flags[idx] == CCV_NNC_PARALLEL_REDUCER)
413
8
          exec_symbol->inputs[j] = broadcast_reduce_tensor_idx[idx] - 1;
414
24
      }
415
8
    }
416
8
  }
417
17
  ccv_array_free(broadcast_reduce_execs);
418
  // If this tensor is not broadcasted yet, that means there is no exec to generate this tensor. We just generate headless copy.
419
579
  for (i = 0; i < dup_execs->rnum; 
i++562
)
420
562
  {
421
562
    const int idx = *(int*)ccv_array_get(dup_execs, i);
422
562
    ccv_nnc_graph_exec_symbol_info_t* const node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, idx);
423
562
    if (exec_flags[idx] == CCV_NNC_PARALLEL_REDUCER)
424
0
      continue;
425
    // We try to make copy command as compact as possible by having one copy for multiple tensors if they used together.
426
562
    ccv_nnc_tensor_symbol_t* const inputs = max_io;
427
562
    ccv_nnc_tensor_symbol_t* const outputs = max_io + 1;
428
2.36k
    for (j = 0; j < node->input_size; 
j++1.80k
)
429
1.80k
    {
430
1.80k
      const int idx = node->inputs[j];
431
      // Now, figure out whether we need to create copy command.
432
1.80k
      if (idx >= 0 && 
idx < tensor_symbol_size1.30k
&&
tensor_flags[idx] == CCV_NNC_PARALLEL_BROADCAST1.30k
)
433
12
      {
434
12
        inputs[0] = (ccv_nnc_tensor_symbol_t){
435
12
          .d = idx,
436
12
          .graph = graph,
437
12
        };
438
        // Reset the tensor flags, it is broadcasted now.
439
12
        tensor_flags[idx] = 0;
440
12
        outputs[0] = (ccv_nnc_tensor_symbol_t){
441
12
          .d = broadcast_reduce_tensor_idx[idx] - 1,
442
12
          .graph = graph,
443
12
        };
444
12
        assert(broadcast_reduce_tensor_idx[idx] > 0);
445
24
        
for (k = 0; 12
k < parallel_count - 1;
k++12
)
446
12
          outputs[k + 1] = (ccv_nnc_tensor_symbol_t){
447
12
            .d = dup_tensor_idx[idx * (parallel_count - 1) + k],
448
12
            .graph = graph,
449
12
          };
450
12
        const ccv_nnc_graph_exec_symbol_t bcast = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMM_BROADCAST_FORWARD(), inputs, 1, outputs, parallel_count, 0);
451
12
        broadcast_exec_idx[idx] = bcast.d + 1;
452
12
      }
453
1.80k
    }
454
562
  }
455
  // Write reducer_outs last, because it may be the same pointer as reducers.
456
17
  if (reducer_outs)
457
0
    for (i = 0; i < reducer_size; i++)
458
0
    {
459
0
      reducer_outs[i].d = broadcast_reduce_tensor_idx[i + broadcast_size] - 1;
460
0
      reducer_outs[i].graph = graph;
461
0
    }
462
17
  ccfree(broadcast_reduce_tensor_idx);
463
17
  ccv_array_free(dup_execs);
464
  // Now everything is dup'ed, connect them all.
465
570
  ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), node, idx) {
466
2.39k
    for (i = 0; i < node->input_size; 
i++1.82k
)
467
1.82k
    {
468
1.82k
      const int input = node->inputs[i];
469
      // If it is broadcast worthy.
470
1.82k
      if (input >= 0 && 
input < tensor_symbol_size1.32k
&&
broadcast_exec_idx[input]1.31k
)
471
27
        ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
472
27
          .d = broadcast_exec_idx[input] - 1,
473
27
          .graph = graph,
474
27
        }, (ccv_nnc_graph_exec_symbol_t){
475
27
          .d = idx,
476
27
          .graph = graph,
477
27
        });
478
1.82k
    }
479
    // Check whether this node has outgoing to the reducer node, if so, replace that to the sum node.
480
570
    if (node->outgoings && 
node->outgoings->rnum422
)
481
1.29k
      
for (i = 0; 422
i < node->outgoings->rnum;
i++871
)
482
871
      {
483
871
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
484
871
        if (outgoing_idx >= graph_exec_symbol_size)
485
0
          continue;
486
871
        if (exec_flags[outgoing_idx] == CCV_NNC_PARALLEL_REDUCER)
487
40
          
for (j = 0; 12
j < node->output_size;
j++28
)
488
28
          {
489
28
            const int output_idx = node->outputs[j];
490
28
            if (output_idx >= 0 && 
tensor_flags[output_idx] == CCV_NNC_PARALLEL_REDUCER24
)
491
16
            {
492
16
              assert(reduce_exec_idx[output_idx]);
493
16
              ccv_array_replace_unique_int(node->outgoings, outgoing_idx, reduce_exec_idx[output_idx] - 1);
494
16
            }
495
28
          }
496
871
      }
497
2.17k
    
for (i = 0; 570
i < parallel_count - 1;
i++1.60k
)
498
1.60k
    {
499
1.60k
      const int d = dup_exec_idx[idx * (parallel_count - 1) + i];
500
1.60k
      if (d < 0)
501
8
        continue;
502
1.59k
      const ccv_nnc_graph_exec_symbol_t source = {
503
1.59k
        .d = d,
504
1.59k
        .graph = graph,
505
1.59k
      };
506
      // If it is broadcast worthy.
507
6.78k
      for (j = 0; j < node->input_size; 
j++5.18k
)
508
5.18k
      {
509
5.18k
        const int input = node->inputs[j];
510
5.18k
        if (input >= 0 && 
input < tensor_symbol_size3.75k
&&
broadcast_exec_idx[input]3.75k
)
511
19
          ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
512
19
            .d = broadcast_exec_idx[input] - 1,
513
19
            .graph = graph,
514
19
          }, source);
515
5.18k
      }
516
      // If it is reduce worthy.
517
4.29k
      for (j = 0; j < node->output_size; 
j++2.70k
)
518
2.70k
      {
519
2.70k
        const int output = node->outputs[j];
520
2.70k
        if (output >= 0 && 
output < tensor_symbol_size2.57k
&&
reduce_exec_idx[output]2.57k
)
521
8
          ccv_nnc_graph_exec_symbol_concat(graph, source, (ccv_nnc_graph_exec_symbol_t){
522
8
            .d = reduce_exec_idx[output] - 1,
523
8
            .graph = graph,
524
8
          });
525
2.70k
      }
526
1.59k
      if (node->outgoings && 
node->outgoings->rnum1.18k
)
527
3.67k
        
for (j = 0; 1.18k
j < node->outgoings->rnum;
j++2.48k
)
528
2.48k
        {
529
2.48k
          const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, j);
530
2.48k
          if (outgoing_idx > graph_exec_symbol_size)
531
8
            continue;
532
2.47k
          const int outgoing_d = dup_exec_idx[outgoing_idx * (parallel_count - 1) + i];
533
2.47k
          if (outgoing_d < 0)
534
4
            continue;
535
2.47k
          ccv_nnc_graph_exec_symbol_concat(graph, source, (ccv_nnc_graph_exec_symbol_t){
536
2.47k
            .d = outgoing_d,
537
2.47k
            .graph = graph,
538
2.47k
          });
539
2.47k
        }
540
1.59k
    }
541
570
  } ccv_nnc_graph_visit_endfor
542
17
  ccfree(broadcast_exec_idx);
543
17
  ccfree(reduce_exec_idx);
544
17
  ccfree(tensor_flags);
545
17
  ccv_nnc_graph_visit_free(visit);
546
  // Allreduce is easier to do, we do that the last. It consists of two steps:
547
  // 1. Generate allreduce node for each symbol;
548
  // 2. Disconnect them from source and connect them through all reduce nodes.
549
149
  for (i = 0; i < allreducer_size; 
i++132
)
550
132
  {
551
132
    if (allreducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL)
552
0
      continue;
553
132
    ccv_nnc_tensor_symbol_t* const outputs = max_io + parallel_count;
554
132
    outputs[0] = allreducers[i];
555
    // Copy over allreducers output symbols (as the old symbol).
556
520
    for (j = 0; j < parallel_count - 1; 
j++388
)
557
388
    {
558
388
      const int d = allreducers[i].d;
559
388
      outputs[j + 1].graph = graph;
560
388
      assert(dup_tensor_idx[d * (parallel_count - 1) + j] >= 0);
561
388
      outputs[j + 1].d = dup_tensor_idx[d * (parallel_count - 1) + j];
562
388
    }
563
132
    ccv_nnc_tensor_symbol_t* const inputs = max_io;
564
132
    inputs[0].graph = graph;
565
132
    inputs[0].d = allreduce_inputs[i];
566
    // Create identical new tensor symbols
567
520
    for (j = 0; j < parallel_count - 1; 
j++388
)
568
388
    {
569
388
      if (dup_tensor_idx[allreduce_inputs[i] * (parallel_count - 1) + j] < 0)
570
388
        dup_tensor_idx[allreduce_inputs[i] * (parallel_count - 1) + j] = ccv_nnc_tensor_symbol_new(graph, ccv_nnc_tensor_symbol_params(graph, outputs[j + 1]), 0).d;
571
388
      inputs[j + 1].graph = graph;
572
388
      inputs[j + 1].d = dup_tensor_idx[allreduce_inputs[i] * (parallel_count - 1) + j];
573
388
    }
574
    // Create allreduce node.
575
132
    const ccv_nnc_graph_exec_symbol_t allreduce = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMM_ALLREDUCE_FORWARD(), inputs, parallel_count, outputs, parallel_count, 0);
576
132
    const int exec_idx = allreduce_producers[allreducers[i].d] - 1;
577
132
    assert(exec_idx >= 0);
578
132
    ccv_nnc_graph_exec_symbol_info_t* const node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, exec_idx);
579
576
    for (j = 0; j < node->output_size; 
j++444
)
580
444
      if (node->outputs[j] == outputs[0].d)
581
132
        node->outputs[j] = inputs[0].d;
582
132
    ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
583
132
      .graph = graph,
584
132
      .d = exec_idx,
585
132
    }, allreduce);
586
    // Remove connections from current node directly to its following nodes (these should follow allreduce node now).
587
634
    for (j = 0; j < node->outgoings->rnum;)
588
502
    {
589
502
      const int d = *(int*)ccv_array_get(node->outgoings, j);
590
502
      if (d == allreduce.d)
591
132
      {
592
132
        ++j;
593
132
        continue;
594
132
      }
595
      // Get the destination nodes, and check whether they have inputs matches our outputs.
596
370
      ccv_nnc_graph_exec_symbol_info_t* const outgoing_node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d);
597
370
      if (_ccv_nnc_exec_inputs_contain(outgoing_node, allreducers[i].d))
598
132
      {
599
132
        ccv_nnc_graph_exec_symbol_concat(graph, allreduce, (ccv_nnc_graph_exec_symbol_t){
600
132
          .graph = graph,
601
132
          .d = d,
602
132
        });
603
        // Remove the connection.
604
132
        if (j < node->outgoings->rnum - 1)
605
132
          *(int*)ccv_array_get(node->outgoings, j) = *(int*)ccv_array_get(node->outgoings, node->outgoings->rnum - 1);
606
132
        --node->outgoings->rnum;
607
132
      } else
608
238
        ++j;
609
370
    }
610
520
    for (j = 0; j < parallel_count - 1; 
j++388
)
611
388
    {
612
388
      const int new_exec_idx = dup_exec_idx[exec_idx * (parallel_count - 1) + j];
613
388
      ccv_nnc_graph_exec_symbol_info_t* const node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, new_exec_idx);
614
1.69k
      for (k = 0; k < node->output_size; 
k++1.30k
)
615
1.30k
        if (node->outputs[k] == outputs[j + 1].d)
616
388
          node->outputs[k] = inputs[j + 1].d;
617
388
      ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
618
388
        .graph = graph,
619
388
        .d = new_exec_idx,
620
388
      }, allreduce);
621
1.86k
      for (k = 0; k < node->outgoings->rnum;)
622
1.47k
      {
623
1.47k
        const int d = *(int*)ccv_array_get(node->outgoings, k);
624
1.47k
        if (d == allreduce.d)
625
388
        {
626
388
          ++k;
627
388
          continue;
628
388
        }
629
        // Get the destination nodes, and check whether they have inputs matches our outputs.
630
1.09k
        ccv_nnc_graph_exec_symbol_info_t* const outgoing_node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d);
631
1.09k
        if (_ccv_nnc_exec_inputs_contain(outgoing_node, outputs[j + 1].d))
632
388
        {
633
388
          ccv_nnc_graph_exec_symbol_concat(graph, allreduce, (ccv_nnc_graph_exec_symbol_t){
634
388
            .graph = graph,
635
388
            .d = d,
636
388
          });
637
          // Remove the connection.
638
388
          if (k < node->outgoings->rnum - 1)
639
388
            *(int*)ccv_array_get(node->outgoings, k) = *(int*)ccv_array_get(node->outgoings, node->outgoings->rnum - 1);
640
388
          --node->outgoings->rnum;
641
388
        } else
642
702
          ++k;
643
1.09k
      }
644
388
    }
645
132
  }
646
17
  ccfree(allreduce_producers);
647
  // Write allreducer_outs last, because it may be the same pointer as allreducers.
648
17
  if (allreducer_outs)
649
136
    
for (i = 0; 8
i < allreducer_size;
i++128
)
650
128
    {
651
128
      if (allreduce_inputs[i] != CCV_NNC_NO_TENSOR_SYMBOL)
652
128
      {
653
128
        allreducer_outs[i].d = allreduce_inputs[i];
654
128
        allreducer_outs[i].graph = graph;
655
128
      } else {
656
0
        allreducer_outs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
657
0
        allreducer_outs[i].graph = 0;
658
0
      }
659
128
    }
660
17
  ccfree(allreduce_inputs);
661
17
}
662
663
ccv_nnc_tensor_symbol_t ccv_nnc_tensor_symbol_copy(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol, const int device_id)
664
5.42k
{
665
5.42k
  if (!graph->data_parallel.tensor_symbol_idx)
666
0
    return NO_TENSOR_SYMBOL;
667
5.42k
  assert(graph->data_parallel.tensor_symbol_idx);
668
5.42k
  assert(symbol.d >= 0);
669
5.42k
  assert(symbol.d < graph->data_parallel.tensor_symbol_size);
670
5.42k
  assert(symbol.graph == graph);
671
5.42k
  if (device_id == 0)
672
0
    return symbol;
673
5.42k
  const int parallel_count = graph->data_parallel.count;
674
5.42k
  if (graph->data_parallel.tensor_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] < 0)
675
0
    return NO_TENSOR_SYMBOL;
676
5.42k
  ccv_nnc_tensor_symbol_t tensor = {
677
5.42k
    .d = graph->data_parallel.tensor_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1],
678
5.42k
    .graph = graph,
679
5.42k
  };
680
5.42k
  return tensor;
681
5.42k
}
682
683
void ccv_nnc_tensor_symbol_set_copy(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol, const int device_id, const ccv_nnc_tensor_symbol_t copy)
684
672
{
685
672
  assert(graph->data_parallel.tensor_symbol_idx);
686
672
  assert(symbol.d >= 0);
687
672
  assert(symbol.d < graph->tensor_symbol_info->rnum);
688
672
  assert(symbol.graph == graph);
689
672
  const int parallel_count = graph->data_parallel.count;
690
672
  if (copy.d == CCV_NNC_NO_TENSOR_SYMBOL)
691
0
  {
692
0
    assert(symbol.d < graph->data_parallel.tensor_symbol_size);
693
0
    graph->data_parallel.tensor_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] = -1;
694
0
    return;
695
0
  }
696
672
  assert(copy.d >= 0);
697
672
  assert(copy.d < graph->tensor_symbol_info->rnum);
698
672
  assert(copy.graph == graph);
699
672
  assert(parallel_count > 1);
700
672
  if (symbol.d >= graph->data_parallel.tensor_symbol_size)
701
224
  {
702
224
    graph->data_parallel.tensor_symbol_idx = ccrealloc(graph->data_parallel.tensor_symbol_idx, sizeof(int) * (parallel_count - 1) * (symbol.d + 1));
703
224
    int i;
704
8.40k
    for (i = graph->data_parallel.tensor_symbol_size * (parallel_count - 1); i < (symbol.d + 1) * (parallel_count - 1); 
i++8.17k
)
705
8.17k
      graph->data_parallel.tensor_symbol_idx[i] = -1;
706
224
    graph->data_parallel.tensor_symbol_size = symbol.d + 1;
707
224
  }
708
672
  graph->data_parallel.tensor_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] = copy.d;
709
672
}
710
711
ccv_nnc_graph_exec_symbol_t ccv_nnc_graph_exec_symbol_copy(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t symbol, const int device_id)
712
30.7k
{
713
30.7k
  if (!graph->data_parallel.exec_symbol_idx)
714
102
    return NO_GRAPH_EXEC_SYMBOL;
715
30.7k
  assert
(graph->data_parallel.exec_symbol_idx)30.6k
;
716
30.6k
  assert(symbol.d >= 0);
717
30.6k
  assert(symbol.d < graph->data_parallel.exec_symbol_size);
718
30.6k
  assert(symbol.graph == graph);
719
30.6k
  if (device_id == 0)
720
0
    return symbol;
721
30.6k
  const int parallel_count = graph->data_parallel.count;
722
30.6k
  if (graph->data_parallel.exec_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] < 0)
723
0
    return NO_GRAPH_EXEC_SYMBOL;
724
30.6k
  ccv_nnc_graph_exec_symbol_t graph_exec = {
725
30.6k
    .d = graph->data_parallel.exec_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1],
726
30.6k
    .graph = graph,
727
30.6k
  };
728
30.6k
  return graph_exec;
729
30.6k
}
730
731
void ccv_nnc_graph_exec_symbol_set_copy(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t symbol, const int device_id, const ccv_nnc_graph_exec_symbol_t copy)
732
0
{
733
0
  assert(graph->data_parallel.exec_symbol_idx);
734
0
  assert(symbol.d >= 0);
735
0
  assert(symbol.d < graph->exec_symbol_info->rnum);
736
0
  assert(symbol.graph == graph);
737
0
  const int parallel_count = graph->data_parallel.count;
738
0
  if (copy.d == CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
739
0
  {
740
0
    assert(symbol.d < graph->data_parallel.exec_symbol_size);
741
0
    graph->data_parallel.exec_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] = -1;
742
0
    return;
743
0
  }
744
0
  assert(copy.d >= 0);
745
0
  assert(copy.d < graph->exec_symbol_info->rnum);
746
0
  assert(copy.graph == graph);
747
0
  assert(parallel_count > 1);
748
0
  if (symbol.d >= graph->data_parallel.exec_symbol_size)
749
0
  {
750
0
    graph->data_parallel.exec_symbol_idx = ccrealloc(graph->data_parallel.exec_symbol_idx, sizeof(int) * (parallel_count - 1) * (symbol.d + 1));
751
0
    int i;
752
0
    for (i = graph->data_parallel.exec_symbol_size * (parallel_count - 1); i < (symbol.d + 1) * (parallel_count - 1); i++)
753
0
      graph->data_parallel.exec_symbol_idx[i] = -1;
754
0
    graph->data_parallel.exec_symbol_size = symbol.d + 1;
755
0
  }
756
0
  graph->data_parallel.exec_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] = copy.d;
757
0
}