Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/nccl.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
TEST_CASE("nccl with allreduce in blocking mode")
15
1
{
16
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
17
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_ALLREDUCE_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1);
18
1
  ccv_nnc_tensor_t* tensors[device_count];
19
1
  int i;
20
5
  for (i = 0; i < device_count; 
i++4
)
21
4
  {
22
4
    ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100);
23
4
    CCV_TENSOR_SET_DEVICE_ID(info.type, i);
24
4
    tensors[i] = ccv_nnc_tensor_new(0, info, 0);
25
4
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(i), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0);
26
4
  }
27
1
  ccv_nnc_cmd_exec(CMD_COMM_ALLREDUCE_FORWARD(), ccv_nnc_no_hint, 0, tensors, device_count, tensors, device_count, 0);
28
1
  ccv_nnc_tensor_t* cpu_tensors[device_count];
29
5
  for (i = 0; i < device_count; 
i++4
)
30
4
  {
31
4
    cpu_tensors[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
32
4
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &cpu_tensors[i], 1, 0);
33
4
  }
34
1
  ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
35
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD((device_count - 1) * device_count / 2), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0);
36
5
  for (i = 0; i < device_count; 
i++4
)
37
4
    REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensors[i], "all values should be summed");
38
1
  ccv_nnc_tensor_free(demo_tensor);
39
5
  for (i = 0; i < device_count; 
i++4
)
40
4
  {
41
4
    ccv_nnc_tensor_free(tensors[i]);
42
4
    ccv_nnc_tensor_free(cpu_tensors[i]);
43
4
  }
44
1
}
45
46
TEST_CASE("nccl with broadcast in blocking mode")
47
1
{
48
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
49
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_BROADCAST_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1);
50
1
  ccv_nnc_tensor_t* tensors[device_count];
51
1
  int i;
52
5
  for (i = 0; i < device_count; 
i++4
)
53
4
  {
54
4
    ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100);
55
4
    CCV_TENSOR_SET_DEVICE_ID(info.type, i);
56
4
    tensors[i] = ccv_nnc_tensor_new(0, info, 0);
57
4
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 1), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0);
58
4
  }
59
1
  ccv_nnc_cmd_exec(CMD_COMM_BROADCAST_FORWARD(), ccv_nnc_no_hint, 0, tensors, 1, tensors, device_count, 0);
60
1
  ccv_nnc_tensor_t* cpu_tensors[device_count];
61
5
  for (i = 0; i < device_count; 
i++4
)
62
4
  {
63
4
    cpu_tensors[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
64
4
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &cpu_tensors[i], 1, 0);
65
4
  }
66
1
  ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
67
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0);
68
5
  for (i = 0; i < device_count; 
i++4
)
69
4
    REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensors[i], "all values should be summed");
70
1
  ccv_nnc_tensor_free(demo_tensor);
71
5
  for (i = 0; i < device_count; 
i++4
)
72
4
  {
73
4
    ccv_nnc_tensor_free(tensors[i]);
74
4
    ccv_nnc_tensor_free(cpu_tensors[i]);
75
4
  }
76
1
}
77
78
TEST_CASE("nccl with reduce in blocking mode")
79
1
{
80
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
81
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_REDUCE_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1);
82
1
  ccv_nnc_tensor_t* tensors[device_count];
83
1
  int i;
84
5
  for (i = 0; i < device_count; 
i++4
)
85
4
  {
86
4
    ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100);
87
4
    CCV_TENSOR_SET_DEVICE_ID(info.type, i);
88
4
    tensors[i] = ccv_nnc_tensor_new(0, info, 0);
89
4
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 1), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0);
90
4
  }
91
1
  ccv_nnc_cmd_exec(CMD_COMM_REDUCE_FORWARD(), ccv_nnc_no_hint, 0, tensors, device_count, tensors, 1, 0);
92
1
  ccv_nnc_tensor_t* cpu_tensor;
93
1
  cpu_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
94
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[0], 1, &cpu_tensor, 1, 0);
95
1
  ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
96
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD((device_count + 1) * device_count / 2), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0);
97
1
  REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensor, "all values should be summed");
98
1
  ccv_nnc_tensor_free(demo_tensor);
99
1
  ccv_nnc_tensor_free(cpu_tensor);
100
5
  for (i = 0; i < device_count; 
i++4
)
101
4
    ccv_nnc_tensor_free(tensors[i]);
102
1
}
103
104
static ccv_nnc_stream_context_t* _neighbor_discovery(const int device_id, void* const contexts)
105
12
{
106
12
  ccv_nnc_stream_context_t** stream_contexts = (ccv_nnc_stream_context_t**)contexts;
107
12
  return stream_contexts[device_id];
108
12
}
109
110
TEST_CASE("nccl with allreduce in non-blocking mode")
111
1
{
112
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
113
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_ALLREDUCE_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1);
114
1
  ccv_nnc_tensor_t* tensors[device_count];
115
1
  ccv_nnc_stream_context_t* contexts[device_count];
116
1
  int i;
117
5
  for (i = 0; i < device_count; 
i++4
)
118
4
  {
119
4
    ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100);
120
4
    CCV_TENSOR_SET_DEVICE_ID(info.type, i);
121
4
    tensors[i] = ccv_nnc_tensor_new(0, info, 0);
122
4
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 0.5), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0);
123
4
    int stream_type = CCV_STREAM_CONTEXT_GPU;
124
4
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
125
4
    contexts[i] = ccv_nnc_stream_context_new(stream_type);
126
4
  }
127
1
  ccv_nnc_stream_context_set_neighbor_discovery(contexts[0], _neighbor_discovery, contexts);
128
1
  ccv_nnc_cmd_exec(CMD_COMM_ALLREDUCE_FORWARD(), ccv_nnc_no_hint, 0, tensors, device_count, tensors, device_count, contexts[0]);
129
1
  ccv_nnc_tensor_t* cpu_tensors[device_count];
130
5
  for (i = 0; i < device_count; 
i++4
)
131
4
  {
132
4
    cpu_tensors[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
133
4
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &cpu_tensors[i], 1, contexts[i]);
134
4
  }
135
1
  ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
136
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD((device_count - 1) * device_count / 2 + 0.5 * device_count), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0);
137
5
  for (i = 0; i < device_count; 
i++4
)
138
4
    ccv_nnc_stream_context_wait(contexts[i]);
139
5
  for (i = 0; i < device_count; 
i++4
)
140
4
    REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensors[i], "all values should be summed");
141
1
  ccv_nnc_tensor_free(demo_tensor);
142
5
  for (i = 0; i < device_count; 
i++4
)
143
4
  {
144
4
    ccv_nnc_tensor_free(tensors[i]);
145
4
    ccv_nnc_tensor_free(cpu_tensors[i]);
146
4
    ccv_nnc_stream_context_free(contexts[i]);
147
4
  }
148
1
}
149
150
TEST_CASE("nccl with broadcast in non-blocking mode")
151
1
{
152
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
153
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_BROADCAST_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1);
154
1
  ccv_nnc_tensor_t* tensors[device_count];
155
1
  ccv_nnc_stream_context_t* contexts[device_count];
156
1
  int i;
157
5
  for (i = 0; i < device_count; 
i++4
)
158
4
  {
159
4
    ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100);
160
4
    CCV_TENSOR_SET_DEVICE_ID(info.type, i);
161
4
    tensors[i] = ccv_nnc_tensor_new(0, info, 0);
162
4
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 1), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0);
163
4
    int stream_type = CCV_STREAM_CONTEXT_GPU;
164
4
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
165
4
    contexts[i] = ccv_nnc_stream_context_new(stream_type);
166
4
  }
167
1
  ccv_nnc_stream_context_set_neighbor_discovery(contexts[0], _neighbor_discovery, contexts);
168
1
  ccv_nnc_cmd_exec(CMD_COMM_BROADCAST_FORWARD(), ccv_nnc_no_hint, 0, tensors, 1, tensors, device_count, contexts[0]);
169
1
  ccv_nnc_tensor_t* cpu_tensors[device_count];
170
5
  for (i = 0; i < device_count; 
i++4
)
171
4
  {
172
4
    cpu_tensors[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
173
4
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &cpu_tensors[i], 1, contexts[i]);
174
4
  }
175
1
  ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
176
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0);
177
5
  for (i = 0; i < device_count; 
i++4
)
178
4
    ccv_nnc_stream_context_wait(contexts[i]);
179
5
  for (i = 0; i < device_count; 
i++4
)
180
4
    REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensors[i], "all values should be summed");
181
1
  ccv_nnc_tensor_free(demo_tensor);
182
5
  for (i = 0; i < device_count; 
i++4
)
183
4
  {
184
4
    ccv_nnc_tensor_free(tensors[i]);
185
4
    ccv_nnc_tensor_free(cpu_tensors[i]);
186
4
    ccv_nnc_stream_context_free(contexts[i]);
187
4
  }
188
1
}
189
190
TEST_CASE("nccl with reduce in non-blocking mode")
191
1
{
192
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
193
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_REDUCE_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1);
194
1
  ccv_nnc_tensor_t* tensors[device_count];
195
1
  ccv_nnc_stream_context_t* contexts[device_count];
196
1
  int i;
197
5
  for (i = 0; i < device_count; 
i++4
)
198
4
  {
199
4
    ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100);
200
4
    CCV_TENSOR_SET_DEVICE_ID(info.type, i);
201
4
    tensors[i] = ccv_nnc_tensor_new(0, info, 0);
202
4
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 1), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0);
203
4
    int stream_type = CCV_STREAM_CONTEXT_GPU;
204
4
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
205
4
    contexts[i] = ccv_nnc_stream_context_new(stream_type);
206
4
  }
207
1
  ccv_nnc_stream_context_set_neighbor_discovery(contexts[0], _neighbor_discovery, contexts);
208
1
  ccv_nnc_cmd_exec(CMD_COMM_REDUCE_FORWARD(), ccv_nnc_no_hint, 0, tensors, device_count, tensors, 1, contexts[0]);
209
1
  ccv_nnc_tensor_t* cpu_tensor;
210
1
  cpu_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
211
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[0], 1, &cpu_tensor, 1, contexts[0]);
212
1
  ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
213
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD((device_count + 1) * device_count / 2), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0);
214
1
  ccv_nnc_stream_context_wait(contexts[0]);
215
1
  REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensor, "all values should be summed");
216
1
  ccv_nnc_tensor_free(demo_tensor);
217
1
  ccv_nnc_tensor_free(cpu_tensor);
218
5
  for (i = 0; i < device_count; 
i++4
)
219
4
  {
220
4
    ccv_nnc_tensor_free(tensors[i]);
221
4
    ccv_nnc_stream_context_wait(contexts[i]);
222
4
    ccv_nnc_stream_context_free(contexts[i]);
223
4
  }
224
1
}
225
226
#include "case_main.h"