/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/nccl.tests.c
Line | Count | Source |
1 | | #include "case.h" |
2 | | #include "ccv_case.h" |
3 | | #include "ccv_nnc_case.h" |
4 | | #include <ccv.h> |
5 | | #include <nnc/ccv_nnc.h> |
6 | | #include <nnc/ccv_nnc_easy.h> |
7 | | #include <3rdparty/dsfmt/dSFMT.h> |
8 | | |
9 | | TEST_SETUP() |
10 | | { |
11 | | ccv_nnc_init(); |
12 | | } |
13 | | |
14 | | TEST_CASE("nccl with allreduce in blocking mode") |
15 | 1 | { |
16 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
17 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_ALLREDUCE_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1); |
18 | 1 | ccv_nnc_tensor_t* tensors[device_count]; |
19 | 1 | int i; |
20 | 5 | for (i = 0; i < device_count; i++4 ) |
21 | 4 | { |
22 | 4 | ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100); |
23 | 4 | CCV_TENSOR_SET_DEVICE_ID(info.type, i); |
24 | 4 | tensors[i] = ccv_nnc_tensor_new(0, info, 0); |
25 | 4 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(i), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0); |
26 | 4 | } |
27 | 1 | ccv_nnc_cmd_exec(CMD_COMM_ALLREDUCE_FORWARD(), ccv_nnc_no_hint, 0, tensors, device_count, tensors, device_count, 0); |
28 | 1 | ccv_nnc_tensor_t* cpu_tensors[device_count]; |
29 | 5 | for (i = 0; i < device_count; i++4 ) |
30 | 4 | { |
31 | 4 | cpu_tensors[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
32 | 4 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &cpu_tensors[i], 1, 0); |
33 | 4 | } |
34 | 1 | ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
35 | 1 | ccv_nnc_cmd_exec(CMD_SET_FORWARD((device_count - 1) * device_count / 2), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0); |
36 | 5 | for (i = 0; i < device_count; i++4 ) |
37 | 4 | REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensors[i], "all values should be summed"); |
38 | 1 | ccv_nnc_tensor_free(demo_tensor); |
39 | 5 | for (i = 0; i < device_count; i++4 ) |
40 | 4 | { |
41 | 4 | ccv_nnc_tensor_free(tensors[i]); |
42 | 4 | ccv_nnc_tensor_free(cpu_tensors[i]); |
43 | 4 | } |
44 | 1 | } |
45 | | |
46 | | TEST_CASE("nccl with broadcast in blocking mode") |
47 | 1 | { |
48 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
49 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_BROADCAST_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1); |
50 | 1 | ccv_nnc_tensor_t* tensors[device_count]; |
51 | 1 | int i; |
52 | 5 | for (i = 0; i < device_count; i++4 ) |
53 | 4 | { |
54 | 4 | ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100); |
55 | 4 | CCV_TENSOR_SET_DEVICE_ID(info.type, i); |
56 | 4 | tensors[i] = ccv_nnc_tensor_new(0, info, 0); |
57 | 4 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 1), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0); |
58 | 4 | } |
59 | 1 | ccv_nnc_cmd_exec(CMD_COMM_BROADCAST_FORWARD(), ccv_nnc_no_hint, 0, tensors, 1, tensors, device_count, 0); |
60 | 1 | ccv_nnc_tensor_t* cpu_tensors[device_count]; |
61 | 5 | for (i = 0; i < device_count; i++4 ) |
62 | 4 | { |
63 | 4 | cpu_tensors[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
64 | 4 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &cpu_tensors[i], 1, 0); |
65 | 4 | } |
66 | 1 | ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
67 | 1 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0); |
68 | 5 | for (i = 0; i < device_count; i++4 ) |
69 | 4 | REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensors[i], "all values should be summed"); |
70 | 1 | ccv_nnc_tensor_free(demo_tensor); |
71 | 5 | for (i = 0; i < device_count; i++4 ) |
72 | 4 | { |
73 | 4 | ccv_nnc_tensor_free(tensors[i]); |
74 | 4 | ccv_nnc_tensor_free(cpu_tensors[i]); |
75 | 4 | } |
76 | 1 | } |
77 | | |
78 | | TEST_CASE("nccl with reduce in blocking mode") |
79 | 1 | { |
80 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
81 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_REDUCE_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1); |
82 | 1 | ccv_nnc_tensor_t* tensors[device_count]; |
83 | 1 | int i; |
84 | 5 | for (i = 0; i < device_count; i++4 ) |
85 | 4 | { |
86 | 4 | ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100); |
87 | 4 | CCV_TENSOR_SET_DEVICE_ID(info.type, i); |
88 | 4 | tensors[i] = ccv_nnc_tensor_new(0, info, 0); |
89 | 4 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 1), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0); |
90 | 4 | } |
91 | 1 | ccv_nnc_cmd_exec(CMD_COMM_REDUCE_FORWARD(), ccv_nnc_no_hint, 0, tensors, device_count, tensors, 1, 0); |
92 | 1 | ccv_nnc_tensor_t* cpu_tensor; |
93 | 1 | cpu_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
94 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[0], 1, &cpu_tensor, 1, 0); |
95 | 1 | ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
96 | 1 | ccv_nnc_cmd_exec(CMD_SET_FORWARD((device_count + 1) * device_count / 2), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0); |
97 | 1 | REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensor, "all values should be summed"); |
98 | 1 | ccv_nnc_tensor_free(demo_tensor); |
99 | 1 | ccv_nnc_tensor_free(cpu_tensor); |
100 | 5 | for (i = 0; i < device_count; i++4 ) |
101 | 4 | ccv_nnc_tensor_free(tensors[i]); |
102 | 1 | } |
103 | | |
104 | | static ccv_nnc_stream_context_t* _neighbor_discovery(const int device_id, void* const contexts) |
105 | 12 | { |
106 | 12 | ccv_nnc_stream_context_t** stream_contexts = (ccv_nnc_stream_context_t**)contexts; |
107 | 12 | return stream_contexts[device_id]; |
108 | 12 | } |
109 | | |
110 | | TEST_CASE("nccl with allreduce in non-blocking mode") |
111 | 1 | { |
112 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
113 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_ALLREDUCE_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1); |
114 | 1 | ccv_nnc_tensor_t* tensors[device_count]; |
115 | 1 | ccv_nnc_stream_context_t* contexts[device_count]; |
116 | 1 | int i; |
117 | 5 | for (i = 0; i < device_count; i++4 ) |
118 | 4 | { |
119 | 4 | ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100); |
120 | 4 | CCV_TENSOR_SET_DEVICE_ID(info.type, i); |
121 | 4 | tensors[i] = ccv_nnc_tensor_new(0, info, 0); |
122 | 4 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 0.5), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0); |
123 | 4 | int stream_type = CCV_STREAM_CONTEXT_GPU; |
124 | 4 | CCV_STREAM_SET_DEVICE_ID(stream_type, i); |
125 | 4 | contexts[i] = ccv_nnc_stream_context_new(stream_type); |
126 | 4 | } |
127 | 1 | ccv_nnc_stream_context_set_neighbor_discovery(contexts[0], _neighbor_discovery, contexts); |
128 | 1 | ccv_nnc_cmd_exec(CMD_COMM_ALLREDUCE_FORWARD(), ccv_nnc_no_hint, 0, tensors, device_count, tensors, device_count, contexts[0]); |
129 | 1 | ccv_nnc_tensor_t* cpu_tensors[device_count]; |
130 | 5 | for (i = 0; i < device_count; i++4 ) |
131 | 4 | { |
132 | 4 | cpu_tensors[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
133 | 4 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &cpu_tensors[i], 1, contexts[i]); |
134 | 4 | } |
135 | 1 | ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
136 | 1 | ccv_nnc_cmd_exec(CMD_SET_FORWARD((device_count - 1) * device_count / 2 + 0.5 * device_count), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0); |
137 | 5 | for (i = 0; i < device_count; i++4 ) |
138 | 4 | ccv_nnc_stream_context_wait(contexts[i]); |
139 | 5 | for (i = 0; i < device_count; i++4 ) |
140 | 4 | REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensors[i], "all values should be summed"); |
141 | 1 | ccv_nnc_tensor_free(demo_tensor); |
142 | 5 | for (i = 0; i < device_count; i++4 ) |
143 | 4 | { |
144 | 4 | ccv_nnc_tensor_free(tensors[i]); |
145 | 4 | ccv_nnc_tensor_free(cpu_tensors[i]); |
146 | 4 | ccv_nnc_stream_context_free(contexts[i]); |
147 | 4 | } |
148 | 1 | } |
149 | | |
150 | | TEST_CASE("nccl with broadcast in non-blocking mode") |
151 | 1 | { |
152 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
153 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_BROADCAST_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1); |
154 | 1 | ccv_nnc_tensor_t* tensors[device_count]; |
155 | 1 | ccv_nnc_stream_context_t* contexts[device_count]; |
156 | 1 | int i; |
157 | 5 | for (i = 0; i < device_count; i++4 ) |
158 | 4 | { |
159 | 4 | ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100); |
160 | 4 | CCV_TENSOR_SET_DEVICE_ID(info.type, i); |
161 | 4 | tensors[i] = ccv_nnc_tensor_new(0, info, 0); |
162 | 4 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 1), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0); |
163 | 4 | int stream_type = CCV_STREAM_CONTEXT_GPU; |
164 | 4 | CCV_STREAM_SET_DEVICE_ID(stream_type, i); |
165 | 4 | contexts[i] = ccv_nnc_stream_context_new(stream_type); |
166 | 4 | } |
167 | 1 | ccv_nnc_stream_context_set_neighbor_discovery(contexts[0], _neighbor_discovery, contexts); |
168 | 1 | ccv_nnc_cmd_exec(CMD_COMM_BROADCAST_FORWARD(), ccv_nnc_no_hint, 0, tensors, 1, tensors, device_count, contexts[0]); |
169 | 1 | ccv_nnc_tensor_t* cpu_tensors[device_count]; |
170 | 5 | for (i = 0; i < device_count; i++4 ) |
171 | 4 | { |
172 | 4 | cpu_tensors[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
173 | 4 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &cpu_tensors[i], 1, contexts[i]); |
174 | 4 | } |
175 | 1 | ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
176 | 1 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0); |
177 | 5 | for (i = 0; i < device_count; i++4 ) |
178 | 4 | ccv_nnc_stream_context_wait(contexts[i]); |
179 | 5 | for (i = 0; i < device_count; i++4 ) |
180 | 4 | REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensors[i], "all values should be summed"); |
181 | 1 | ccv_nnc_tensor_free(demo_tensor); |
182 | 5 | for (i = 0; i < device_count; i++4 ) |
183 | 4 | { |
184 | 4 | ccv_nnc_tensor_free(tensors[i]); |
185 | 4 | ccv_nnc_tensor_free(cpu_tensors[i]); |
186 | 4 | ccv_nnc_stream_context_free(contexts[i]); |
187 | 4 | } |
188 | 1 | } |
189 | | |
190 | | TEST_CASE("nccl with reduce in non-blocking mode") |
191 | 1 | { |
192 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
193 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_COMM_REDUCE_FORWARD, CCV_NNC_BACKEND_GPU_NCCL) && device_count > 1); |
194 | 1 | ccv_nnc_tensor_t* tensors[device_count]; |
195 | 1 | ccv_nnc_stream_context_t* contexts[device_count]; |
196 | 1 | int i; |
197 | 5 | for (i = 0; i < device_count; i++4 ) |
198 | 4 | { |
199 | 4 | ccv_nnc_tensor_param_t info = GPU_TENSOR_NHWC(000, 32F, 100); |
200 | 4 | CCV_TENSOR_SET_DEVICE_ID(info.type, i); |
201 | 4 | tensors[i] = ccv_nnc_tensor_new(0, info, 0); |
202 | 4 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(i + 1), ccv_nnc_no_hint, 0, 0, 0, &tensors[i], 1, 0); |
203 | 4 | int stream_type = CCV_STREAM_CONTEXT_GPU; |
204 | 4 | CCV_STREAM_SET_DEVICE_ID(stream_type, i); |
205 | 4 | contexts[i] = ccv_nnc_stream_context_new(stream_type); |
206 | 4 | } |
207 | 1 | ccv_nnc_stream_context_set_neighbor_discovery(contexts[0], _neighbor_discovery, contexts); |
208 | 1 | ccv_nnc_cmd_exec(CMD_COMM_REDUCE_FORWARD(), ccv_nnc_no_hint, 0, tensors, device_count, tensors, 1, contexts[0]); |
209 | 1 | ccv_nnc_tensor_t* cpu_tensor; |
210 | 1 | cpu_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
211 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[0], 1, &cpu_tensor, 1, contexts[0]); |
212 | 1 | ccv_nnc_tensor_t* demo_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0); |
213 | 1 | ccv_nnc_cmd_exec(CMD_SET_FORWARD((device_count + 1) * device_count / 2), ccv_nnc_no_hint, 0, 0, 0, &demo_tensor, 1, 0); |
214 | 1 | ccv_nnc_stream_context_wait(contexts[0]); |
215 | 1 | REQUIRE_TENSOR_EQ(demo_tensor, cpu_tensor, "all values should be summed"); |
216 | 1 | ccv_nnc_tensor_free(demo_tensor); |
217 | 1 | ccv_nnc_tensor_free(cpu_tensor); |
218 | 5 | for (i = 0; i < device_count; i++4 ) |
219 | 4 | { |
220 | 4 | ccv_nnc_tensor_free(tensors[i]); |
221 | 4 | ccv_nnc_stream_context_wait(contexts[i]); |
222 | 4 | ccv_nnc_stream_context_free(contexts[i]); |
223 | 4 | } |
224 | 1 | } |
225 | | |
226 | | #include "case_main.h" |