/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/schedule.tests.c
Line | Count | Source |
1 | | #include "case.h" |
2 | | #include "ccv_case.h" |
3 | | #include "ccv_nnc_case.h" |
4 | | #include <ccv.h> |
5 | | #include <nnc/ccv_nnc.h> |
6 | | #include <nnc/ccv_nnc_easy.h> |
7 | | #include <3rdparty/dsfmt/dSFMT.h> |
8 | | |
9 | | TEST_SETUP() |
10 | | { |
11 | | ccv_nnc_init(); |
12 | | } |
13 | | |
14 | | TEST_CASE("schedule GPU work on one stream") |
15 | 1 | { |
16 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS)); |
17 | 1 | ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new(); |
18 | 1 | ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a"); |
19 | 1 | ccv_nnc_tensor_symbol_t const w = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w"); |
20 | 1 | ccv_nnc_tensor_symbol_t const bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias"); |
21 | 1 | ccv_nnc_tensor_symbol_t const b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b"); |
22 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w, bias), TENSOR_SYMBOL_LIST(b), "mul"); |
23 | 1 | ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
24 | 1 | ccv_nnc_graph_t* graph; |
25 | 1 | ccv_nnc_tensor_arena_t* tensor_arena; |
26 | 1 | ccv_nnc_graph_exec_arena_t* graph_exec_arena; |
27 | 1 | ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, |
28 | 1 | 0, 0, |
29 | 1 | TENSOR_SYMBOL_LIST(b), |
30 | 1 | SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), |
31 | 1 | &graph, &tensor_arena, &graph_exec_arena); |
32 | 1 | SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH); |
33 | 1 | ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0); |
34 | 1 | GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH); |
35 | 1 | ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
36 | 1 | ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
37 | 1 | ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
38 | 1 | ccv_nnc_tensor_pin_memory(ha); |
39 | 1 | ccv_nnc_tensor_pin_memory(hw); |
40 | 1 | ccv_nnc_tensor_pin_memory(hbias); |
41 | 1 | ha->data.f32[0] = 1.4; |
42 | 1 | ha->data.f32[1] = 0.2; |
43 | 1 | hw->data.f32[0] = 2; |
44 | 1 | hw->data.f32[1] = 11; |
45 | 1 | hbias->data.f32[0] = 0; |
46 | 1 | ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a); |
47 | 1 | ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w); |
48 | 1 | ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias); |
49 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a_tensor, w_tensor, bias_tensor), 0); |
50 | 1 | ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
51 | 1 | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context); |
52 | 1 | ccv_nnc_stream_context_wait(stream_context); |
53 | 1 | ccv_nnc_stream_context_free(stream_context); |
54 | 1 | ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0); |
55 | 1 | ccv_nnc_tensor_pin_memory(hb); |
56 | 1 | ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b); |
57 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(hb), 0); |
58 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hb->data.f32[0], 1.4 * 2 + 0.2 * 11, 1e-5, "should match simple algebra"); |
59 | 1 | ccv_nnc_tensor_free(ha); |
60 | 1 | ccv_nnc_tensor_free(hw); |
61 | 1 | ccv_nnc_tensor_free(hbias); |
62 | 1 | ccv_nnc_tensor_free(hb); |
63 | 1 | ccv_nnc_symbolic_graph_free(symbolic_graph); |
64 | 1 | ccv_nnc_graph_free(graph); |
65 | 1 | ccv_nnc_tensor_arena_free(tensor_arena); |
66 | 1 | ccv_nnc_graph_exec_arena_free(graph_exec_arena); |
67 | 1 | } |
68 | | |
69 | | TEST_CASE("schedule GPU work on multiple streams") |
70 | 1 | { |
71 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS)); |
72 | 1 | ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new(); |
73 | 1 | ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a"); |
74 | 1 | ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w1"); |
75 | 1 | ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias1"); |
76 | 1 | ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b1"); |
77 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1"); |
78 | 1 | ccv_nnc_tensor_symbol_t const w2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w2"); |
79 | 1 | ccv_nnc_tensor_symbol_t const bias2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias2"); |
80 | 1 | ccv_nnc_tensor_symbol_t const b2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b2"); |
81 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w2, bias2), TENSOR_SYMBOL_LIST(b2), "mul2"); |
82 | 1 | ccv_nnc_tensor_symbol_t const w3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w3"); |
83 | 1 | ccv_nnc_tensor_symbol_t const bias3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias3"); |
84 | 1 | ccv_nnc_tensor_symbol_t const b3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b3"); |
85 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w3, bias3), TENSOR_SYMBOL_LIST(b3), "mul3"); |
86 | 1 | ccv_nnc_tensor_symbol_t const biasc = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "biasc"); |
87 | 1 | ccv_nnc_tensor_symbol_t const c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "c"); |
88 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b1, b2, biasc), TENSOR_SYMBOL_LIST(c), "mulc"); |
89 | 1 | ccv_nnc_tensor_symbol_t const biasd = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "biasd"); |
90 | 1 | ccv_nnc_tensor_symbol_t const d = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "d"); |
91 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(c, b3, biasd), TENSOR_SYMBOL_LIST(d), "muld"); |
92 | 1 | ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
93 | 1 | ccv_nnc_graph_t* graph; |
94 | 1 | ccv_nnc_tensor_arena_t* tensor_arena; |
95 | 1 | ccv_nnc_graph_exec_arena_t* graph_exec_arena; |
96 | 1 | ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, |
97 | 1 | 0, 0, |
98 | 1 | TENSOR_SYMBOL_LIST(d), |
99 | 1 | SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), |
100 | 1 | &graph, &tensor_arena, &graph_exec_arena); |
101 | 1 | SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH); |
102 | 1 | ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0); |
103 | 1 | GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH); |
104 | 1 | ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
105 | 1 | ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
106 | 1 | ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
107 | 1 | ccv_nnc_tensor_pin_memory(ha); |
108 | 1 | ccv_nnc_tensor_pin_memory(hw1); |
109 | 1 | ccv_nnc_tensor_pin_memory(hbias1); |
110 | 1 | ha->data.f32[0] = 1.4; |
111 | 1 | ha->data.f32[1] = 0.2; |
112 | 1 | hw1->data.f32[0] = 2; |
113 | 1 | hw1->data.f32[1] = 11; |
114 | 1 | hbias1->data.f32[0] = 0; |
115 | 1 | ccv_nnc_tensor_t* const hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
116 | 1 | ccv_nnc_tensor_t* const hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
117 | 1 | ccv_nnc_tensor_pin_memory(hw2); |
118 | 1 | ccv_nnc_tensor_pin_memory(hbias2); |
119 | 1 | hw2->data.f32[0] = 1; |
120 | 1 | hw2->data.f32[1] = 2.2; |
121 | 1 | hbias2->data.f32[0] = 1; |
122 | 1 | ccv_nnc_tensor_t* const hw3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
123 | 1 | ccv_nnc_tensor_t* const hbias3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
124 | 1 | ccv_nnc_tensor_pin_memory(hw3); |
125 | 1 | ccv_nnc_tensor_pin_memory(hbias3); |
126 | 1 | hw3->data.f32[0] = 0.5; |
127 | 1 | hw3->data.f32[1] = 1.5; |
128 | 1 | hbias3->data.f32[0] = 0.5; |
129 | 1 | ccv_nnc_tensor_t* const hbiasc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
130 | 1 | ccv_nnc_tensor_pin_memory(hbiasc); |
131 | 1 | hbiasc->data.f32[0] = 0.2; |
132 | 1 | ccv_nnc_tensor_t* const hbiasd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
133 | 1 | ccv_nnc_tensor_pin_memory(hbiasd); |
134 | 1 | hbiasd->data.f32[0] = 0.3; |
135 | 1 | ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a); |
136 | 1 | ccv_nnc_tensor_t* const w1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w1); |
137 | 1 | ccv_nnc_tensor_t* const bias1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias1); |
138 | 1 | ccv_nnc_tensor_t* const w2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w2); |
139 | 1 | ccv_nnc_tensor_t* const bias2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias2); |
140 | 1 | ccv_nnc_tensor_t* const w3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w3); |
141 | 1 | ccv_nnc_tensor_t* const bias3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias3); |
142 | 1 | ccv_nnc_tensor_t* const biasc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, biasc); |
143 | 1 | ccv_nnc_tensor_t* const biasd_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, biasd); |
144 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw1, hbias1, hw2, hbias2, hw3, hbias3, hbiasc, hbiasd), TENSOR_LIST(a_tensor, w1_tensor, bias1_tensor, w2_tensor, bias2_tensor, w3_tensor, bias3_tensor, biasc_tensor, biasd_tensor), 0); |
145 | 1 | ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
146 | 1 | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context); |
147 | 1 | ccv_nnc_stream_context_wait(stream_context); |
148 | 1 | ccv_nnc_stream_context_free(stream_context); |
149 | 1 | ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0); |
150 | 1 | ccv_nnc_tensor_pin_memory(hd); |
151 | 1 | ccv_nnc_tensor_t* const d_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, d); |
152 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d_tensor), TENSOR_LIST(hd), 0); |
153 | 1 | const float b1v = 1.4 * 2 + 0.2 * 11; |
154 | 1 | const float b2v = 1.4 * 1 + 0.2 * 2.2 + 1; |
155 | 1 | const float b3v = 1.4 * 0.5 + 0.2 * 1.5 + 0.5; |
156 | 1 | const float cv = b1v * b2v + 0.2; |
157 | 1 | const float dv = cv * b3v + 0.3; |
158 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hd->data.f32[0], dv, 1e-5, "should match simple algebra"); |
159 | 1 | ccv_nnc_tensor_free(ha); |
160 | 1 | ccv_nnc_tensor_free(hw1); |
161 | 1 | ccv_nnc_tensor_free(hbias1); |
162 | 1 | ccv_nnc_tensor_free(hw2); |
163 | 1 | ccv_nnc_tensor_free(hbias2); |
164 | 1 | ccv_nnc_tensor_free(hw3); |
165 | 1 | ccv_nnc_tensor_free(hbias3); |
166 | 1 | ccv_nnc_tensor_free(hbiasc); |
167 | 1 | ccv_nnc_tensor_free(hbiasd); |
168 | 1 | ccv_nnc_tensor_free(hd); |
169 | 1 | ccv_nnc_symbolic_graph_free(symbolic_graph); |
170 | 1 | ccv_nnc_graph_free(graph); |
171 | 1 | ccv_nnc_tensor_arena_free(tensor_arena); |
172 | 1 | ccv_nnc_graph_exec_arena_free(graph_exec_arena); |
173 | 1 | } |
174 | | |
175 | | static int while_5(ccv_nnc_tensor_t* const* const inputs, const int input_size, const void* const data) |
176 | 12 | { |
177 | 12 | return inputs[0]->data.i64[0] < 5; |
178 | 12 | } |
179 | | |
180 | | TEST_CASE("schedule GPU work with while loop") |
181 | 1 | { |
182 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS)); |
183 | 1 | ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new(); |
184 | 1 | ccv_nnc_symbolic_graph_t* const while_graph = ccv_nnc_symbolic_graph_new(); |
185 | 1 | ccv_nnc_symbolic_graph_while(symbolic_graph, CCV_NNC_GRAPH_FORWARD, while_graph, "while 1..5"); |
186 | 1 | ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a"); |
187 | 1 | ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(while_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2), "w1"); |
188 | 1 | ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(while_graph, GPU_TENSOR_NHWC(000, 32F, 2), "bias1"); |
189 | 1 | ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b1"); |
190 | 1 | ccv_nnc_graph_exec_symbol_t const noop = ccv_nnc_graph_exec_symbol_new(while_graph, CMD_NOOP(), 0, 0, 0, 0, "noop"); |
191 | 1 | ccv_nnc_graph_exec_symbol_t const mul1 = ccv_nnc_graph_exec_symbol_new(while_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1"); |
192 | 1 | ccv_nnc_graph_exec_symbol_concat(while_graph, noop, mul1); |
193 | 1 | ccv_nnc_symbolic_graph_set_while_expr(while_graph, while_5, 0, TENSOR_SYMBOL_LIST(ccv_nnc_tensor_symbol_for_while_count(while_graph)), GRAPH_EXEC_SYMBOL_LIST(noop)); |
194 | 1 | ccv_nnc_symbolic_graph_set_carry_overs(while_graph, TENSOR_SYMBOL_MAP(KV(b1, a))); |
195 | 1 | ccv_nnc_symbolic_graph_set_sources(while_graph, GRAPH_EXEC_SYMBOL_LIST(noop)); |
196 | 1 | ccv_nnc_symbolic_graph_set_destinations(while_graph, GRAPH_EXEC_SYMBOL_LIST(mul1)); |
197 | 1 | ccv_nnc_tensor_symbol_t const w2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w2"); |
198 | 1 | ccv_nnc_tensor_symbol_t const bias2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias2"); |
199 | 1 | ccv_nnc_tensor_symbol_t const b2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b2"); |
200 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w2, bias2), TENSOR_SYMBOL_LIST(b2), "mul2"); |
201 | 1 | ccv_nnc_tensor_symbol_t const w3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w3"); |
202 | 1 | ccv_nnc_tensor_symbol_t const bias3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias3"); |
203 | 1 | ccv_nnc_tensor_symbol_t const b3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b3"); |
204 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b1, w3, bias3), TENSOR_SYMBOL_LIST(b3), "mul3"); |
205 | 1 | ccv_nnc_tensor_symbol_t const biasc = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "biasc"); |
206 | 1 | ccv_nnc_tensor_symbol_t const c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "c"); |
207 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b2, b3, biasc), TENSOR_SYMBOL_LIST(c), "mulc"); |
208 | 1 | ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
209 | 1 | ccv_nnc_graph_t* graph; |
210 | 1 | ccv_nnc_tensor_arena_t* tensor_arena; |
211 | 1 | ccv_nnc_graph_exec_arena_t* graph_exec_arena; |
212 | 1 | ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, |
213 | 1 | 0, 0, |
214 | 1 | TENSOR_SYMBOL_LIST(c), |
215 | 1 | SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), |
216 | 1 | &graph, &tensor_arena, &graph_exec_arena); |
217 | 1 | SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH); |
218 | 1 | ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0); |
219 | 1 | GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH); |
220 | 1 | ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
221 | 1 | ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2), 0); |
222 | 1 | ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0); |
223 | 1 | ccv_nnc_tensor_pin_memory(ha); |
224 | 1 | ccv_nnc_tensor_pin_memory(hw1); |
225 | 1 | ccv_nnc_tensor_pin_memory(hbias1); |
226 | 1 | ha->data.f32[0] = 1.4; |
227 | 1 | ha->data.f32[1] = 0.2; |
228 | 1 | hw1->data.f32[0] = 1.1; |
229 | 1 | hw1->data.f32[1] = 2.2; |
230 | 1 | hw1->data.f32[2] = 1; |
231 | 1 | hw1->data.f32[3] = 2; |
232 | 1 | hbias1->data.f32[0] = 0; |
233 | 1 | hbias1->data.f32[1] = 0; |
234 | 1 | ccv_nnc_tensor_t* const hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
235 | 1 | ccv_nnc_tensor_t* const hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
236 | 1 | ccv_nnc_tensor_pin_memory(hw2); |
237 | 1 | ccv_nnc_tensor_pin_memory(hbias2); |
238 | 1 | hw2->data.f32[0] = 0.6; |
239 | 1 | hw2->data.f32[1] = 3; |
240 | 1 | hbias2->data.f32[0] = 0.4; |
241 | 1 | ccv_nnc_tensor_t* const hw3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
242 | 1 | ccv_nnc_tensor_t* const hbias3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
243 | 1 | ccv_nnc_tensor_pin_memory(hw3); |
244 | 1 | ccv_nnc_tensor_pin_memory(hbias3); |
245 | 1 | hw3->data.f32[0] = 0.2; |
246 | 1 | hw3->data.f32[1] = 0.3; |
247 | 1 | hbias3->data.f32[0] = 1; |
248 | 1 | ccv_nnc_tensor_t* const hbiasc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
249 | 1 | ccv_nnc_tensor_pin_memory(hbiasc); |
250 | 1 | hbiasc->data.f32[0] = 0.5; |
251 | 1 | ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a); |
252 | 1 | ccv_nnc_tensor_t* const w1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w1); |
253 | 1 | ccv_nnc_tensor_t* const bias1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias1); |
254 | 1 | ccv_nnc_tensor_t* const w2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w2); |
255 | 1 | ccv_nnc_tensor_t* const bias2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias2); |
256 | 1 | ccv_nnc_tensor_t* const w3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w3); |
257 | 1 | ccv_nnc_tensor_t* const bias3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias3); |
258 | 1 | ccv_nnc_tensor_t* const biasc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, biasc); |
259 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw1, hbias1, hw2, hbias2, hw3, hbias3, hbiasc), TENSOR_LIST(a_tensor, w1_tensor, bias1_tensor, w2_tensor, bias2_tensor, w3_tensor, bias3_tensor, biasc_tensor), 0); |
260 | 1 | ccv_nnc_stream_context_t* const stream_context = ccv_nnc_graph_default_stream(graph); |
261 | 1 | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context); |
262 | 1 | ccv_nnc_stream_context_wait(stream_context); |
263 | 1 | ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0); |
264 | 1 | ccv_nnc_tensor_pin_memory(hc); |
265 | 1 | ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c); |
266 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(hc), 0); |
267 | 1 | float av0 = 1.4; |
268 | 1 | float av1 = 0.2; |
269 | 1 | int i; |
270 | 6 | for (i = 0; i < 5; i++5 ) |
271 | 5 | { |
272 | 5 | const float b0 = av0 * 1.1 + av1 * 2.2; |
273 | 5 | const float b1 = av0 * 1 + av1 * 2; |
274 | 5 | av0 = b0; |
275 | 5 | av1 = b1; |
276 | 5 | } |
277 | 1 | const float b2v = 1.4 * 0.6 + 0.2 * 3 + 0.4; |
278 | 1 | const float b3v = av0 * 0.2 + av1 * 0.3 + 1; |
279 | 1 | const float cv = b2v * b3v + 0.5; |
280 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hc->data.f32[0], cv, 1e-2, "should match simple algebra"); |
281 | 1 | ccv_nnc_tensor_free(ha); |
282 | 1 | ccv_nnc_tensor_free(hw1); |
283 | 1 | ccv_nnc_tensor_free(hbias1); |
284 | 1 | ccv_nnc_tensor_free(hw2); |
285 | 1 | ccv_nnc_tensor_free(hbias2); |
286 | 1 | ccv_nnc_tensor_free(hw3); |
287 | 1 | ccv_nnc_tensor_free(hbias3); |
288 | 1 | ccv_nnc_tensor_free(hbiasc); |
289 | 1 | ccv_nnc_tensor_free(hc); |
290 | 1 | ccv_nnc_symbolic_graph_free(symbolic_graph); |
291 | 1 | ccv_nnc_graph_free(graph); |
292 | 1 | ccv_nnc_tensor_arena_free(tensor_arena); |
293 | 1 | ccv_nnc_graph_exec_arena_free(graph_exec_arena); |
294 | 1 | } |
295 | | |
296 | | static int case_of_0(ccv_nnc_tensor_t* const *const inputs, const int input_size, const void* const data) |
297 | 2 | { |
298 | 2 | return 0; |
299 | 2 | } |
300 | | |
301 | | TEST_CASE("schedule GPU work with case..of") |
302 | 1 | { |
303 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS)); |
304 | 1 | ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new(); |
305 | 1 | ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a"); |
306 | 1 | ccv_nnc_tensor_symbol_t const b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b"); |
307 | 1 | ccv_nnc_graph_exec_symbol_t const case_of = ccv_nnc_symbolic_graph_case_of_new(symbolic_graph, CCV_NNC_GRAPH_FORWARD, TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_MAP(KV(a, b)), "case..of"); |
308 | 1 | ccv_nnc_symbolic_graph_set_case_of_expr(symbolic_graph, case_of, case_of_0, 0); |
309 | 1 | ccv_nnc_symbolic_graph_t* const symbolic_graph_0 = ccv_nnc_symbolic_graph_new(); |
310 | 1 | ccv_nnc_tensor_symbol_t const b0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b0"); |
311 | 1 | ccv_nnc_symbolic_graph_set_case_of(symbolic_graph, case_of, symbolic_graph_0, 0, TENSOR_SYMBOL_MAP(KV(b0, b))); |
312 | 1 | ccv_nnc_tensor_symbol_t const w = ccv_nnc_tensor_symbol_new(symbolic_graph_0, GPU_TENSOR_NHWC(000, 32F, 2, 2), "w"); |
313 | 1 | ccv_nnc_tensor_symbol_t const bias = ccv_nnc_tensor_symbol_new(symbolic_graph_0, GPU_TENSOR_NHWC(000, 32F, 2), "bias"); |
314 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph_0, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w, bias), TENSOR_SYMBOL_LIST(b0), "mul"); |
315 | 1 | ccv_nnc_graph_exec_symbol_autogen(symbolic_graph_0, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
316 | 1 | ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
317 | 1 | ccv_nnc_graph_t* graph; |
318 | 1 | ccv_nnc_tensor_arena_t* tensor_arena; |
319 | 1 | ccv_nnc_graph_exec_arena_t* graph_exec_arena; |
320 | 1 | ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, |
321 | 1 | 0, 0, |
322 | 1 | TENSOR_SYMBOL_LIST(b), |
323 | 1 | SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), |
324 | 1 | &graph, &tensor_arena, &graph_exec_arena); |
325 | 1 | SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH); |
326 | 1 | ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0); |
327 | 1 | GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH); |
328 | 1 | ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
329 | 1 | ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2), 0); |
330 | 1 | ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0); |
331 | 1 | ccv_nnc_tensor_pin_memory(ha); |
332 | 1 | ccv_nnc_tensor_pin_memory(hw); |
333 | 1 | ccv_nnc_tensor_pin_memory(hbias); |
334 | 1 | ha->data.f32[0] = 1.4; |
335 | 1 | ha->data.f32[1] = 0.2; |
336 | 1 | hw->data.f32[0] = 2; |
337 | 1 | hw->data.f32[1] = 11; |
338 | 1 | hw->data.f32[2] = 1; |
339 | 1 | hw->data.f32[3] = 2; |
340 | 1 | hbias->data.f32[0] = 0; |
341 | 1 | hbias->data.f32[1] = 0; |
342 | 1 | ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a); |
343 | 1 | ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w); |
344 | 1 | ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias); |
345 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a_tensor, w_tensor, bias_tensor), 0); |
346 | 1 | ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
347 | 1 | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context); |
348 | 1 | ccv_nnc_stream_context_wait(stream_context); |
349 | 1 | ccv_nnc_stream_context_free(stream_context); |
350 | 1 | ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
351 | 1 | ccv_nnc_tensor_pin_memory(hb); |
352 | 1 | ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b); |
353 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(hb), 0); |
354 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hb->data.f32[0], 1.4 * 2 + 0.2 * 11, 1e-5, "should match simple algebra"); |
355 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hb->data.f32[1], 1.4 + 0.2 * 2, 1e-5, "should match simple algebra"); |
356 | 1 | ccv_nnc_graph_free(graph); |
357 | 1 | ccv_nnc_symbolic_graph_free(symbolic_graph); |
358 | 1 | ccv_nnc_tensor_arena_free(tensor_arena); |
359 | 1 | ccv_nnc_graph_exec_arena_free(graph_exec_arena); |
360 | 1 | ccv_nnc_tensor_free(ha); |
361 | 1 | ccv_nnc_tensor_free(hw); |
362 | 1 | ccv_nnc_tensor_free(hbias); |
363 | 1 | ccv_nnc_tensor_free(hb); |
364 | 1 | } |
365 | | |
366 | | TEST_CASE("schedule GPU work with both while loop and case..of") |
367 | 1 | { |
368 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS)); |
369 | 1 | ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new(); |
370 | 1 | ccv_nnc_symbolic_graph_t* const while_graph = ccv_nnc_symbolic_graph_new(); |
371 | 1 | ccv_nnc_symbolic_graph_while(symbolic_graph, CCV_NNC_GRAPH_FORWARD, while_graph, "while 1..5"); |
372 | 1 | ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a"); |
373 | 1 | ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(while_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2), "w1"); |
374 | 1 | ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(while_graph, GPU_TENSOR_NHWC(000, 32F, 2), "bias1"); |
375 | 1 | ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b1"); |
376 | 1 | ccv_nnc_graph_exec_symbol_t const noop = ccv_nnc_graph_exec_symbol_new(while_graph, CMD_NOOP(), 0, 0, 0, 0, "noop"); |
377 | 1 | ccv_nnc_graph_exec_symbol_t const mul1 = ccv_nnc_graph_exec_symbol_new(while_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1"); |
378 | 1 | ccv_nnc_graph_exec_symbol_concat(while_graph, noop, mul1); |
379 | 1 | ccv_nnc_symbolic_graph_set_while_expr(while_graph, while_5, 0, TENSOR_SYMBOL_LIST(ccv_nnc_tensor_symbol_for_while_count(while_graph)), GRAPH_EXEC_SYMBOL_LIST(noop)); |
380 | 1 | ccv_nnc_symbolic_graph_set_carry_overs(while_graph, TENSOR_SYMBOL_MAP(KV(b1, a))); |
381 | 1 | ccv_nnc_symbolic_graph_set_sources(while_graph, GRAPH_EXEC_SYMBOL_LIST(noop)); |
382 | 1 | ccv_nnc_symbolic_graph_set_destinations(while_graph, GRAPH_EXEC_SYMBOL_LIST(mul1)); |
383 | 1 | ccv_nnc_tensor_symbol_t const b2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b2"); |
384 | 1 | ccv_nnc_graph_exec_symbol_t const case_of = ccv_nnc_symbolic_graph_case_of_new(symbolic_graph, CCV_NNC_GRAPH_FORWARD, TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_MAP(KV(a, b2)), "case..of"); |
385 | 1 | ccv_nnc_symbolic_graph_set_case_of_expr(symbolic_graph, case_of, case_of_0, 0); |
386 | 1 | ccv_nnc_symbolic_graph_t* const symbolic_graph_0 = ccv_nnc_symbolic_graph_new(); |
387 | 1 | ccv_nnc_tensor_symbol_t const b20 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b20"); |
388 | 1 | ccv_nnc_symbolic_graph_set_case_of(symbolic_graph, case_of, symbolic_graph_0, 0, TENSOR_SYMBOL_MAP(KV(b20, b2))); |
389 | 1 | ccv_nnc_tensor_symbol_t const w2 = ccv_nnc_tensor_symbol_new(symbolic_graph_0, GPU_TENSOR_NHWC(000, 32F, 2, 2), "w2"); |
390 | 1 | ccv_nnc_tensor_symbol_t const bias2 = ccv_nnc_tensor_symbol_new(symbolic_graph_0, GPU_TENSOR_NHWC(000, 32F, 2), "bias2"); |
391 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph_0, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w2, bias2), TENSOR_SYMBOL_LIST(b20), "mul2"); |
392 | 1 | ccv_nnc_graph_exec_symbol_autogen(symbolic_graph_0, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
393 | 1 | ccv_nnc_tensor_symbol_t const w3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w3"); |
394 | 1 | ccv_nnc_tensor_symbol_t const bias3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias3"); |
395 | 1 | ccv_nnc_tensor_symbol_t const b3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b3"); |
396 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b2, w3, bias3), TENSOR_SYMBOL_LIST(b3), "mul3"); |
397 | 1 | ccv_nnc_tensor_symbol_t const w4 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w4"); |
398 | 1 | ccv_nnc_tensor_symbol_t const bias4 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias4"); |
399 | 1 | ccv_nnc_tensor_symbol_t const b4 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b4"); |
400 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b1, w4, bias4), TENSOR_SYMBOL_LIST(b4), "mul4"); |
401 | 1 | ccv_nnc_tensor_symbol_t const biasc = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "biasc"); |
402 | 1 | ccv_nnc_tensor_symbol_t const c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "c"); |
403 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b3, b4, biasc), TENSOR_SYMBOL_LIST(c), "mulc"); |
404 | 1 | ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
405 | 1 | ccv_nnc_graph_t* graph; |
406 | 1 | ccv_nnc_tensor_arena_t* tensor_arena; |
407 | 1 | ccv_nnc_graph_exec_arena_t* graph_exec_arena; |
408 | 1 | ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, |
409 | 1 | 0, 0, |
410 | 1 | TENSOR_SYMBOL_LIST(c), |
411 | 1 | SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), |
412 | 1 | &graph, &tensor_arena, &graph_exec_arena); |
413 | 1 | SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH); |
414 | 1 | ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0); |
415 | 1 | GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH); |
416 | 1 | ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
417 | 1 | ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2), 0); |
418 | 1 | ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0); |
419 | 1 | ccv_nnc_tensor_pin_memory(ha); |
420 | 1 | ccv_nnc_tensor_pin_memory(hw1); |
421 | 1 | ccv_nnc_tensor_pin_memory(hbias1); |
422 | 1 | ha->data.f32[0] = 1.4; |
423 | 1 | ha->data.f32[1] = 0.2; |
424 | 1 | hw1->data.f32[0] = 1.1; |
425 | 1 | hw1->data.f32[1] = 2.2; |
426 | 1 | hw1->data.f32[2] = 1; |
427 | 1 | hw1->data.f32[3] = 2; |
428 | 1 | hbias1->data.f32[0] = 0; |
429 | 1 | hbias1->data.f32[1] = 0; |
430 | 1 | ccv_nnc_tensor_t* const hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2), 0); |
431 | 1 | ccv_nnc_tensor_t* const hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0); |
432 | 1 | ccv_nnc_tensor_pin_memory(hw2); |
433 | 1 | ccv_nnc_tensor_pin_memory(hbias2); |
434 | 1 | hw2->data.f32[0] = 0.1; |
435 | 1 | hw2->data.f32[1] = 0.2; |
436 | 1 | hw2->data.f32[2] = 1.2; |
437 | 1 | hw2->data.f32[3] = 1.1; |
438 | 1 | hbias2->data.f32[0] = 1; |
439 | 1 | hbias2->data.f32[1] = 0; |
440 | 1 | ccv_nnc_tensor_t* const hw3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
441 | 1 | ccv_nnc_tensor_t* const hbias3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
442 | 1 | ccv_nnc_tensor_pin_memory(hw3); |
443 | 1 | ccv_nnc_tensor_pin_memory(hbias3); |
444 | 1 | hw3->data.f32[0] = 0.6; |
445 | 1 | hw3->data.f32[1] = 3; |
446 | 1 | hbias3->data.f32[0] = 0.4; |
447 | 1 | ccv_nnc_tensor_t* const hw4 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
448 | 1 | ccv_nnc_tensor_t* const hbias4 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
449 | 1 | ccv_nnc_tensor_pin_memory(hw4); |
450 | 1 | ccv_nnc_tensor_pin_memory(hbias4); |
451 | 1 | hw4->data.f32[0] = 0.2; |
452 | 1 | hw4->data.f32[1] = 0.3; |
453 | 1 | hbias4->data.f32[0] = 1; |
454 | 1 | ccv_nnc_tensor_t* const hbiasc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
455 | 1 | ccv_nnc_tensor_pin_memory(hbiasc); |
456 | 1 | hbiasc->data.f32[0] = 0.5; |
457 | 1 | ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a); |
458 | 1 | ccv_nnc_tensor_t* const w1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w1); |
459 | 1 | ccv_nnc_tensor_t* const bias1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias1); |
460 | 1 | ccv_nnc_tensor_t* const w2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w2); |
461 | 1 | ccv_nnc_tensor_t* const bias2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias2); |
462 | 1 | ccv_nnc_tensor_t* const w3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w3); |
463 | 1 | ccv_nnc_tensor_t* const bias3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias3); |
464 | 1 | ccv_nnc_tensor_t* const w4_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w4); |
465 | 1 | ccv_nnc_tensor_t* const bias4_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias4); |
466 | 1 | ccv_nnc_tensor_t* const biasc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, biasc); |
467 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw1, hbias1, hw2, hbias2, hw3, hbias3, hw4, hbias4, hbiasc), TENSOR_LIST(a_tensor, w1_tensor, bias1_tensor, w2_tensor, bias2_tensor, w3_tensor, bias3_tensor, w4_tensor, bias4_tensor, biasc_tensor), 0); |
468 | 1 | ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
469 | 1 | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context); |
470 | 1 | ccv_nnc_stream_context_wait(stream_context); |
471 | 1 | ccv_nnc_stream_context_free(stream_context); |
472 | 1 | ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0); |
473 | 1 | ccv_nnc_tensor_pin_memory(hc); |
474 | 1 | ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c); |
475 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(hc), 0); |
476 | 1 | float av0 = 1.4; |
477 | 1 | float av1 = 0.2; |
478 | 1 | int i; |
479 | 6 | for (i = 0; i < 5; i++5 ) |
480 | 5 | { |
481 | 5 | const float b0 = av0 * 1.1 + av1 * 2.2; |
482 | 5 | const float b1 = av0 * 1 + av1 * 2; |
483 | 5 | av0 = b0; |
484 | 5 | av1 = b1; |
485 | 5 | } |
486 | 1 | const float b2v0 = 1.4 * 0.1 + 0.2 * 0.2 + 1; |
487 | 1 | const float b2v1 = 1.4 * 1.2 + 0.2 * 1.1; |
488 | 1 | const float b3v = b2v0 * 0.6 + b2v1 * 3 + 0.4; |
489 | 1 | const float b4v = av0 * 0.2 + av1 * 0.3 + 1; |
490 | 1 | const float cv = b3v * b4v + 0.5; |
491 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hc->data.f32[0], cv, 1e-2, "should match simple algebra"); |
492 | 1 | ccv_nnc_tensor_free(ha); |
493 | 1 | ccv_nnc_tensor_free(hw1); |
494 | 1 | ccv_nnc_tensor_free(hbias1); |
495 | 1 | ccv_nnc_tensor_free(hw2); |
496 | 1 | ccv_nnc_tensor_free(hbias2); |
497 | 1 | ccv_nnc_tensor_free(hw3); |
498 | 1 | ccv_nnc_tensor_free(hbias3); |
499 | 1 | ccv_nnc_tensor_free(hw4); |
500 | 1 | ccv_nnc_tensor_free(hbias4); |
501 | 1 | ccv_nnc_tensor_free(hbiasc); |
502 | 1 | ccv_nnc_tensor_free(hc); |
503 | 1 | ccv_nnc_symbolic_graph_free(symbolic_graph); |
504 | 1 | ccv_nnc_graph_free(graph); |
505 | 1 | ccv_nnc_tensor_arena_free(tensor_arena); |
506 | 1 | ccv_nnc_graph_exec_arena_free(graph_exec_arena); |
507 | 1 | } |
508 | | |
509 | | TEST_CASE("partial schedule work, one for device 0 and one for device 1") |
510 | 1 | { |
511 | 1 | GUARD_ELSE_RETURN(ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU) > 1 && |
512 | 1 | ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) && |
513 | 1 | ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)); |
514 | 1 | ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new(); |
515 | 1 | ccv_nnc_tensor_symbol_t const a0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a0"); |
516 | 1 | ccv_nnc_tensor_symbol_t const w0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w0"); |
517 | 1 | ccv_nnc_tensor_symbol_t const bias0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias0"); |
518 | 1 | ccv_nnc_tensor_symbol_t const b0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b0"); |
519 | 1 | ccv_nnc_graph_exec_symbol_t const src0 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a0, w0, bias0), TENSOR_SYMBOL_LIST(b0), "mul0"); |
520 | 1 | ccv_nnc_tensor_symbol_t const c0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "c0"); |
521 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(1.1), TENSOR_SYMBOL_LIST(b0), TENSOR_SYMBOL_LIST(c0), "scale00"); |
522 | 1 | ccv_nnc_tensor_symbol_t const d0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "d0"); |
523 | 1 | ccv_nnc_graph_exec_symbol_t const dest0 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(0.9), TENSOR_SYMBOL_LIST(c0), TENSOR_SYMBOL_LIST(d0), "scale01"); |
524 | 1 | ccv_nnc_tensor_symbol_t const a1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 2), "a1"); |
525 | 1 | ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 2), "w1"); |
526 | 1 | ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1), "bias1"); |
527 | 1 | ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "b1"); |
528 | 1 | ccv_nnc_graph_exec_symbol_t const src1 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a1, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1"); |
529 | 1 | ccv_nnc_tensor_symbol_t const c1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "c1"); |
530 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(1.2), TENSOR_SYMBOL_LIST(b1), TENSOR_SYMBOL_LIST(c1), "scale10"); |
531 | 1 | ccv_nnc_tensor_symbol_t const d1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "d1"); |
532 | 1 | ccv_nnc_graph_exec_symbol_t const dest1 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(0.8), TENSOR_SYMBOL_LIST(c1), TENSOR_SYMBOL_LIST(d1), "scale11"); |
533 | 1 | ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
534 | 1 | ccv_nnc_graph_t* graph; |
535 | 1 | ccv_nnc_tensor_arena_t* tensor_arena; |
536 | 1 | ccv_nnc_graph_exec_arena_t* graph_exec_arena; |
537 | 1 | ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, |
538 | 1 | 0, 0, |
539 | 1 | TENSOR_SYMBOL_LIST(d0, d1), |
540 | 1 | SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), |
541 | 1 | &graph, &tensor_arena, &graph_exec_arena); |
542 | 1 | SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH); |
543 | 1 | ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0); |
544 | 1 | GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH); |
545 | 1 | ccv_nnc_tensor_t* const ha0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
546 | 1 | ccv_nnc_tensor_t* const ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
547 | 1 | ccv_nnc_tensor_t* const hw0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
548 | 1 | ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
549 | 1 | ccv_nnc_tensor_t* const hbias0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
550 | 1 | ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
551 | 1 | ccv_nnc_tensor_t* const hd0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0); |
552 | 1 | ccv_nnc_tensor_t* const hd1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0); |
553 | 1 | ha0->data.f32[0] = 0.4; |
554 | 1 | ha0->data.f32[1] = 1.2; |
555 | 1 | hw0->data.f32[0] = 3; |
556 | 1 | hw0->data.f32[1] = 2; |
557 | 1 | hbias0->data.f32[0] = -1; |
558 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
559 | 1 | TENSOR_LIST(ha0, hw0, hbias0), |
560 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, a0), ccv_nnc_tensor_from_symbol(tensor_arena, w0), ccv_nnc_tensor_from_symbol(tensor_arena, bias0)), |
561 | 1 | 0); |
562 | 1 | ha1->data.f32[0] = 1.3; |
563 | 1 | ha1->data.f32[1] = 0.5; |
564 | 1 | hw1->data.f32[0] = -3; |
565 | 1 | hw1->data.f32[1] = 5; |
566 | 1 | hbias1->data.f32[0] = 0.2; |
567 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
568 | 1 | TENSOR_LIST(ha1, hw1, hbias1), |
569 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, a1), ccv_nnc_tensor_from_symbol(tensor_arena, w1), ccv_nnc_tensor_from_symbol(tensor_arena, bias1)), |
570 | 1 | 0); |
571 | 1 | ccv_nnc_stream_context_t* const stream = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
572 | 1 | ccv_nnc_graph_run_with_schedule(graph, 0, 0, 0, stream); |
573 | 1 | ccv_nnc_stream_context_wait(stream); |
574 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
575 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)), |
576 | 1 | TENSOR_LIST(hd0), |
577 | 1 | 0); |
578 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
579 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)), |
580 | 1 | TENSOR_LIST(hd1), |
581 | 1 | 0); |
582 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hd0->data.f32[0], (0.4 * 3 + 1.2 * 2 - 1) * 1.1 * 0.9, 1e-5, "result should be equal"); |
583 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hd1->data.f32[0], (-1.3 * 3 + 0.5 * 5 + 0.2) * 1.2 * 0.8, 1e-5, "result should be equal"); |
584 | 1 | hd0->data.f32[0] = 0; |
585 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
586 | 1 | TENSOR_LIST(hd0), |
587 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)), |
588 | 1 | 0); |
589 | 1 | hd1->data.f32[0] = 0; |
590 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
591 | 1 | TENSOR_LIST(hd1), |
592 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)), |
593 | 1 | 0); |
594 | | // schedule device 0 |
595 | 1 | ccv_nnc_graph_static_schedule_t* const schedule0 = ccv_nnc_graph_static_schedule_new(graph, CCV_STREAM_CONTEXT_GPU, 0, |
596 | 1 | GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src0)), |
597 | 1 | GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest0))); |
598 | 1 | ccv_nnc_graph_run_with_schedule(graph, 0, schedule0, 0, stream); |
599 | 1 | ccv_nnc_stream_context_wait(stream); |
600 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
601 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)), |
602 | 1 | TENSOR_LIST(hd0), |
603 | 1 | 0); |
604 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hd0->data.f32[0], (0.4 * 3 + 1.2 * 2 - 1) * 1.1 * 0.9, 1e-5, "result should be equal"); |
605 | | // schedule device 1 |
606 | 1 | ccv_nnc_graph_static_schedule_t* const schedule1 = ccv_nnc_graph_static_schedule_new(graph, CCV_STREAM_CONTEXT_GPU, 0, |
607 | 1 | GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src1)), |
608 | 1 | GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest1))); |
609 | 1 | ccv_nnc_graph_run_with_schedule(graph, 0, schedule1, 0, stream); |
610 | 1 | ccv_nnc_stream_context_wait(stream); |
611 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
612 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)), |
613 | 1 | TENSOR_LIST(hd1), |
614 | 1 | 0); |
615 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hd1->data.f32[0], (-1.3 * 3 + 0.5 * 5 + 0.2) * 1.2 * 0.8, 1e-5, "result should be equal"); |
616 | 1 | hd0->data.f32[0] = 0; |
617 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
618 | 1 | TENSOR_LIST(hd0), |
619 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)), |
620 | 1 | 0); |
621 | 1 | hd1->data.f32[0] = 0; |
622 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
623 | 1 | TENSOR_LIST(hd1), |
624 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)), |
625 | 1 | 0); |
626 | | // custom schedule again with both device 0 and device 1. |
627 | 1 | ccv_nnc_graph_static_schedule_t* const schedule01 = ccv_nnc_graph_static_schedule_new(graph, CCV_STREAM_CONTEXT_GPU, 0, |
628 | 1 | GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src0), ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src1)), |
629 | 1 | GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest0), ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest1))); |
630 | 1 | ccv_nnc_graph_run_with_schedule(graph, 0, schedule01, 0, stream); |
631 | 1 | ccv_nnc_stream_context_wait(stream); |
632 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
633 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)), |
634 | 1 | TENSOR_LIST(hd0), |
635 | 1 | 0); |
636 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
637 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)), |
638 | 1 | TENSOR_LIST(hd1), |
639 | 1 | 0); |
640 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hd0->data.f32[0], (0.4 * 3 + 1.2 * 2 - 1) * 1.1 * 0.9, 1e-5, "result should be equal"); |
641 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hd1->data.f32[0], (-1.3 * 3 + 0.5 * 5 + 0.2) * 1.2 * 0.8, 1e-5, "result should be equal"); |
642 | 1 | ccv_nnc_graph_static_schedule_free(schedule0); |
643 | 1 | ccv_nnc_graph_static_schedule_free(schedule1); |
644 | 1 | ccv_nnc_graph_static_schedule_free(schedule01); |
645 | 1 | ccv_nnc_stream_context_free(stream); |
646 | 1 | ccv_nnc_symbolic_graph_free(symbolic_graph); |
647 | 1 | ccv_nnc_graph_exec_arena_free(graph_exec_arena); |
648 | 1 | ccv_nnc_tensor_arena_free(tensor_arena); |
649 | 1 | ccv_nnc_graph_free(graph); |
650 | 1 | ccv_nnc_tensor_free(ha0); |
651 | 1 | ccv_nnc_tensor_free(ha1); |
652 | 1 | ccv_nnc_tensor_free(hw0); |
653 | 1 | ccv_nnc_tensor_free(hw1); |
654 | 1 | ccv_nnc_tensor_free(hbias0); |
655 | 1 | ccv_nnc_tensor_free(hbias1); |
656 | 1 | ccv_nnc_tensor_free(hd0); |
657 | 1 | ccv_nnc_tensor_free(hd1); |
658 | 1 | } |
659 | | TEST_CASE("partial schedule on both device 0 and then join device 1") |
660 | 1 | { |
661 | 1 | GUARD_ELSE_RETURN(ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU) > 1 && |
662 | 1 | ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) && |
663 | 1 | ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)); |
664 | 1 | ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new(); |
665 | 1 | ccv_nnc_tensor_symbol_t const a0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a0"); |
666 | 1 | ccv_nnc_tensor_symbol_t const w0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w0"); |
667 | 1 | ccv_nnc_tensor_symbol_t const bias0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias0"); |
668 | 1 | ccv_nnc_tensor_symbol_t const b0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b0"); |
669 | 1 | ccv_nnc_graph_exec_symbol_t const src0 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a0, w0, bias0), TENSOR_SYMBOL_LIST(b0), "mul0"); |
670 | 1 | ccv_nnc_graph_exec_symbol_t const dest0 = src0; |
671 | 1 | ccv_nnc_tensor_symbol_t const a1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 2), "a1"); |
672 | 1 | ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 2), "w1"); |
673 | 1 | ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1), "bias1"); |
674 | 1 | ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "b1"); |
675 | 1 | ccv_nnc_graph_exec_symbol_t const src1 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a1, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1"); |
676 | 1 | ccv_nnc_tensor_symbol_t const c1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "c1"); |
677 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(1.2), TENSOR_SYMBOL_LIST(b1), TENSOR_SYMBOL_LIST(c1), "scale10"); |
678 | 1 | ccv_nnc_tensor_symbol_t const d1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "d1"); |
679 | 1 | ccv_nnc_graph_exec_symbol_t const dest1 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(0.8), TENSOR_SYMBOL_LIST(c1), TENSOR_SYMBOL_LIST(d1), "scale11"); |
680 | 1 | ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_NOOP(), TENSOR_SYMBOL_LIST(b0, d1), TENSOR_SYMBOL_LIST(), "noop"); |
681 | 1 | ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
682 | 1 | ccv_nnc_graph_t* graph; |
683 | 1 | ccv_nnc_tensor_arena_t* tensor_arena; |
684 | 1 | ccv_nnc_graph_exec_arena_t* graph_exec_arena; |
685 | 1 | ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, |
686 | 1 | 0, 0, |
687 | 1 | TENSOR_SYMBOL_LIST(b0, d1), |
688 | 1 | SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), |
689 | 1 | &graph, &tensor_arena, &graph_exec_arena); |
690 | 1 | SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH); |
691 | 1 | ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0); |
692 | 1 | GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH); |
693 | 1 | ccv_nnc_tensor_t* const ha0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
694 | 1 | ccv_nnc_tensor_t* const ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
695 | 1 | ccv_nnc_tensor_t* const hw0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
696 | 1 | ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0); |
697 | 1 | ccv_nnc_tensor_t* const hbias0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
698 | 1 | ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0); |
699 | 1 | ccv_nnc_tensor_t* const hb0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0); |
700 | 1 | ccv_nnc_tensor_t* const hd1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0); |
701 | 1 | ha0->data.f32[0] = 0.4; |
702 | 1 | ha0->data.f32[1] = 1.2; |
703 | 1 | hw0->data.f32[0] = 3; |
704 | 1 | hw0->data.f32[1] = 2; |
705 | 1 | hbias0->data.f32[0] = -1; |
706 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
707 | 1 | TENSOR_LIST(ha0, hw0, hbias0), |
708 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, a0), ccv_nnc_tensor_from_symbol(tensor_arena, w0), ccv_nnc_tensor_from_symbol(tensor_arena, bias0)), |
709 | 1 | 0); |
710 | 1 | ha1->data.f32[0] = 1.3; |
711 | 1 | ha1->data.f32[1] = 0.5; |
712 | 1 | hw1->data.f32[0] = -3; |
713 | 1 | hw1->data.f32[1] = 5; |
714 | 1 | hbias1->data.f32[0] = 0.2; |
715 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
716 | 1 | TENSOR_LIST(ha1, hw1, hbias1), |
717 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, a1), ccv_nnc_tensor_from_symbol(tensor_arena, w1), ccv_nnc_tensor_from_symbol(tensor_arena, bias1)), |
718 | 1 | 0); |
719 | 1 | ccv_nnc_stream_context_t* const stream0 = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU | CCV_COMPUTE_DEVICE_000); |
720 | 1 | ccv_nnc_stream_signal_t* const signal0 = ccv_nnc_stream_signal_new(CCV_STREAM_CONTEXT_GPU | CCV_COMPUTE_DEVICE_000); |
721 | 1 | ccv_nnc_stream_context_t* const stream1 = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU | CCV_COMPUTE_DEVICE_001); |
722 | | // custom schedule again with both device 0 and device 1. |
723 | 1 | ccv_nnc_graph_static_schedule_t* const schedule01 = ccv_nnc_graph_static_schedule_new(graph, CCV_STREAM_CONTEXT_GPU, 0, |
724 | 1 | GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src0), ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src1)), |
725 | 1 | GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest0), ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest1))); |
726 | 1 | ccv_nnc_graph_run_with_schedule(graph, 0, schedule01, 0, stream0); |
727 | 1 | ccv_nnc_stream_context_emit_signal(stream0, signal0); |
728 | 1 | ccv_nnc_stream_context_wait_signal(stream1, signal0); |
729 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
730 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)), |
731 | 1 | TENSOR_LIST(hd1), |
732 | 1 | stream1); |
733 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, |
734 | 1 | TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, b0)), |
735 | 1 | TENSOR_LIST(hb0), |
736 | 1 | stream0); |
737 | 1 | ccv_nnc_stream_context_wait(stream1); |
738 | 1 | ccv_nnc_stream_context_wait(stream0); |
739 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hb0->data.f32[0], 0.4 * 3 + 1.2 * 2 - 1, 1e-5, "result should be equal"); |
740 | 1 | REQUIRE_EQ_WITH_TOLERANCE(hd1->data.f32[0], (-1.3 * 3 + 0.5 * 5 + 0.2) * 1.2 * 0.8, 1e-5, "result should be equal"); |
741 | 1 | ccv_nnc_graph_static_schedule_free(schedule01); |
742 | 1 | ccv_nnc_stream_context_free(stream0); |
743 | 1 | ccv_nnc_stream_signal_free(signal0); |
744 | 1 | ccv_nnc_stream_context_free(stream1); |
745 | 1 | ccv_nnc_symbolic_graph_free(symbolic_graph); |
746 | 1 | ccv_nnc_graph_exec_arena_free(graph_exec_arena); |
747 | 1 | ccv_nnc_tensor_arena_free(tensor_arena); |
748 | 1 | ccv_nnc_graph_free(graph); |
749 | 1 | ccv_nnc_tensor_free(ha0); |
750 | 1 | ccv_nnc_tensor_free(ha1); |
751 | 1 | ccv_nnc_tensor_free(hw0); |
752 | 1 | ccv_nnc_tensor_free(hw1); |
753 | 1 | ccv_nnc_tensor_free(hbias0); |
754 | 1 | ccv_nnc_tensor_free(hbias1); |
755 | 1 | ccv_nnc_tensor_free(hb0); |
756 | 1 | ccv_nnc_tensor_free(hd1); |
757 | 1 | } |
758 | | |
759 | | #include "case_main.h" |