Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/schedule.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
TEST_CASE("schedule GPU work on one stream")
15
1
{
16
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
17
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
18
1
  ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a");
19
1
  ccv_nnc_tensor_symbol_t const w = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w");
20
1
  ccv_nnc_tensor_symbol_t const bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias");
21
1
  ccv_nnc_tensor_symbol_t const b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b");
22
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w, bias), TENSOR_SYMBOL_LIST(b), "mul");
23
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
24
1
  ccv_nnc_graph_t* graph;
25
1
  ccv_nnc_tensor_arena_t* tensor_arena;
26
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
27
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params,
28
1
    0, 0,
29
1
    TENSOR_SYMBOL_LIST(b),
30
1
    SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph),
31
1
    &graph, &tensor_arena, &graph_exec_arena);
32
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
33
1
  ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0);
34
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
35
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
36
1
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
37
1
  ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
38
1
  ccv_nnc_tensor_pin_memory(ha);
39
1
  ccv_nnc_tensor_pin_memory(hw);
40
1
  ccv_nnc_tensor_pin_memory(hbias);
41
1
  ha->data.f32[0] = 1.4;
42
1
  ha->data.f32[1] = 0.2;
43
1
  hw->data.f32[0] = 2;
44
1
  hw->data.f32[1] = 11;
45
1
  hbias->data.f32[0] = 0;
46
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
47
1
  ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w);
48
1
  ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias);
49
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a_tensor, w_tensor, bias_tensor), 0);
50
1
  ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
51
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context);
52
1
  ccv_nnc_stream_context_wait(stream_context);
53
1
  ccv_nnc_stream_context_free(stream_context);
54
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
55
1
  ccv_nnc_tensor_pin_memory(hb);
56
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
57
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(hb), 0);
58
1
  REQUIRE_EQ_WITH_TOLERANCE(hb->data.f32[0], 1.4 * 2 + 0.2 * 11, 1e-5, "should match simple algebra");
59
1
  ccv_nnc_tensor_free(ha);
60
1
  ccv_nnc_tensor_free(hw);
61
1
  ccv_nnc_tensor_free(hbias);
62
1
  ccv_nnc_tensor_free(hb);
63
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
64
1
  ccv_nnc_graph_free(graph);
65
1
  ccv_nnc_tensor_arena_free(tensor_arena);
66
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
67
1
}
68
69
TEST_CASE("schedule GPU work on multiple streams")
70
1
{
71
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
72
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
73
1
  ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a");
74
1
  ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w1");
75
1
  ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias1");
76
1
  ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b1");
77
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1");
78
1
  ccv_nnc_tensor_symbol_t const w2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w2");
79
1
  ccv_nnc_tensor_symbol_t const bias2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias2");
80
1
  ccv_nnc_tensor_symbol_t const b2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b2");
81
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w2, bias2), TENSOR_SYMBOL_LIST(b2), "mul2");
82
1
  ccv_nnc_tensor_symbol_t const w3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w3");
83
1
  ccv_nnc_tensor_symbol_t const bias3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias3");
84
1
  ccv_nnc_tensor_symbol_t const b3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b3");
85
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w3, bias3), TENSOR_SYMBOL_LIST(b3), "mul3");
86
1
  ccv_nnc_tensor_symbol_t const biasc = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "biasc");
87
1
  ccv_nnc_tensor_symbol_t const c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "c");
88
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b1, b2, biasc), TENSOR_SYMBOL_LIST(c), "mulc");
89
1
  ccv_nnc_tensor_symbol_t const biasd = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "biasd");
90
1
  ccv_nnc_tensor_symbol_t const d = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "d");
91
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(c, b3, biasd), TENSOR_SYMBOL_LIST(d), "muld");
92
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
93
1
  ccv_nnc_graph_t* graph;
94
1
  ccv_nnc_tensor_arena_t* tensor_arena;
95
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
96
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params,
97
1
    0, 0,
98
1
    TENSOR_SYMBOL_LIST(d),
99
1
    SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph),
100
1
    &graph, &tensor_arena, &graph_exec_arena);
101
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
102
1
  ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0);
103
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
104
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
105
1
  ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
106
1
  ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
107
1
  ccv_nnc_tensor_pin_memory(ha);
108
1
  ccv_nnc_tensor_pin_memory(hw1);
109
1
  ccv_nnc_tensor_pin_memory(hbias1);
110
1
  ha->data.f32[0] = 1.4;
111
1
  ha->data.f32[1] = 0.2;
112
1
  hw1->data.f32[0] = 2;
113
1
  hw1->data.f32[1] = 11;
114
1
  hbias1->data.f32[0] = 0;
115
1
  ccv_nnc_tensor_t* const hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
116
1
  ccv_nnc_tensor_t* const hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
117
1
  ccv_nnc_tensor_pin_memory(hw2);
118
1
  ccv_nnc_tensor_pin_memory(hbias2);
119
1
  hw2->data.f32[0] = 1;
120
1
  hw2->data.f32[1] = 2.2;
121
1
  hbias2->data.f32[0] = 1;
122
1
  ccv_nnc_tensor_t* const hw3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
123
1
  ccv_nnc_tensor_t* const hbias3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
124
1
  ccv_nnc_tensor_pin_memory(hw3);
125
1
  ccv_nnc_tensor_pin_memory(hbias3);
126
1
  hw3->data.f32[0] = 0.5;
127
1
  hw3->data.f32[1] = 1.5;
128
1
  hbias3->data.f32[0] = 0.5;
129
1
  ccv_nnc_tensor_t* const hbiasc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
130
1
  ccv_nnc_tensor_pin_memory(hbiasc);
131
1
  hbiasc->data.f32[0] = 0.2;
132
1
  ccv_nnc_tensor_t* const hbiasd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
133
1
  ccv_nnc_tensor_pin_memory(hbiasd);
134
1
  hbiasd->data.f32[0] = 0.3;
135
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
136
1
  ccv_nnc_tensor_t* const w1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w1);
137
1
  ccv_nnc_tensor_t* const bias1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias1);
138
1
  ccv_nnc_tensor_t* const w2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w2);
139
1
  ccv_nnc_tensor_t* const bias2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias2);
140
1
  ccv_nnc_tensor_t* const w3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w3);
141
1
  ccv_nnc_tensor_t* const bias3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias3);
142
1
  ccv_nnc_tensor_t* const biasc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, biasc);
143
1
  ccv_nnc_tensor_t* const biasd_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, biasd);
144
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw1, hbias1, hw2, hbias2, hw3, hbias3, hbiasc, hbiasd), TENSOR_LIST(a_tensor, w1_tensor, bias1_tensor, w2_tensor, bias2_tensor, w3_tensor, bias3_tensor, biasc_tensor, biasd_tensor), 0);
145
1
  ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
146
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context);
147
1
  ccv_nnc_stream_context_wait(stream_context);
148
1
  ccv_nnc_stream_context_free(stream_context);
149
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
150
1
  ccv_nnc_tensor_pin_memory(hd);
151
1
  ccv_nnc_tensor_t* const d_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, d);
152
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d_tensor), TENSOR_LIST(hd), 0);
153
1
  const float b1v = 1.4 * 2 + 0.2 * 11;
154
1
  const float b2v = 1.4 * 1 + 0.2 * 2.2 + 1;
155
1
  const float b3v = 1.4 * 0.5 + 0.2 * 1.5 + 0.5;
156
1
  const float cv = b1v * b2v + 0.2;
157
1
  const float dv = cv * b3v + 0.3;
158
1
  REQUIRE_EQ_WITH_TOLERANCE(hd->data.f32[0], dv, 1e-5, "should match simple algebra");
159
1
  ccv_nnc_tensor_free(ha);
160
1
  ccv_nnc_tensor_free(hw1);
161
1
  ccv_nnc_tensor_free(hbias1);
162
1
  ccv_nnc_tensor_free(hw2);
163
1
  ccv_nnc_tensor_free(hbias2);
164
1
  ccv_nnc_tensor_free(hw3);
165
1
  ccv_nnc_tensor_free(hbias3);
166
1
  ccv_nnc_tensor_free(hbiasc);
167
1
  ccv_nnc_tensor_free(hbiasd);
168
1
  ccv_nnc_tensor_free(hd);
169
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
170
1
  ccv_nnc_graph_free(graph);
171
1
  ccv_nnc_tensor_arena_free(tensor_arena);
172
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
173
1
}
174
175
static int while_5(ccv_nnc_tensor_t* const* const inputs, const int input_size, const void* const data)
176
12
{
177
12
  return inputs[0]->data.i64[0] < 5;
178
12
}
179
180
TEST_CASE("schedule GPU work with while loop")
181
1
{
182
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
183
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
184
1
  ccv_nnc_symbolic_graph_t* const while_graph = ccv_nnc_symbolic_graph_new();
185
1
  ccv_nnc_symbolic_graph_while(symbolic_graph, CCV_NNC_GRAPH_FORWARD, while_graph, "while 1..5");
186
1
  ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a");
187
1
  ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(while_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2), "w1");
188
1
  ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(while_graph, GPU_TENSOR_NHWC(000, 32F, 2), "bias1");
189
1
  ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b1");
190
1
  ccv_nnc_graph_exec_symbol_t const noop = ccv_nnc_graph_exec_symbol_new(while_graph, CMD_NOOP(), 0, 0, 0, 0, "noop");
191
1
  ccv_nnc_graph_exec_symbol_t const mul1 = ccv_nnc_graph_exec_symbol_new(while_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1");
192
1
  ccv_nnc_graph_exec_symbol_concat(while_graph, noop, mul1);
193
1
  ccv_nnc_symbolic_graph_set_while_expr(while_graph, while_5, 0, TENSOR_SYMBOL_LIST(ccv_nnc_tensor_symbol_for_while_count(while_graph)), GRAPH_EXEC_SYMBOL_LIST(noop));
194
1
  ccv_nnc_symbolic_graph_set_carry_overs(while_graph, TENSOR_SYMBOL_MAP(KV(b1, a)));
195
1
  ccv_nnc_symbolic_graph_set_sources(while_graph, GRAPH_EXEC_SYMBOL_LIST(noop));
196
1
  ccv_nnc_symbolic_graph_set_destinations(while_graph, GRAPH_EXEC_SYMBOL_LIST(mul1));
197
1
  ccv_nnc_tensor_symbol_t const w2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w2");
198
1
  ccv_nnc_tensor_symbol_t const bias2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias2");
199
1
  ccv_nnc_tensor_symbol_t const b2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b2");
200
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w2, bias2), TENSOR_SYMBOL_LIST(b2), "mul2");
201
1
  ccv_nnc_tensor_symbol_t const w3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w3");
202
1
  ccv_nnc_tensor_symbol_t const bias3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias3");
203
1
  ccv_nnc_tensor_symbol_t const b3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b3");
204
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b1, w3, bias3), TENSOR_SYMBOL_LIST(b3), "mul3");
205
1
  ccv_nnc_tensor_symbol_t const biasc = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "biasc");
206
1
  ccv_nnc_tensor_symbol_t const c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "c");
207
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b2, b3, biasc), TENSOR_SYMBOL_LIST(c), "mulc");
208
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
209
1
  ccv_nnc_graph_t* graph;
210
1
  ccv_nnc_tensor_arena_t* tensor_arena;
211
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
212
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params,
213
1
    0, 0,
214
1
    TENSOR_SYMBOL_LIST(c),
215
1
    SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph),
216
1
    &graph, &tensor_arena, &graph_exec_arena);
217
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
218
1
  ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0);
219
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
220
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
221
1
  ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2), 0);
222
1
  ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
223
1
  ccv_nnc_tensor_pin_memory(ha);
224
1
  ccv_nnc_tensor_pin_memory(hw1);
225
1
  ccv_nnc_tensor_pin_memory(hbias1);
226
1
  ha->data.f32[0] = 1.4;
227
1
  ha->data.f32[1] = 0.2;
228
1
  hw1->data.f32[0] = 1.1;
229
1
  hw1->data.f32[1] = 2.2;
230
1
  hw1->data.f32[2] = 1;
231
1
  hw1->data.f32[3] = 2;
232
1
  hbias1->data.f32[0] = 0;
233
1
  hbias1->data.f32[1] = 0;
234
1
  ccv_nnc_tensor_t* const hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
235
1
  ccv_nnc_tensor_t* const hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
236
1
  ccv_nnc_tensor_pin_memory(hw2);
237
1
  ccv_nnc_tensor_pin_memory(hbias2);
238
1
  hw2->data.f32[0] = 0.6;
239
1
  hw2->data.f32[1] = 3;
240
1
  hbias2->data.f32[0] = 0.4;
241
1
  ccv_nnc_tensor_t* const hw3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
242
1
  ccv_nnc_tensor_t* const hbias3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
243
1
  ccv_nnc_tensor_pin_memory(hw3);
244
1
  ccv_nnc_tensor_pin_memory(hbias3);
245
1
  hw3->data.f32[0] = 0.2;
246
1
  hw3->data.f32[1] = 0.3;
247
1
  hbias3->data.f32[0] = 1;
248
1
  ccv_nnc_tensor_t* const hbiasc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
249
1
  ccv_nnc_tensor_pin_memory(hbiasc);
250
1
  hbiasc->data.f32[0] = 0.5;
251
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
252
1
  ccv_nnc_tensor_t* const w1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w1);
253
1
  ccv_nnc_tensor_t* const bias1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias1);
254
1
  ccv_nnc_tensor_t* const w2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w2);
255
1
  ccv_nnc_tensor_t* const bias2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias2);
256
1
  ccv_nnc_tensor_t* const w3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w3);
257
1
  ccv_nnc_tensor_t* const bias3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias3);
258
1
  ccv_nnc_tensor_t* const biasc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, biasc);
259
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw1, hbias1, hw2, hbias2, hw3, hbias3, hbiasc), TENSOR_LIST(a_tensor, w1_tensor, bias1_tensor, w2_tensor, bias2_tensor, w3_tensor, bias3_tensor, biasc_tensor), 0);
260
1
  ccv_nnc_stream_context_t* const stream_context = ccv_nnc_graph_default_stream(graph);
261
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context);
262
1
  ccv_nnc_stream_context_wait(stream_context);
263
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
264
1
  ccv_nnc_tensor_pin_memory(hc);
265
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
266
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(hc), 0);
267
1
  float av0 = 1.4;
268
1
  float av1 = 0.2;
269
1
  int i;
270
6
  for (i = 0; i < 5; 
i++5
)
271
5
  {
272
5
    const float b0 = av0 * 1.1 + av1 * 2.2;
273
5
    const float b1 = av0 * 1 + av1 * 2;
274
5
    av0 = b0;
275
5
    av1 = b1;
276
5
  }
277
1
  const float b2v = 1.4 * 0.6 + 0.2 * 3 + 0.4;
278
1
  const float b3v = av0 * 0.2 + av1 * 0.3 + 1;
279
1
  const float cv = b2v * b3v + 0.5;
280
1
  REQUIRE_EQ_WITH_TOLERANCE(hc->data.f32[0], cv, 1e-2, "should match simple algebra");
281
1
  ccv_nnc_tensor_free(ha);
282
1
  ccv_nnc_tensor_free(hw1);
283
1
  ccv_nnc_tensor_free(hbias1);
284
1
  ccv_nnc_tensor_free(hw2);
285
1
  ccv_nnc_tensor_free(hbias2);
286
1
  ccv_nnc_tensor_free(hw3);
287
1
  ccv_nnc_tensor_free(hbias3);
288
1
  ccv_nnc_tensor_free(hbiasc);
289
1
  ccv_nnc_tensor_free(hc);
290
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
291
1
  ccv_nnc_graph_free(graph);
292
1
  ccv_nnc_tensor_arena_free(tensor_arena);
293
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
294
1
}
295
296
static int case_of_0(ccv_nnc_tensor_t* const *const inputs, const int input_size, const void* const data)
297
2
{
298
2
  return 0;
299
2
}
300
301
TEST_CASE("schedule GPU work with case..of")
302
1
{
303
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
304
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
305
1
  ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a");
306
1
  ccv_nnc_tensor_symbol_t const b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b");
307
1
  ccv_nnc_graph_exec_symbol_t const case_of = ccv_nnc_symbolic_graph_case_of_new(symbolic_graph, CCV_NNC_GRAPH_FORWARD, TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_MAP(KV(a, b)), "case..of");
308
1
  ccv_nnc_symbolic_graph_set_case_of_expr(symbolic_graph, case_of, case_of_0, 0);
309
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph_0 = ccv_nnc_symbolic_graph_new();
310
1
  ccv_nnc_tensor_symbol_t const b0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b0");
311
1
  ccv_nnc_symbolic_graph_set_case_of(symbolic_graph, case_of, symbolic_graph_0, 0, TENSOR_SYMBOL_MAP(KV(b0, b)));
312
1
  ccv_nnc_tensor_symbol_t const w = ccv_nnc_tensor_symbol_new(symbolic_graph_0, GPU_TENSOR_NHWC(000, 32F, 2, 2), "w");
313
1
  ccv_nnc_tensor_symbol_t const bias = ccv_nnc_tensor_symbol_new(symbolic_graph_0, GPU_TENSOR_NHWC(000, 32F, 2), "bias");
314
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph_0, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w, bias), TENSOR_SYMBOL_LIST(b0), "mul");
315
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph_0, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
316
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
317
1
  ccv_nnc_graph_t* graph;
318
1
  ccv_nnc_tensor_arena_t* tensor_arena;
319
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
320
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params,
321
1
    0, 0,
322
1
    TENSOR_SYMBOL_LIST(b),
323
1
    SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph),
324
1
    &graph, &tensor_arena, &graph_exec_arena);
325
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
326
1
  ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0);
327
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
328
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
329
1
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2), 0);
330
1
  ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
331
1
  ccv_nnc_tensor_pin_memory(ha);
332
1
  ccv_nnc_tensor_pin_memory(hw);
333
1
  ccv_nnc_tensor_pin_memory(hbias);
334
1
  ha->data.f32[0] = 1.4;
335
1
  ha->data.f32[1] = 0.2;
336
1
  hw->data.f32[0] = 2;
337
1
  hw->data.f32[1] = 11;
338
1
  hw->data.f32[2] = 1;
339
1
  hw->data.f32[3] = 2;
340
1
  hbias->data.f32[0] = 0;
341
1
  hbias->data.f32[1] = 0;
342
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
343
1
  ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w);
344
1
  ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias);
345
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a_tensor, w_tensor, bias_tensor), 0);
346
1
  ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
347
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context);
348
1
  ccv_nnc_stream_context_wait(stream_context);
349
1
  ccv_nnc_stream_context_free(stream_context);
350
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
351
1
  ccv_nnc_tensor_pin_memory(hb);
352
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
353
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(hb), 0);
354
1
  REQUIRE_EQ_WITH_TOLERANCE(hb->data.f32[0], 1.4 * 2 + 0.2 * 11, 1e-5, "should match simple algebra");
355
1
  REQUIRE_EQ_WITH_TOLERANCE(hb->data.f32[1], 1.4 + 0.2 * 2, 1e-5, "should match simple algebra");
356
1
  ccv_nnc_graph_free(graph);
357
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
358
1
  ccv_nnc_tensor_arena_free(tensor_arena);
359
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
360
1
  ccv_nnc_tensor_free(ha);
361
1
  ccv_nnc_tensor_free(hw);
362
1
  ccv_nnc_tensor_free(hbias);
363
1
  ccv_nnc_tensor_free(hb);
364
1
}
365
366
TEST_CASE("schedule GPU work with both while loop and case..of")
367
1
{
368
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
369
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
370
1
  ccv_nnc_symbolic_graph_t* const while_graph = ccv_nnc_symbolic_graph_new();
371
1
  ccv_nnc_symbolic_graph_while(symbolic_graph, CCV_NNC_GRAPH_FORWARD, while_graph, "while 1..5");
372
1
  ccv_nnc_tensor_symbol_t const a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a");
373
1
  ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(while_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2), "w1");
374
1
  ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(while_graph, GPU_TENSOR_NHWC(000, 32F, 2), "bias1");
375
1
  ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b1");
376
1
  ccv_nnc_graph_exec_symbol_t const noop = ccv_nnc_graph_exec_symbol_new(while_graph, CMD_NOOP(), 0, 0, 0, 0, "noop");
377
1
  ccv_nnc_graph_exec_symbol_t const mul1 = ccv_nnc_graph_exec_symbol_new(while_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1");
378
1
  ccv_nnc_graph_exec_symbol_concat(while_graph, noop, mul1);
379
1
  ccv_nnc_symbolic_graph_set_while_expr(while_graph, while_5, 0, TENSOR_SYMBOL_LIST(ccv_nnc_tensor_symbol_for_while_count(while_graph)), GRAPH_EXEC_SYMBOL_LIST(noop));
380
1
  ccv_nnc_symbolic_graph_set_carry_overs(while_graph, TENSOR_SYMBOL_MAP(KV(b1, a)));
381
1
  ccv_nnc_symbolic_graph_set_sources(while_graph, GRAPH_EXEC_SYMBOL_LIST(noop));
382
1
  ccv_nnc_symbolic_graph_set_destinations(while_graph, GRAPH_EXEC_SYMBOL_LIST(mul1));
383
1
  ccv_nnc_tensor_symbol_t const b2 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b2");
384
1
  ccv_nnc_graph_exec_symbol_t const case_of = ccv_nnc_symbolic_graph_case_of_new(symbolic_graph, CCV_NNC_GRAPH_FORWARD, TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_MAP(KV(a, b2)), "case..of");
385
1
  ccv_nnc_symbolic_graph_set_case_of_expr(symbolic_graph, case_of, case_of_0, 0);
386
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph_0 = ccv_nnc_symbolic_graph_new();
387
1
  ccv_nnc_tensor_symbol_t const b20 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "b20");
388
1
  ccv_nnc_symbolic_graph_set_case_of(symbolic_graph, case_of, symbolic_graph_0, 0, TENSOR_SYMBOL_MAP(KV(b20, b2)));
389
1
  ccv_nnc_tensor_symbol_t const w2 = ccv_nnc_tensor_symbol_new(symbolic_graph_0, GPU_TENSOR_NHWC(000, 32F, 2, 2), "w2");
390
1
  ccv_nnc_tensor_symbol_t const bias2 = ccv_nnc_tensor_symbol_new(symbolic_graph_0, GPU_TENSOR_NHWC(000, 32F, 2), "bias2");
391
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph_0, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a, w2, bias2), TENSOR_SYMBOL_LIST(b20), "mul2");
392
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph_0, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
393
1
  ccv_nnc_tensor_symbol_t const w3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w3");
394
1
  ccv_nnc_tensor_symbol_t const bias3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias3");
395
1
  ccv_nnc_tensor_symbol_t const b3 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b3");
396
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b2, w3, bias3), TENSOR_SYMBOL_LIST(b3), "mul3");
397
1
  ccv_nnc_tensor_symbol_t const w4 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w4");
398
1
  ccv_nnc_tensor_symbol_t const bias4 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias4");
399
1
  ccv_nnc_tensor_symbol_t const b4 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b4");
400
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b1, w4, bias4), TENSOR_SYMBOL_LIST(b4), "mul4");
401
1
  ccv_nnc_tensor_symbol_t const biasc = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "biasc");
402
1
  ccv_nnc_tensor_symbol_t const c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "c");
403
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(b3, b4, biasc), TENSOR_SYMBOL_LIST(c), "mulc");
404
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
405
1
  ccv_nnc_graph_t* graph;
406
1
  ccv_nnc_tensor_arena_t* tensor_arena;
407
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
408
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params,
409
1
    0, 0,
410
1
    TENSOR_SYMBOL_LIST(c),
411
1
    SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph),
412
1
    &graph, &tensor_arena, &graph_exec_arena);
413
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
414
1
  ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0);
415
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
416
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
417
1
  ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2), 0);
418
1
  ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
419
1
  ccv_nnc_tensor_pin_memory(ha);
420
1
  ccv_nnc_tensor_pin_memory(hw1);
421
1
  ccv_nnc_tensor_pin_memory(hbias1);
422
1
  ha->data.f32[0] = 1.4;
423
1
  ha->data.f32[1] = 0.2;
424
1
  hw1->data.f32[0] = 1.1;
425
1
  hw1->data.f32[1] = 2.2;
426
1
  hw1->data.f32[2] = 1;
427
1
  hw1->data.f32[3] = 2;
428
1
  hbias1->data.f32[0] = 0;
429
1
  hbias1->data.f32[1] = 0;
430
1
  ccv_nnc_tensor_t* const hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2), 0);
431
1
  ccv_nnc_tensor_t* const hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
432
1
  ccv_nnc_tensor_pin_memory(hw2);
433
1
  ccv_nnc_tensor_pin_memory(hbias2);
434
1
  hw2->data.f32[0] = 0.1;
435
1
  hw2->data.f32[1] = 0.2;
436
1
  hw2->data.f32[2] = 1.2;
437
1
  hw2->data.f32[3] = 1.1;
438
1
  hbias2->data.f32[0] = 1;
439
1
  hbias2->data.f32[1] = 0;
440
1
  ccv_nnc_tensor_t* const hw3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
441
1
  ccv_nnc_tensor_t* const hbias3 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
442
1
  ccv_nnc_tensor_pin_memory(hw3);
443
1
  ccv_nnc_tensor_pin_memory(hbias3);
444
1
  hw3->data.f32[0] = 0.6;
445
1
  hw3->data.f32[1] = 3;
446
1
  hbias3->data.f32[0] = 0.4;
447
1
  ccv_nnc_tensor_t* const hw4 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
448
1
  ccv_nnc_tensor_t* const hbias4 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
449
1
  ccv_nnc_tensor_pin_memory(hw4);
450
1
  ccv_nnc_tensor_pin_memory(hbias4);
451
1
  hw4->data.f32[0] = 0.2;
452
1
  hw4->data.f32[1] = 0.3;
453
1
  hbias4->data.f32[0] = 1;
454
1
  ccv_nnc_tensor_t* const hbiasc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
455
1
  ccv_nnc_tensor_pin_memory(hbiasc);
456
1
  hbiasc->data.f32[0] = 0.5;
457
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
458
1
  ccv_nnc_tensor_t* const w1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w1);
459
1
  ccv_nnc_tensor_t* const bias1_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias1);
460
1
  ccv_nnc_tensor_t* const w2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w2);
461
1
  ccv_nnc_tensor_t* const bias2_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias2);
462
1
  ccv_nnc_tensor_t* const w3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w3);
463
1
  ccv_nnc_tensor_t* const bias3_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias3);
464
1
  ccv_nnc_tensor_t* const w4_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, w4);
465
1
  ccv_nnc_tensor_t* const bias4_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bias4);
466
1
  ccv_nnc_tensor_t* const biasc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, biasc);
467
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw1, hbias1, hw2, hbias2, hw3, hbias3, hw4, hbias4, hbiasc), TENSOR_LIST(a_tensor, w1_tensor, bias1_tensor, w2_tensor, bias2_tensor, w3_tensor, bias3_tensor, w4_tensor, bias4_tensor, biasc_tensor), 0);
468
1
  ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
469
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context);
470
1
  ccv_nnc_stream_context_wait(stream_context);
471
1
  ccv_nnc_stream_context_free(stream_context);
472
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
473
1
  ccv_nnc_tensor_pin_memory(hc);
474
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
475
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(hc), 0);
476
1
  float av0 = 1.4;
477
1
  float av1 = 0.2;
478
1
  int i;
479
6
  for (i = 0; i < 5; 
i++5
)
480
5
  {
481
5
    const float b0 = av0 * 1.1 + av1 * 2.2;
482
5
    const float b1 = av0 * 1 + av1 * 2;
483
5
    av0 = b0;
484
5
    av1 = b1;
485
5
  }
486
1
  const float b2v0 = 1.4 * 0.1 + 0.2 * 0.2 + 1;
487
1
  const float b2v1 = 1.4 * 1.2 + 0.2 * 1.1;
488
1
  const float b3v = b2v0 * 0.6 + b2v1 * 3 + 0.4;
489
1
  const float b4v = av0 * 0.2 + av1 * 0.3 + 1;
490
1
  const float cv = b3v * b4v + 0.5;
491
1
  REQUIRE_EQ_WITH_TOLERANCE(hc->data.f32[0], cv, 1e-2, "should match simple algebra");
492
1
  ccv_nnc_tensor_free(ha);
493
1
  ccv_nnc_tensor_free(hw1);
494
1
  ccv_nnc_tensor_free(hbias1);
495
1
  ccv_nnc_tensor_free(hw2);
496
1
  ccv_nnc_tensor_free(hbias2);
497
1
  ccv_nnc_tensor_free(hw3);
498
1
  ccv_nnc_tensor_free(hbias3);
499
1
  ccv_nnc_tensor_free(hw4);
500
1
  ccv_nnc_tensor_free(hbias4);
501
1
  ccv_nnc_tensor_free(hbiasc);
502
1
  ccv_nnc_tensor_free(hc);
503
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
504
1
  ccv_nnc_graph_free(graph);
505
1
  ccv_nnc_tensor_arena_free(tensor_arena);
506
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
507
1
}
508
509
TEST_CASE("partial schedule work, one for device 0 and one for device 1")
510
1
{
511
1
  GUARD_ELSE_RETURN(ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU) > 1 &&
512
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
513
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
514
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
515
1
  ccv_nnc_tensor_symbol_t const a0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a0");
516
1
  ccv_nnc_tensor_symbol_t const w0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w0");
517
1
  ccv_nnc_tensor_symbol_t const bias0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias0");
518
1
  ccv_nnc_tensor_symbol_t const b0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b0");
519
1
  ccv_nnc_graph_exec_symbol_t const src0 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a0, w0, bias0), TENSOR_SYMBOL_LIST(b0), "mul0");
520
1
  ccv_nnc_tensor_symbol_t const c0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "c0");
521
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(1.1), TENSOR_SYMBOL_LIST(b0), TENSOR_SYMBOL_LIST(c0), "scale00");
522
1
  ccv_nnc_tensor_symbol_t const d0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "d0");
523
1
  ccv_nnc_graph_exec_symbol_t const dest0 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(0.9), TENSOR_SYMBOL_LIST(c0), TENSOR_SYMBOL_LIST(d0), "scale01");
524
1
  ccv_nnc_tensor_symbol_t const a1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 2), "a1");
525
1
  ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 2), "w1");
526
1
  ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1), "bias1");
527
1
  ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "b1");
528
1
  ccv_nnc_graph_exec_symbol_t const src1 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a1, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1");
529
1
  ccv_nnc_tensor_symbol_t const c1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "c1");
530
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(1.2), TENSOR_SYMBOL_LIST(b1), TENSOR_SYMBOL_LIST(c1), "scale10");
531
1
  ccv_nnc_tensor_symbol_t const d1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "d1");
532
1
  ccv_nnc_graph_exec_symbol_t const dest1 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(0.8), TENSOR_SYMBOL_LIST(c1), TENSOR_SYMBOL_LIST(d1), "scale11");
533
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
534
1
  ccv_nnc_graph_t* graph;
535
1
  ccv_nnc_tensor_arena_t* tensor_arena;
536
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
537
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params,
538
1
    0, 0,
539
1
    TENSOR_SYMBOL_LIST(d0, d1),
540
1
    SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph),
541
1
    &graph, &tensor_arena, &graph_exec_arena);
542
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
543
1
  ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0);
544
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
545
1
  ccv_nnc_tensor_t* const ha0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
546
1
  ccv_nnc_tensor_t* const ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
547
1
  ccv_nnc_tensor_t* const hw0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
548
1
  ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
549
1
  ccv_nnc_tensor_t* const hbias0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
550
1
  ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
551
1
  ccv_nnc_tensor_t* const hd0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
552
1
  ccv_nnc_tensor_t* const hd1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
553
1
  ha0->data.f32[0] = 0.4;
554
1
  ha0->data.f32[1] = 1.2;
555
1
  hw0->data.f32[0] = 3;
556
1
  hw0->data.f32[1] = 2;
557
1
  hbias0->data.f32[0] = -1;
558
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
559
1
    TENSOR_LIST(ha0, hw0, hbias0),
560
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, a0), ccv_nnc_tensor_from_symbol(tensor_arena, w0), ccv_nnc_tensor_from_symbol(tensor_arena, bias0)),
561
1
    0);
562
1
  ha1->data.f32[0] = 1.3;
563
1
  ha1->data.f32[1] = 0.5;
564
1
  hw1->data.f32[0] = -3;
565
1
  hw1->data.f32[1] = 5;
566
1
  hbias1->data.f32[0] = 0.2;
567
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
568
1
    TENSOR_LIST(ha1, hw1, hbias1),
569
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, a1), ccv_nnc_tensor_from_symbol(tensor_arena, w1), ccv_nnc_tensor_from_symbol(tensor_arena, bias1)),
570
1
    0);
571
1
  ccv_nnc_stream_context_t* const stream = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
572
1
  ccv_nnc_graph_run_with_schedule(graph, 0, 0, 0, stream);
573
1
  ccv_nnc_stream_context_wait(stream);
574
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
575
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)),
576
1
    TENSOR_LIST(hd0),
577
1
    0);
578
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
579
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)),
580
1
    TENSOR_LIST(hd1),
581
1
    0);
582
1
  REQUIRE_EQ_WITH_TOLERANCE(hd0->data.f32[0], (0.4 * 3 + 1.2 * 2 - 1) * 1.1 * 0.9, 1e-5, "result should be equal");
583
1
  REQUIRE_EQ_WITH_TOLERANCE(hd1->data.f32[0], (-1.3 * 3 + 0.5 * 5 + 0.2) * 1.2 * 0.8, 1e-5, "result should be equal");
584
1
  hd0->data.f32[0] = 0;
585
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
586
1
    TENSOR_LIST(hd0),
587
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)),
588
1
    0);
589
1
  hd1->data.f32[0] = 0;
590
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
591
1
    TENSOR_LIST(hd1),
592
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)),
593
1
    0);
594
  // schedule device 0
595
1
  ccv_nnc_graph_static_schedule_t* const schedule0 = ccv_nnc_graph_static_schedule_new(graph, CCV_STREAM_CONTEXT_GPU, 0,
596
1
    GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src0)),
597
1
    GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest0)));
598
1
  ccv_nnc_graph_run_with_schedule(graph, 0, schedule0, 0, stream);
599
1
  ccv_nnc_stream_context_wait(stream);
600
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
601
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)),
602
1
    TENSOR_LIST(hd0),
603
1
    0);
604
1
  REQUIRE_EQ_WITH_TOLERANCE(hd0->data.f32[0], (0.4 * 3 + 1.2 * 2 - 1) * 1.1 * 0.9, 1e-5, "result should be equal");
605
  // schedule device 1
606
1
  ccv_nnc_graph_static_schedule_t* const schedule1 = ccv_nnc_graph_static_schedule_new(graph, CCV_STREAM_CONTEXT_GPU, 0,
607
1
    GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src1)),
608
1
    GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest1)));
609
1
  ccv_nnc_graph_run_with_schedule(graph, 0, schedule1, 0, stream);
610
1
  ccv_nnc_stream_context_wait(stream);
611
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
612
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)),
613
1
    TENSOR_LIST(hd1),
614
1
    0);
615
1
  REQUIRE_EQ_WITH_TOLERANCE(hd1->data.f32[0], (-1.3 * 3 + 0.5 * 5 + 0.2) * 1.2 * 0.8, 1e-5, "result should be equal");
616
1
  hd0->data.f32[0] = 0;
617
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
618
1
    TENSOR_LIST(hd0),
619
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)),
620
1
    0);
621
1
  hd1->data.f32[0] = 0;
622
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
623
1
    TENSOR_LIST(hd1),
624
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)),
625
1
    0);
626
  // custom schedule again with both device 0 and device 1.
627
1
  ccv_nnc_graph_static_schedule_t* const schedule01 = ccv_nnc_graph_static_schedule_new(graph, CCV_STREAM_CONTEXT_GPU, 0,
628
1
    GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src0), ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src1)),
629
1
    GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest0), ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest1)));
630
1
  ccv_nnc_graph_run_with_schedule(graph, 0, schedule01, 0, stream);
631
1
  ccv_nnc_stream_context_wait(stream);
632
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
633
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d0)),
634
1
    TENSOR_LIST(hd0),
635
1
    0);
636
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
637
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)),
638
1
    TENSOR_LIST(hd1),
639
1
    0);
640
1
  REQUIRE_EQ_WITH_TOLERANCE(hd0->data.f32[0], (0.4 * 3 + 1.2 * 2 - 1) * 1.1 * 0.9, 1e-5, "result should be equal");
641
1
  REQUIRE_EQ_WITH_TOLERANCE(hd1->data.f32[0], (-1.3 * 3 + 0.5 * 5 + 0.2) * 1.2 * 0.8, 1e-5, "result should be equal");
642
1
  ccv_nnc_graph_static_schedule_free(schedule0);
643
1
  ccv_nnc_graph_static_schedule_free(schedule1);
644
1
  ccv_nnc_graph_static_schedule_free(schedule01);
645
1
  ccv_nnc_stream_context_free(stream);
646
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
647
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
648
1
  ccv_nnc_tensor_arena_free(tensor_arena);
649
1
  ccv_nnc_graph_free(graph);
650
1
  ccv_nnc_tensor_free(ha0);
651
1
  ccv_nnc_tensor_free(ha1);
652
1
  ccv_nnc_tensor_free(hw0);
653
1
  ccv_nnc_tensor_free(hw1);
654
1
  ccv_nnc_tensor_free(hbias0);
655
1
  ccv_nnc_tensor_free(hbias1);
656
1
  ccv_nnc_tensor_free(hd0);
657
1
  ccv_nnc_tensor_free(hd1);
658
1
}
659
TEST_CASE("partial schedule on both device 0 and then join device 1")
660
1
{
661
1
  GUARD_ELSE_RETURN(ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU) > 1 &&
662
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
663
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
664
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
665
1
  ccv_nnc_tensor_symbol_t const a0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "a0");
666
1
  ccv_nnc_tensor_symbol_t const w0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2), "w0");
667
1
  ccv_nnc_tensor_symbol_t const bias0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1), "bias0");
668
1
  ccv_nnc_tensor_symbol_t const b0 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 1), "b0");
669
1
  ccv_nnc_graph_exec_symbol_t const src0 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a0, w0, bias0), TENSOR_SYMBOL_LIST(b0), "mul0");
670
1
  ccv_nnc_graph_exec_symbol_t const dest0 = src0;
671
1
  ccv_nnc_tensor_symbol_t const a1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 2), "a1");
672
1
  ccv_nnc_tensor_symbol_t const w1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 2), "w1");
673
1
  ccv_nnc_tensor_symbol_t const bias1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1), "bias1");
674
1
  ccv_nnc_tensor_symbol_t const b1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "b1");
675
1
  ccv_nnc_graph_exec_symbol_t const src1 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), TENSOR_SYMBOL_LIST(a1, w1, bias1), TENSOR_SYMBOL_LIST(b1), "mul1");
676
1
  ccv_nnc_tensor_symbol_t const c1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "c1");
677
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(1.2), TENSOR_SYMBOL_LIST(b1), TENSOR_SYMBOL_LIST(c1), "scale10");
678
1
  ccv_nnc_tensor_symbol_t const d1 = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(001, 32F, 1, 1), "d1");
679
1
  ccv_nnc_graph_exec_symbol_t const dest1 = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SCALAR_MUL_FORWARD(0.8), TENSOR_SYMBOL_LIST(c1), TENSOR_SYMBOL_LIST(d1), "scale11");
680
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_NOOP(), TENSOR_SYMBOL_LIST(b0, d1), TENSOR_SYMBOL_LIST(), "noop");
681
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
682
1
  ccv_nnc_graph_t* graph;
683
1
  ccv_nnc_tensor_arena_t* tensor_arena;
684
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
685
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params,
686
1
    0, 0,
687
1
    TENSOR_SYMBOL_LIST(b0, d1),
688
1
    SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph),
689
1
    &graph, &tensor_arena, &graph_exec_arena);
690
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
691
1
  ccv_nnc_graph_set_default_static_schedule(graph, CCV_STREAM_CONTEXT_GPU, 0);
692
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
693
1
  ccv_nnc_tensor_t* const ha0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
694
1
  ccv_nnc_tensor_t* const ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
695
1
  ccv_nnc_tensor_t* const hw0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
696
1
  ccv_nnc_tensor_t* const hw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2), 0);
697
1
  ccv_nnc_tensor_t* const hbias0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
698
1
  ccv_nnc_tensor_t* const hbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1), 0);
699
1
  ccv_nnc_tensor_t* const hb0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
700
1
  ccv_nnc_tensor_t* const hd1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
701
1
  ha0->data.f32[0] = 0.4;
702
1
  ha0->data.f32[1] = 1.2;
703
1
  hw0->data.f32[0] = 3;
704
1
  hw0->data.f32[1] = 2;
705
1
  hbias0->data.f32[0] = -1;
706
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
707
1
    TENSOR_LIST(ha0, hw0, hbias0),
708
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, a0), ccv_nnc_tensor_from_symbol(tensor_arena, w0), ccv_nnc_tensor_from_symbol(tensor_arena, bias0)),
709
1
    0);
710
1
  ha1->data.f32[0] = 1.3;
711
1
  ha1->data.f32[1] = 0.5;
712
1
  hw1->data.f32[0] = -3;
713
1
  hw1->data.f32[1] = 5;
714
1
  hbias1->data.f32[0] = 0.2;
715
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
716
1
    TENSOR_LIST(ha1, hw1, hbias1),
717
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, a1), ccv_nnc_tensor_from_symbol(tensor_arena, w1), ccv_nnc_tensor_from_symbol(tensor_arena, bias1)),
718
1
    0);
719
1
  ccv_nnc_stream_context_t* const stream0 = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU | CCV_COMPUTE_DEVICE_000);
720
1
  ccv_nnc_stream_signal_t* const signal0 = ccv_nnc_stream_signal_new(CCV_STREAM_CONTEXT_GPU | CCV_COMPUTE_DEVICE_000);
721
1
  ccv_nnc_stream_context_t* const stream1 = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU | CCV_COMPUTE_DEVICE_001);
722
  // custom schedule again with both device 0 and device 1.
723
1
  ccv_nnc_graph_static_schedule_t* const schedule01 = ccv_nnc_graph_static_schedule_new(graph, CCV_STREAM_CONTEXT_GPU, 0,
724
1
    GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src0), ccv_nnc_graph_exec_from_symbol(graph_exec_arena, src1)),
725
1
    GRAPH_EXEC_LIST(ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest0), ccv_nnc_graph_exec_from_symbol(graph_exec_arena, dest1)));
726
1
  ccv_nnc_graph_run_with_schedule(graph, 0, schedule01, 0, stream0);
727
1
  ccv_nnc_stream_context_emit_signal(stream0, signal0);
728
1
  ccv_nnc_stream_context_wait_signal(stream1, signal0);
729
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
730
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, d1)),
731
1
    TENSOR_LIST(hd1),
732
1
    stream1);
733
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0,
734
1
    TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, b0)),
735
1
    TENSOR_LIST(hb0),
736
1
    stream0);
737
1
  ccv_nnc_stream_context_wait(stream1);
738
1
  ccv_nnc_stream_context_wait(stream0);
739
1
  REQUIRE_EQ_WITH_TOLERANCE(hb0->data.f32[0], 0.4 * 3 + 1.2 * 2 - 1, 1e-5, "result should be equal");
740
1
  REQUIRE_EQ_WITH_TOLERANCE(hd1->data.f32[0], (-1.3 * 3 + 0.5 * 5 + 0.2) * 1.2 * 0.8, 1e-5, "result should be equal");
741
1
  ccv_nnc_graph_static_schedule_free(schedule01);
742
1
  ccv_nnc_stream_context_free(stream0);
743
1
  ccv_nnc_stream_signal_free(signal0);
744
1
  ccv_nnc_stream_context_free(stream1);
745
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
746
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
747
1
  ccv_nnc_tensor_arena_free(tensor_arena);
748
1
  ccv_nnc_graph_free(graph);
749
1
  ccv_nnc_tensor_free(ha0);
750
1
  ccv_nnc_tensor_free(ha1);
751
1
  ccv_nnc_tensor_free(hw0);
752
1
  ccv_nnc_tensor_free(hw1);
753
1
  ccv_nnc_tensor_free(hbias0);
754
1
  ccv_nnc_tensor_free(hbias1);
755
1
  ccv_nnc_tensor_free(hb0);
756
1
  ccv_nnc_tensor_free(hd1);
757
1
}
758
759
#include "case_main.h"