Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/cudnn.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
46.2M
#define INPUT_DIM (3)
15
231M
#define OUTPUT_DIM (96)
16
17
91.5M
#define INPUT_SIZE (224)
18
308M
#define OUTPUT_SIZE (112)
19
20
1.07M
#define KERNEL_SIZE (7)
21
22
#define BATCH_SIZE (16)
23
24
TEST_CASE("cudnn forward convolution")
25
1
{
26
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
27
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
28
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
29
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
30
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
31
1
  assert(cmd.backend >= 0);
32
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
33
1
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
34
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
35
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
36
  // configure the inlets.
37
1
  dsfmt_t dsfmt;
38
1
  dsfmt_init_gen_rand(&dsfmt, 0);
39
1
  int i;
40
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
41
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
42
2.40M
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++2.40M
)
43
2.40M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
44
97
  for (i = 0; i < OUTPUT_DIM; 
i++96
)
45
96
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
46
  // Copy generated matrix values over to GPU.
47
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
48
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
49
1
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
50
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
51
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
52
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
53
1
  assert(move.backend >= 0);
54
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
55
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
56
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
57
58
1
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
59
1
  transform.backend = CCV_NNC_BACKEND_GPU_CUDNN;
60
1
  assert(transform.backend >= 0);
61
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
62
1
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
63
1
  ccv_nnc_stream_context_wait(stream_context);
64
1
  ccv_nnc_tensor_free(gw);
65
66
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
67
1
  assert(cmd.backend >= 0);
68
1
  cmd.algorithm = -1;
69
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
70
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
71
1
  ccv_nnc_stream_context_wait(stream_context);
72
1
  ccv_nnc_stream_context_free(stream_context);
73
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
74
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
75
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
76
1
  ccv_nnc_tensor_free(c);
77
1
  ccv_nnc_tensor_free(gc);
78
1
  ccv_nnc_tensor_free(bias);
79
1
  ccv_nnc_tensor_free(w);
80
1
  ccv_nnc_tensor_free(b);
81
1
  ccv_nnc_tensor_free(a);
82
1
  ccv_nnc_tensor_free(gbias);
83
1
  ccv_nnc_tensor_free(gwo);
84
1
  ccv_nnc_tensor_free(ga);
85
1
}
86
87
TEST_CASE("cudnn forward convolution in nchw format")
88
1
{
89
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
90
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
91
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
92
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
93
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
94
1
  assert(cmd.backend >= 0);
95
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
96
1
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
97
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
98
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM), 0);
99
  // configure the inlets.
100
1
  dsfmt_t dsfmt;
101
1
  dsfmt_init_gen_rand(&dsfmt, 0);
102
1
  int i;
103
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
104
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
105
2.40M
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++2.40M
)
106
2.40M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
107
97
  for (i = 0; i < OUTPUT_DIM; 
i++96
)
108
96
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
109
  // Copy generated matrix values over to GPU.
110
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
111
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
112
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
113
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
114
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
115
1
  assert(move.backend >= 0);
116
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
117
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
118
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
119
120
1
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
121
1
  transform.backend = CCV_NNC_BACKEND_GPU_CUDNN;
122
1
  assert(transform.backend >= 0);
123
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
124
1
  assert(cmd.backend >= 0);
125
1
  cmd.algorithm = -1;
126
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
127
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
128
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
129
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
130
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from cudnn should match from CPU");
131
1
  ccv_nnc_tensor_free(c);
132
1
  ccv_nnc_tensor_free(gc);
133
1
  ccv_nnc_tensor_free(bias);
134
1
  ccv_nnc_tensor_free(w);
135
1
  ccv_nnc_tensor_free(b);
136
1
  ccv_nnc_tensor_free(a);
137
1
  ccv_nnc_tensor_free(gbias);
138
1
  ccv_nnc_tensor_free(gw);
139
1
  ccv_nnc_tensor_free(ga);
140
1
}
141
142
TEST_CASE("cudnn forward convolution in half precision")
143
1
{
144
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
145
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
146
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
147
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
148
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
149
1
  assert(cmd.backend >= 0);
150
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
151
1
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
152
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
153
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
154
  // configure the inlets.
155
1
  dsfmt_t dsfmt;
156
1
  dsfmt_init_gen_rand(&dsfmt, 0);
157
1
  int i;
158
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
159
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
160
2.40M
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++2.40M
)
161
2.40M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
162
97
  for (i = 0; i < OUTPUT_DIM; 
i++96
)
163
96
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
164
1
  ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
165
1
  ccv_nnc_tensor_t* w1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
166
1
  ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM), 0);
167
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(a1, w1, bias1), 0);
168
  // Copy generated matrix values over to GPU.
169
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
170
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
171
1
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
172
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM), 0);
173
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, w1, bias1), TENSOR_LIST(ga, gw, gbias), 0);
174
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
175
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
176
177
1
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
178
1
  transform.backend = CCV_NNC_BACKEND_GPU_CUDNN;
179
1
  assert(transform.backend >= 0);
180
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
181
1
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
182
1
  ccv_nnc_stream_context_wait(stream_context);
183
1
  ccv_nnc_tensor_free(gw);
184
185
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
186
1
  assert(cmd.backend >= 0);
187
1
  cmd.algorithm = -1;
188
1
  cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
189
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
190
1
  ccv_nnc_stream_context_wait(stream_context);
191
1
  ccv_nnc_stream_context_free(stream_context);
192
1
  ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
193
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
194
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
195
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
196
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 5e-3, "output from cudnn should match from CPU");
197
1
  ccv_nnc_tensor_free(c);
198
1
  ccv_nnc_tensor_free(gc);
199
1
  ccv_nnc_tensor_free(bias);
200
1
  ccv_nnc_tensor_free(w);
201
1
  ccv_nnc_tensor_free(b);
202
1
  ccv_nnc_tensor_free(a);
203
1
  ccv_nnc_tensor_free(c1);
204
1
  ccv_nnc_tensor_free(bias1);
205
1
  ccv_nnc_tensor_free(w1);
206
1
  ccv_nnc_tensor_free(a1);
207
1
  ccv_nnc_tensor_free(gbias);
208
1
  ccv_nnc_tensor_free(gwo);
209
1
  ccv_nnc_tensor_free(ga);
210
1
}
211
212
TEST_CASE("cudnn forward convolution in half precision with palettize weights")
213
1
{
214
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
215
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
216
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
217
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
218
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
219
1
  assert(cmd.backend >= 0);
220
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
221
1
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
222
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
223
1
  ccv_nnc_tensor_t* wo = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
224
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
225
  // configure the inlets.
226
1
  dsfmt_t dsfmt;
227
1
  dsfmt_init_gen_rand(&dsfmt, 0);
228
1
  int i;
229
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
230
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
231
2.40M
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++2.40M
)
232
2.40M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
233
97
  for (i = 0; i < OUTPUT_DIM; 
i++96
)
234
96
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
235
1
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(w), TENSOR_LIST(wo), 0);
236
1
  ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
237
1
  ccv_nnc_tensor_t* w1o = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
238
1
  ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM), 0);
239
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wo, bias), TENSOR_LIST(a1, w1o, bias1), 0);
240
1
  ccv_nnc_tensor_t* pw1o = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(CPU_TENSOR_NCHW(16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 6, 1280), 0);
241
1
  (void)ccv_nnc_palettize(w1o->data.u8, CCV_16F, CCV_TENSOR_CPU_MEMORY, ccv_nnc_tensor_count(w1o->info), 6, 1280, pw1o->data.u8, ccv_nnc_tensor_data_size_without_padding(pw1o->info));
242
  // Copy generated matrix values over to GPU.
243
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
244
1
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 6, 1280), 0);
245
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM), 0);
246
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, pw1o, bias1), TENSOR_LIST(ga, gwo, gbias), 0);
247
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
248
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
249
250
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
251
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
252
1
  assert(cmd.backend >= 0);
253
1
  cmd.algorithm = -1;
254
1
  cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
255
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
256
1
  ccv_nnc_stream_context_wait(stream_context);
257
1
  ccv_nnc_stream_context_free(stream_context);
258
1
  ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
259
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
260
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
261
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
262
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 5e-3, "output from cudnn should match from CPU");
263
1
  ccv_nnc_tensor_free(c);
264
1
  ccv_nnc_tensor_free(gc);
265
1
  ccv_nnc_tensor_free(bias);
266
1
  ccv_nnc_tensor_free(w);
267
1
  ccv_nnc_tensor_free(wo);
268
1
  ccv_nnc_tensor_free(b);
269
1
  ccv_nnc_tensor_free(a);
270
1
  ccv_nnc_tensor_free(c1);
271
1
  ccv_nnc_tensor_free(bias1);
272
1
  ccv_nnc_tensor_free(w1o);
273
1
  ccv_nnc_tensor_free(pw1o);
274
1
  ccv_nnc_tensor_free(a1);
275
1
  ccv_nnc_tensor_free(gbias);
276
1
  ccv_nnc_tensor_free(gwo);
277
1
  ccv_nnc_tensor_free(ga);
278
1
}
279
280
TEST_CASE("cudnn forward convolution with dilation 2, 3")
281
1
{
282
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
283
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
284
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
285
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
286
1
  cmd.info.convolution.dilation[0] = 2;
287
1
  cmd.info.convolution.dilation[1] = 3;
288
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
289
1
  assert(cmd.backend >= 0);
290
1
  ccv_nnc_cmd_param_t modified_cmd = cmd.info;
291
1
  modified_cmd.size.dim[0] = (cmd.info.size.dim[0] - 1) * ccv_max(cmd.info.convolution.dilation[0], 1) + 1;
292
1
  modified_cmd.size.dim[1] = (cmd.info.size.dim[1] - 1) * ccv_max(cmd.info.convolution.dilation[1], 1) + 1;
293
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(modified_cmd, a->info, b->info);
294
1
  assert(ccv_nnc_hint_verify(hint, modified_cmd, a->info, b->info) == 0);
295
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
296
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
297
  // configure the inlets.
298
1
  dsfmt_t dsfmt;
299
1
  dsfmt_init_gen_rand(&dsfmt, 0);
300
1
  int i;
301
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
302
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
303
2.40M
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++2.40M
)
304
2.40M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
305
97
  for (i = 0; i < OUTPUT_DIM; 
i++96
)
306
96
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
307
  // Copy generated matrix values over to GPU.
308
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
309
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
310
1
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
311
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
312
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
313
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
314
1
  assert(move.backend >= 0);
315
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
316
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
317
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
318
319
1
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
320
1
  transform.backend = CCV_NNC_BACKEND_GPU_CUDNN;
321
1
  assert(transform.backend >= 0);
322
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
323
1
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
324
1
  ccv_nnc_stream_context_wait(stream_context);
325
1
  ccv_nnc_tensor_free(gw);
326
327
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
328
1
  assert(cmd.backend >= 0);
329
1
  cmd.algorithm = -1;
330
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
331
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
332
1
  ccv_nnc_stream_context_wait(stream_context);
333
1
  ccv_nnc_stream_context_free(stream_context);
334
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
335
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
336
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
337
1
  ccv_nnc_tensor_free(c);
338
1
  ccv_nnc_tensor_free(gc);
339
1
  ccv_nnc_tensor_free(bias);
340
1
  ccv_nnc_tensor_free(w);
341
1
  ccv_nnc_tensor_free(b);
342
1
  ccv_nnc_tensor_free(a);
343
1
  ccv_nnc_tensor_free(gbias);
344
1
  ccv_nnc_tensor_free(gwo);
345
1
  ccv_nnc_tensor_free(ga);
346
1
}
347
348
TEST_CASE("cudnn forward convolution 3d")
349
1
{
350
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
351
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
352
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
353
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
354
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
355
1
  hint.stride.dim[0] = 2;
356
1
  hint.border.begin[0] = 1;
357
1
  hint.border.end[0] = 1;
358
1
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
359
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
360
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
361
  // configure the inlets.
362
1
  dsfmt_t dsfmt;
363
1
  dsfmt_init_gen_rand(&dsfmt, 0);
364
1
  int i;
365
42.3k
  for (i = 0; i < INPUT_DIM * 3 * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++42.3k
)
366
42.3k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
367
12.0M
  for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++12.0M
)
368
12.0M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
369
97
  for (i = 0; i < OUTPUT_DIM; 
i++96
)
370
96
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
371
  // Copy generated matrix values over to GPU.
372
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
373
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
374
1
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
375
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
376
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
377
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
378
1
  assert(move.backend >= 0);
379
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
380
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
381
1
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
382
1
  transform.backend = CCV_NNC_BACKEND_GPU_CUDNN;
383
1
  assert(transform.backend >= 0);
384
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
385
1
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
386
1
  ccv_nnc_stream_context_wait(stream_context);
387
1
  ccv_nnc_tensor_free(gw);
388
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
389
1
  assert(cmd.backend >= 0);
390
1
  cmd.algorithm = -1;
391
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
392
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
393
1
  ccv_nnc_stream_context_wait(stream_context);
394
1
  ccv_nnc_stream_context_free(stream_context);
395
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
396
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
397
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
398
1
  assert(cmd.backend >= 0);
399
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
400
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
401
1
  ccv_nnc_tensor_free(c);
402
1
  ccv_nnc_tensor_free(gc);
403
1
  ccv_nnc_tensor_free(bias);
404
1
  ccv_nnc_tensor_free(w);
405
1
  ccv_nnc_tensor_free(b);
406
1
  ccv_nnc_tensor_free(a);
407
1
  ccv_nnc_tensor_free(gbias);
408
1
  ccv_nnc_tensor_free(gwo);
409
1
  ccv_nnc_tensor_free(ga);
410
1
}
411
412
TEST_CASE("cudnn forward convolution 3d in nchw format")
413
1
{
414
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
415
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
416
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
417
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
418
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
419
1
  hint.stride.dim[0] = 2;
420
1
  hint.border.begin[0] = 1;
421
1
  hint.border.end[0] = 1;
422
1
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
423
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
424
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM), 0);
425
  // configure the inlets.
426
1
  dsfmt_t dsfmt;
427
1
  dsfmt_init_gen_rand(&dsfmt, 0);
428
1
  int i;
429
42.3k
  for (i = 0; i < 3 * INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++42.3k
)
430
42.3k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
431
12.0M
  for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++12.0M
)
432
12.0M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
433
97
  for (i = 0; i < OUTPUT_DIM; 
i++96
)
434
96
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
435
  // Copy generated matrix values over to GPU.
436
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
437
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
438
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
439
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
440
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
441
1
  assert(move.backend >= 0);
442
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
443
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
444
445
1
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
446
1
  transform.backend = CCV_NNC_BACKEND_GPU_CUDNN;
447
1
  assert(transform.backend >= 0);
448
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
449
1
  assert(cmd.backend >= 0);
450
1
  cmd.algorithm = -1;
451
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
452
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
453
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
454
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
455
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
456
1
  assert(cmd.backend >= 0);
457
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
458
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
459
1
  ccv_nnc_tensor_free(c);
460
1
  ccv_nnc_tensor_free(gc);
461
1
  ccv_nnc_tensor_free(bias);
462
1
  ccv_nnc_tensor_free(w);
463
1
  ccv_nnc_tensor_free(b);
464
1
  ccv_nnc_tensor_free(a);
465
1
  ccv_nnc_tensor_free(gbias);
466
1
  ccv_nnc_tensor_free(gw);
467
1
  ccv_nnc_tensor_free(ga);
468
1
}
469
470
TEST_CASE("cudnn backward convolution")
471
1
{
472
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
473
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
474
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
475
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
476
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
477
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
478
1
  assert(cmd.backend >= 0);
479
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, g->info);
480
1
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, g->info) == 0);
481
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
482
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
483
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
484
  // configure the inlets.
485
1
  dsfmt_t dsfmt;
486
1
  dsfmt_init_gen_rand(&dsfmt, 0);
487
1
  int i;
488
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
489
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
490
2.40M
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++2.40M
)
491
2.40M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
492
19.2M
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++19.2M
)
493
19.2M
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
494
  // Copy generated matrix values over to GPU.
495
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
496
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
497
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
498
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
499
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
500
1
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
501
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
502
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
503
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
504
1
  assert(move.backend >= 0);
505
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
506
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
507
508
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
509
1
  assert(cmd.backend >= 0);
510
1
  cmd.algorithm = -1;
511
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
512
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(gg, ga, gw), TENSOR_LIST(gh, gdw, gdbias), stream_context);
513
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(gg, ga, gw), TENSOR_LIST(gh, gdw, gdbias), stream_context));
514
1
  ccv_nnc_stream_context_wait(stream_context);
515
1
  ccv_nnc_stream_context_free(stream_context);
516
1
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
517
1
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
518
1
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
519
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
520
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from cudnn should match from CPU");
521
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from cudnn should match from CPU");
522
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
523
1
  ccv_nnc_tensor_free(h);
524
1
  ccv_nnc_tensor_free(gh);
525
1
  ccv_nnc_tensor_free(w);
526
1
  ccv_nnc_tensor_free(g);
527
1
  ccv_nnc_tensor_free(a);
528
1
  ccv_nnc_tensor_free(gbias);
529
1
  ccv_nnc_tensor_free(gdbias);
530
1
  ccv_nnc_tensor_free(gdw);
531
1
  ccv_nnc_tensor_free(gw);
532
1
  ccv_nnc_tensor_free(gg);
533
1
  ccv_nnc_tensor_free(ga);
534
1
  ccv_nnc_tensor_free(ch);
535
1
  ccv_nnc_tensor_free(cdw);
536
1
  ccv_nnc_tensor_free(cdbias);
537
1
}
538
539
TEST_CASE("cudnn backward convolution in nchw format")
540
1
{
541
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
542
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
543
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
544
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
545
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
546
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
547
1
  assert(cmd.backend >= 0);
548
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, g->info);
549
1
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, g->info) == 0);
550
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
551
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
552
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
553
  // configure the inlets.
554
1
  dsfmt_t dsfmt;
555
1
  dsfmt_init_gen_rand(&dsfmt, 0);
556
1
  int i;
557
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
558
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
559
2.40M
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++2.40M
)
560
2.40M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
561
19.2M
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++19.2M
)
562
19.2M
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
563
  // Copy generated matrix values over to GPU.
564
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
565
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
566
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
567
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
568
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
569
1
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
570
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
571
1
  ccv_nnc_tensor_t* gao = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
572
1
  ccv_nnc_tensor_t* ggo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
573
1
  ccv_nnc_tensor_t* gho = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
574
1
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
575
1
  ccv_nnc_tensor_t* gbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
576
1
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
577
1
  ccv_nnc_tensor_t* gdbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
578
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
579
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
580
1
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gw, gg), TENSOR_LIST(gao, gwo, ggo), 0);
581
582
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
583
1
  assert(cmd.backend >= 0);
584
1
  cmd.algorithm = -1;
585
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
586
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context);
587
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context));
588
1
  ccv_nnc_stream_context_wait(stream_context);
589
1
  ccv_nnc_stream_context_free(stream_context);
590
1
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
591
1
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
592
1
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
593
1
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gho, gdwo, gdbiaso), TENSOR_LIST(gh, gdw, gdbias), 0);
594
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
595
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from cudnn should match from CPU");
596
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from cudnn should match from CPU");
597
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
598
1
  ccv_nnc_tensor_free(gao);
599
1
  ccv_nnc_tensor_free(ggo);
600
1
  ccv_nnc_tensor_free(gho);
601
1
  ccv_nnc_tensor_free(gwo);
602
1
  ccv_nnc_tensor_free(gbiaso);
603
1
  ccv_nnc_tensor_free(gdwo);
604
1
  ccv_nnc_tensor_free(gdbiaso);
605
1
  ccv_nnc_tensor_free(h);
606
1
  ccv_nnc_tensor_free(gh);
607
1
  ccv_nnc_tensor_free(w);
608
1
  ccv_nnc_tensor_free(g);
609
1
  ccv_nnc_tensor_free(a);
610
1
  ccv_nnc_tensor_free(gbias);
611
1
  ccv_nnc_tensor_free(gdbias);
612
1
  ccv_nnc_tensor_free(gdw);
613
1
  ccv_nnc_tensor_free(gw);
614
1
  ccv_nnc_tensor_free(gg);
615
1
  ccv_nnc_tensor_free(ga);
616
1
  ccv_nnc_tensor_free(ch);
617
1
  ccv_nnc_tensor_free(cdw);
618
1
  ccv_nnc_tensor_free(cdbias);
619
1
}
620
621
TEST_CASE("cudnn backward convolution in half precision")
622
1
{
623
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
624
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
625
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
626
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
627
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
628
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
629
1
  assert(cmd.backend >= 0);
630
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, g->info);
631
1
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, g->info) == 0);
632
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
633
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
634
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
635
  // configure the inlets.
636
1
  dsfmt_t dsfmt;
637
1
  dsfmt_init_gen_rand(&dsfmt, 0);
638
1
  int i;
639
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
640
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
641
2.40M
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++2.40M
)
642
2.40M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
643
19.2M
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++19.2M
)
644
19.2M
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
645
  // Copy generated matrix values over to GPU.
646
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
647
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
648
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
649
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
650
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM), 0);
651
1
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
652
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM), 0);
653
1
  ccv_nnc_tensor_t* a16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
654
1
  ccv_nnc_tensor_t* g16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
655
1
  ccv_nnc_tensor_t* w16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
656
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(a16, w16, g16), 0);
657
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a16, w16, g16), TENSOR_LIST(ga, gw, gg), 0);
658
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
659
660
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
661
1
  assert(cmd.backend >= 0);
662
1
  cmd.algorithm = -1;
663
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
664
1
  cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(gg, ga, gw), TENSOR_LIST(gh, gdw, gdbias), stream_context);
665
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(gg, ga, gw), TENSOR_LIST(gh, gdw, gdbias), stream_context));
666
1
  ccv_nnc_stream_context_wait(stream_context);
667
1
  ccv_nnc_stream_context_free(stream_context);
668
1
  ccv_nnc_tensor_t* ch16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
669
1
  ccv_nnc_tensor_t* cdw16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
670
1
  ccv_nnc_tensor_t* cdbias16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM), 0);
671
1
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
672
1
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
673
1
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
674
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch16, cdw16, cdbias16), 0);
675
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ch16, cdw16, cdbias16), TENSOR_LIST(ch, cdw, cdbias), 0);
676
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 12, "output from cudnn should match from CPU");
677
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5, "output from cudnn should match from CPU");
678
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
679
1
  ccv_nnc_tensor_free(h);
680
1
  ccv_nnc_tensor_free(gh);
681
1
  ccv_nnc_tensor_free(w);
682
1
  ccv_nnc_tensor_free(g);
683
1
  ccv_nnc_tensor_free(a);
684
1
  ccv_nnc_tensor_free(w16);
685
1
  ccv_nnc_tensor_free(g16);
686
1
  ccv_nnc_tensor_free(a16);
687
1
  ccv_nnc_tensor_free(gbias);
688
1
  ccv_nnc_tensor_free(gdbias);
689
1
  ccv_nnc_tensor_free(gdw);
690
1
  ccv_nnc_tensor_free(gw);
691
1
  ccv_nnc_tensor_free(gg);
692
1
  ccv_nnc_tensor_free(ga);
693
1
  ccv_nnc_tensor_free(ch);
694
1
  ccv_nnc_tensor_free(cdw);
695
1
  ccv_nnc_tensor_free(cdbias);
696
1
  ccv_nnc_tensor_free(ch16);
697
1
  ccv_nnc_tensor_free(cdw16);
698
1
  ccv_nnc_tensor_free(cdbias16);
699
1
}
700
701
TEST_CASE("cudnn backward convolution with dilation 2, 3")
702
1
{
703
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
704
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
705
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
706
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
707
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
708
1
  cmd.info.convolution.dilation[0] = 2;
709
1
  cmd.info.convolution.dilation[1] = 3;
710
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
711
1
  assert(cmd.backend >= 0);
712
1
  ccv_nnc_cmd_param_t modified_cmd = cmd.info;
713
1
  modified_cmd.size.dim[0] = (modified_cmd.size.dim[0] - 1) * cmd.info.convolution.dilation[0] + 1;
714
1
  modified_cmd.size.dim[1] = (modified_cmd.size.dim[1] - 1) * cmd.info.convolution.dilation[1] + 1;
715
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(modified_cmd, a->info, g->info);
716
1
  assert(ccv_nnc_hint_verify(hint, modified_cmd, a->info, g->info) == 0);
717
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
718
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
719
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
720
  // configure the inlets.
721
1
  dsfmt_t dsfmt;
722
1
  dsfmt_init_gen_rand(&dsfmt, 0);
723
1
  int i;
724
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
725
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
726
2.40M
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++2.40M
)
727
2.40M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
728
19.2M
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++19.2M
)
729
19.2M
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
730
  // Copy generated matrix values over to GPU.
731
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
732
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
733
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
734
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
735
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
736
1
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
737
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
738
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
739
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
740
1
  assert(move.backend >= 0);
741
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
742
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
743
744
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
745
1
  assert(cmd.backend >= 0);
746
1
  cmd.algorithm = -1;
747
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
748
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(gg, ga, gw), TENSOR_LIST(gh, gdw, gdbias), stream_context);
749
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(gg, ga, gw), TENSOR_LIST(gh, gdw, gdbias), stream_context));
750
1
  ccv_nnc_stream_context_wait(stream_context);
751
1
  ccv_nnc_stream_context_free(stream_context);
752
1
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
753
1
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
754
1
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
755
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
756
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from cudnn should match from CPU");
757
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from cudnn should match from CPU");
758
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
759
1
  ccv_nnc_tensor_free(h);
760
1
  ccv_nnc_tensor_free(gh);
761
1
  ccv_nnc_tensor_free(w);
762
1
  ccv_nnc_tensor_free(g);
763
1
  ccv_nnc_tensor_free(a);
764
1
  ccv_nnc_tensor_free(gbias);
765
1
  ccv_nnc_tensor_free(gdbias);
766
1
  ccv_nnc_tensor_free(gdw);
767
1
  ccv_nnc_tensor_free(gw);
768
1
  ccv_nnc_tensor_free(gg);
769
1
  ccv_nnc_tensor_free(ga);
770
1
  ccv_nnc_tensor_free(ch);
771
1
  ccv_nnc_tensor_free(cdw);
772
1
  ccv_nnc_tensor_free(cdbias);
773
1
}
774
775
TEST_CASE("compare batch norm with cudnn")
776
1
{
777
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_BATCH_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
778
1
    ccv_nnc_cmd_ok(CCV_NNC_BATCH_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
779
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
780
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
781
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
782
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
783
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
784
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
785
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "scale");
786
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "bias");
787
1
  ccv_nnc_tensor_symbol_t bmean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "mean");
788
1
  ccv_nnc_tensor_symbol_t bvar = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "var");
789
1
  ccv_nnc_tensor_symbol_set_flags(symbolic_graph, bmean, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
790
1
  ccv_nnc_tensor_symbol_set_flags(symbolic_graph, bvar, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
791
1
  ccv_nnc_tensor_symbol_t bmean_out = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "mean");
792
1
  ccv_nnc_tensor_symbol_t bvar_out = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "var");
793
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "saved_mean");
794
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "saved_inv_std");
795
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
796
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SET_FORWARD(1), 0, 0, TENSOR_SYMBOL_LIST(scale), "set_scale");
797
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SET_FORWARD(0), 0, 0, TENSOR_SYMBOL_LIST(bias), "set_bias");
798
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_BATCH_NORM_FORWARD(1e-4, 0, 0.9, 0, 1, 2), TENSOR_SYMBOL_LIST(bx, scale, bias, bmean, bvar), TENSOR_SYMBOL_LIST(by, bmean_out, bvar_out, saved_mean, saved_inv_std), "batch_norm");
799
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
800
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
801
1
  ccv_nnc_graph_t* graph = 0;
802
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
803
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
804
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
805
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
806
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
807
1
  dsfmt_t dsfmt;
808
1
  float xdata[2 * 2 * 2 * 10];
809
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
810
1
  int i;
811
1
  dsfmt_init_gen_rand(&dsfmt, 1);
812
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
813
80
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
814
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
815
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
816
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
817
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
818
1
  ccv_nnc_graph_free(graph);
819
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
820
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
821
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
822
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "scale");
823
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "bias");
824
1
  ccv_nnc_tensor_symbol_t cmean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "mean");
825
1
  ccv_nnc_tensor_symbol_t cvar = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "var");
826
1
  ccv_nnc_tensor_symbol_set_flags(cpu_symbolic_graph, cmean, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
827
1
  ccv_nnc_tensor_symbol_set_flags(cpu_symbolic_graph, cvar, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
828
1
  ccv_nnc_tensor_symbol_t cmean_out = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "mean");
829
1
  ccv_nnc_tensor_symbol_t cvar_out = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "var");
830
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "saved_mean");
831
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "saved_inv_std");
832
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_SET_FORWARD(1), 0, 0, TENSOR_SYMBOL_LIST(cscale), "set_scale");
833
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_SET_FORWARD(0), 0, 0, TENSOR_SYMBOL_LIST(cbias), "set_bias");
834
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_BATCH_NORM_FORWARD(1e-4, 0, 0.9, 0, 1, 2), TENSOR_SYMBOL_LIST(cx, cscale, cbias, cmean, cvar), TENSOR_SYMBOL_LIST(cy, cmean_out, cvar_out, csaved_mean, csaved_inv_std), "batch_norm");
835
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
836
1
  ccv_nnc_graph_t* cpu_graph = 0;
837
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
838
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
839
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
840
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
841
1
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
842
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
843
1
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
844
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-5, "batch norm result from cudnn should match the one from reference implementation");
845
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
846
1
  ccv_nnc_tensor_arena_free(tensor_arena);
847
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
848
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
849
1
  ccv_nnc_graph_free(cpu_graph);
850
1
}
851
852
TEST_CASE("compare batch norm with cudnn in half precision")
853
1
{
854
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_BATCH_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
855
1
    ccv_nnc_cmd_ok(CCV_NNC_BATCH_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
856
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
857
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
858
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
859
1
  ccv_nnc_tensor_symbol_t x16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 2, 2, 2, 10), "x in half precision");
860
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 2, 2, 2, 10), "x");
861
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 2, 2, 2, 10), "y");
862
1
  ccv_nnc_tensor_symbol_t y16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 2, 2, 2, 10), "y in half precision");
863
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
864
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "scale");
865
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "bias");
866
1
  ccv_nnc_tensor_symbol_t bmean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "mean");
867
1
  ccv_nnc_tensor_symbol_t bvar = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "var");
868
1
  ccv_nnc_tensor_symbol_set_flags(symbolic_graph, bmean, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
869
1
  ccv_nnc_tensor_symbol_set_flags(symbolic_graph, bvar, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
870
1
  ccv_nnc_tensor_symbol_t bmean_out = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "mean");
871
1
  ccv_nnc_tensor_symbol_t bvar_out = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "var");
872
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "saved_mean");
873
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "saved_inv_std");
874
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(x16), "convert x");
875
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x16), TENSOR_SYMBOL_LIST(bx), "transfer x");
876
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SET_FORWARD(1), 0, 0, TENSOR_SYMBOL_LIST(scale), "set_scale");
877
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SET_FORWARD(0), 0, 0, TENSOR_SYMBOL_LIST(bias), "set_bias");
878
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_BATCH_NORM_FORWARD(1e-4, 0, 0.9, 0, 1, 2), TENSOR_SYMBOL_LIST(bx, scale, bias, bmean, bvar), TENSOR_SYMBOL_LIST(by, bmean_out, bvar_out, saved_mean, saved_inv_std), "batch_norm");
879
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y16), "transfer y");
880
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(y16), TENSOR_SYMBOL_LIST(y), "convert y");
881
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
882
1
  ccv_nnc_graph_t* graph = 0;
883
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
884
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
885
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
886
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
887
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
888
1
  dsfmt_t dsfmt;
889
1
  float xdata[2 * 2 * 2 * 10];
890
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
891
1
  int i;
892
1
  dsfmt_init_gen_rand(&dsfmt, 1);
893
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
894
80
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
895
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
896
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
897
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
898
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
899
1
  ccv_nnc_graph_free(graph);
900
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
901
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
902
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
903
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "scale");
904
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "bias");
905
1
  ccv_nnc_tensor_symbol_t cmean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "mean");
906
1
  ccv_nnc_tensor_symbol_t cvar = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "var");
907
1
  ccv_nnc_tensor_symbol_set_flags(cpu_symbolic_graph, cmean, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
908
1
  ccv_nnc_tensor_symbol_set_flags(cpu_symbolic_graph, cvar, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
909
1
  ccv_nnc_tensor_symbol_t cmean_out = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "mean");
910
1
  ccv_nnc_tensor_symbol_t cvar_out = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "var");
911
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "saved_mean");
912
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "saved_inv_std");
913
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_SET_FORWARD(1), 0, 0, TENSOR_SYMBOL_LIST(cscale), "set_scale");
914
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_SET_FORWARD(0), 0, 0, TENSOR_SYMBOL_LIST(cbias), "set_bias");
915
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_BATCH_NORM_FORWARD(1e-4, 0, 0.9, 0, 1, 2), TENSOR_SYMBOL_LIST(cx, cscale, cbias, cmean, cvar), TENSOR_SYMBOL_LIST(cy, cmean_out, cvar_out, csaved_mean, csaved_inv_std), "batch_norm");
916
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
917
1
  ccv_nnc_graph_t* cpu_graph = 0;
918
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
919
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
920
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
921
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
922
1
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
923
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
924
1
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
925
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-3, "batch norm result from cudnn should match the one from reference implementation");
926
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
927
1
  ccv_nnc_tensor_arena_free(tensor_arena);
928
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
929
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
930
1
  ccv_nnc_graph_free(cpu_graph);
931
1
}
932
933
TEST_CASE("compare batch norm gradient with cudnn")
934
1
{
935
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_BATCH_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
936
1
    ccv_nnc_cmd_ok(CCV_NNC_BATCH_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
937
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
938
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
939
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
940
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
941
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "scale");
942
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "bias");
943
1
  ccv_nnc_tensor_symbol_t bmean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "mean");
944
1
  ccv_nnc_tensor_symbol_t bvar = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "var");
945
1
  ccv_nnc_tensor_symbol_set_flags(symbolic_graph, bmean, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
946
1
  ccv_nnc_tensor_symbol_set_flags(symbolic_graph, bvar, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
947
1
  ccv_nnc_tensor_symbol_t bmean_out = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "mean");
948
1
  ccv_nnc_tensor_symbol_t bvar_out = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "var");
949
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "saved_mean");
950
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "saved_inv_std");
951
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SET_FORWARD(1), 0, 0, TENSOR_SYMBOL_LIST(scale), "set_scale");
952
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SET_FORWARD(0), 0, 0, TENSOR_SYMBOL_LIST(bias), "set_bias");
953
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_BATCH_NORM_FORWARD(1e-4, 0, 0.9, 0, 1, 2), TENSOR_SYMBOL_LIST(bx, scale, bias, bmean, bvar), TENSOR_SYMBOL_LIST(by, bmean_out, bvar_out, saved_mean, saved_inv_std), "batch_norm");
954
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
955
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
956
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
957
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
958
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
959
1
  ccv_nnc_graph_t* graph = 0;
960
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
961
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
962
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
963
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
964
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
965
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
966
1
  dsfmt_t dsfmt;
967
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
968
1
  int i;
969
1
  dsfmt_init_gen_rand(&dsfmt, 1);
970
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
971
80
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
972
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
973
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
974
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
975
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
976
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
977
80
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
978
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
979
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
980
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
981
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
982
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
983
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
984
1
  ccv_nnc_tensor_arena_free(tensor_arena);
985
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
986
1
  ccv_nnc_graph_free(graph);
987
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
988
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
989
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
990
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "scale");
991
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "bias");
992
1
  ccv_nnc_tensor_symbol_t cmean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "mean");
993
1
  ccv_nnc_tensor_symbol_t cvar = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "var");
994
1
  ccv_nnc_tensor_symbol_set_flags(cpu_symbolic_graph, cmean, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
995
1
  ccv_nnc_tensor_symbol_set_flags(cpu_symbolic_graph, cvar, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
996
1
  ccv_nnc_tensor_symbol_t cmean_out = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "mean");
997
1
  ccv_nnc_tensor_symbol_t cvar_out = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "var");
998
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "saved_mean");
999
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "saved_inv_std");
1000
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_SET_FORWARD(1), 0, 0, TENSOR_SYMBOL_LIST(cscale), "set_scale");
1001
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_SET_FORWARD(0), 0, 0, TENSOR_SYMBOL_LIST(cbias), "set_bias");
1002
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_BATCH_NORM_FORWARD(1e-4, 0, 0.9, 0, 1, 2), TENSOR_SYMBOL_LIST(cx, cscale, cbias, cmean, cvar), TENSOR_SYMBOL_LIST(cy, cmean_out, cvar_out, csaved_mean, csaved_inv_std), "batch_norm");
1003
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1004
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
1005
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1006
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
1007
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
1008
1
  ccv_nnc_graph_t* cpu_graph = 0;
1009
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1010
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1011
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1012
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1013
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1014
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
1015
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1016
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1017
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
1018
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "batch norm gradient result from cudnn should match the one from reference implementation");
1019
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1020
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1021
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1022
1
  ccv_nnc_graph_free(cpu_graph);
1023
1
  ccv_nnc_tensor_free(x_tensor);
1024
1
  ccv_nnc_tensor_free(dy_tensor);
1025
1
  ccv_nnc_tensor_free(dx_tensor);
1026
1
}
1027
1028
TEST_CASE("compare batch norm gradient with cudnn in half precision")
1029
1
{
1030
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_BATCH_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1031
1
    ccv_nnc_cmd_ok(CCV_NNC_BATCH_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1032
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1033
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1034
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 2, 2, 2, 10), "x");
1035
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 2, 2, 2, 10), "y");
1036
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "scale");
1037
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "bias");
1038
1
  ccv_nnc_tensor_symbol_t bmean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "mean");
1039
1
  ccv_nnc_tensor_symbol_t bvar = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "var");
1040
1
  ccv_nnc_tensor_symbol_set_flags(symbolic_graph, bmean, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
1041
1
  ccv_nnc_tensor_symbol_set_flags(symbolic_graph, bvar, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
1042
1
  ccv_nnc_tensor_symbol_t bmean_out = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "mean");
1043
1
  ccv_nnc_tensor_symbol_t bvar_out = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "var");
1044
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "saved_mean");
1045
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10), "saved_inv_std");
1046
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SET_FORWARD(1), 0, 0, TENSOR_SYMBOL_LIST(scale), "set_scale");
1047
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SET_FORWARD(0), 0, 0, TENSOR_SYMBOL_LIST(bias), "set_bias");
1048
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_BATCH_NORM_FORWARD(1e-4, 0, 0.9, 0, 1, 2), TENSOR_SYMBOL_LIST(bx, scale, bias, bmean, bvar), TENSOR_SYMBOL_LIST(by, bmean_out, bvar_out, saved_mean, saved_inv_std), "batch_norm");
1049
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1050
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1051
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1052
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
1053
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
1054
1
  ccv_nnc_graph_t* graph = 0;
1055
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1056
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1057
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1058
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1059
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1060
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
1061
1
  dsfmt_t dsfmt;
1062
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1063
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 2, 2, 2, 10), 0);
1064
1
  int i;
1065
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1066
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1067
80
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1068
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
1069
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(bx_tensor), 0);
1070
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1071
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1072
1
  ccv_nnc_tensor_t* const dy16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 2, 2, 2, 10), 0);
1073
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
1074
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1075
80
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
1076
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dy16_tensor), 0);
1077
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy16_tensor), TENSOR_LIST(dby_tensor), 0);
1078
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1079
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
1080
1
  ccv_nnc_tensor_t* const dx16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 2, 2, 2, 10), 0);
1081
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1082
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx16_tensor), 0);
1083
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dx16_tensor), TENSOR_LIST(dx_tensor), 0);
1084
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1085
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1086
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1087
1
  ccv_nnc_graph_free(graph);
1088
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1089
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1090
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1091
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "scale");
1092
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "bias");
1093
1
  ccv_nnc_tensor_symbol_t cmean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "mean");
1094
1
  ccv_nnc_tensor_symbol_t cvar = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "var");
1095
1
  ccv_nnc_tensor_symbol_set_flags(cpu_symbolic_graph, cmean, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
1096
1
  ccv_nnc_tensor_symbol_set_flags(cpu_symbolic_graph, cvar, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS);
1097
1
  ccv_nnc_tensor_symbol_t cmean_out = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "mean");
1098
1
  ccv_nnc_tensor_symbol_t cvar_out = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "var");
1099
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "saved_mean");
1100
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 10), "saved_inv_std");
1101
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_SET_FORWARD(1), 0, 0, TENSOR_SYMBOL_LIST(cscale), "set_scale");
1102
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_SET_FORWARD(0), 0, 0, TENSOR_SYMBOL_LIST(cbias), "set_bias");
1103
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_BATCH_NORM_FORWARD(1e-4, 0, 0.9, 0, 1, 2), TENSOR_SYMBOL_LIST(cx, cscale, cbias, cmean, cvar), TENSOR_SYMBOL_LIST(cy, cmean_out, cvar_out, csaved_mean, csaved_inv_std), "batch_norm");
1104
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1105
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
1106
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1107
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
1108
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
1109
1
  ccv_nnc_graph_t* cpu_graph = 0;
1110
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1111
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1112
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1113
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1114
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1115
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
1116
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1117
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1118
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
1119
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * 2 * 2 * 10, 2e-3, "batch norm result from cudnn should match the one from reference implementation");
1120
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1121
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1122
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1123
1
  ccv_nnc_graph_free(cpu_graph);
1124
1
  ccv_nnc_tensor_free(x_tensor);
1125
1
  ccv_nnc_tensor_free(x16_tensor);
1126
1
  ccv_nnc_tensor_free(dy_tensor);
1127
1
  ccv_nnc_tensor_free(dy16_tensor);
1128
1
  ccv_nnc_tensor_free(dx_tensor);
1129
1
  ccv_nnc_tensor_free(dx16_tensor);
1130
1
}
1131
1132
TEST_CASE("compare layer norm with cudnn")
1133
1
{
1134
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1135
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1136
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1137
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1138
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
1139
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
1140
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
1141
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
1142
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
1143
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "bias");
1144
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
1145
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
1146
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1147
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
1148
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1149
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1150
1
  ccv_nnc_graph_t* graph = 0;
1151
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1152
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1153
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1154
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1155
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1156
1
  dsfmt_t dsfmt;
1157
1
  float xdata[2 * 2 * 2 * 10];
1158
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1159
1
  int i;
1160
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1161
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1162
80
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1163
1
  float scaledata[1 * 2 * 2 * 10];
1164
1
  float biasdata[1 * 2 * 2 * 10];
1165
41
  for (i = 0; i < 1 * 2 * 2 * 10; 
i++40
)
1166
40
  {
1167
40
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1168
40
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
1169
40
  }
1170
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1171
1
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1172
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
1173
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1174
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1175
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1176
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1177
1
  ccv_nnc_graph_free(graph);
1178
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1179
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1180
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1181
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
1182
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "bias");
1183
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
1184
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1185
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
1186
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1187
1
  ccv_nnc_graph_t* cpu_graph = 0;
1188
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1189
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1190
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1191
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1192
1
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
1193
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1194
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
1195
1
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
1196
1
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 2 * 2 * 10);
1197
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1198
1
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1199
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-5, "layer norm result from cudnn should match the one from reference implementation");
1200
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1201
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1202
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1203
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1204
1
  ccv_nnc_graph_free(cpu_graph);
1205
1
}
1206
1207
TEST_CASE("compare layer norm gradient with cudnn")
1208
1
{
1209
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1210
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1211
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1212
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1213
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
1214
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
1215
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
1216
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "bias");
1217
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
1218
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
1219
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
1220
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1221
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1222
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1223
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
1224
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
1225
1
  ccv_nnc_graph_t* graph = 0;
1226
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1227
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1228
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1229
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1230
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1231
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
1232
1
  dsfmt_t dsfmt;
1233
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1234
1
  int i;
1235
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1236
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1237
80
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
1238
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
1239
1
  float scaledata[1 * 2 * 2 * 10];
1240
1
  float biasdata[1 * 2 * 2 * 10];
1241
41
  for (i = 0; i < 1 * 2 * 2 * 10; 
i++40
)
1242
40
  {
1243
40
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1244
40
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
1245
40
  }
1246
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1247
1
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1248
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
1249
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1250
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1251
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
1252
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1253
80
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
1254
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
1255
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1256
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
1257
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1258
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
1259
1
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
1260
1
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
1261
1
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1262
1
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1263
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
1264
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1265
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1266
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1267
1
  ccv_nnc_graph_free(graph);
1268
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1269
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1270
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1271
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
1272
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "bias");
1273
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
1274
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1275
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
1276
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1277
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
1278
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1279
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
1280
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
1281
1
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
1282
1
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
1283
1
  ccv_nnc_graph_t* cpu_graph = 0;
1284
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1285
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1286
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1287
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1288
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1289
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
1290
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1291
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1292
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
1293
1
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
1294
1
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 2 * 2 * 10);
1295
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1296
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
1297
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
1298
1
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
1299
1
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from cudnn should match the one from reference implementation");
1300
1
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
1301
1
  REQUIRE_TENSOR_EQ(dbias_tensor, dcbias_tensor, "layer norm bias gradient result from cudnn should match the one from reference implementation");
1302
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1303
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1304
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1305
1
  ccv_nnc_graph_free(cpu_graph);
1306
1
  ccv_nnc_tensor_free(x_tensor);
1307
1
  ccv_nnc_tensor_free(dy_tensor);
1308
1
  ccv_nnc_tensor_free(dx_tensor);
1309
1
  ccv_nnc_tensor_free(dscale_tensor);
1310
1
  ccv_nnc_tensor_free(dbias_tensor);
1311
1
}
1312
1313
TEST_CASE("compare layer norm only gradient with cudnn")
1314
1
{
1315
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1316
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1317
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1318
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1319
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
1320
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
1321
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
1322
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "bias");
1323
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
1324
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
1325
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
1326
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1327
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1328
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1329
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
1330
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
1331
1
  ccv_nnc_graph_t* graph = 0;
1332
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1333
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1334
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1335
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1336
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1337
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
1338
1
  dsfmt_t dsfmt;
1339
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1340
1
  int i;
1341
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1342
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1343
80
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
1344
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
1345
1
  float scaledata[1 * 2 * 2 * 10];
1346
1
  float biasdata[1 * 2 * 2 * 10];
1347
41
  for (i = 0; i < 1 * 2 * 2 * 10; 
i++40
)
1348
40
  {
1349
40
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1350
40
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
1351
40
  }
1352
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1353
1
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1354
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
1355
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1356
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1357
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
1358
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1359
80
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
1360
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
1361
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1362
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
1363
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1364
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
1365
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1366
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1367
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1368
1
  ccv_nnc_graph_free(graph);
1369
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1370
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1371
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1372
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
1373
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "bias");
1374
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
1375
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1376
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
1377
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1378
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
1379
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1380
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
1381
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
1382
1
  ccv_nnc_graph_t* cpu_graph = 0;
1383
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1384
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1385
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1386
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1387
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1388
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
1389
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1390
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1391
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
1392
1
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
1393
1
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 2 * 2 * 10);
1394
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1395
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
1396
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
1397
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1398
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1399
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1400
1
  ccv_nnc_graph_free(cpu_graph);
1401
1
  ccv_nnc_tensor_free(x_tensor);
1402
1
  ccv_nnc_tensor_free(dy_tensor);
1403
1
  ccv_nnc_tensor_free(dx_tensor);
1404
1
}
1405
1406
TEST_CASE("compare layer norm with cudnn without scale / bias")
1407
1
{
1408
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1409
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1410
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1411
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1412
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
1413
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
1414
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
1415
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
1416
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
1417
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
1418
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1419
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
1420
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1421
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1422
1
  ccv_nnc_graph_t* graph = 0;
1423
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1424
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1425
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1426
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1427
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1428
1
  dsfmt_t dsfmt;
1429
1
  float xdata[2 * 2 * 2 * 10];
1430
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1431
1
  int i;
1432
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1433
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1434
80
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1435
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1436
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1437
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1438
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1439
1
  ccv_nnc_graph_free(graph);
1440
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1441
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1442
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1443
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
1444
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1445
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
1446
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1447
1
  ccv_nnc_graph_t* cpu_graph = 0;
1448
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1449
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1450
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1451
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1452
1
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
1453
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1454
1
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1455
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-5, "layer norm result from cudnn should match the one from reference implementation");
1456
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1457
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1458
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1459
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1460
1
  ccv_nnc_graph_free(cpu_graph);
1461
1
}
1462
1463
TEST_CASE("compare layer norm gradient with cudnn without scale / bias")
1464
1
{
1465
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1466
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1467
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1468
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1469
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
1470
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
1471
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
1472
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
1473
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
1474
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1475
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1476
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1477
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
1478
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
1479
1
  ccv_nnc_graph_t* graph = 0;
1480
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1481
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1482
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1483
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1484
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1485
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
1486
1
  dsfmt_t dsfmt;
1487
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1488
1
  int i;
1489
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1490
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1491
80
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
1492
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
1493
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1494
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1495
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
1496
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1497
80
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
1498
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
1499
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1500
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
1501
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1502
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
1503
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1504
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1505
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1506
1
  ccv_nnc_graph_free(graph);
1507
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1508
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1509
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1510
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
1511
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1512
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
1513
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1514
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
1515
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1516
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
1517
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
1518
1
  ccv_nnc_graph_t* cpu_graph = 0;
1519
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1520
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1521
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1522
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1523
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1524
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
1525
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1526
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1527
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
1528
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
1529
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1530
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1531
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1532
1
  ccv_nnc_graph_free(cpu_graph);
1533
1
  ccv_nnc_tensor_free(x_tensor);
1534
1
  ccv_nnc_tensor_free(dy_tensor);
1535
1
  ccv_nnc_tensor_free(dx_tensor);
1536
1
}
1537
1538
TEST_CASE("compare layer norm only gradient with cudnn without scale / bias")
1539
1
{
1540
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1541
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1542
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1543
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1544
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
1545
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
1546
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
1547
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
1548
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
1549
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1550
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1551
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1552
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
1553
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
1554
1
  ccv_nnc_graph_t* graph = 0;
1555
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1556
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1557
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1558
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1559
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1560
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
1561
1
  dsfmt_t dsfmt;
1562
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1563
1
  int i;
1564
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1565
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1566
80
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
1567
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
1568
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1569
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1570
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
1571
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
1572
80
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
1573
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
1574
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1575
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
1576
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
1577
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
1578
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1579
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1580
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1581
1
  ccv_nnc_graph_free(graph);
1582
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1583
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1584
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1585
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
1586
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1587
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
1588
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1589
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
1590
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1591
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
1592
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
1593
1
  ccv_nnc_graph_t* cpu_graph = 0;
1594
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1595
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1596
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1597
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1598
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1599
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
1600
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
1601
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1602
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
1603
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
1604
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1605
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1606
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1607
1
  ccv_nnc_graph_free(cpu_graph);
1608
1
  ccv_nnc_tensor_free(x_tensor);
1609
1
  ccv_nnc_tensor_free(dy_tensor);
1610
1
  ccv_nnc_tensor_free(dx_tensor);
1611
1
}
1612
1613
TEST_CASE("compare group norm with cudnn")
1614
1
{
1615
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1616
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1617
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1618
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1619
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
1620
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
1621
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
1622
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
1623
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "scale");
1624
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "bias");
1625
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
1626
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
1627
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1628
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
1629
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1630
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1631
1
  ccv_nnc_graph_t* graph = 0;
1632
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1633
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1634
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1635
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1636
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1637
1
  dsfmt_t dsfmt;
1638
1
  float xdata[2 * 16 * 2 * 10];
1639
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1640
1
  int i;
1641
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1642
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
1643
640
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1644
1
  float scaledata[1 * 16 * 2 * 10];
1645
1
  float biasdata[1 * 16 * 2 * 10];
1646
321
  for (i = 0; i < 1 * 16 * 2 * 10; 
i++320
)
1647
320
  {
1648
320
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1649
320
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
1650
320
  }
1651
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1652
1
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1653
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
1654
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1655
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1656
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1657
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1658
1
  ccv_nnc_graph_free(graph);
1659
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1660
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
1661
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
1662
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "scale");
1663
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "bias");
1664
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
1665
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
1666
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
1667
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1668
1
  ccv_nnc_graph_t* cpu_graph = 0;
1669
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1670
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1671
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1672
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1673
1
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
1674
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1675
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 16 * 2 * 10);
1676
1
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
1677
1
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 16 * 2 * 10);
1678
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1679
1
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1680
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-5, "layer norm result from cudnn should match the one from reference implementation");
1681
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1682
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1683
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1684
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1685
1
  ccv_nnc_graph_free(cpu_graph);
1686
1
}
1687
1688
TEST_CASE("compare group norm gradient with cudnn")
1689
1
{
1690
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1691
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1692
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1693
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1694
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
1695
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
1696
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "scale");
1697
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "bias");
1698
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
1699
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
1700
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
1701
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1702
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1703
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1704
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
1705
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
1706
1
  ccv_nnc_graph_t* graph = 0;
1707
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1708
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1709
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1710
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1711
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1712
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
1713
1
  dsfmt_t dsfmt;
1714
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
1715
1
  int i;
1716
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1717
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
1718
640
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
1719
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
1720
1
  float scaledata[1 * 16 * 2 * 10];
1721
1
  float biasdata[1 * 16 * 2 * 10];
1722
321
  for (i = 0; i < 1 * 16 * 2 * 10; 
i++320
)
1723
320
  {
1724
320
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1725
320
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
1726
320
  }
1727
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1728
1
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1729
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
1730
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1731
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
1732
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
1733
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
1734
640
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
1735
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
1736
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1737
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
1738
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
1739
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
1740
1
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
1741
1
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
1742
1
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1743
1
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1744
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
1745
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1746
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1747
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1748
1
  ccv_nnc_graph_free(graph);
1749
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1750
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
1751
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
1752
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "scale");
1753
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "bias");
1754
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
1755
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
1756
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
1757
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1758
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
1759
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1760
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
1761
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
1762
1
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
1763
1
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
1764
1
  ccv_nnc_graph_t* cpu_graph = 0;
1765
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1766
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1767
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1768
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1769
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
1770
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
1771
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
1772
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1773
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 16 * 2 * 10);
1774
1
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
1775
1
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 16 * 2 * 10);
1776
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1777
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
1778
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
1779
1
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
1780
1
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from cudnn should match the one from reference implementation");
1781
1
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
1782
1
  REQUIRE_TENSOR_EQ(dbias_tensor, dcbias_tensor, "layer norm bias gradient result from cudnn should match the one from reference implementation");
1783
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1784
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1785
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1786
1
  ccv_nnc_graph_free(cpu_graph);
1787
1
  ccv_nnc_tensor_free(x_tensor);
1788
1
  ccv_nnc_tensor_free(dy_tensor);
1789
1
  ccv_nnc_tensor_free(dx_tensor);
1790
1
  ccv_nnc_tensor_free(dscale_tensor);
1791
1
  ccv_nnc_tensor_free(dbias_tensor);
1792
1
}
1793
1794
TEST_CASE("compare group norm only gradient with cudnn")
1795
1
{
1796
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1797
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1798
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1799
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1800
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
1801
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
1802
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "scale");
1803
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "bias");
1804
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
1805
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
1806
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
1807
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1808
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1809
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1810
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
1811
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
1812
1
  ccv_nnc_graph_t* graph = 0;
1813
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1814
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1815
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1816
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1817
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1818
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
1819
1
  dsfmt_t dsfmt;
1820
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
1821
1
  int i;
1822
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1823
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
1824
640
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
1825
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
1826
1
  float scaledata[1 * 16 * 2 * 10];
1827
1
  float biasdata[1 * 16 * 2 * 10];
1828
321
  for (i = 0; i < 1 * 16 * 2 * 10; 
i++320
)
1829
320
  {
1830
320
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1831
320
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
1832
320
  }
1833
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1834
1
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1835
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
1836
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1837
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
1838
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
1839
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
1840
640
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
1841
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
1842
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1843
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
1844
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
1845
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
1846
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1847
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1848
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1849
1
  ccv_nnc_graph_free(graph);
1850
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1851
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
1852
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
1853
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "scale");
1854
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "bias");
1855
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
1856
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
1857
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
1858
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1859
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
1860
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1861
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
1862
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
1863
1
  ccv_nnc_graph_t* cpu_graph = 0;
1864
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1865
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1866
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1867
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1868
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
1869
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
1870
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
1871
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1872
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 16 * 2 * 10);
1873
1
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
1874
1
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 16 * 2 * 10);
1875
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1876
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
1877
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
1878
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1879
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1880
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1881
1
  ccv_nnc_graph_free(cpu_graph);
1882
1
  ccv_nnc_tensor_free(x_tensor);
1883
1
  ccv_nnc_tensor_free(dy_tensor);
1884
1
  ccv_nnc_tensor_free(dx_tensor);
1885
1
}
1886
1887
TEST_CASE("compare group norm and reduce HW with cudnn")
1888
1
{
1889
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1890
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1891
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1892
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1893
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
1894
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
1895
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
1896
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
1897
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 1, 1), "scale");
1898
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 1, 1), "bias");
1899
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 1, 1), "saved_mean");
1900
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 1, 1), "saved_inv_std");
1901
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1902
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
1903
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1904
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1905
1
  ccv_nnc_graph_t* graph = 0;
1906
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1907
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1908
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1909
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1910
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1911
1
  dsfmt_t dsfmt;
1912
1
  float xdata[2 * 16 * 2 * 10];
1913
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1914
1
  int i;
1915
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1916
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
1917
640
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1918
1
  float scaledata[1 * 16];
1919
1
  float biasdata[1 * 16];
1920
17
  for (i = 0; i < 1 * 16; 
i++16
)
1921
16
  {
1922
16
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1923
16
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
1924
16
  }
1925
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), 0);
1926
1
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), 0);
1927
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
1928
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1929
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1930
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1931
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1932
1
  ccv_nnc_graph_free(graph);
1933
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1934
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
1935
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
1936
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), "scale");
1937
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), "bias");
1938
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 1, 1), "saved_mean");
1939
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 1, 1), "saved_inv_std");
1940
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
1941
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1942
1
  ccv_nnc_graph_t* cpu_graph = 0;
1943
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1944
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1945
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1946
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1947
1
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
1948
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1949
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 16);
1950
1
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
1951
1
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 16);
1952
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1953
1
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1954
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-5, "layer norm result from cudnn should match the one from reference implementation");
1955
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1956
1
  ccv_nnc_tensor_arena_free(tensor_arena);
1957
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1958
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1959
1
  ccv_nnc_graph_free(cpu_graph);
1960
1
}
1961
1962
TEST_CASE("compare group norm gradient and reduce HW with cudnn")
1963
1
{
1964
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1965
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
1966
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
1967
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1968
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
1969
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
1970
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 1, 1), "scale");
1971
1
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 1, 1), "bias");
1972
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 1, 1), "saved_mean");
1973
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 1, 1), "saved_inv_std");
1974
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
1975
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1976
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1977
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1978
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
1979
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
1980
1
  ccv_nnc_graph_t* graph = 0;
1981
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1982
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1983
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1984
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1985
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1986
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
1987
1
  dsfmt_t dsfmt;
1988
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
1989
1
  int i;
1990
1
  dsfmt_init_gen_rand(&dsfmt, 1);
1991
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
1992
640
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
1993
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
1994
1
  float scaledata[1 * 16];
1995
1
  float biasdata[1 * 16];
1996
17
  for (i = 0; i < 1 * 16; 
i++16
)
1997
16
  {
1998
16
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1999
16
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
2000
16
  }
2001
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), 0);
2002
1
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), 0);
2003
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
2004
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2005
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2006
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2007
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
2008
640
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2009
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2010
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2011
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2012
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2013
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2014
1
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
2015
1
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
2016
1
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), 0);
2017
1
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), 0);
2018
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
2019
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2020
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2021
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2022
1
  ccv_nnc_graph_free(graph);
2023
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2024
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
2025
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
2026
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), "scale");
2027
1
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 1, 1), "bias");
2028
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 1, 1), "saved_mean");
2029
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 1, 1), "saved_inv_std");
2030
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
2031
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2032
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2033
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2034
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2035
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2036
1
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
2037
1
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
2038
1
  ccv_nnc_graph_t* cpu_graph = 0;
2039
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2040
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2041
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2042
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2043
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
2044
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2045
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
2046
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2047
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 16);
2048
1
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
2049
1
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 16);
2050
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2051
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2052
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2053
1
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
2054
1
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from cudnn should match the one from reference implementation");
2055
1
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
2056
1
  REQUIRE_TENSOR_EQ(dbias_tensor, dcbias_tensor, "layer norm bias gradient result from cudnn should match the one from reference implementation");
2057
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2058
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2059
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2060
1
  ccv_nnc_graph_free(cpu_graph);
2061
1
  ccv_nnc_tensor_free(x_tensor);
2062
1
  ccv_nnc_tensor_free(dy_tensor);
2063
1
  ccv_nnc_tensor_free(dx_tensor);
2064
1
  ccv_nnc_tensor_free(dscale_tensor);
2065
1
  ccv_nnc_tensor_free(dbias_tensor);
2066
1
}
2067
2068
TEST_CASE("compare group norm with cudnn without scale / bias")
2069
1
{
2070
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2071
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2072
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
2073
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2074
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
2075
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
2076
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
2077
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
2078
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
2079
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
2080
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
2081
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
2082
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
2083
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2084
1
  ccv_nnc_graph_t* graph = 0;
2085
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2086
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2087
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2088
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2089
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2090
1
  dsfmt_t dsfmt;
2091
1
  float xdata[2 * 16 * 2 * 10];
2092
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2093
1
  int i;
2094
1
  dsfmt_init_gen_rand(&dsfmt, 1);
2095
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
2096
640
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
2097
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2098
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2099
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2100
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2101
1
  ccv_nnc_graph_free(graph);
2102
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2103
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
2104
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
2105
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
2106
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
2107
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
2108
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2109
1
  ccv_nnc_graph_t* cpu_graph = 0;
2110
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2111
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2112
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2113
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2114
1
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
2115
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2116
1
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
2117
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-5, "layer norm result from cudnn should match the one from reference implementation");
2118
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2119
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2120
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2121
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2122
1
  ccv_nnc_graph_free(cpu_graph);
2123
1
}
2124
2125
TEST_CASE("compare group norm gradient with cudnn without scale / bias")
2126
1
{
2127
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2128
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2129
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
2130
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2131
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
2132
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
2133
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
2134
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
2135
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
2136
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2137
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2138
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2139
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2140
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2141
1
  ccv_nnc_graph_t* graph = 0;
2142
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2143
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2144
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2145
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2146
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2147
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2148
1
  dsfmt_t dsfmt;
2149
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2150
1
  int i;
2151
1
  dsfmt_init_gen_rand(&dsfmt, 1);
2152
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
2153
640
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2154
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2155
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2156
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2157
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2158
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
2159
640
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2160
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2161
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2162
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2163
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2164
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2165
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2166
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2167
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2168
1
  ccv_nnc_graph_free(graph);
2169
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2170
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
2171
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
2172
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
2173
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
2174
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
2175
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2176
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2177
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2178
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2179
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2180
1
  ccv_nnc_graph_t* cpu_graph = 0;
2181
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2182
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2183
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2184
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2185
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
2186
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2187
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
2188
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2189
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2190
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2191
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2192
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2193
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2194
1
  ccv_nnc_graph_free(cpu_graph);
2195
1
  ccv_nnc_tensor_free(x_tensor);
2196
1
  ccv_nnc_tensor_free(dy_tensor);
2197
1
  ccv_nnc_tensor_free(dx_tensor);
2198
1
}
2199
2200
TEST_CASE("compare group norm only gradient with cudnn without scale / bias")
2201
1
{
2202
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2203
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2204
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
2205
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2206
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
2207
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
2208
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
2209
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
2210
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
2211
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2212
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2213
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2214
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2215
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2216
1
  ccv_nnc_graph_t* graph = 0;
2217
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2218
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2219
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2220
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2221
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2222
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2223
1
  dsfmt_t dsfmt;
2224
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2225
1
  int i;
2226
1
  dsfmt_init_gen_rand(&dsfmt, 1);
2227
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
2228
640
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2229
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2230
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2231
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2232
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2233
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
2234
640
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2235
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2236
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2237
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2238
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2239
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2240
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2241
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2242
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2243
1
  ccv_nnc_graph_free(graph);
2244
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2245
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
2246
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
2247
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
2248
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
2249
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
2250
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2251
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2252
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2253
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2254
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2255
1
  ccv_nnc_graph_t* cpu_graph = 0;
2256
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2257
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2258
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2259
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2260
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
2261
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2262
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
2263
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2264
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2265
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2266
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2267
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2268
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2269
1
  ccv_nnc_graph_free(cpu_graph);
2270
1
  ccv_nnc_tensor_free(x_tensor);
2271
1
  ccv_nnc_tensor_free(dy_tensor);
2272
1
  ccv_nnc_tensor_free(dx_tensor);
2273
1
}
2274
2275
TEST_CASE("compare group norm and reduce HW with cudnn without scale / bias")
2276
1
{
2277
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2278
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2279
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
2280
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2281
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
2282
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
2283
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
2284
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
2285
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 1, 1), "saved_mean");
2286
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 1, 1), "saved_inv_std");
2287
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
2288
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
2289
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
2290
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2291
1
  ccv_nnc_graph_t* graph = 0;
2292
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2293
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2294
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2295
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2296
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2297
1
  dsfmt_t dsfmt;
2298
1
  float xdata[2 * 16 * 2 * 10];
2299
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2300
1
  int i;
2301
1
  dsfmt_init_gen_rand(&dsfmt, 1);
2302
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
2303
640
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
2304
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2305
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2306
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2307
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2308
1
  ccv_nnc_graph_free(graph);
2309
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2310
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
2311
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
2312
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 1, 1), "saved_mean");
2313
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 1, 1), "saved_inv_std");
2314
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
2315
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2316
1
  ccv_nnc_graph_t* cpu_graph = 0;
2317
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2318
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2319
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2320
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2321
1
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
2322
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2323
1
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
2324
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-5, "layer norm result from cudnn should match the one from reference implementation");
2325
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2326
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2327
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2328
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2329
1
  ccv_nnc_graph_free(cpu_graph);
2330
1
}
2331
2332
TEST_CASE("compare group norm gradient and reduce HW with cudnn without scale / bias")
2333
1
{
2334
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2335
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2336
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
2337
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2338
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
2339
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
2340
1
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 1, 1), "saved_mean");
2341
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 1, 1), "saved_inv_std");
2342
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
2343
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2344
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2345
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2346
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2347
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2348
1
  ccv_nnc_graph_t* graph = 0;
2349
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2350
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2351
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2352
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2353
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2354
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2355
1
  dsfmt_t dsfmt;
2356
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2357
1
  int i;
2358
1
  dsfmt_init_gen_rand(&dsfmt, 1);
2359
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
2360
640
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2361
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2362
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2363
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2364
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2365
641
  for (i = 0; i < 2 * 16 * 2 * 10; 
i++640
)
2366
640
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2367
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2368
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2369
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2370
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), 0);
2371
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2372
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2373
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2374
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2375
1
  ccv_nnc_graph_free(graph);
2376
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2377
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
2378
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
2379
1
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 1, 1), "saved_mean");
2380
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 1, 1), "saved_inv_std");
2381
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
2382
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2383
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2384
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2385
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2386
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2387
1
  ccv_nnc_graph_t* cpu_graph = 0;
2388
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2389
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2390
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2391
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2392
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
2393
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2394
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 16 * 2 * 10);
2395
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2396
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2397
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2398
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2399
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2400
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2401
1
  ccv_nnc_graph_free(cpu_graph);
2402
1
  ccv_nnc_tensor_free(x_tensor);
2403
1
  ccv_nnc_tensor_free(dy_tensor);
2404
1
  ccv_nnc_tensor_free(dx_tensor);
2405
1
}
2406
2407
TEST_CASE("compare rmsnorm with cudnn")
2408
1
{
2409
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2410
1
    ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2411
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
2412
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2413
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
2414
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
2415
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
2416
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
2417
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
2418
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2419
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
2420
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
2421
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
2422
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2423
1
  ccv_nnc_graph_t* graph = 0;
2424
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2425
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2426
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2427
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2428
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2429
1
  dsfmt_t dsfmt;
2430
1
  float xdata[2 * 2 * 2 * 10];
2431
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2432
1
  int i;
2433
1
  dsfmt_init_gen_rand(&dsfmt, 1);
2434
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
2435
80
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
2436
1
  float scaledata[1 * 2 * 2 * 10];
2437
41
  for (i = 0; i < 1 * 2 * 2 * 10; 
i++40
)
2438
40
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2439
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
2440
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale)), 0);
2441
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2442
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2443
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2444
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2445
1
  ccv_nnc_graph_free(graph);
2446
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2447
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
2448
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
2449
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
2450
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2451
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "rmsnorm");
2452
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2453
1
  ccv_nnc_graph_t* cpu_graph = 0;
2454
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2455
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2456
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2457
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2458
1
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
2459
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2460
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
2461
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2462
1
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
2463
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-5, "rmsnorm result from cudnn should match the one from reference implementation");
2464
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2465
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2466
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2467
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2468
1
  ccv_nnc_graph_free(cpu_graph);
2469
1
}
2470
2471
TEST_CASE("compare rmsnorm gradient with cudnn")
2472
1
{
2473
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2474
1
    ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2475
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
2476
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2477
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
2478
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
2479
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
2480
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2481
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
2482
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2483
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2484
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2485
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2486
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2487
1
  ccv_nnc_graph_t* graph = 0;
2488
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2489
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2490
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2491
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2492
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2493
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2494
1
  dsfmt_t dsfmt;
2495
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
2496
1
  int i;
2497
1
  dsfmt_init_gen_rand(&dsfmt, 1);
2498
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
2499
80
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2500
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2501
1
  float scaledata[1 * 2 * 2 * 10];
2502
41
  for (i = 0; i < 1 * 2 * 2 * 10; 
i++40
)
2503
40
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2504
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
2505
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale)), 0);
2506
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2507
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
2508
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2509
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
2510
80
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2511
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2512
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2513
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2514
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
2515
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2516
1
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
2517
1
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
2518
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
2519
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2520
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2521
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2522
1
  ccv_nnc_graph_free(graph);
2523
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2524
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
2525
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
2526
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
2527
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2528
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "rmsnorm");
2529
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2530
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2531
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2532
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2533
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2534
1
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
2535
1
  ccv_nnc_graph_t* cpu_graph = 0;
2536
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2537
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2538
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2539
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2540
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
2541
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2542
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
2543
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2544
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
2545
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2546
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2547
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "rmsnorm gradient result from cudnn should match the one from reference implementation");
2548
1
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
2549
1
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "rmsnorm scale gradient result from cudnn should match the one from reference implementation");
2550
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2551
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2552
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2553
1
  ccv_nnc_graph_free(cpu_graph);
2554
1
  ccv_nnc_tensor_free(x_tensor);
2555
1
  ccv_nnc_tensor_free(dy_tensor);
2556
1
  ccv_nnc_tensor_free(dx_tensor);
2557
1
  ccv_nnc_tensor_free(dscale_tensor);
2558
1
}
2559
2560
TEST_CASE("compare rmsnorm only gradient with cudnn")
2561
1
{
2562
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2563
1
    ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2564
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)));
2565
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2566
1
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
2567
1
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
2568
1
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
2569
1
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2570
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
2571
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2572
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2573
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2574
1
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2575
1
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2576
1
  ccv_nnc_graph_t* graph = 0;
2577
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2578
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2579
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2580
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2581
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2582
1
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2583
1
  dsfmt_t dsfmt;
2584
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
2585
1
  int i;
2586
1
  dsfmt_init_gen_rand(&dsfmt, 1);
2587
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
2588
80
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2589
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2590
1
  float scaledata[1 * 2 * 2 * 10];
2591
41
  for (i = 0; i < 1 * 2 * 2 * 10; 
i++40
)
2592
40
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2593
1
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
2594
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale)), 0);
2595
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2596
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
2597
1
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2598
81
  for (i = 0; i < 2 * 2 * 2 * 10; 
i++80
)
2599
80
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2600
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2601
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2602
1
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2603
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), 0);
2604
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2605
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2606
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2607
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2608
1
  ccv_nnc_graph_free(graph);
2609
1
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2610
1
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
2611
1
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
2612
1
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
2613
1
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2614
1
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "rmsnorm");
2615
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2616
1
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2617
1
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2618
1
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2619
1
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2620
1
  ccv_nnc_graph_t* cpu_graph = 0;
2621
1
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2622
1
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2623
1
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2624
1
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2625
1
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
2626
1
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2627
1
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * 10);
2628
1
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2629
1
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
2630
1
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2631
1
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2632
1
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "rmsnorm gradient result from cudnn should match the one from reference implementation");
2633
1
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2634
1
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2635
1
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2636
1
  ccv_nnc_graph_free(cpu_graph);
2637
1
  ccv_nnc_tensor_free(x_tensor);
2638
1
  ccv_nnc_tensor_free(dy_tensor);
2639
1
  ccv_nnc_tensor_free(dx_tensor);
2640
1
}
2641
2642
TEST_CASE("compare average pooling with cudnn")
2643
1
{
2644
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
2645
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2646
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
2647
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "y");
2648
1
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "avg_pool");
2649
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
2650
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2651
1
  ccv_nnc_graph_t* graph = 0;
2652
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2653
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2654
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2655
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2656
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2657
1
  dsfmt_t dsfmt;
2658
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2659
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2660
1
  int i;
2661
491
  for (i = 0; i < 7 * 7 * 10; 
i++490
)
2662
490
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2663
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2664
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2665
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2666
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2667
1
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
2668
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2669
1
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2670
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
2671
1
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "cudnn result should equal to cpu result");
2672
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2673
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2674
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2675
1
  ccv_nnc_graph_free(graph);
2676
1
  ccv_nnc_tensor_free(x_tensor);
2677
1
  ccv_nnc_tensor_free(y_tensor);
2678
1
  ccv_nnc_tensor_free(cpu_y);
2679
1
}
2680
2681
TEST_CASE("compare average pooling with cudnn in half precision")
2682
1
{
2683
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
2684
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2685
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
2686
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "y");
2687
1
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "avg_pool");
2688
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
2689
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2690
1
  ccv_nnc_graph_t* graph = 0;
2691
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2692
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2693
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2694
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2695
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2696
1
  dsfmt_t dsfmt;
2697
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2698
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2699
1
  int i;
2700
491
  for (i = 0; i < 7 * 7 * 10; 
i++490
)
2701
490
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2702
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
2703
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2704
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2705
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
2706
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2707
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2708
1
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
2709
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2710
1
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2711
1
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
2712
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
2713
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
2714
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 3 * 3 * 10, 1e-3, "cudnn result should equal to cpu result");
2715
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2716
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2717
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2718
1
  ccv_nnc_graph_free(graph);
2719
1
  ccv_nnc_tensor_free(x_tensor);
2720
1
  ccv_nnc_tensor_free(x16_tensor);
2721
1
  ccv_nnc_tensor_free(y_tensor);
2722
1
  ccv_nnc_tensor_free(cpu_y);
2723
1
  ccv_nnc_tensor_free(cpu_y16);
2724
1
}
2725
2726
TEST_CASE("compare average pooling gradient with cudnn")
2727
1
{
2728
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
2729
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2730
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "dx");
2731
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "dy");
2732
1
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_BACKWARD(5, 5), TENSOR_SYMBOL_LIST(dy), TENSOR_SYMBOL_LIST(dx), "avg_pool");
2733
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
2734
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2735
1
  ccv_nnc_graph_t* graph = 0;
2736
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2737
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2738
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2739
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2740
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2741
1
  dsfmt_t dsfmt;
2742
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2743
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2744
1
  int i;
2745
91
  for (i = 0; i < 3 * 3 * 10; 
i++90
)
2746
90
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2747
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_from_symbol(tensor_arena, dy);
2748
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
2749
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2750
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2751
1
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_BACKWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dx_tensor), 0);
2752
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
2753
1
  ccv_nnc_tensor_t* const cpu_dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2754
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(cpu_dx), 0);
2755
1
  REQUIRE_TENSOR_EQ(dx_tensor, cpu_dx, "cudnn result should equal to cpu result");
2756
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2757
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2758
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2759
1
  ccv_nnc_graph_free(graph);
2760
1
  ccv_nnc_tensor_free(dy_tensor);
2761
1
  ccv_nnc_tensor_free(dx_tensor);
2762
1
  ccv_nnc_tensor_free(cpu_dx);
2763
1
}
2764
2765
TEST_CASE("compare average pooling gradient with cudnn in half precision")
2766
1
{
2767
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
2768
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2769
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "dx");
2770
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "dy");
2771
1
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_BACKWARD(5, 5), TENSOR_SYMBOL_LIST(dy), TENSOR_SYMBOL_LIST(dx), "avg_pool");
2772
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
2773
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2774
1
  ccv_nnc_graph_t* graph = 0;
2775
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2776
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2777
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2778
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2779
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2780
1
  dsfmt_t dsfmt;
2781
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2782
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2783
1
  int i;
2784
91
  for (i = 0; i < 3 * 3 * 10; 
i++90
)
2785
90
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2786
1
  ccv_nnc_tensor_t* const dy16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
2787
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_from_symbol(tensor_arena, dy);
2788
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dy16_tensor), 0);
2789
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy16_tensor), TENSOR_LIST(dyt), 0);
2790
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2791
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2792
1
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_BACKWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dx_tensor), 0);
2793
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
2794
1
  ccv_nnc_tensor_t* const cpu_dx16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
2795
1
  ccv_nnc_tensor_t* const cpu_dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2796
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(cpu_dx16), 0);
2797
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_dx16), TENSOR_LIST(cpu_dx), 0);
2798
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, cpu_dx->data.f32, 7 * 7 * 10, 1e-3, "cudnn result should equal to cpu result");
2799
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2800
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2801
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2802
1
  ccv_nnc_graph_free(graph);
2803
1
  ccv_nnc_tensor_free(dy_tensor);
2804
1
  ccv_nnc_tensor_free(dy16_tensor);
2805
1
  ccv_nnc_tensor_free(dx_tensor);
2806
1
  ccv_nnc_tensor_free(cpu_dx);
2807
1
  ccv_nnc_tensor_free(cpu_dx16);
2808
1
}
2809
2810
TEST_CASE("compare max pooling with cudnn")
2811
1
{
2812
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
2813
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2814
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
2815
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "y");
2816
1
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
2817
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
2818
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2819
1
  ccv_nnc_graph_t* graph = 0;
2820
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2821
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2822
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2823
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2824
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2825
1
  dsfmt_t dsfmt;
2826
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2827
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2828
1
  int i;
2829
491
  for (i = 0; i < 7 * 7 * 10; 
i++490
)
2830
490
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2831
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2832
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2833
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2834
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2835
1
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
2836
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2837
1
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2838
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
2839
1
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "cudnn result should equal to cpu result");
2840
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2841
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2842
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2843
1
  ccv_nnc_graph_free(graph);
2844
1
  ccv_nnc_tensor_free(x_tensor);
2845
1
  ccv_nnc_tensor_free(y_tensor);
2846
1
  ccv_nnc_tensor_free(cpu_y);
2847
1
}
2848
2849
TEST_CASE("compare max pooling with cudnn in half precision")
2850
1
{
2851
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
2852
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2853
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
2854
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "y");
2855
1
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
2856
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
2857
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2858
1
  ccv_nnc_graph_t* graph = 0;
2859
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2860
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2861
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2862
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2863
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2864
1
  dsfmt_t dsfmt;
2865
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2866
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2867
1
  int i;
2868
491
  for (i = 0; i < 7 * 7 * 10; 
i++490
)
2869
490
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2870
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2871
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
2872
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2873
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
2874
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2875
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2876
1
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
2877
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2878
1
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
2879
1
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2880
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
2881
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
2882
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 3 * 3 * 10, 1e-3, "cudnn result should equal to cpu result");
2883
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2884
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2885
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2886
1
  ccv_nnc_graph_free(graph);
2887
1
  ccv_nnc_tensor_free(x_tensor);
2888
1
  ccv_nnc_tensor_free(x16_tensor);
2889
1
  ccv_nnc_tensor_free(y_tensor);
2890
1
  ccv_nnc_tensor_free(cpu_y);
2891
1
  ccv_nnc_tensor_free(cpu_y16);
2892
1
}
2893
2894
TEST_CASE("compare max pooling 2x2 with cudnn")
2895
1
{
2896
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
2897
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2898
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 6, 6), "x");
2899
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 3, 3), "y");
2900
1
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(2, 2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
2901
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (0, 0)));
2902
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2903
1
  ccv_nnc_graph_t* graph = 0;
2904
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2905
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2906
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2907
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2908
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2909
1
  dsfmt_t dsfmt;
2910
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2911
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 6, 6), 0);
2912
1
  int i, j;
2913
361
  for (i = 0; i < 6 * 6 * 10; 
i++360
)
2914
360
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2915
1
  ccv_nnc_tensor_t* const gt_x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 6, 6, 10), 0);
2916
11
  for (i = 0; i < 10; 
i++10
)
2917
370
    
for (j = 0; 10
j < 6 * 6;
j++360
)
2918
360
      gt_x->data.f32[j * 10 + i] = x_tensor->data.f32[i * 6 * 6 + j];
2919
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2920
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2921
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2922
1
  ccv_nnc_tensor_t* const gt_y= ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2923
1
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(2, 2), HINT((2, 2), (0, 0)), 0, TENSOR_LIST(gt_x), TENSOR_LIST(gt_y), 0);
2924
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2925
1
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2926
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
2927
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2928
11
  for (i = 0; i < 10; 
i++10
)
2929
100
    
for (j = 0; 10
j < 3 * 3;
j++90
)
2930
90
      y_tensor->data.f32[i * 3 * 3 + j] = gt_y->data.f32[j * 10 + i];
2931
1
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "cudnn result should equal to cpu result");
2932
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2933
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2934
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2935
1
  ccv_nnc_graph_free(graph);
2936
1
  ccv_nnc_tensor_free(x_tensor);
2937
1
  ccv_nnc_tensor_free(y_tensor);
2938
1
  ccv_nnc_tensor_free(cpu_y);
2939
1
}
2940
2941
TEST_CASE("compare max pooling 2x2 with cudnn in half precision")
2942
1
{
2943
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
2944
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2945
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 6, 6), "x");
2946
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 3, 3), "y");
2947
1
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(2, 2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
2948
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (0, 0)));
2949
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2950
1
  ccv_nnc_graph_t* graph = 0;
2951
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2952
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2953
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2954
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2955
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2956
1
  dsfmt_t dsfmt;
2957
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2958
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 6, 6), 0);
2959
1
  int i, j;
2960
361
  for (i = 0; i < 6 * 6 * 10; 
i++360
)
2961
360
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2962
1
  ccv_nnc_tensor_t* const gt_x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 6, 6, 10), 0);
2963
11
  for (i = 0; i < 10; 
i++10
)
2964
370
    
for (j = 0; 10
j < 6 * 6;
j++360
)
2965
360
      gt_x->data.f32[j * 10 + i] = x_tensor->data.f32[i * 6 * 6 + j];
2966
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2967
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 6, 6), 0);
2968
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2969
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
2970
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2971
1
  ccv_nnc_tensor_t* const gt_y= ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2972
1
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(2, 2), HINT((2, 2), (0, 0)), 0, TENSOR_LIST(gt_x), TENSOR_LIST(gt_y), 0);
2973
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2974
1
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 3, 3), 0);
2975
1
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2976
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
2977
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
2978
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2979
11
  for (i = 0; i < 10; 
i++10
)
2980
100
    
for (j = 0; 10
j < 3 * 3;
j++90
)
2981
90
      y_tensor->data.f32[i * 3 * 3 + j] = gt_y->data.f32[j * 10 + i];
2982
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 10 * 3 * 3, 1e-3, "cudnn result should equal to cpu result");
2983
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2984
1
  ccv_nnc_tensor_arena_free(tensor_arena);
2985
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2986
1
  ccv_nnc_graph_free(graph);
2987
1
  ccv_nnc_tensor_free(x_tensor);
2988
1
  ccv_nnc_tensor_free(x16_tensor);
2989
1
  ccv_nnc_tensor_free(y_tensor);
2990
1
  ccv_nnc_tensor_free(cpu_y);
2991
1
  ccv_nnc_tensor_free(cpu_y16);
2992
1
}
2993
2994
TEST_CASE("compare max pooling gradient with cudnn")
2995
1
{
2996
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
2997
1
    ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
2998
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2999
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
3000
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "y");
3001
1
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
3002
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
3003
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3004
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3005
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3006
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3007
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3008
1
  dsfmt_t dsfmt;
3009
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3010
1
  int i;
3011
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3012
91
  for (i = 0; i < 3 * 3 * 10; 
i++90
)
3013
90
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3014
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), 0);
3015
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
3016
1
  ccv_nnc_graph_t* graph = 0;
3017
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3018
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3019
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3020
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3021
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3022
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3023
491
  for (i = 0; i < 7 * 7 * 10; 
i++490
)
3024
490
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3025
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3026
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
3027
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3028
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3029
1
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
3030
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3031
1
  ccv_nnc_cmd_exec(CMD_MAX_POOL_BACKWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(dy_tensor, x_tensor, y_tensor), TENSOR_LIST(dx_tensor), 0);
3032
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3033
1
  ccv_nnc_tensor_t* const cpu_dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3034
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(cpu_dx), 0);
3035
1
  REQUIRE_TENSOR_EQ(dx_tensor, cpu_dx, "cudnn result should equal to cpu result");
3036
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3037
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3038
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3039
1
  ccv_nnc_graph_free(graph);
3040
1
  ccv_nnc_tensor_free(x_tensor);
3041
1
  ccv_nnc_tensor_free(y_tensor);
3042
1
  ccv_nnc_tensor_free(dx_tensor);
3043
1
  ccv_nnc_tensor_free(dy_tensor);
3044
1
  ccv_nnc_tensor_free(cpu_dx);
3045
1
  ccv_nnc_tensor_free(dyt);
3046
1
}
3047
3048
TEST_CASE("compare max pooling gradient with cudnn in half precision")
3049
1
{
3050
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3051
1
    ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3052
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
3053
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
3054
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "y");
3055
1
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
3056
1
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
3057
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3058
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3059
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3060
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3061
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3062
1
  dsfmt_t dsfmt;
3063
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3064
1
  int i;
3065
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3066
91
  for (i = 0; i < 3 * 3 * 10; 
i++90
)
3067
90
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3068
1
  ccv_nnc_tensor_t* const dy16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
3069
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), 0);
3070
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dy16_tensor), 0);
3071
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy16_tensor), TENSOR_LIST(dyt), 0);
3072
1
  ccv_nnc_graph_t* graph = 0;
3073
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3074
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3075
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3076
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3077
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3078
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3079
491
  for (i = 0; i < 7 * 7 * 10; 
i++490
)
3080
490
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3081
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
3082
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3083
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3084
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
3085
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3086
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3087
1
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
3088
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3089
1
  ccv_nnc_cmd_exec(CMD_MAX_POOL_BACKWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(dy_tensor, x_tensor, y_tensor), TENSOR_LIST(dx_tensor), 0);
3090
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3091
1
  ccv_nnc_tensor_t* const cpu_dx16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
3092
1
  ccv_nnc_tensor_t* const cpu_dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3093
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(cpu_dx16), 0);
3094
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_dx16), TENSOR_LIST(cpu_dx), 0);
3095
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, cpu_dx->data.f32, 7 * 7 * 10, 5e-3, "cudnn result should equal to cpu result");
3096
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3097
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3098
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3099
1
  ccv_nnc_graph_free(graph);
3100
1
  ccv_nnc_tensor_free(x_tensor);
3101
1
  ccv_nnc_tensor_free(x16_tensor);
3102
1
  ccv_nnc_tensor_free(y_tensor);
3103
1
  ccv_nnc_tensor_free(dx_tensor);
3104
1
  ccv_nnc_tensor_free(dy_tensor);
3105
1
  ccv_nnc_tensor_free(dy16_tensor);
3106
1
  ccv_nnc_tensor_free(cpu_dx);
3107
1
  ccv_nnc_tensor_free(cpu_dx16);
3108
1
  ccv_nnc_tensor_free(dyt);
3109
1
}
3110
3111
TEST_CASE("compare relu with cudnn")
3112
1
{
3113
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3114
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
3115
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
3116
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "y");
3117
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
3118
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3119
1
  ccv_nnc_graph_t* graph = 0;
3120
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3121
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3122
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3123
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3124
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3125
1
  dsfmt_t dsfmt;
3126
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3127
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3128
1
  int i;
3129
491
  for (i = 0; i < 7 * 7 * 10; 
i++490
)
3130
490
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3131
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3132
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
3133
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3134
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3135
1
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
3136
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3137
1
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3138
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
3139
1
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "cudnn result should equal to cpu result");
3140
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3141
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3142
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3143
1
  ccv_nnc_graph_free(graph);
3144
1
  ccv_nnc_tensor_free(x_tensor);
3145
1
  ccv_nnc_tensor_free(y_tensor);
3146
1
  ccv_nnc_tensor_free(cpu_y);
3147
1
}
3148
3149
TEST_CASE("compare relu with cudnn in half precision")
3150
1
{
3151
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3152
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
3153
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
3154
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "y");
3155
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
3156
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3157
1
  ccv_nnc_graph_t* graph = 0;
3158
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3159
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3160
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3161
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3162
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3163
1
  dsfmt_t dsfmt;
3164
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3165
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3166
1
  int i;
3167
491
  for (i = 0; i < 7 * 7 * 10; 
i++490
)
3168
490
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3169
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
3170
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3171
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3172
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
3173
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3174
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3175
1
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
3176
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3177
1
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
3178
1
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3179
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
3180
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
3181
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 7 * 7 * 10, 1e-3, "cudnn result should equal to cpu result");
3182
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3183
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3184
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3185
1
  ccv_nnc_graph_free(graph);
3186
1
  ccv_nnc_tensor_free(x_tensor);
3187
1
  ccv_nnc_tensor_free(x16_tensor);
3188
1
  ccv_nnc_tensor_free(y_tensor);
3189
1
  ccv_nnc_tensor_free(cpu_y);
3190
1
  ccv_nnc_tensor_free(cpu_y16);
3191
1
}
3192
3193
TEST_CASE("compare relu gradient with cudnn")
3194
1
{
3195
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3196
1
    ccv_nnc_cmd_ok(CCV_NNC_RELU_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3197
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
3198
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 10, 7, 7), "x");
3199
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 10, 7, 7), "y");
3200
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
3201
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3202
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3203
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3204
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3205
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3206
1
  dsfmt_t dsfmt;
3207
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3208
1
  int i;
3209
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3210
4.90k
  for (i = 0; i < 10 * 7 * 7 * 10; 
i++4.90k
)
3211
4.90k
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3212
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 10, 7, 7), 0);
3213
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
3214
1
  ccv_nnc_graph_t* graph = 0;
3215
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3216
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3217
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3218
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3219
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3220
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3221
4.90k
  for (i = 0; i < 10 * 7 * 7 * 10; 
i++4.90k
)
3222
4.90k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3223
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3224
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
3225
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3226
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3227
1
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
3228
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3229
1
  ccv_nnc_cmd_exec(CMD_RELU_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, x_tensor, y_tensor), TENSOR_LIST(dx_tensor), 0);
3230
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3231
1
  ccv_nnc_tensor_t* const cpu_dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3232
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(cpu_dx), 0);
3233
1
  REQUIRE_TENSOR_EQ(dx_tensor, cpu_dx, "cudnn result should equal to cpu result");
3234
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3235
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3236
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3237
1
  ccv_nnc_graph_free(graph);
3238
1
  ccv_nnc_tensor_free(x_tensor);
3239
1
  ccv_nnc_tensor_free(y_tensor);
3240
1
  ccv_nnc_tensor_free(dx_tensor);
3241
1
  ccv_nnc_tensor_free(dy_tensor);
3242
1
  ccv_nnc_tensor_free(dyt);
3243
1
  ccv_nnc_tensor_free(cpu_dx);
3244
1
}
3245
3246
TEST_CASE("compare relu gradient with cudnn in half precision")
3247
1
{
3248
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3249
1
    ccv_nnc_cmd_ok(CCV_NNC_RELU_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3250
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
3251
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 10, 7, 7), "x");
3252
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 10, 7, 7), "y");
3253
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
3254
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3255
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3256
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3257
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3258
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3259
1
  dsfmt_t dsfmt;
3260
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3261
1
  int i;
3262
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3263
4.90k
  for (i = 0; i < 10 * 7 * 7 * 10; 
i++4.90k
)
3264
4.90k
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3265
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 10, 10, 7, 7), 0);
3266
1
  ccv_nnc_tensor_t* const dy16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 10, 7, 7), 0);
3267
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dy16_tensor), 0);
3268
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy16_tensor), TENSOR_LIST(dyt), 0);
3269
1
  ccv_nnc_graph_t* graph = 0;
3270
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3271
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3272
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3273
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3274
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3275
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3276
4.90k
  for (i = 0; i < 10 * 7 * 7 * 10; 
i++4.90k
)
3277
4.90k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3278
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3279
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 10, 7, 7), 0);
3280
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3281
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
3282
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3283
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3284
1
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
3285
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3286
1
  ccv_nnc_cmd_exec(CMD_RELU_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, x_tensor, y_tensor), TENSOR_LIST(dx_tensor), 0);
3287
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3288
1
  ccv_nnc_tensor_t* const cpu_dx16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 10, 7, 7), 0);
3289
1
  ccv_nnc_tensor_t* const cpu_dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 10, 7, 7), 0);
3290
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(cpu_dx16), 0);
3291
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_dx16), TENSOR_LIST(cpu_dx), 0);
3292
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, cpu_dx->data.f32, 10 * 10 * 7 * 7, 1e-3, "cudnn result should equal to cpu result");
3293
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3294
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3295
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3296
1
  ccv_nnc_graph_free(graph);
3297
1
  ccv_nnc_tensor_free(x_tensor);
3298
1
  ccv_nnc_tensor_free(x16_tensor);
3299
1
  ccv_nnc_tensor_free(y_tensor);
3300
1
  ccv_nnc_tensor_free(dx_tensor);
3301
1
  ccv_nnc_tensor_free(dy_tensor);
3302
1
  ccv_nnc_tensor_free(dy16_tensor);
3303
1
  ccv_nnc_tensor_free(dyt);
3304
1
  ccv_nnc_tensor_free(cpu_dx);
3305
1
  ccv_nnc_tensor_free(cpu_dx16);
3306
1
}
3307
3308
TEST_CASE("compare dropout with cudnn")
3309
1
{
3310
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DROPOUT_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3311
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3312
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 20 * 50), "x");
3313
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 20 * 50), "y");
3314
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, ccv_nnc_tensor_auto, "c");
3315
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DROPOUT_FORWARD(0.4), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y, c), "dropout");
3316
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3317
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3318
1
  ccv_nnc_graph_t* graph = 0;
3319
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3320
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3321
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3322
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3323
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3324
1
  int i;
3325
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3326
1.00k
    x_tensor->data.f32[i] = (i + 1) * 0.01;
3327
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3328
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
3329
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3330
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3331
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3332
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
3333
1
  int zero_count = 0;
3334
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3335
1.00k
    if (fabsf(y_tensor->data.f32[i]) < 1e-5)
3336
386
      ++zero_count;
3337
614
    else {
3338
614
      REQUIRE_EQ_WITH_TOLERANCE(x_tensor->data.f32[i] / 0.6, y_tensor->data.f32[i], 1e-5, "should be scaled up by 1 / 0.6");
3339
614
    }
3340
1
  REQUIRE_EQ_WITH_TOLERANCE((float)zero_count / (20 * 50), 0.4, 5 * 1e-2, "should be within 5%% of error");
3341
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3342
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3343
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3344
1
  ccv_nnc_graph_free(graph);
3345
1
  ccv_nnc_tensor_free(x_tensor);
3346
1
  ccv_nnc_tensor_free(y_tensor);
3347
1
}
3348
3349
TEST_CASE("compare dropout with cudnn in half precision")
3350
1
{
3351
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DROPOUT_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3352
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3353
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 20 * 50), "x");
3354
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 20 * 50), "y");
3355
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, ccv_nnc_tensor_auto, "c");
3356
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DROPOUT_FORWARD(0.4), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y, c), "dropout");
3357
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3358
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3359
1
  ccv_nnc_graph_t* graph = 0;
3360
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3361
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3362
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3363
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3364
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3365
1
  int i;
3366
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3367
1.00k
    x_tensor->data.f32[i] = (i + 1) * 0.01;
3368
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 20 * 50), 0);
3369
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3370
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3371
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
3372
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3373
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3374
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3375
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 20 * 50), 0);
3376
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y16_tensor), 0);
3377
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
3378
1
  int zero_count = 0;
3379
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3380
1.00k
    if (fabsf(y_tensor->data.f32[i]) < 1e-5)
3381
401
      ++zero_count;
3382
599
    else {
3383
599
      REQUIRE_EQ_WITH_TOLERANCE(x_tensor->data.f32[i] / 0.6, y_tensor->data.f32[i], x_tensor->data.f32[i] * 2e-3, "should be scaled up by 1 / 0.6");
3384
599
    }
3385
1
  REQUIRE_EQ_WITH_TOLERANCE((float)zero_count / (20 * 50), 0.4, 5 * 1e-2, "should be within 5%% of error");
3386
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3387
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3388
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3389
1
  ccv_nnc_graph_free(graph);
3390
1
  ccv_nnc_tensor_free(x_tensor);
3391
1
  ccv_nnc_tensor_free(x16_tensor);
3392
1
  ccv_nnc_tensor_free(y_tensor);
3393
1
  ccv_nnc_tensor_free(y16_tensor);
3394
1
}
3395
3396
TEST_CASE("compare dropout gradient with cudnn")
3397
1
{
3398
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DROPOUT_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3399
1
    ccv_nnc_cmd_ok(CCV_NNC_DROPOUT_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3400
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3401
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 20 * 50), "x");
3402
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 20 * 50), "y");
3403
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, ccv_nnc_tensor_auto, "c");
3404
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DROPOUT_FORWARD(0.4), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y, c), "dropout");
3405
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3406
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3407
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3408
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3409
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3410
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3411
1
  int i;
3412
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3413
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3414
1.00k
    dy_tensor->data.f32[i] = i + 1;
3415
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 20 * 50), 0);
3416
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
3417
1
  ccv_nnc_graph_t* graph = 0;
3418
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3419
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3420
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3421
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3422
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3423
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3424
1.00k
    x_tensor->data.f32[i] = (i + 1) * 0.01;
3425
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3426
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
3427
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3428
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3429
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3430
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
3431
1
  int zero_count = 0;
3432
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3433
1.00k
    if (fabsf(dx_tensor->data.f32[i]) < 1e-5)
3434
387
      ++zero_count;
3435
613
    else {
3436
613
      REQUIRE_EQ_WITH_TOLERANCE(dx_tensor->data.f32[i], dy_tensor->data.f32[i] / 0.6, 1e-3, "should match the gradient");
3437
613
    }
3438
1
  REQUIRE_EQ_WITH_TOLERANCE((float)zero_count / (20 * 50), 0.4, 5 * 1e-2, "should be within 5%% of error");
3439
1
  ccv_nnc_graph_free(graph);
3440
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3441
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3442
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3443
1
  ccv_nnc_tensor_free(x_tensor);
3444
1
  ccv_nnc_tensor_free(dy_tensor);
3445
1
  ccv_nnc_tensor_free(dyt);
3446
1
  ccv_nnc_tensor_free(dx_tensor);
3447
1
}
3448
3449
TEST_CASE("compare dropout gradient with cudnn in half precision")
3450
1
{
3451
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DROPOUT_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3452
1
    ccv_nnc_cmd_ok(CCV_NNC_DROPOUT_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3453
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3454
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 20 * 50), "x");
3455
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 20 * 50), "y");
3456
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, ccv_nnc_tensor_auto, "c");
3457
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DROPOUT_FORWARD(0.4), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y, c), "dropout");
3458
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3459
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3460
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3461
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3462
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3463
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3464
1
  int i;
3465
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3466
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3467
1.00k
    dy_tensor->data.f32[i] = i + 1;
3468
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 20 * 50), 0);
3469
1
  ccv_nnc_tensor_t* const dy16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 20 * 50), 0);
3470
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dy16_tensor), 0);
3471
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy16_tensor), TENSOR_LIST(dyt), 0);
3472
1
  ccv_nnc_graph_t* graph = 0;
3473
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3474
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3475
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3476
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3477
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3478
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3479
1.00k
    x_tensor->data.f32[i] = (i + 1) * 0.01;
3480
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 20 * 50), 0);
3481
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3482
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3483
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
3484
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3485
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3486
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20 * 50), 0);
3487
1
  ccv_nnc_tensor_t* const dx16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 20 * 50), 0);
3488
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx16_tensor), 0);
3489
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dx16_tensor), TENSOR_LIST(dx_tensor), 0);
3490
1
  int zero_count = 0;
3491
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3492
1.00k
    if (fabsf(dx_tensor->data.f32[i]) < 1e-5)
3493
401
      ++zero_count;
3494
599
    else {
3495
599
      REQUIRE_EQ_WITH_TOLERANCE(dx_tensor->data.f32[i], dy_tensor->data.f32[i] / 0.6, dx_tensor->data.f32[i] * 1e-3, "should match the gradient");
3496
599
    }
3497
1
  REQUIRE_EQ_WITH_TOLERANCE((float)zero_count / (20 * 50), 0.4, 5 * 1e-2, "should be within 5%% of error");
3498
1
  ccv_nnc_graph_free(graph);
3499
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3500
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3501
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3502
1
  ccv_nnc_tensor_free(x_tensor);
3503
1
  ccv_nnc_tensor_free(x16_tensor);
3504
1
  ccv_nnc_tensor_free(dy_tensor);
3505
1
  ccv_nnc_tensor_free(dy16_tensor);
3506
1
  ccv_nnc_tensor_free(dyt);
3507
1
  ccv_nnc_tensor_free(dx_tensor);
3508
1
  ccv_nnc_tensor_free(dx16_tensor);
3509
1
}
3510
3511
TEST_CASE("dropout entire matrix with 20% chance")
3512
1
{
3513
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DROPOUT_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3514
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20, 50), 0);
3515
1
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20, 50), 0);
3516
1
  int i;
3517
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3518
1.00k
    ha->data.f32[i] = (i + 1) * 0.01;
3519
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 20, 50), 0);
3520
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3521
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 20, 50), 0);
3522
1
  ccv_nnc_tensor_param_t output_info[2];
3523
1
  ccv_nnc_hint_tensor_auto(CMD_DROPOUT_FORWARD(0.4), &a->info, 1, ccv_nnc_no_hint, output_info, 2);
3524
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, output_info[1], 0);
3525
1
  ccv_nnc_cmd_exec(CMD_DROPOUT_FORWARD(0.2, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b, c), 0);
3526
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3527
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20, 50), 0);
3528
1
  if (hb->data.f32[0] == 0)
3529
0
    for (i = 0; i < 20 * 50; i++)
3530
0
      d->data.f32[i] = 0;
3531
1
  else
3532
1.00k
    
for (i = 0; 1
i < 20 * 50;
i++1.00k
)
3533
1.00k
      d->data.f32[i] = ha->data.f32[i] / 0.8;
3534
1
  REQUIRE_TENSOR_EQ(hb, d, "dropout chance should be equal");
3535
1
  ccv_nnc_tensor_free(ha);
3536
1
  ccv_nnc_tensor_free(hb);
3537
1
  ccv_nnc_tensor_free(a);
3538
1
  ccv_nnc_tensor_free(b);
3539
1
  ccv_nnc_tensor_free(c);
3540
1
  ccv_nnc_tensor_free(d);
3541
1
}
3542
3543
TEST_CASE("dropout gradient entire matrix with 20% chance")
3544
1
{
3545
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DROPOUT_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3546
1
    ccv_nnc_cmd_ok(CCV_NNC_DROPOUT_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3547
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20, 50), 0);
3548
1
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20, 50), 0);
3549
1
  int i;
3550
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3551
1.00k
    ha->data.f32[i] = (i + 1) * 0.01;
3552
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 20, 50), 0);
3553
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3554
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 20, 50), 0);
3555
1
  ccv_nnc_tensor_param_t output_info[2];
3556
1
  ccv_nnc_hint_tensor_auto(CMD_DROPOUT_FORWARD(0.4), &a->info, 1, ccv_nnc_no_hint, output_info, 2);
3557
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, output_info[1], 0);
3558
1
  ccv_nnc_cmd_exec(CMD_DROPOUT_FORWARD(0.2, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b, c), 0);
3559
1
  ccv_nnc_tensor_t* const hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20, 50), 0);
3560
1.00k
  for (i = 0; i < 20 * 50; 
i++1.00k
)
3561
1.00k
    hg->data.f32[i] = i + 1;
3562
1
  ccv_nnc_tensor_t* const hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20, 50), 0);
3563
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 20, 50), 0);
3564
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg), TENSOR_LIST(g), 0);
3565
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 20, 50), 0);
3566
1
  ccv_nnc_cmd_exec(CMD_DROPOUT_BACKWARD(0.2, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, 0, 0, c), TENSOR_LIST(h), 0);
3567
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, h), TENSOR_LIST(hb, hh), 0);
3568
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 20, 50), 0);
3569
1
  if (hb->data.f32[0] == 0)
3570
0
    for (i = 0; i < 20 * 50; i++)
3571
0
      d->data.f32[i] = 0;
3572
1
  else
3573
1.00k
    
for (i = 0; 1
i < 20 * 50;
i++1.00k
)
3574
1.00k
      d->data.f32[i] = hg->data.f32[i] / 0.8;
3575
1
  REQUIRE_TENSOR_EQ(hh, d, "dropout chance should be equal");
3576
1
  ccv_nnc_tensor_free(ha);
3577
1
  ccv_nnc_tensor_free(hb);
3578
1
  ccv_nnc_tensor_free(hg);
3579
1
  ccv_nnc_tensor_free(hh);
3580
1
  ccv_nnc_tensor_free(a);
3581
1
  ccv_nnc_tensor_free(b);
3582
1
  ccv_nnc_tensor_free(c);
3583
1
  ccv_nnc_tensor_free(g);
3584
1
  ccv_nnc_tensor_free(h);
3585
1
  ccv_nnc_tensor_free(d);
3586
1
}
3587
3588
TEST_CASE("compare softmax with cudnn")
3589
1
{
3590
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3591
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3592
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
3593
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
3594
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "softmax");
3595
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3596
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3597
1
  ccv_nnc_graph_t* graph = 0;
3598
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3599
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3600
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3601
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3602
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3603
1
  dsfmt_t dsfmt;
3604
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3605
1
  int i;
3606
201
  for (i = 0; i < 20 * 10; 
i++200
)
3607
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3608
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3609
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3610
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3611
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3612
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3613
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
3614
1
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3615
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
3616
1
  REQUIRE_TENSOR_EQ(ty, y_tensor, "softmax from cudnn should match from CPU");
3617
1
  ccv_nnc_tensor_free(x_tensor);
3618
1
  ccv_nnc_tensor_free(y_tensor);
3619
1
  ccv_nnc_tensor_free(ty);
3620
1
  ccv_nnc_graph_free(graph);
3621
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3622
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3623
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3624
1
}
3625
3626
TEST_CASE("compare softmax with cudnn in half precision")
3627
1
{
3628
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3629
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3630
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
3631
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
3632
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "softmax");
3633
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3634
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3635
1
  ccv_nnc_graph_t* graph = 0;
3636
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3637
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3638
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3639
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3640
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3641
1
  dsfmt_t dsfmt;
3642
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3643
1
  int i;
3644
201
  for (i = 0; i < 20 * 10; 
i++200
)
3645
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3646
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3647
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3648
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3649
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
3650
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3651
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3652
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3653
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3654
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
3655
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
3656
1
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3657
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
3658
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "softmax from cudnn should match from CPU");
3659
1
  ccv_nnc_tensor_free(x_tensor);
3660
1
  ccv_nnc_tensor_free(x16_tensor);
3661
1
  ccv_nnc_tensor_free(y16_tensor);
3662
1
  ccv_nnc_tensor_free(y_tensor);
3663
1
  ccv_nnc_tensor_free(ty);
3664
1
  ccv_nnc_graph_free(graph);
3665
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3666
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3667
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3668
1
}
3669
3670
TEST_CASE("compare softmax gradient with cudnn")
3671
1
{
3672
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3673
1
    ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3674
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3675
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
3676
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
3677
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "softmax");
3678
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3679
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3680
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3681
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3682
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3683
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3684
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3685
1
  dsfmt_t dsfmt;
3686
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3687
1
  int i;
3688
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
3689
1.00k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3690
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3691
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
3692
1.00k
    dy_tensor->data.f32[i] = 0;
3693
11
  for (i = 0; i < 10; 
i++10
)
3694
10
    dy_tensor->data.f32[i * 100 + i] = 1;
3695
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
3696
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
3697
1
  ccv_nnc_graph_t* graph = 0;
3698
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3699
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3700
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3701
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3702
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3703
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
3704
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3705
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3706
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3707
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3708
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3709
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
3710
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
3711
1
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3712
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
3713
1
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
3714
1
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3715
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
3716
1
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
3717
1
  ccv_nnc_tensor_free(x_tensor);
3718
1
  ccv_nnc_tensor_free(y_tensor);
3719
1
  ccv_nnc_tensor_free(dx_tensor);
3720
1
  ccv_nnc_tensor_free(dy_tensor);
3721
1
  ccv_nnc_tensor_free(ty_tensor);
3722
1
  ccv_nnc_tensor_free(tdx_tensor);
3723
1
  ccv_nnc_tensor_free(dyt);
3724
1
  ccv_nnc_graph_free(graph);
3725
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3726
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3727
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3728
1
}
3729
3730
TEST_CASE("compare softmax gradient with cudnn in half precision")
3731
1
{
3732
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3733
1
    ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3734
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3735
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 100), "x");
3736
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 100), "y");
3737
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "softmax");
3738
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3739
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3740
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3741
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3742
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3743
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3744
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3745
1
  dsfmt_t dsfmt;
3746
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3747
1
  int i;
3748
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
3749
1.00k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3750
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3751
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
3752
1.00k
    dy_tensor->data.f32[i] = 0;
3753
11
  for (i = 0; i < 10; 
i++10
)
3754
10
    dy_tensor->data.f32[i * 100 + i] = 1;
3755
1
  ccv_nnc_tensor_t* const dy16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
3756
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
3757
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dy16_tensor), 0);
3758
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy16_tensor), TENSOR_LIST(dyt), 0);
3759
1
  ccv_nnc_graph_t* graph = 0;
3760
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3761
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3762
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3763
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3764
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3765
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
3766
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3767
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
3768
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3769
1
  ccv_nnc_tensor_t* const dx16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
3770
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3771
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3772
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
3773
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3774
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3775
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx16_tensor), 0);
3776
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dx16_tensor), TENSOR_LIST(dx_tensor), 0);
3777
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y16_tensor), 0);
3778
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
3779
1
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3780
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
3781
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty_tensor->data.f32, y_tensor->data.f32, 10 * 100, 1e-3, "forward pass should match");
3782
1
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3783
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
3784
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdx_tensor->data.f32, dx_tensor->data.f32, 10 * 100, 1e-3, "backward pass should match");
3785
1
  ccv_nnc_tensor_free(x_tensor);
3786
1
  ccv_nnc_tensor_free(x16_tensor);
3787
1
  ccv_nnc_tensor_free(y_tensor);
3788
1
  ccv_nnc_tensor_free(y16_tensor);
3789
1
  ccv_nnc_tensor_free(dx_tensor);
3790
1
  ccv_nnc_tensor_free(dx16_tensor);
3791
1
  ccv_nnc_tensor_free(dy_tensor);
3792
1
  ccv_nnc_tensor_free(dy16_tensor);
3793
1
  ccv_nnc_tensor_free(ty_tensor);
3794
1
  ccv_nnc_tensor_free(tdx_tensor);
3795
1
  ccv_nnc_tensor_free(dyt);
3796
1
  ccv_nnc_graph_free(graph);
3797
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3798
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3799
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3800
1
}
3801
3802
TEST_CASE("compare sigmoid with cudnn")
3803
1
{
3804
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3805
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3806
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
3807
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
3808
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "sigmoid");
3809
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3810
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3811
1
  ccv_nnc_graph_t* graph = 0;
3812
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3813
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3814
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3815
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3816
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3817
1
  dsfmt_t dsfmt;
3818
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3819
1
  int i;
3820
201
  for (i = 0; i < 20 * 10; 
i++200
)
3821
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3822
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3823
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3824
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3825
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3826
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3827
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
3828
1
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3829
1
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
3830
1
  REQUIRE_TENSOR_EQ(ty, y_tensor, "sigmoid from cudnn should match from CPU");
3831
1
  ccv_nnc_tensor_free(x_tensor);
3832
1
  ccv_nnc_tensor_free(y_tensor);
3833
1
  ccv_nnc_tensor_free(ty);
3834
1
  ccv_nnc_graph_free(graph);
3835
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3836
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3837
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3838
1
}
3839
3840
TEST_CASE("compare sigmoid with cudnn in half precision")
3841
1
{
3842
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3843
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3844
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
3845
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
3846
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "sigmoid");
3847
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3848
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3849
1
  ccv_nnc_graph_t* graph = 0;
3850
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3851
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3852
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3853
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3854
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3855
1
  dsfmt_t dsfmt;
3856
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3857
1
  int i;
3858
201
  for (i = 0; i < 20 * 10; 
i++200
)
3859
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3860
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3861
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3862
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3863
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
3864
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3865
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3866
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3867
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3868
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
3869
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
3870
1
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3871
1
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
3872
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "sigmoid from cudnn should match from CPU");
3873
1
  ccv_nnc_tensor_free(x_tensor);
3874
1
  ccv_nnc_tensor_free(x16_tensor);
3875
1
  ccv_nnc_tensor_free(y16_tensor);
3876
1
  ccv_nnc_tensor_free(y_tensor);
3877
1
  ccv_nnc_tensor_free(ty);
3878
1
  ccv_nnc_graph_free(graph);
3879
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3880
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3881
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3882
1
}
3883
3884
TEST_CASE("compare sigmoid gradient with cudnn")
3885
1
{
3886
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3887
1
    ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3888
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3889
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
3890
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
3891
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "sigmoid");
3892
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3893
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3894
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3895
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3896
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3897
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3898
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3899
1
  dsfmt_t dsfmt;
3900
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3901
1
  int i;
3902
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
3903
1.00k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3904
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3905
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
3906
1.00k
    dy_tensor->data.f32[i] = 0;
3907
11
  for (i = 0; i < 10; 
i++10
)
3908
10
    dy_tensor->data.f32[i * 100 + i] = 1;
3909
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
3910
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
3911
1
  ccv_nnc_graph_t* graph = 0;
3912
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3913
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3914
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3915
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3916
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3917
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
3918
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3919
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3920
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3921
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3922
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3923
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
3924
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
3925
1
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3926
1
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
3927
1
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
3928
1
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3929
1
  ccv_nnc_cmd_exec(CMD_SIGMOID_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
3930
1
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
3931
1
  ccv_nnc_tensor_free(x_tensor);
3932
1
  ccv_nnc_tensor_free(y_tensor);
3933
1
  ccv_nnc_tensor_free(dx_tensor);
3934
1
  ccv_nnc_tensor_free(dy_tensor);
3935
1
  ccv_nnc_tensor_free(ty_tensor);
3936
1
  ccv_nnc_tensor_free(tdx_tensor);
3937
1
  ccv_nnc_tensor_free(dyt);
3938
1
  ccv_nnc_graph_free(graph);
3939
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3940
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3941
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3942
1
}
3943
3944
TEST_CASE("compare sigmoid gradient with cudnn in half precision")
3945
1
{
3946
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
3947
1
    ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
3948
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3949
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 100), "x");
3950
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 100), "y");
3951
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "sigmoid");
3952
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3953
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3954
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3955
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3956
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3957
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3958
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3959
1
  dsfmt_t dsfmt;
3960
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3961
1
  int i;
3962
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
3963
1.00k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3964
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3965
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
3966
1.00k
    dy_tensor->data.f32[i] = 0;
3967
11
  for (i = 0; i < 10; 
i++10
)
3968
10
    dy_tensor->data.f32[i * 100 + i] = 1;
3969
1
  ccv_nnc_tensor_t* const dy16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
3970
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
3971
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dy16_tensor), 0);
3972
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy16_tensor), TENSOR_LIST(dyt), 0);
3973
1
  ccv_nnc_graph_t* graph = 0;
3974
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3975
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3976
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3977
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3978
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3979
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
3980
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3981
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
3982
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3983
1
  ccv_nnc_tensor_t* const dx16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
3984
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3985
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3986
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
3987
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3988
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3989
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx16_tensor), 0);
3990
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dx16_tensor), TENSOR_LIST(dx_tensor), 0);
3991
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y16_tensor), 0);
3992
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
3993
1
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3994
1
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
3995
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty_tensor->data.f32, y_tensor->data.f32, 10 * 100, 1e-3, "forward pass should match");
3996
1
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
3997
1
  ccv_nnc_cmd_exec(CMD_SIGMOID_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
3998
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdx_tensor->data.f32, dx_tensor->data.f32, 10 * 100, 1e-3, "backward pass should match");
3999
1
  ccv_nnc_tensor_free(x_tensor);
4000
1
  ccv_nnc_tensor_free(x16_tensor);
4001
1
  ccv_nnc_tensor_free(y_tensor);
4002
1
  ccv_nnc_tensor_free(y16_tensor);
4003
1
  ccv_nnc_tensor_free(dx_tensor);
4004
1
  ccv_nnc_tensor_free(dx16_tensor);
4005
1
  ccv_nnc_tensor_free(dy_tensor);
4006
1
  ccv_nnc_tensor_free(dy16_tensor);
4007
1
  ccv_nnc_tensor_free(ty_tensor);
4008
1
  ccv_nnc_tensor_free(tdx_tensor);
4009
1
  ccv_nnc_tensor_free(dyt);
4010
1
  ccv_nnc_graph_free(graph);
4011
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4012
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4013
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4014
1
}
4015
4016
TEST_CASE("compare tanh with cudnn")
4017
1
{
4018
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TANH_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4019
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4020
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
4021
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
4022
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_TANH_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "tanh");
4023
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4024
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4025
1
  ccv_nnc_graph_t* graph = 0;
4026
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4027
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4028
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4029
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4030
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4031
1
  dsfmt_t dsfmt;
4032
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4033
1
  int i;
4034
201
  for (i = 0; i < 20 * 10; 
i++200
)
4035
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4036
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
4037
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
4038
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4039
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4040
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
4041
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
4042
1
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4043
1
  ccv_nnc_cmd_exec(CMD_TANH_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
4044
1
  REQUIRE_TENSOR_EQ(ty, y_tensor, "tanh from cudnn should match from CPU");
4045
1
  ccv_nnc_tensor_free(x_tensor);
4046
1
  ccv_nnc_tensor_free(y_tensor);
4047
1
  ccv_nnc_tensor_free(ty);
4048
1
  ccv_nnc_graph_free(graph);
4049
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4050
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4051
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4052
1
}
4053
4054
TEST_CASE("compare tanh with cudnn in half precision")
4055
1
{
4056
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TANH_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4057
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4058
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
4059
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
4060
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_TANH_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "tanh");
4061
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4062
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4063
1
  ccv_nnc_graph_t* graph = 0;
4064
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4065
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4066
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4067
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4068
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4069
1
  dsfmt_t dsfmt;
4070
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4071
1
  int i;
4072
201
  for (i = 0; i < 20 * 10; 
i++200
)
4073
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4074
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
4075
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
4076
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
4077
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
4078
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4079
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
4080
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4081
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
4082
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
4083
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
4084
1
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4085
1
  ccv_nnc_cmd_exec(CMD_TANH_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
4086
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "tanh from cudnn should match from CPU");
4087
1
  ccv_nnc_tensor_free(x_tensor);
4088
1
  ccv_nnc_tensor_free(x16_tensor);
4089
1
  ccv_nnc_tensor_free(y16_tensor);
4090
1
  ccv_nnc_tensor_free(y_tensor);
4091
1
  ccv_nnc_tensor_free(ty);
4092
1
  ccv_nnc_graph_free(graph);
4093
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4094
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4095
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4096
1
}
4097
4098
TEST_CASE("compare tanh gradient with cudnn")
4099
1
{
4100
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TANH_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
4101
1
    ccv_nnc_cmd_ok(CCV_NNC_TANH_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4102
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4103
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
4104
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
4105
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_TANH_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "tanh");
4106
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4107
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4108
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4109
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4110
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
4111
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
4112
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4113
1
  dsfmt_t dsfmt;
4114
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4115
1
  int i;
4116
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
4117
1.00k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4118
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4119
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
4120
1.00k
    dy_tensor->data.f32[i] = 0;
4121
11
  for (i = 0; i < 10; 
i++10
)
4122
10
    dy_tensor->data.f32[i * 100 + i] = 1;
4123
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4124
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
4125
1
  ccv_nnc_graph_t* graph = 0;
4126
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4127
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4128
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4129
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4130
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
4131
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
4132
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4133
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4134
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
4135
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4136
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
4137
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
4138
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
4139
1
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4140
1
  ccv_nnc_cmd_exec(CMD_TANH_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
4141
1
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
4142
1
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4143
1
  ccv_nnc_cmd_exec(CMD_TANH_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
4144
1
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
4145
1
  ccv_nnc_tensor_free(x_tensor);
4146
1
  ccv_nnc_tensor_free(y_tensor);
4147
1
  ccv_nnc_tensor_free(dx_tensor);
4148
1
  ccv_nnc_tensor_free(dy_tensor);
4149
1
  ccv_nnc_tensor_free(ty_tensor);
4150
1
  ccv_nnc_tensor_free(tdx_tensor);
4151
1
  ccv_nnc_tensor_free(dyt);
4152
1
  ccv_nnc_graph_free(graph);
4153
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4154
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4155
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4156
1
}
4157
4158
TEST_CASE("compare tanh gradient with cudnn in half precision")
4159
1
{
4160
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TANH_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
4161
1
    ccv_nnc_cmd_ok(CCV_NNC_TANH_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4162
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4163
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 100), "x");
4164
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 100), "y");
4165
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_TANH_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "tanh");
4166
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4167
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4168
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4169
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4170
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
4171
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
4172
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4173
1
  dsfmt_t dsfmt;
4174
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4175
1
  int i;
4176
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
4177
1.00k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4178
1
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4179
1.00k
  for (i = 0; i < 10 * 100; 
i++1.00k
)
4180
1.00k
    dy_tensor->data.f32[i] = 0;
4181
11
  for (i = 0; i < 10; 
i++10
)
4182
10
    dy_tensor->data.f32[i * 100 + i] = 1;
4183
1
  ccv_nnc_tensor_t* const dy16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4184
1
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4185
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dy16_tensor), 0);
4186
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy16_tensor), TENSOR_LIST(dyt), 0);
4187
1
  ccv_nnc_graph_t* graph = 0;
4188
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4189
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4190
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4191
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4192
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
4193
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4194
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
4195
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
4196
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4197
1
  ccv_nnc_tensor_t* const dx16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4198
1
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4199
1
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
4200
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4201
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4202
1
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
4203
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx16_tensor), 0);
4204
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dx16_tensor), TENSOR_LIST(dx_tensor), 0);
4205
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y16_tensor), 0);
4206
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
4207
1
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4208
1
  ccv_nnc_cmd_exec(CMD_TANH_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
4209
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty_tensor->data.f32, y_tensor->data.f32, 10 * 100, 1e-3, "forward pass should match");
4210
1
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4211
1
  ccv_nnc_cmd_exec(CMD_TANH_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
4212
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdx_tensor->data.f32, dx_tensor->data.f32, 10 * 100, 1e-3, "backward pass should match");
4213
1
  ccv_nnc_tensor_free(x_tensor);
4214
1
  ccv_nnc_tensor_free(x16_tensor);
4215
1
  ccv_nnc_tensor_free(y_tensor);
4216
1
  ccv_nnc_tensor_free(y16_tensor);
4217
1
  ccv_nnc_tensor_free(dx_tensor);
4218
1
  ccv_nnc_tensor_free(dx16_tensor);
4219
1
  ccv_nnc_tensor_free(dy_tensor);
4220
1
  ccv_nnc_tensor_free(dy16_tensor);
4221
1
  ccv_nnc_tensor_free(ty_tensor);
4222
1
  ccv_nnc_tensor_free(tdx_tensor);
4223
1
  ccv_nnc_tensor_free(dyt);
4224
1
  ccv_nnc_graph_free(graph);
4225
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4226
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4227
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4228
1
}
4229
4230
TEST_CASE("compare add with cudnn")
4231
1
{
4232
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4233
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4234
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
4235
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
4236
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
4237
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
4238
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
4239
1
  ccv_nnc_tensor_symbol_t z = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "z");
4240
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
4241
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
4242
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(z), "transfer");
4243
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4244
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4245
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4246
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
4247
1
  ccv_nnc_graph_t* graph = 0;
4248
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4249
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4250
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(z), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4251
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4252
1
  dsfmt_t dsfmt;
4253
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4254
1
  int i;
4255
751
  for (i = 0; i < 10 * 5 * 5 * 3; 
i++750
)
4256
750
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4257
151
  for (i = 0; i < 10 * 5 * 1 * 3; 
i++150
)
4258
150
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4259
1
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4260
1
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
4261
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4262
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, z);
4263
1
  REQUIRE_TENSOR_EQ(zt, z_tensor, "add should match");
4264
1
  ccv_nnc_tensor_free(x_tensor);
4265
1
  ccv_nnc_tensor_free(y_tensor);
4266
1
  ccv_nnc_tensor_free(zt);
4267
1
  ccv_nnc_graph_free(graph);
4268
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4269
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4270
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4271
1
}
4272
4273
TEST_CASE("compare add with cudnn in half precision")
4274
1
{
4275
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4276
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4277
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
4278
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
4279
1
  ccv_nnc_tensor_symbol_t x16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), "x 16");
4280
1
  ccv_nnc_tensor_symbol_t y16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 1, 3), "y 16");
4281
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "a");
4282
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 1, 3), "b");
4283
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "c");
4284
1
  ccv_nnc_tensor_symbol_t z = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "z");
4285
1
  ccv_nnc_tensor_symbol_t z16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), "z 16");
4286
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(x16, y16), "convert");
4287
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x16, y16), TENSOR_SYMBOL_LIST(a, b), "transfer");
4288
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
4289
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(z16), "transfer");
4290
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(z16), TENSOR_SYMBOL_LIST(z), "convert");
4291
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4292
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4293
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4294
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
4295
1
  ccv_nnc_graph_t* graph = 0;
4296
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4297
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4298
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(z), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4299
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4300
1
  dsfmt_t dsfmt;
4301
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4302
1
  int i;
4303
751
  for (i = 0; i < 10 * 5 * 5 * 3; 
i++750
)
4304
750
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4305
151
  for (i = 0; i < 10 * 5 * 1 * 3; 
i++150
)
4306
150
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4307
1
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4308
1
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
4309
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4310
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, z);
4311
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, zt->data.f32, z_tensor->data.f32, 10 * 5 * 5 * 3, 1e-3, "add should match");
4312
1
  ccv_nnc_tensor_free(x_tensor);
4313
1
  ccv_nnc_tensor_free(y_tensor);
4314
1
  ccv_nnc_tensor_free(zt);
4315
1
  ccv_nnc_graph_free(graph);
4316
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4317
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4318
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4319
1
}
4320
4321
TEST_CASE("compare add gradient with cudnn")
4322
1
{
4323
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
4324
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4325
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4326
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
4327
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
4328
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
4329
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
4330
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
4331
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
4332
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
4333
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4334
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(x, y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4335
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4336
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4337
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4338
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
4339
1
  ccv_nnc_graph_t* graph = 0;
4340
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4341
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4342
1
  ccv_nnc_tensor_symbol_t dc = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, c);
4343
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
4344
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
4345
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(dx, dy), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4346
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4347
1
  dsfmt_t dsfmt;
4348
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4349
1
  int i;
4350
751
  for (i = 0; i < 10 * 5 * 5 * 3; 
i++750
)
4351
750
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4352
151
  for (i = 0; i < 10 * 5 * 1 * 3; 
i++150
)
4353
150
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4354
1
  ccv_nnc_tensor_t* dct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4355
751
  for (i = 0; i < 10 * 5 * 5 * 3; 
i++750
)
4356
750
    dct->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4357
1
  ccv_nnc_tensor_t* const dc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dc);
4358
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dct), TENSOR_LIST(dc_tensor), 0);
4359
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4360
1
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4361
1
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
4362
1
  ccv_nnc_tensor_t* dxt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4363
1
  ccv_nnc_tensor_t* dyt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
4364
1
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dct, x_tensor, y_tensor, zt), TENSOR_LIST(dxt, dyt), 0);
4365
1
  ccv_nnc_tensor_t* dx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
4366
1
  ccv_nnc_tensor_t* dy_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dy);
4367
1
  REQUIRE_TENSOR_EQ(dxt, dx_tensor, "backward pass should match");
4368
1
  REQUIRE_TENSOR_EQ(dyt, dy_tensor, "backward pass should match");
4369
1
  ccv_nnc_tensor_free(x_tensor);
4370
1
  ccv_nnc_tensor_free(y_tensor);
4371
1
  ccv_nnc_tensor_free(dct);
4372
1
  ccv_nnc_tensor_free(zt);
4373
1
  ccv_nnc_tensor_free(dxt);
4374
1
  ccv_nnc_tensor_free(dyt);
4375
1
  ccv_nnc_graph_free(graph);
4376
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4377
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4378
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4379
1
}
4380
4381
TEST_CASE("compare add gradient with cudnn in half precision")
4382
1
{
4383
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
4384
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4385
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4386
1
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
4387
1
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
4388
1
  ccv_nnc_tensor_symbol_t x16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), "x 16");
4389
1
  ccv_nnc_tensor_symbol_t y16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 1, 3), "y 16");
4390
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "a");
4391
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 1, 3), "b");
4392
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "c");
4393
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(x16, y16), "convert");
4394
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x16, y16), TENSOR_SYMBOL_LIST(a, b), "transfer");
4395
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
4396
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4397
1
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(x, y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4398
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4399
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4400
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4401
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
4402
1
  ccv_nnc_graph_t* graph = 0;
4403
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4404
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4405
1
  ccv_nnc_tensor_symbol_t dc = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, c);
4406
1
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
4407
1
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
4408
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(dx, dy), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4409
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4410
1
  dsfmt_t dsfmt;
4411
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4412
1
  int i;
4413
751
  for (i = 0; i < 10 * 5 * 5 * 3; 
i++750
)
4414
750
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4415
151
  for (i = 0; i < 10 * 5 * 1 * 3; 
i++150
)
4416
150
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4417
1
  ccv_nnc_tensor_t* dct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4418
1
  ccv_nnc_tensor_t* dct16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), 0);
4419
751
  for (i = 0; i < 10 * 5 * 5 * 3; 
i++750
)
4420
750
    dct->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4421
1
  ccv_nnc_tensor_t* const dc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dc);
4422
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dct), TENSOR_LIST(dct16), 0);
4423
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dct16), TENSOR_LIST(dc_tensor), 0);
4424
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4425
1
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4426
1
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
4427
1
  ccv_nnc_tensor_t* dxt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
4428
1
  ccv_nnc_tensor_t* dyt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
4429
1
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dct, x_tensor, y_tensor, zt), TENSOR_LIST(dxt, dyt), 0);
4430
1
  ccv_nnc_tensor_t* dx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
4431
1
  ccv_nnc_tensor_t* dy_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dy);
4432
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dxt->data.f32, dx_tensor->data.f32, 10 * 5 * 5 * 3, 1e-3, "backward pass should match");
4433
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dyt->data.f32, dy_tensor->data.f32, 10 * 5 * 1 * 3, 1e-3, "backward pass should match");
4434
1
  ccv_nnc_tensor_free(x_tensor);
4435
1
  ccv_nnc_tensor_free(y_tensor);
4436
1
  ccv_nnc_tensor_free(dct);
4437
1
  ccv_nnc_tensor_free(dct16);
4438
1
  ccv_nnc_tensor_free(zt);
4439
1
  ccv_nnc_tensor_free(dxt);
4440
1
  ccv_nnc_tensor_free(dyt);
4441
1
  ccv_nnc_graph_free(graph);
4442
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4443
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4444
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4445
1
}
4446
4447
TEST_CASE("compare softmax cross entropy forward")
4448
1
{
4449
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4450
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4451
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4452
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4453
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4454
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4455
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4456
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4457
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4458
1
  dsfmt_t dsfmt;
4459
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4460
1
  int i;
4461
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
4462
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4463
11
  for (i = 0; i < 10; 
i++10
)
4464
10
    hb->data.f32[i] = (i + 1) * 9;
4465
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
4466
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc, hd), 0);
4467
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c, d), 0);
4468
1
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4469
1
  ccv_nnc_tensor_t* td = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4470
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c, d), TENSOR_LIST(tc, td), 0);
4471
1
  REQUIRE_TENSOR_EQ(tc, hc, "GPU computed output should be the same as CPU computed ones");
4472
1
  REQUIRE_TENSOR_EQ(td, hd, "GPU computed output should be the same as CPU computed ones");
4473
1
  ccv_nnc_tensor_free(a);
4474
1
  ccv_nnc_tensor_free(b);
4475
1
  ccv_nnc_tensor_free(c);
4476
1
  ccv_nnc_tensor_free(d);
4477
1
  ccv_nnc_tensor_free(ha);
4478
1
  ccv_nnc_tensor_free(hb);
4479
1
  ccv_nnc_tensor_free(hc);
4480
1
  ccv_nnc_tensor_free(hd);
4481
1
  ccv_nnc_tensor_free(tc);
4482
1
  ccv_nnc_tensor_free(td);
4483
1
}
4484
4485
TEST_CASE("compare softmax cross entropy forward in half precision")
4486
1
{
4487
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4488
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4489
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4490
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4491
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4492
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4493
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4494
1
  ccv_nnc_tensor_t* ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4495
1
  ccv_nnc_tensor_t* hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4496
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4497
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4498
1
  dsfmt_t dsfmt;
4499
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4500
1
  int i;
4501
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
4502
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4503
11
  for (i = 0; i < 10; 
i++10
)
4504
10
    hb->data.f32[i] = (i + 1) * 9;
4505
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(ha16, hb16), 0);
4506
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hb16), TENSOR_LIST(a, b), 0);
4507
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc, hd), 0);
4508
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c, d), 0);
4509
1
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4510
1
  ccv_nnc_tensor_t* td = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4511
1
  ccv_nnc_tensor_t* tc16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4512
1
  ccv_nnc_tensor_t* td16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4513
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c, d), TENSOR_LIST(tc16, td16), 0);
4514
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tc16, td16), TENSOR_LIST(tc, td), 0);
4515
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tc->data.f32, hc->data.f32, 10, 1e-3, "GPU computed output should be the same as CPU computed ones");
4516
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, td->data.f32, hd->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
4517
1
  ccv_nnc_tensor_free(a);
4518
1
  ccv_nnc_tensor_free(b);
4519
1
  ccv_nnc_tensor_free(c);
4520
1
  ccv_nnc_tensor_free(d);
4521
1
  ccv_nnc_tensor_free(ha);
4522
1
  ccv_nnc_tensor_free(hb);
4523
1
  ccv_nnc_tensor_free(ha16);
4524
1
  ccv_nnc_tensor_free(hb16);
4525
1
  ccv_nnc_tensor_free(hc);
4526
1
  ccv_nnc_tensor_free(hd);
4527
1
  ccv_nnc_tensor_free(tc);
4528
1
  ccv_nnc_tensor_free(td);
4529
1
  ccv_nnc_tensor_free(tc16);
4530
1
  ccv_nnc_tensor_free(td16);
4531
1
}
4532
4533
TEST_CASE("compare softmax cross entropy forward with label smoothing")
4534
1
{
4535
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4536
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4537
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4538
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4539
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4540
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4541
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4542
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4543
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4544
1
  dsfmt_t dsfmt;
4545
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4546
1
  int i;
4547
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
4548
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4549
11
  for (i = 0; i < 10; 
i++10
)
4550
10
    hb->data.f32[i] = (i + 1) * 9;
4551
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
4552
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc, hd), 0);
4553
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c, d), 0);
4554
1
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4555
1
  ccv_nnc_tensor_t* td = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4556
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c, d), TENSOR_LIST(tc, td), 0);
4557
1
  REQUIRE_TENSOR_EQ(tc, hc, "GPU computed output should be the same as CPU computed ones");
4558
1
  REQUIRE_TENSOR_EQ(td, hd, "GPU computed output should be the same as CPU computed ones");
4559
1
  ccv_nnc_tensor_free(a);
4560
1
  ccv_nnc_tensor_free(b);
4561
1
  ccv_nnc_tensor_free(c);
4562
1
  ccv_nnc_tensor_free(d);
4563
1
  ccv_nnc_tensor_free(ha);
4564
1
  ccv_nnc_tensor_free(hb);
4565
1
  ccv_nnc_tensor_free(hc);
4566
1
  ccv_nnc_tensor_free(hd);
4567
1
  ccv_nnc_tensor_free(tc);
4568
1
  ccv_nnc_tensor_free(td);
4569
1
}
4570
4571
TEST_CASE("compare softmax cross entropy forward in half precision with label smoothing")
4572
1
{
4573
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4574
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4575
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4576
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4577
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4578
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4579
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4580
1
  ccv_nnc_tensor_t* ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4581
1
  ccv_nnc_tensor_t* hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4582
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4583
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4584
1
  dsfmt_t dsfmt;
4585
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4586
1
  int i;
4587
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
4588
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4589
11
  for (i = 0; i < 10; 
i++10
)
4590
10
    hb->data.f32[i] = (i + 1) * 9;
4591
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(ha16, hb16), 0);
4592
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hb16), TENSOR_LIST(a, b), 0);
4593
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc, hd), 0);
4594
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c, d), 0);
4595
1
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4596
1
  ccv_nnc_tensor_t* td = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4597
1
  ccv_nnc_tensor_t* tc16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4598
1
  ccv_nnc_tensor_t* td16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4599
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c, d), TENSOR_LIST(tc16, td16), 0);
4600
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tc16, td16), TENSOR_LIST(tc, td), 0);
4601
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tc->data.f32, hc->data.f32, 10, 5e-2, "GPU computed output should be the same as CPU computed ones");
4602
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, td->data.f32, hd->data.f32, 10 * 100, 5e-2, "GPU computed output should be the same as CPU computed ones");
4603
1
  ccv_nnc_tensor_free(a);
4604
1
  ccv_nnc_tensor_free(b);
4605
1
  ccv_nnc_tensor_free(c);
4606
1
  ccv_nnc_tensor_free(d);
4607
1
  ccv_nnc_tensor_free(ha);
4608
1
  ccv_nnc_tensor_free(hb);
4609
1
  ccv_nnc_tensor_free(ha16);
4610
1
  ccv_nnc_tensor_free(hb16);
4611
1
  ccv_nnc_tensor_free(hc);
4612
1
  ccv_nnc_tensor_free(hd);
4613
1
  ccv_nnc_tensor_free(tc);
4614
1
  ccv_nnc_tensor_free(td);
4615
1
  ccv_nnc_tensor_free(tc16);
4616
1
  ccv_nnc_tensor_free(td16);
4617
1
}
4618
4619
TEST_CASE("compare softmax cross entropy backward")
4620
1
{
4621
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
4622
1
    ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4623
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4624
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4625
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4626
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4627
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4628
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4629
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4630
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4631
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4632
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4633
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4634
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4635
1
  dsfmt_t dsfmt;
4636
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4637
1
  int i;
4638
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
4639
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4640
11
  for (i = 0; i < 10; 
i++10
)
4641
10
    hb->data.f32[i] = (i + 1) * 9;
4642
11
  for (i = 0; i < 10; 
i++10
)
4643
10
    hg->data.f32[i] = i * 0.1;
4644
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
4645
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc, hd), 0);
4646
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, 0, hb, hc, hd), TENSOR_LIST(hh, 0), 0);
4647
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c, d), 0);
4648
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, 0, b, c, d), TENSOR_LIST(h, 0), 0);
4649
1
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4650
1
  ccv_nnc_tensor_t* td = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4651
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4652
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c, d, h), TENSOR_LIST(tc, td, th), 0);
4653
1
  REQUIRE_TENSOR_EQ(tc, hc, "GPU computed output should be the same as CPU computed ones");
4654
1
  REQUIRE_TENSOR_EQ(td, hd, "GPU computed output should be the same as CPU computed ones");
4655
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
4656
1
  ccv_nnc_tensor_free(a);
4657
1
  ccv_nnc_tensor_free(b);
4658
1
  ccv_nnc_tensor_free(c);
4659
1
  ccv_nnc_tensor_free(d);
4660
1
  ccv_nnc_tensor_free(h);
4661
1
  ccv_nnc_tensor_free(ha);
4662
1
  ccv_nnc_tensor_free(hb);
4663
1
  ccv_nnc_tensor_free(hc);
4664
1
  ccv_nnc_tensor_free(hd);
4665
1
  ccv_nnc_tensor_free(hg);
4666
1
  ccv_nnc_tensor_free(hh);
4667
1
  ccv_nnc_tensor_free(tc);
4668
1
  ccv_nnc_tensor_free(td);
4669
1
  ccv_nnc_tensor_free(th);
4670
1
}
4671
4672
TEST_CASE("compare softmax cross entropy backward with label smoothing")
4673
1
{
4674
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
4675
1
    ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4676
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4677
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4678
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4679
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4680
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4681
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4682
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4683
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4684
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4685
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4686
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4687
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4688
1
  dsfmt_t dsfmt;
4689
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4690
1
  int i;
4691
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
4692
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4693
11
  for (i = 0; i < 10; 
i++10
)
4694
10
    hb->data.f32[i] = (i + 1) * 9;
4695
11
  for (i = 0; i < 10; 
i++10
)
4696
10
    hg->data.f32[i] = i * 0.1;
4697
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
4698
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc, hd), 0);
4699
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_BACKWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, 0, hb, hc, hd), TENSOR_LIST(hh, 0), 0);
4700
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c, d), 0);
4701
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_BACKWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, 0, b, c, d), TENSOR_LIST(h, 0), 0);
4702
1
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4703
1
  ccv_nnc_tensor_t* td = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4704
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4705
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c, d, h), TENSOR_LIST(tc, td, th), 0);
4706
1
  REQUIRE_TENSOR_EQ(tc, hc, "GPU computed output should be the same as CPU computed ones");
4707
1
  REQUIRE_TENSOR_EQ(td, hd, "GPU computed output should be the same as CPU computed ones");
4708
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
4709
1
  ccv_nnc_tensor_free(a);
4710
1
  ccv_nnc_tensor_free(b);
4711
1
  ccv_nnc_tensor_free(c);
4712
1
  ccv_nnc_tensor_free(d);
4713
1
  ccv_nnc_tensor_free(h);
4714
1
  ccv_nnc_tensor_free(ha);
4715
1
  ccv_nnc_tensor_free(hb);
4716
1
  ccv_nnc_tensor_free(hc);
4717
1
  ccv_nnc_tensor_free(hd);
4718
1
  ccv_nnc_tensor_free(hg);
4719
1
  ccv_nnc_tensor_free(hh);
4720
1
  ccv_nnc_tensor_free(tc);
4721
1
  ccv_nnc_tensor_free(td);
4722
1
  ccv_nnc_tensor_free(th);
4723
1
}
4724
4725
TEST_CASE("compare softmax cross entropy backward in half precision")
4726
1
{
4727
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
4728
1
    ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4729
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4730
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4731
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4732
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4733
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4734
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4735
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4736
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4737
1
  ccv_nnc_tensor_t* ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4738
1
  ccv_nnc_tensor_t* hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4739
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4740
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4741
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4742
1
  ccv_nnc_tensor_t* hg16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4743
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4744
1
  dsfmt_t dsfmt;
4745
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4746
1
  int i;
4747
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
4748
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4749
11
  for (i = 0; i < 10; 
i++10
)
4750
10
    hb->data.f32[i] = (i + 1) * 9;
4751
11
  for (i = 0; i < 10; 
i++10
)
4752
10
    hg->data.f32[i] = i * 0.1;
4753
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(ha16, hb16, hg16), 0);
4754
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hb16, hg16), TENSOR_LIST(a, b, g), 0);
4755
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc, hd), 0);
4756
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, 0, hb, hc, hd), TENSOR_LIST(hh, 0), 0);
4757
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c, d), 0);
4758
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, 0, b, c, d), TENSOR_LIST(h, 0), 0);
4759
1
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4760
1
  ccv_nnc_tensor_t* td = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4761
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4762
1
  ccv_nnc_tensor_t* tc16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4763
1
  ccv_nnc_tensor_t* td16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4764
1
  ccv_nnc_tensor_t* th16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4765
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c, d, h), TENSOR_LIST(tc16, td16, th16), 0);
4766
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tc16, td16, th16), TENSOR_LIST(tc, td, th), 0);
4767
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tc->data.f32, hc->data.f32, 10, 1e-3, "GPU computed output should be the same as CPU computed ones");
4768
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, td->data.f32, hd->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
4769
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
4770
1
  ccv_nnc_tensor_free(a);
4771
1
  ccv_nnc_tensor_free(b);
4772
1
  ccv_nnc_tensor_free(c);
4773
1
  ccv_nnc_tensor_free(d);
4774
1
  ccv_nnc_tensor_free(h);
4775
1
  ccv_nnc_tensor_free(ha);
4776
1
  ccv_nnc_tensor_free(hb);
4777
1
  ccv_nnc_tensor_free(ha16);
4778
1
  ccv_nnc_tensor_free(hb16);
4779
1
  ccv_nnc_tensor_free(hc);
4780
1
  ccv_nnc_tensor_free(hd);
4781
1
  ccv_nnc_tensor_free(hg);
4782
1
  ccv_nnc_tensor_free(hg16);
4783
1
  ccv_nnc_tensor_free(hh);
4784
1
  ccv_nnc_tensor_free(tc);
4785
1
  ccv_nnc_tensor_free(td);
4786
1
  ccv_nnc_tensor_free(th);
4787
1
  ccv_nnc_tensor_free(tc16);
4788
1
  ccv_nnc_tensor_free(td16);
4789
1
  ccv_nnc_tensor_free(th16);
4790
1
}
4791
4792
TEST_CASE("compare softmax cross entropy backward in half precision with label smoothing")
4793
1
{
4794
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
4795
1
    ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4796
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4797
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4798
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4799
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4800
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10), 0);
4801
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
4802
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4803
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4804
1
  ccv_nnc_tensor_t* ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4805
1
  ccv_nnc_tensor_t* hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4806
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4807
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4808
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4809
1
  ccv_nnc_tensor_t* hg16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4810
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4811
1
  dsfmt_t dsfmt;
4812
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4813
1
  int i;
4814
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
4815
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4816
11
  for (i = 0; i < 10; 
i++10
)
4817
10
    hb->data.f32[i] = (i + 1) * 9;
4818
11
  for (i = 0; i < 10; 
i++10
)
4819
10
    hg->data.f32[i] = i * 0.1;
4820
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(ha16, hb16, hg16), 0);
4821
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hb16, hg16), TENSOR_LIST(a, b, g), 0);
4822
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc, hd), 0);
4823
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_BACKWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, 0, hb, hc, hd), TENSOR_LIST(hh, 0), 0);
4824
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_FORWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c, d), 0);
4825
1
  ccv_nnc_cmd_exec(CMD_SOFTMAX_CROSSENTROPY_BACKWARD(0.1, 0.9), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, 0, b, c, d), TENSOR_LIST(h, 0), 0);
4826
1
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4827
1
  ccv_nnc_tensor_t* td = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4828
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4829
1
  ccv_nnc_tensor_t* tc16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10), 0);
4830
1
  ccv_nnc_tensor_t* td16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4831
1
  ccv_nnc_tensor_t* th16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
4832
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c, d, h), TENSOR_LIST(tc16, td16, th16), 0);
4833
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tc16, td16, th16), TENSOR_LIST(tc, td, th), 0);
4834
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tc->data.f32, hc->data.f32, 10, 5e-2, "GPU computed output should be the same as CPU computed ones");
4835
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, td->data.f32, hd->data.f32, 10 * 100, 5e-2, "GPU computed output should be the same as CPU computed ones");
4836
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 100, 5e-2, "GPU computed output should be the same as CPU computed ones");
4837
1
  ccv_nnc_tensor_free(a);
4838
1
  ccv_nnc_tensor_free(b);
4839
1
  ccv_nnc_tensor_free(c);
4840
1
  ccv_nnc_tensor_free(d);
4841
1
  ccv_nnc_tensor_free(h);
4842
1
  ccv_nnc_tensor_free(ha);
4843
1
  ccv_nnc_tensor_free(hb);
4844
1
  ccv_nnc_tensor_free(ha16);
4845
1
  ccv_nnc_tensor_free(hb16);
4846
1
  ccv_nnc_tensor_free(hc);
4847
1
  ccv_nnc_tensor_free(hd);
4848
1
  ccv_nnc_tensor_free(hg);
4849
1
  ccv_nnc_tensor_free(hg16);
4850
1
  ccv_nnc_tensor_free(hh);
4851
1
  ccv_nnc_tensor_free(tc);
4852
1
  ccv_nnc_tensor_free(td);
4853
1
  ccv_nnc_tensor_free(th);
4854
1
  ccv_nnc_tensor_free(tc16);
4855
1
  ccv_nnc_tensor_free(td16);
4856
1
  ccv_nnc_tensor_free(th16);
4857
1
}
4858
4859
TEST_CASE("compare ewsum with cudnn")
4860
1
{
4861
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4862
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
4863
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
4864
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
4865
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
4866
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4867
1
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4868
1
  ccv_nnc_tensor_t* const hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4869
1
  ccv_nnc_tensor_t* const hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4870
1
  ccv_nnc_tensor_t* const gd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4871
1
  int i;
4872
101
  for (i = 0; i < 100; 
i++100
)
4873
100
  {
4874
100
    ha->data.f32[i] = 1;
4875
100
    hb->data.f32[i] = 0.5;
4876
100
    hc->data.f32[i] = 0.25;
4877
100
    gd->data.f32[i] = 1.75;
4878
100
  }
4879
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hc), TENSOR_LIST(a, b, c), 0);
4880
1
  ccv_nnc_cmd_exec(CMD_EWSUM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(d), 0);
4881
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(hd), 0);
4882
1
  REQUIRE_TENSOR_EQ(hd, gd, "ewsum result should be the same");
4883
1
  ccv_nnc_tensor_free(a);
4884
1
  ccv_nnc_tensor_free(b);
4885
1
  ccv_nnc_tensor_free(c);
4886
1
  ccv_nnc_tensor_free(d);
4887
1
  ccv_nnc_tensor_free(ha);
4888
1
  ccv_nnc_tensor_free(hb);
4889
1
  ccv_nnc_tensor_free(hc);
4890
1
  ccv_nnc_tensor_free(hd);
4891
1
  ccv_nnc_tensor_free(gd);
4892
1
}
4893
4894
TEST_CASE("compare ewsum with cudnn in half precision")
4895
1
{
4896
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4897
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
4898
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
4899
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
4900
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
4901
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4902
1
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4903
1
  ccv_nnc_tensor_t* const hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4904
1
  ccv_nnc_tensor_t* const hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4905
1
  ccv_nnc_tensor_t* const ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
4906
1
  ccv_nnc_tensor_t* const hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
4907
1
  ccv_nnc_tensor_t* const hc16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
4908
1
  ccv_nnc_tensor_t* const hd16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
4909
1
  ccv_nnc_tensor_t* const gd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
4910
1
  int i;
4911
101
  for (i = 0; i < 100; 
i++100
)
4912
100
  {
4913
100
    ha->data.f32[i] = 1;
4914
100
    hb->data.f32[i] = 0.5;
4915
100
    hc->data.f32[i] = 0.25;
4916
100
    gd->data.f32[i] = 1.75;
4917
100
  }
4918
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hc), TENSOR_LIST(ha16, hb16, hc16), 0);
4919
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hb16, hc16), TENSOR_LIST(a, b, c), 0);
4920
1
  ccv_nnc_cmd_exec(CMD_EWSUM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(d), 0);
4921
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(hd16), 0);
4922
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hd16), TENSOR_LIST(hd), 0);
4923
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hd->data.f32, gd->data.f32, 100, 1e-3, "ewsum result should be the same");
4924
1
  ccv_nnc_tensor_free(a);
4925
1
  ccv_nnc_tensor_free(b);
4926
1
  ccv_nnc_tensor_free(c);
4927
1
  ccv_nnc_tensor_free(d);
4928
1
  ccv_nnc_tensor_free(ha);
4929
1
  ccv_nnc_tensor_free(hb);
4930
1
  ccv_nnc_tensor_free(hc);
4931
1
  ccv_nnc_tensor_free(hd);
4932
1
  ccv_nnc_tensor_free(ha16);
4933
1
  ccv_nnc_tensor_free(hb16);
4934
1
  ccv_nnc_tensor_free(hc16);
4935
1
  ccv_nnc_tensor_free(hd16);
4936
1
  ccv_nnc_tensor_free(gd);
4937
1
}
4938
4939
TEST_CASE("compare ewsum with cudnn in int32")
4940
1
{
4941
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4942
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 100), 0);
4943
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 100), 0);
4944
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 100), 0);
4945
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 100), 0);
4946
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 100), 0);
4947
1
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 100), 0);
4948
1
  ccv_nnc_tensor_t* const hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 100), 0);
4949
1
  ccv_nnc_tensor_t* const hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 100), 0);
4950
1
  ccv_nnc_tensor_t* const gd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 100), 0);
4951
1
  int i;
4952
101
  for (i = 0; i < 100; 
i++100
)
4953
100
  {
4954
100
    ha->data.i32[i] = 2;
4955
100
    hb->data.i32[i] = 5;
4956
100
    hc->data.i32[i] = 8;
4957
100
    gd->data.i32[i] = 2 + 5 + 8;
4958
100
  }
4959
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hc), TENSOR_LIST(a, b, c), 0);
4960
1
  ccv_nnc_cmd_exec(CMD_EWSUM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(d), 0);
4961
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(hd), 0);
4962
1
  REQUIRE_TENSOR_EQ(hd, gd, "ewsum result should be the same");
4963
1
  ccv_nnc_tensor_free(a);
4964
1
  ccv_nnc_tensor_free(b);
4965
1
  ccv_nnc_tensor_free(c);
4966
1
  ccv_nnc_tensor_free(d);
4967
1
  ccv_nnc_tensor_free(ha);
4968
1
  ccv_nnc_tensor_free(hb);
4969
1
  ccv_nnc_tensor_free(hc);
4970
1
  ccv_nnc_tensor_free(hd);
4971
1
  ccv_nnc_tensor_free(gd);
4972
1
}
4973
4974
TEST_CASE("compare transpose two tensor views")
4975
1
{
4976
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
4977
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
4978
1
  memset(ha->data.f32, 0, sizeof(float) * 7 * 6 * 5 * 4);
4979
1
  ccv_nnc_tensor_view_t ha_view = ccv_nnc_tensor_view(ha, CPU_TENSOR_NHWC(32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
4980
1
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 8, 7, 6, 5), 0);
4981
1
  memset(hb->data.f32, 0, sizeof(float) * 8 * 7 * 6 * 5);
4982
1
  ccv_nnc_tensor_view_t hb_view = ccv_nnc_tensor_view(hb, CPU_TENSOR_NHWC(32F, 4, 2, 2, 3), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(7 * 6 * 5, 6 * 5, 5, 1));
4983
1
  int i, j, k, l;
4984
5
  for (i = 0; i < 4; 
i++4
)
4985
16
    
for (j = 0; 4
j < 3;
j++12
)
4986
36
      
for (k = 0; 12
k < 2;
k++24
)
4987
72
        
for (l = 0; 24
l < 2;
l++48
)
4988
48
          ha->data.f32[(i + 3) * 6 * 5 * 4 + (j + 2) * 5 * 4 + (k + 1) * 4 + l] = i * 3 * 2 * 2 + j * 2 * 2 + k * 2 + l;
4989
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&ha_view), TENSOR_LIST((ccv_nnc_tensor_t*)&hb_view), 0);
4990
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
4991
1
  memset(hd->data.f32, 0, sizeof(float) * 7 * 6 * 5 * 4);
4992
1
  ccv_nnc_tensor_view_t hd_view = ccv_nnc_tensor_view(hd, CPU_TENSOR_NHWC(32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
4993
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&hb_view), TENSOR_LIST((ccv_nnc_tensor_t*)&hd_view), 0);
4994
1
  REQUIRE_TENSOR_EQ(hd, ha, "4x3x2x2 tensor should be exactly the same.");
4995
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 7, 6, 5, 4), 0);
4996
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
4997
1
  ccv_nnc_tensor_view_t a_view = ccv_nnc_tensor_view(a, GPU_TENSOR_NHWC(000, 32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
4998
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 8, 7, 6, 5), 0);
4999
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(b), 0);
5000
1
  ccv_nnc_tensor_view_t b_view = ccv_nnc_tensor_view(b, GPU_TENSOR_NHWC(000, 32F, 4, 2, 2, 3), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(7 * 6 * 5, 6 * 5, 5, 1));
5001
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&a_view), TENSOR_LIST((ccv_nnc_tensor_t*)&b_view), 0);
5002
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 7, 6, 5, 4), 0);
5003
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(d), 0);
5004
1
  ccv_nnc_tensor_view_t d_view = ccv_nnc_tensor_view(d, GPU_TENSOR_NHWC(000, 32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
5005
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&b_view), TENSOR_LIST((ccv_nnc_tensor_t*)&d_view), 0);
5006
1
  ccv_nnc_tensor_t* const hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 8, 7, 6, 5), 0);
5007
1
  ccv_nnc_tensor_t* const hdt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
5008
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, d), TENSOR_LIST(hbt, hdt), 0);
5009
1
  REQUIRE_TENSOR_EQ(hbt, hb, "4x2x2x3 tensor should be exactly the same.");
5010
1
  REQUIRE_TENSOR_EQ(hdt, hd, "4x3x2x2 tensor should be exactly the same.");
5011
1
  ccv_nnc_tensor_free(ha);
5012
1
  ccv_nnc_tensor_free(hb);
5013
1
  ccv_nnc_tensor_free(hd);
5014
1
  ccv_nnc_tensor_free(hbt);
5015
1
  ccv_nnc_tensor_free(hdt);
5016
1
  ccv_nnc_tensor_free(a);
5017
1
  ccv_nnc_tensor_free(b);
5018
1
  ccv_nnc_tensor_free(d);
5019
1
}
5020
5021
TEST_CASE("compare format transform with cudnn in double precision")
5022
1
{
5023
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_FORMAT_TRANSFORM_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5024
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 64F, 11, 10, 9, 8), 0);
5025
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 11, 8, 10, 9), 0);
5026
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(64F, 11, 10, 9, 8), 0);
5027
1
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 11, 8, 10, 9), 0);
5028
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 11, 8, 10, 9), 0);
5029
1
  int i;
5030
1
  dsfmt_t dsfmt;
5031
1
  dsfmt_init_gen_rand(&dsfmt, 0);
5032
7.92k
  for (i = 0; i < 11 * 10 * 9 * 8; 
i++7.92k
)
5033
7.92k
    ha->data.f64[i] = dsfmt_genrand_open_close(&dsfmt);
5034
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
5035
1
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
5036
1
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(gb), 0);
5037
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
5038
1
  REQUIRE_TENSOR_EQ(hb, gb, "format transform result should be the same");
5039
1
  ccv_nnc_tensor_free(a);
5040
1
  ccv_nnc_tensor_free(b);
5041
1
  ccv_nnc_tensor_free(ha);
5042
1
  ccv_nnc_tensor_free(hb);
5043
1
  ccv_nnc_tensor_free(gb);
5044
1
}
5045
5046
TEST_CASE("compare set with cudnn in double precision")
5047
1
{
5048
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5049
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 64F, 11, 10, 9, 8), 0);
5050
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(64F, 11, 10, 9, 8), 0);
5051
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 11, 10, 9, 8), 0);
5052
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(a), 0);
5053
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(ga), 0);
5054
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ha), 0);
5055
1
  REQUIRE_TENSOR_EQ(ha, ga, "format transform result should be the same");
5056
1
  ccv_nnc_tensor_free(a);
5057
1
  ccv_nnc_tensor_free(ha);
5058
1
  ccv_nnc_tensor_free(ga);
5059
1
}
5060
5061
TEST_CASE("compare set with cudnn in integer")
5062
1
{
5063
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5064
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 1), 0);
5065
1
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 1), 0);
5066
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32S, 1), 0);
5067
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(a), 0);
5068
1
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(ga), 0);
5069
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ha), 0);
5070
1
  REQUIRE_TENSOR_EQ(ha, ga, "format transform result should be the same");
5071
1
  ccv_nnc_tensor_free(a);
5072
1
  ccv_nnc_tensor_free(ha);
5073
1
  ccv_nnc_tensor_free(ga);
5074
1
}
5075
5076
TEST_CASE("broadcasting semantics for add [[1, 2, 3], [4, 5, 6]] + [7, 8, 9]")
5077
1
{
5078
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5079
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5080
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5081
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5082
1
  a->data.f32[0] = 1;
5083
1
  a->data.f32[1] = 2;
5084
1
  a->data.f32[2] = 3;
5085
1
  a->data.f32[3] = 4;
5086
1
  a->data.f32[4] = 5;
5087
1
  a->data.f32[5] = 6;
5088
1
  b->data.f32[0] = 7;
5089
1
  b->data.f32[1] = 8;
5090
1
  b->data.f32[2] = 9;
5091
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5092
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5093
1
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5094
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
5095
1
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(1, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
5096
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
5097
1
  float ctp[] = {
5098
1
    8, 10, 12,
5099
1
    11, 13, 15
5100
1
  };
5101
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5102
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
5103
1
  ccv_nnc_tensor_free(a);
5104
1
  ccv_nnc_tensor_free(b);
5105
1
  ccv_nnc_tensor_free(c);
5106
1
  ccv_nnc_tensor_free(ga);
5107
1
  ccv_nnc_tensor_free(gb);
5108
1
  ccv_nnc_tensor_free(gc);
5109
1
}
5110
5111
TEST_CASE("broadcasting semantics for add [[1], [2], [3], [4]] + [5, 6]")
5112
1
{
5113
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5114
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5115
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5116
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5117
1
  a->data.f32[0] = 1;
5118
1
  a->data.f32[1] = 2;
5119
1
  a->data.f32[2] = 3;
5120
1
  a->data.f32[3] = 4;
5121
1
  b->data.f32[0] = 5;
5122
1
  b->data.f32[1] = 6;
5123
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5124
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5125
1
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5126
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
5127
1
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(1, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
5128
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
5129
1
  float ctp[] = {
5130
1
    6, 7,
5131
1
    7, 8,
5132
1
    8, 9,
5133
1
    9, 10
5134
1
  };
5135
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5136
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
5137
1
  ccv_nnc_tensor_free(a);
5138
1
  ccv_nnc_tensor_free(b);
5139
1
  ccv_nnc_tensor_free(c);
5140
1
  ccv_nnc_tensor_free(ga);
5141
1
  ccv_nnc_tensor_free(gb);
5142
1
  ccv_nnc_tensor_free(gc);
5143
1
}
5144
5145
TEST_CASE("broadcasting semantics for mul [[1, 2, 3], [4, 5, 6]] * [7, 8, 9]")
5146
1
{
5147
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5148
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5149
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5150
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5151
1
  a->data.f32[0] = 1;
5152
1
  a->data.f32[1] = 2;
5153
1
  a->data.f32[2] = 3;
5154
1
  a->data.f32[3] = 4;
5155
1
  a->data.f32[4] = 5;
5156
1
  a->data.f32[5] = 6;
5157
1
  b->data.f32[0] = 7;
5158
1
  b->data.f32[1] = 8;
5159
1
  b->data.f32[2] = 9;
5160
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5161
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5162
1
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5163
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
5164
1
  ccv_nnc_cmd_exec(CMD_MUL_FORWARD(1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
5165
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
5166
1
  float ctp[] = {
5167
1
    7, 16, 27,
5168
1
    28, 40, 54
5169
1
  };
5170
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5171
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
5172
1
  ccv_nnc_tensor_free(a);
5173
1
  ccv_nnc_tensor_free(b);
5174
1
  ccv_nnc_tensor_free(c);
5175
1
  ccv_nnc_tensor_free(ga);
5176
1
  ccv_nnc_tensor_free(gb);
5177
1
  ccv_nnc_tensor_free(gc);
5178
1
}
5179
5180
TEST_CASE("broadcasting semantics for mul [[1], [2], [3], [4]] * [5, 6]")
5181
1
{
5182
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5183
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5184
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5185
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5186
1
  a->data.f32[0] = 1;
5187
1
  a->data.f32[1] = 2;
5188
1
  a->data.f32[2] = 3;
5189
1
  a->data.f32[3] = 4;
5190
1
  b->data.f32[0] = 5;
5191
1
  b->data.f32[1] = 6;
5192
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5193
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5194
1
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5195
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
5196
1
  ccv_nnc_cmd_exec(CMD_MUL_FORWARD(1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
5197
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
5198
1
  float ctp[] = {
5199
1
    5, 6,
5200
1
    10, 12,
5201
1
    15, 18,
5202
1
    20, 24
5203
1
  };
5204
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5205
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
5206
1
  ccv_nnc_tensor_free(a);
5207
1
  ccv_nnc_tensor_free(b);
5208
1
  ccv_nnc_tensor_free(c);
5209
1
  ccv_nnc_tensor_free(ga);
5210
1
  ccv_nnc_tensor_free(gb);
5211
1
  ccv_nnc_tensor_free(gc);
5212
1
}
5213
5214
TEST_CASE("scalar mul [[1, 2, 3], [4, 5, 6]] * 0.3")
5215
1
{
5216
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5217
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5218
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5219
1
  a->data.f32[0] = 1;
5220
1
  a->data.f32[1] = 2;
5221
1
  a->data.f32[2] = 3;
5222
1
  a->data.f32[3] = 4;
5223
1
  a->data.f32[4] = 5;
5224
1
  a->data.f32[5] = 6;
5225
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5226
1
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5227
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ga), 0);
5228
1
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(0.3), ccv_nnc_no_hint, 0, TENSOR_LIST(ga), TENSOR_LIST(gc), 0);
5229
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
5230
1
  float ctp[] = {
5231
1
    0.3, 0.6, 0.9,
5232
1
    1.2, 1.5, 1.8,
5233
1
  };
5234
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5235
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
5236
1
  ccv_nnc_tensor_free(a);
5237
1
  ccv_nnc_tensor_free(c);
5238
1
  ccv_nnc_tensor_free(ga);
5239
1
  ccv_nnc_tensor_free(gc);
5240
1
}
5241
5242
TEST_CASE("broadcasting semantics for add backward")
5243
1
{
5244
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
5245
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5246
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5247
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5248
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5249
1
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5250
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5251
1
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5252
1
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5253
1
  a->data.f32[0] = 1;
5254
1
  a->data.f32[1] = 2;
5255
1
  a->data.f32[2] = 3;
5256
1
  a->data.f32[3] = 4;
5257
1
  b->data.f32[0] = 5;
5258
1
  b->data.f32[1] = 6;
5259
1
  float ctp[] = {
5260
1
    6, 7,
5261
1
    7, 8,
5262
1
    8, 9,
5263
1
    9, 10
5264
1
  };
5265
1
  memcpy(c->data.f32, ctp, sizeof(ctp));
5266
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5267
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5268
1
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5269
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5270
1
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5271
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
5272
1
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, gdb), 0);
5273
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
5274
1
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, dbt), 0);
5275
1
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5276
1
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
5277
1
  ccv_nnc_tensor_free(a);
5278
1
  ccv_nnc_tensor_free(b);
5279
1
  ccv_nnc_tensor_free(c);
5280
1
  ccv_nnc_tensor_free(da);
5281
1
  ccv_nnc_tensor_free(db);
5282
1
  ccv_nnc_tensor_free(dat);
5283
1
  ccv_nnc_tensor_free(dbt);
5284
1
  ccv_nnc_tensor_free(ga);
5285
1
  ccv_nnc_tensor_free(gb);
5286
1
  ccv_nnc_tensor_free(gc);
5287
1
  ccv_nnc_tensor_free(gda);
5288
1
  ccv_nnc_tensor_free(gdb);
5289
1
}
5290
5291
TEST_CASE("broadcasting semantics for mul backward")
5292
1
{
5293
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
5294
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5295
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5296
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5297
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5298
1
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5299
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5300
1
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5301
1
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5302
1
  a->data.f32[0] = 1;
5303
1
  a->data.f32[1] = 2;
5304
1
  a->data.f32[2] = 3;
5305
1
  a->data.f32[3] = 4;
5306
1
  b->data.f32[0] = 5;
5307
1
  b->data.f32[1] = 6;
5308
1
  float ctp[] = {
5309
1
    6, 7,
5310
1
    7, 8,
5311
1
    8, 9,
5312
1
    9, 10
5313
1
  };
5314
1
  memcpy(c->data.f32, ctp, sizeof(ctp));
5315
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5316
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5317
1
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5318
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5319
1
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5320
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
5321
1
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, gdb), 0);
5322
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
5323
1
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, dbt), 0);
5324
1
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5325
1
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
5326
1
  ccv_nnc_tensor_free(a);
5327
1
  ccv_nnc_tensor_free(b);
5328
1
  ccv_nnc_tensor_free(c);
5329
1
  ccv_nnc_tensor_free(da);
5330
1
  ccv_nnc_tensor_free(db);
5331
1
  ccv_nnc_tensor_free(dat);
5332
1
  ccv_nnc_tensor_free(dbt);
5333
1
  ccv_nnc_tensor_free(ga);
5334
1
  ccv_nnc_tensor_free(gb);
5335
1
  ccv_nnc_tensor_free(gc);
5336
1
  ccv_nnc_tensor_free(gda);
5337
1
  ccv_nnc_tensor_free(gdb);
5338
1
}
5339
5340
TEST_CASE("broadcasting semantics for mul backward (no input grad)")
5341
1
{
5342
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
5343
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5344
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5345
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5346
1
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5347
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5348
1
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5349
1
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5350
1
  a->data.f32[0] = 1;
5351
1
  a->data.f32[1] = 2;
5352
1
  a->data.f32[2] = 3;
5353
1
  a->data.f32[3] = 4;
5354
1
  b->data.f32[0] = 5;
5355
1
  b->data.f32[1] = 6;
5356
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5357
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5358
1
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5359
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5360
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
5361
1
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
5362
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
5363
1
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
5364
1
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5365
1
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
5366
1
  ccv_nnc_tensor_free(a);
5367
1
  ccv_nnc_tensor_free(b);
5368
1
  ccv_nnc_tensor_free(da);
5369
1
  ccv_nnc_tensor_free(db);
5370
1
  ccv_nnc_tensor_free(dat);
5371
1
  ccv_nnc_tensor_free(dbt);
5372
1
  ccv_nnc_tensor_free(ga);
5373
1
  ccv_nnc_tensor_free(gb);
5374
1
  ccv_nnc_tensor_free(gda);
5375
1
  ccv_nnc_tensor_free(gdb);
5376
1
}
5377
5378
TEST_CASE("broadcasting semantics for mul backward (no input grad) for b")
5379
1
{
5380
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
5381
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5382
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5383
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5384
1
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5385
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5386
1
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5387
1
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5388
1
  a->data.f32[0] = 1;
5389
1
  a->data.f32[1] = 2;
5390
1
  a->data.f32[2] = 3;
5391
1
  a->data.f32[3] = 4;
5392
1
  a->data.f32[4] = 5;
5393
1
  a->data.f32[5] = 6;
5394
1
  b->data.f32[0] = 7;
5395
1
  b->data.f32[1] = 8;
5396
1
  b->data.f32[2] = 9;
5397
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5398
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5399
1
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5400
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5401
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
5402
1
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
5403
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
5404
1
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
5405
1
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5406
1
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
5407
1
  ccv_nnc_tensor_free(a);
5408
1
  ccv_nnc_tensor_free(b);
5409
1
  ccv_nnc_tensor_free(da);
5410
1
  ccv_nnc_tensor_free(db);
5411
1
  ccv_nnc_tensor_free(dat);
5412
1
  ccv_nnc_tensor_free(dbt);
5413
1
  ccv_nnc_tensor_free(ga);
5414
1
  ccv_nnc_tensor_free(gb);
5415
1
  ccv_nnc_tensor_free(gda);
5416
1
  ccv_nnc_tensor_free(gdb);
5417
1
}
5418
5419
TEST_CASE("broadcasting semantics for mul backward (no input grad) for a")
5420
1
{
5421
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
5422
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5423
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5424
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5425
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5426
1
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5427
1
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5428
1
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5429
1
  b->data.f32[0] = 1;
5430
1
  b->data.f32[1] = 2;
5431
1
  b->data.f32[2] = 3;
5432
1
  b->data.f32[3] = 4;
5433
1
  b->data.f32[4] = 5;
5434
1
  b->data.f32[5] = 6;
5435
1
  a->data.f32[0] = 7;
5436
1
  a->data.f32[1] = 8;
5437
1
  a->data.f32[2] = 9;
5438
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5439
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5440
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5441
1
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5442
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
5443
1
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
5444
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
5445
1
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
5446
1
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5447
1
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
5448
1
  ccv_nnc_tensor_free(a);
5449
1
  ccv_nnc_tensor_free(b);
5450
1
  ccv_nnc_tensor_free(da);
5451
1
  ccv_nnc_tensor_free(db);
5452
1
  ccv_nnc_tensor_free(dat);
5453
1
  ccv_nnc_tensor_free(dbt);
5454
1
  ccv_nnc_tensor_free(ga);
5455
1
  ccv_nnc_tensor_free(gb);
5456
1
  ccv_nnc_tensor_free(gda);
5457
1
  ccv_nnc_tensor_free(gdb);
5458
1
}
5459
5460
TEST_CASE("cudnn forward convolution transpose")
5461
1
{
5462
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5463
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5464
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5465
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
5466
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
5467
1
  assert(cmd.backend >= 0);
5468
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
5469
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5470
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, INPUT_DIM), 0);
5471
  // configure the inlets.
5472
1
  dsfmt_t dsfmt;
5473
1
  dsfmt_init_gen_rand(&dsfmt, 0);
5474
1
  int i;
5475
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
5476
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
5477
19.2M
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++19.2M
)
5478
19.2M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5479
4
  for (i = 0; i < INPUT_DIM; 
i++3
)
5480
3
    bias->data.f32[i] = (float)i / INPUT_DIM;
5481
  // Copy generated matrix values over to GPU.
5482
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5483
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5484
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, INPUT_DIM), 0);
5485
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
5486
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
5487
1
  assert(move.backend >= 0);
5488
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
5489
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5490
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5491
5492
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
5493
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
5494
1
  assert(cmd.backend >= 0);
5495
1
  cmd.algorithm = -1;
5496
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), stream_context);
5497
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), stream_context));
5498
1
  ccv_nnc_stream_context_wait(stream_context);
5499
1
  ccv_nnc_stream_context_free(stream_context);
5500
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5501
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
5502
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 2e-4, "output from cudnn should match from CPU");
5503
1
  ccv_nnc_tensor_free(c);
5504
1
  ccv_nnc_tensor_free(gc);
5505
1
  ccv_nnc_tensor_free(bias);
5506
1
  ccv_nnc_tensor_free(w);
5507
1
  ccv_nnc_tensor_free(b);
5508
1
  ccv_nnc_tensor_free(a);
5509
1
  ccv_nnc_tensor_free(gbias);
5510
1
  ccv_nnc_tensor_free(gw);
5511
1
  ccv_nnc_tensor_free(ga);
5512
1
}
5513
5514
TEST_CASE("cudnn forward convolution transpose, w in nchw format")
5515
1
{
5516
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5517
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5518
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5519
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
5520
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
5521
1
  assert(cmd.backend >= 0);
5522
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
5523
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5524
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, INPUT_DIM), 0);
5525
  // configure the inlets.
5526
1
  dsfmt_t dsfmt;
5527
1
  dsfmt_init_gen_rand(&dsfmt, 0);
5528
1
  int i;
5529
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
5530
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
5531
19.2M
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++19.2M
)
5532
19.2M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5533
4
  for (i = 0; i < INPUT_DIM; 
i++3
)
5534
3
    bias->data.f32[i] = (float)i / INPUT_DIM;
5535
  // Copy generated matrix values over to GPU.
5536
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5537
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5538
1
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
5539
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, INPUT_DIM), 0);
5540
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
5541
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
5542
1
  assert(move.backend >= 0);
5543
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
5544
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5545
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5546
1
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), 0);
5547
5548
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
5549
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
5550
1
  assert(cmd.backend >= 0);
5551
1
  cmd.algorithm = -1;
5552
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
5553
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
5554
1
  ccv_nnc_stream_context_wait(stream_context);
5555
1
  ccv_nnc_stream_context_free(stream_context);
5556
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5557
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
5558
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 2e-4, "output from cudnn should match from CPU");
5559
1
  ccv_nnc_tensor_free(c);
5560
1
  ccv_nnc_tensor_free(gc);
5561
1
  ccv_nnc_tensor_free(bias);
5562
1
  ccv_nnc_tensor_free(w);
5563
1
  ccv_nnc_tensor_free(b);
5564
1
  ccv_nnc_tensor_free(a);
5565
1
  ccv_nnc_tensor_free(gbias);
5566
1
  ccv_nnc_tensor_free(gw);
5567
1
  ccv_nnc_tensor_free(gwo);
5568
1
  ccv_nnc_tensor_free(ga);
5569
1
}
5570
5571
TEST_CASE("cudnn forward convolution transpose in nchw format")
5572
1
{
5573
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5574
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
5575
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
5576
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
5577
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
5578
1
  assert(cmd.backend >= 0);
5579
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
5580
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
5581
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, INPUT_DIM), 0);
5582
  // configure the inlets.
5583
1
  dsfmt_t dsfmt;
5584
1
  dsfmt_init_gen_rand(&dsfmt, 0);
5585
1
  int i;
5586
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
5587
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
5588
19.2M
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++19.2M
)
5589
19.2M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5590
4
  for (i = 0; i < INPUT_DIM; 
i++3
)
5591
3
    bias->data.f32[i] = (float)i / INPUT_DIM;
5592
  // Copy generated matrix values over to GPU.
5593
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
5594
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
5595
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, INPUT_DIM), 0);
5596
1
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
5597
1
  move.backend = CCV_NNC_BACKEND_GPU_REF;
5598
1
  assert(move.backend >= 0);
5599
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
5600
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5601
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
5602
5603
1
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
5604
1
  transform.backend = CCV_NNC_BACKEND_GPU_CUDNN;
5605
1
  assert(transform.backend >= 0);
5606
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
5607
1
  assert(cmd.backend >= 0);
5608
1
  cmd.algorithm = -1;
5609
1
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
5610
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
5611
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
5612
1
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
5613
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-5, "output from cudnn should match from CPU");
5614
1
  ccv_nnc_tensor_free(c);
5615
1
  ccv_nnc_tensor_free(gc);
5616
1
  ccv_nnc_tensor_free(bias);
5617
1
  ccv_nnc_tensor_free(w);
5618
1
  ccv_nnc_tensor_free(b);
5619
1
  ccv_nnc_tensor_free(a);
5620
1
  ccv_nnc_tensor_free(gbias);
5621
1
  ccv_nnc_tensor_free(gw);
5622
1
  ccv_nnc_tensor_free(ga);
5623
1
}
5624
5625
TEST_CASE("cudnn forward convolution transpose in half precision")
5626
1
{
5627
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
5628
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5629
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5630
1
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
5631
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
5632
1
  assert(cmd.backend >= 0);
5633
1
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
5634
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5635
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, INPUT_DIM), 0);
5636
  // configure the inlets.
5637
1
  dsfmt_t dsfmt;
5638
1
  dsfmt_init_gen_rand(&dsfmt, 0);
5639
1
  int i;
5640
14.1k
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; 
i++14.1k
)
5641
14.1k
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
5642
19.2M
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); 
i++19.2M
)
5643
19.2M
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5644
4
  for (i = 0; i < INPUT_DIM; 
i++3
)
5645
3
    bias->data.f32[i] = (float)i / INPUT_DIM;
5646
1
  ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5647
1
  ccv_nnc_tensor_t* w1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5648
1
  ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, INPUT_DIM), 0);
5649
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(a1, w1, bias1), 0);
5650
  // Copy generated matrix values over to GPU.
5651
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5652
1
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5653
1
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, INPUT_DIM), 0);
5654
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, w1, bias1), TENSOR_LIST(ga, gw, gbias), 0);
5655
1
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5656
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5657
5658
1
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
5659
5660
1
  cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
5661
1
  assert(cmd.backend >= 0);
5662
1
  cmd.algorithm = -1;
5663
1
  cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), stream_context);
5664
1
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), stream_context));
5665
1
  ccv_nnc_stream_context_wait(stream_context);
5666
1
  ccv_nnc_stream_context_free(stream_context);
5667
1
  ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5668
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
5669
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5670
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
5671
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 5e-3, "output from cudnn should match from CPU");
5672
1
  ccv_nnc_tensor_free(c);
5673
1
  ccv_nnc_tensor_free(gc);
5674
1
  ccv_nnc_tensor_free(bias);
5675
1
  ccv_nnc_tensor_free(w);
5676
1
  ccv_nnc_tensor_free(b);
5677
1
  ccv_nnc_tensor_free(a);
5678
1
  ccv_nnc_tensor_free(c1);
5679
1
  ccv_nnc_tensor_free(bias1);
5680
1
  ccv_nnc_tensor_free(w1);
5681
1
  ccv_nnc_tensor_free(a1);
5682
1
  ccv_nnc_tensor_free(gbias);
5683
1
  ccv_nnc_tensor_free(gw);
5684
1
  ccv_nnc_tensor_free(ga);
5685
1
}
5686
5687
#include "case_main.h"