Coverage Report

Created: 2025-05-31 15:19

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/mpsdnn.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
#include <nnc/ccv_nnc_internal.h>
9
10
TEST_SETUP()
11
{
12
  ccv_nnc_init();
13
}
14
15
0
#define INPUT_DIM (3)
16
0
#define OUTPUT_DIM (96)
17
18
0
#define INPUT_SIZE (224)
19
0
#define OUTPUT_SIZE (112)
20
21
0
#define KERNEL_SIZE (7)
22
23
#define BATCH_SIZE (16)
24
25
0
#define LN_DIM (10)
26
0
#define GN_C_DIM (16)
27
#define GN_RC_DIM (4)
28
29
TEST_CASE("mps forward convolution")
30
1
{
31
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
32
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
33
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
34
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
35
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
36
0
  assert(cmd.backend >= 0);
37
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
38
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
39
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
40
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
41
  // configure the inlets.
42
0
  dsfmt_t dsfmt;
43
0
  dsfmt_init_gen_rand(&dsfmt, 0);
44
0
  int i;
45
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
46
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
47
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
48
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
49
0
  for (i = 0; i < OUTPUT_DIM; i++)
50
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
51
  // Copy generated matrix values over to GPU.
52
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
53
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
54
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
55
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
56
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
57
0
  move.backend = CCV_NNC_BACKEND_MPS;
58
0
  assert(move.backend >= 0);
59
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
60
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
61
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
62
63
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
64
0
  transform.backend = CCV_NNC_BACKEND_MPS;
65
0
  assert(transform.backend >= 0);
66
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
67
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
68
0
  ccv_nnc_stream_context_wait(stream_context);
69
0
  ccv_nnc_tensor_free(gw);
70
71
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
72
0
  assert(cmd.backend >= 0);
73
0
  cmd.algorithm = -1;
74
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
75
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
76
0
  ccv_nnc_stream_context_wait(stream_context);
77
0
  ccv_nnc_stream_context_free(stream_context);
78
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
79
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
80
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
81
0
  ccv_nnc_tensor_free(c);
82
0
  ccv_nnc_tensor_free(gc);
83
0
  ccv_nnc_tensor_free(bias);
84
0
  ccv_nnc_tensor_free(w);
85
0
  ccv_nnc_tensor_free(b);
86
0
  ccv_nnc_tensor_free(a);
87
0
  ccv_nnc_tensor_free(gbias);
88
0
  ccv_nnc_tensor_free(gwo);
89
0
  ccv_nnc_tensor_free(ga);
90
0
}
91
92
TEST_CASE("mps forward convolution in nchw format")
93
1
{
94
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
95
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
96
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
97
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
98
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
99
0
  assert(cmd.backend >= 0);
100
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
101
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
102
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
103
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
104
  // configure the inlets.
105
0
  dsfmt_t dsfmt;
106
0
  dsfmt_init_gen_rand(&dsfmt, 0);
107
0
  int i;
108
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
109
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
110
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
111
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
112
0
  for (i = 0; i < OUTPUT_DIM; i++)
113
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
114
  // Copy generated matrix values over to GPU.
115
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
116
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
117
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
118
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
119
0
  move.backend = CCV_NNC_BACKEND_MPS;
120
0
  assert(move.backend >= 0);
121
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
122
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
123
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
124
125
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
126
0
  transform.backend = CCV_NNC_BACKEND_MPS;
127
0
  assert(transform.backend >= 0);
128
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
129
0
  assert(cmd.backend >= 0);
130
0
  cmd.algorithm = -1;
131
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
132
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
133
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
134
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
135
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
136
0
  ccv_nnc_tensor_free(c);
137
0
  ccv_nnc_tensor_free(gc);
138
0
  ccv_nnc_tensor_free(bias);
139
0
  ccv_nnc_tensor_free(w);
140
0
  ccv_nnc_tensor_free(b);
141
0
  ccv_nnc_tensor_free(a);
142
0
  ccv_nnc_tensor_free(gbias);
143
0
  ccv_nnc_tensor_free(gw);
144
0
  ccv_nnc_tensor_free(ga);
145
0
}
146
147
TEST_CASE("mps forward convolution with 1x1 kernel")
148
1
{
149
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
150
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, INPUT_DIM), 0);
151
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
152
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 1, 1, INPUT_DIM);
153
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
154
0
  assert(cmd.backend >= 0);
155
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
156
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
157
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, 1, 1, INPUT_DIM), 0);
158
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
159
  // configure the inlets.
160
0
  dsfmt_t dsfmt;
161
0
  dsfmt_init_gen_rand(&dsfmt, 0);
162
0
  int i;
163
0
  for (i = 0; i < INPUT_DIM * 1 * 1 * OUTPUT_DIM; i++)
164
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * 1 * 1);
165
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
166
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
167
0
  for (i = 0; i < OUTPUT_DIM; i++)
168
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
169
  // Copy generated matrix values over to GPU.
170
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, INPUT_DIM), 0);
171
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, 1, 1, INPUT_DIM), 0);
172
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
173
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
174
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
175
0
  move.backend = CCV_NNC_BACKEND_MPS;
176
0
  assert(move.backend >= 0);
177
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
178
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
179
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
180
181
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
182
0
  transform.backend = CCV_NNC_BACKEND_MPS;
183
0
  assert(transform.backend >= 0);
184
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
185
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
186
0
  ccv_nnc_stream_context_wait(stream_context);
187
0
  ccv_nnc_tensor_free(gw);
188
189
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
190
0
  assert(cmd.backend >= 0);
191
0
  cmd.algorithm = -1;
192
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
193
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
194
0
  ccv_nnc_stream_context_wait(stream_context);
195
0
  ccv_nnc_stream_context_free(stream_context);
196
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
197
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
198
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
199
0
  ccv_nnc_tensor_free(c);
200
0
  ccv_nnc_tensor_free(gc);
201
0
  ccv_nnc_tensor_free(bias);
202
0
  ccv_nnc_tensor_free(w);
203
0
  ccv_nnc_tensor_free(b);
204
0
  ccv_nnc_tensor_free(a);
205
0
  ccv_nnc_tensor_free(gbias);
206
0
  ccv_nnc_tensor_free(gwo);
207
0
  ccv_nnc_tensor_free(ga);
208
0
}
209
210
TEST_CASE("mps forward convolution in nchw format with 1x1 kernel")
211
1
{
212
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
213
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
214
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
215
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 1, 1, INPUT_DIM);
216
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
217
0
  assert(cmd.backend >= 0);
218
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
219
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
220
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
221
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
222
  // configure the inlets.
223
0
  dsfmt_t dsfmt;
224
0
  dsfmt_init_gen_rand(&dsfmt, 0);
225
0
  int i;
226
0
  for (i = 0; i < INPUT_DIM * 1 * 1 * OUTPUT_DIM; i++)
227
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * 1 * 1);
228
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
229
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
230
0
  for (i = 0; i < OUTPUT_DIM; i++)
231
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
232
  // Copy generated matrix values over to GPU.
233
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
234
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
235
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
236
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
237
0
  move.backend = CCV_NNC_BACKEND_MPS;
238
0
  assert(move.backend >= 0);
239
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
240
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
241
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
242
243
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
244
0
  transform.backend = CCV_NNC_BACKEND_MPS;
245
0
  assert(transform.backend >= 0);
246
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
247
0
  assert(cmd.backend >= 0);
248
0
  cmd.algorithm = -1;
249
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
250
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
251
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
252
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
253
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
254
0
  ccv_nnc_tensor_free(c);
255
0
  ccv_nnc_tensor_free(gc);
256
0
  ccv_nnc_tensor_free(bias);
257
0
  ccv_nnc_tensor_free(w);
258
0
  ccv_nnc_tensor_free(b);
259
0
  ccv_nnc_tensor_free(a);
260
0
  ccv_nnc_tensor_free(gbias);
261
0
  ccv_nnc_tensor_free(gw);
262
0
  ccv_nnc_tensor_free(ga);
263
0
}
264
265
TEST_CASE("mps forward convolution in half precision")
266
1
{
267
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
268
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
269
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
270
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
271
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
272
0
  assert(cmd.backend >= 0);
273
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
274
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
275
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
276
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
277
  // configure the inlets.
278
0
  dsfmt_t dsfmt;
279
0
  dsfmt_init_gen_rand(&dsfmt, 0);
280
0
  int i;
281
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
282
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
283
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
284
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
285
0
  for (i = 0; i < OUTPUT_DIM; i++)
286
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
287
0
  ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
288
0
  ccv_nnc_tensor_t* w1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
289
0
  ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM), 0);
290
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(a1, w1, bias1), 0);
291
  // Copy generated matrix values over to GPU.
292
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
293
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
294
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
295
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM), 0);
296
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, w1, bias1), TENSOR_LIST(ga, gw, gbias), 0);
297
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
298
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
299
300
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
301
0
  transform.backend = CCV_NNC_BACKEND_MPS;
302
0
  assert(transform.backend >= 0);
303
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
304
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
305
0
  ccv_nnc_stream_context_wait(stream_context);
306
0
  ccv_nnc_tensor_free(gw);
307
308
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
309
0
  assert(cmd.backend >= 0);
310
0
  cmd.algorithm = -1;
311
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
312
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
313
0
  ccv_nnc_stream_context_wait(stream_context);
314
0
  ccv_nnc_stream_context_free(stream_context);
315
0
  ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
316
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
317
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
318
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
319
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 5e-3, "output from mps should match from CPU");
320
0
  ccv_nnc_tensor_free(c);
321
0
  ccv_nnc_tensor_free(gc);
322
0
  ccv_nnc_tensor_free(bias);
323
0
  ccv_nnc_tensor_free(w);
324
0
  ccv_nnc_tensor_free(b);
325
0
  ccv_nnc_tensor_free(a);
326
0
  ccv_nnc_tensor_free(c1);
327
0
  ccv_nnc_tensor_free(bias1);
328
0
  ccv_nnc_tensor_free(w1);
329
0
  ccv_nnc_tensor_free(a1);
330
0
  ccv_nnc_tensor_free(gbias);
331
0
  ccv_nnc_tensor_free(gwo);
332
0
  ccv_nnc_tensor_free(ga);
333
0
}
334
335
TEST_CASE("mps forward convolution with dilation 2, 3")
336
1
{
337
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
338
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
339
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
340
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
341
0
  cmd.info.convolution.dilation[0] = 2;
342
0
  cmd.info.convolution.dilation[1] = 3;
343
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
344
0
  assert(cmd.backend >= 0);
345
0
  ccv_nnc_cmd_param_t modified_cmd = cmd.info;
346
0
  modified_cmd.size.dim[0] = (cmd.info.size.dim[0] - 1) * ccv_max(cmd.info.convolution.dilation[0], 1) + 1;
347
0
  modified_cmd.size.dim[1] = (cmd.info.size.dim[1] - 1) * ccv_max(cmd.info.convolution.dilation[1], 1) + 1;
348
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(modified_cmd, a->info, b->info);
349
0
  assert(ccv_nnc_hint_verify(hint, modified_cmd, a->info, b->info) == 0);
350
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
351
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
352
  // configure the inlets.
353
0
  dsfmt_t dsfmt;
354
0
  dsfmt_init_gen_rand(&dsfmt, 0);
355
0
  int i;
356
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
357
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
358
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
359
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
360
0
  for (i = 0; i < OUTPUT_DIM; i++)
361
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
362
  // Copy generated matrix values over to GPU.
363
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
364
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
365
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
366
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
367
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
368
0
  move.backend = CCV_NNC_BACKEND_MPS;
369
0
  assert(move.backend >= 0);
370
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
371
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
372
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
373
374
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
375
0
  transform.backend = CCV_NNC_BACKEND_MPS;
376
0
  assert(transform.backend >= 0);
377
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
378
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
379
0
  ccv_nnc_stream_context_wait(stream_context);
380
0
  ccv_nnc_tensor_free(gw);
381
382
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
383
0
  assert(cmd.backend >= 0);
384
0
  cmd.algorithm = -1;
385
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
386
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
387
0
  ccv_nnc_stream_context_wait(stream_context);
388
0
  ccv_nnc_stream_context_free(stream_context);
389
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
390
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
391
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
392
0
  ccv_nnc_tensor_free(c);
393
0
  ccv_nnc_tensor_free(gc);
394
0
  ccv_nnc_tensor_free(bias);
395
0
  ccv_nnc_tensor_free(w);
396
0
  ccv_nnc_tensor_free(b);
397
0
  ccv_nnc_tensor_free(a);
398
0
  ccv_nnc_tensor_free(gbias);
399
0
  ccv_nnc_tensor_free(gwo);
400
0
  ccv_nnc_tensor_free(ga);
401
0
}
402
403
TEST_CASE("mps forward convolution 3d")
404
1
{
405
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
406
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
407
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
408
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
409
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
410
0
  hint.stride.dim[0] = 2;
411
0
  hint.border.begin[0] = 1;
412
0
  hint.border.end[0] = 1;
413
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
414
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
415
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
416
  // configure the inlets.
417
0
  dsfmt_t dsfmt;
418
0
  dsfmt_init_gen_rand(&dsfmt, 0);
419
0
  int i;
420
0
  for (i = 0; i < INPUT_DIM * 3 * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
421
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
422
0
  for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
423
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
424
0
  for (i = 0; i < OUTPUT_DIM; i++)
425
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
426
  // Copy generated matrix values over to GPU.
427
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
428
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
429
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
430
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
431
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
432
0
  move.backend = CCV_NNC_BACKEND_MPS;
433
0
  assert(move.backend >= 0);
434
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
435
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
436
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
437
0
  transform.backend = CCV_NNC_BACKEND_MPS;
438
0
  assert(transform.backend >= 0);
439
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
440
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
441
0
  ccv_nnc_stream_context_wait(stream_context);
442
0
  ccv_nnc_tensor_free(gw);
443
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
444
0
  assert(cmd.backend >= 0);
445
0
  cmd.algorithm = -1;
446
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
447
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
448
0
  ccv_nnc_stream_context_wait(stream_context);
449
0
  ccv_nnc_stream_context_free(stream_context);
450
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
451
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
452
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
453
0
  assert(cmd.backend >= 0);
454
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
455
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from mps should match from CPU");
456
0
  ccv_nnc_tensor_free(c);
457
0
  ccv_nnc_tensor_free(gc);
458
0
  ccv_nnc_tensor_free(bias);
459
0
  ccv_nnc_tensor_free(w);
460
0
  ccv_nnc_tensor_free(b);
461
0
  ccv_nnc_tensor_free(a);
462
0
  ccv_nnc_tensor_free(gbias);
463
0
  ccv_nnc_tensor_free(gwo);
464
0
  ccv_nnc_tensor_free(ga);
465
0
}
466
467
TEST_CASE("mps forward convolution 3d in nchw format")
468
1
{
469
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
470
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
471
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
472
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
473
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
474
0
  hint.stride.dim[0] = 2;
475
0
  hint.border.begin[0] = 1;
476
0
  hint.border.end[0] = 1;
477
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
478
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
479
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM), 0);
480
  // configure the inlets.
481
0
  dsfmt_t dsfmt;
482
0
  dsfmt_init_gen_rand(&dsfmt, 0);
483
0
  int i;
484
0
  for (i = 0; i < 3 * INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
485
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
486
0
  for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
487
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
488
0
  for (i = 0; i < OUTPUT_DIM; i++)
489
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
490
  // Copy generated matrix values over to GPU.
491
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
492
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
493
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
494
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
495
0
  move.backend = CCV_NNC_BACKEND_MPS;
496
0
  assert(move.backend >= 0);
497
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
498
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
499
500
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
501
0
  transform.backend = CCV_NNC_BACKEND_MPS;
502
0
  assert(transform.backend >= 0);
503
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
504
0
  assert(cmd.backend >= 0);
505
0
  cmd.algorithm = -1;
506
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
507
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
508
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
509
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
510
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
511
0
  assert(cmd.backend >= 0);
512
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
513
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
514
0
  ccv_nnc_tensor_free(c);
515
0
  ccv_nnc_tensor_free(gc);
516
0
  ccv_nnc_tensor_free(bias);
517
0
  ccv_nnc_tensor_free(w);
518
0
  ccv_nnc_tensor_free(b);
519
0
  ccv_nnc_tensor_free(a);
520
0
  ccv_nnc_tensor_free(gbias);
521
0
  ccv_nnc_tensor_free(gw);
522
0
  ccv_nnc_tensor_free(ga);
523
0
}
524
525
TEST_CASE("compare softmax with mps")
526
1
{
527
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_MPS));
528
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
529
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
530
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
531
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "softmax");
532
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
533
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
534
0
  ccv_nnc_graph_t* graph = 0;
535
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
536
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
537
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
538
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
539
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
540
0
  dsfmt_t dsfmt;
541
0
  dsfmt_init_gen_rand(&dsfmt, 0);
542
0
  int i;
543
0
  for (i = 0; i < 20 * 10; i++)
544
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
545
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
546
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
547
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
548
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
549
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
550
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
551
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
552
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
553
0
  REQUIRE_TENSOR_EQ(ty, y_tensor, "softmax from mps should match from CPU");
554
0
  ccv_nnc_tensor_free(x_tensor);
555
0
  ccv_nnc_tensor_free(y_tensor);
556
0
  ccv_nnc_tensor_free(ty);
557
0
  ccv_nnc_graph_free(graph);
558
0
  ccv_nnc_tensor_arena_free(tensor_arena);
559
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
560
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
561
0
}
562
563
TEST_CASE("compare softmax with mps in half precision")
564
1
{
565
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_MPS));
566
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
567
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
568
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
569
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "softmax");
570
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
571
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
572
0
  ccv_nnc_graph_t* graph = 0;
573
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
574
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
575
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
576
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
577
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
578
0
  dsfmt_t dsfmt;
579
0
  dsfmt_init_gen_rand(&dsfmt, 0);
580
0
  int i;
581
0
  for (i = 0; i < 20 * 10; i++)
582
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
583
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
584
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
585
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
586
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
587
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
588
0
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
589
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
590
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
591
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
592
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
593
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
594
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
595
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "softmax from mps should match from CPU");
596
0
  ccv_nnc_tensor_free(x_tensor);
597
0
  ccv_nnc_tensor_free(x16_tensor);
598
0
  ccv_nnc_tensor_free(y16_tensor);
599
0
  ccv_nnc_tensor_free(y_tensor);
600
0
  ccv_nnc_tensor_free(ty);
601
0
  ccv_nnc_graph_free(graph);
602
0
  ccv_nnc_tensor_arena_free(tensor_arena);
603
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
604
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
605
0
}
606
607
TEST_CASE("compare softmax gradient with mps")
608
1
{
609
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_MPS) &&
610
1
    ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_BACKWARD, CCV_NNC_BACKEND_MPS));
611
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
612
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
613
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
614
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "softmax");
615
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
616
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
617
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
618
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
619
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
620
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
621
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
622
0
  dsfmt_t dsfmt;
623
0
  dsfmt_init_gen_rand(&dsfmt, 0);
624
0
  int i;
625
0
  for (i = 0; i < 10 * 100; i++)
626
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
627
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
628
0
  for (i = 0; i < 10 * 100; i++)
629
0
    dy_tensor->data.f32[i] = 0;
630
0
  for (i = 0; i < 10; i++)
631
0
    dy_tensor->data.f32[i * 100 + i] = 1;
632
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
633
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
634
0
  ccv_nnc_graph_t* graph = 0;
635
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
636
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
637
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
638
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
639
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
640
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
641
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
642
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
643
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
644
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
645
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
646
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
647
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
648
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
649
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
650
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
651
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
652
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
653
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
654
0
  ccv_nnc_tensor_free(x_tensor);
655
0
  ccv_nnc_tensor_free(y_tensor);
656
0
  ccv_nnc_tensor_free(dx_tensor);
657
0
  ccv_nnc_tensor_free(dy_tensor);
658
0
  ccv_nnc_tensor_free(ty_tensor);
659
0
  ccv_nnc_tensor_free(tdx_tensor);
660
0
  ccv_nnc_tensor_free(dyt);
661
0
  ccv_nnc_graph_free(graph);
662
0
  ccv_nnc_tensor_arena_free(tensor_arena);
663
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
664
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
665
0
}
666
667
TEST_CASE("compare sigmoid with mps")
668
1
{
669
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS));
670
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
671
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
672
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
673
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "sigmoid");
674
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
675
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
676
0
  ccv_nnc_graph_t* graph = 0;
677
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
678
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
679
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
680
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
681
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
682
0
  dsfmt_t dsfmt;
683
0
  dsfmt_init_gen_rand(&dsfmt, 0);
684
0
  int i;
685
0
  for (i = 0; i < 20 * 10; i++)
686
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
687
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
688
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
689
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
690
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
691
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
692
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
693
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
694
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
695
0
  REQUIRE_TENSOR_EQ(ty, y_tensor, "sigmoid from mps should match from CPU");
696
0
  ccv_nnc_tensor_free(x_tensor);
697
0
  ccv_nnc_tensor_free(y_tensor);
698
0
  ccv_nnc_tensor_free(ty);
699
0
  ccv_nnc_graph_free(graph);
700
0
  ccv_nnc_tensor_arena_free(tensor_arena);
701
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
702
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
703
0
}
704
705
TEST_CASE("compare sigmoid with mps in half precision")
706
1
{
707
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS));
708
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
709
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
710
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
711
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "sigmoid");
712
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
713
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
714
0
  ccv_nnc_graph_t* graph = 0;
715
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
716
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
717
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
718
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
719
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
720
0
  dsfmt_t dsfmt;
721
0
  dsfmt_init_gen_rand(&dsfmt, 0);
722
0
  int i;
723
0
  for (i = 0; i < 20 * 10; i++)
724
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
725
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
726
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
727
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
728
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
729
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
730
0
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
731
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
732
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
733
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
734
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
735
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
736
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
737
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "sigmoid from mps should match from CPU");
738
0
  ccv_nnc_tensor_free(x_tensor);
739
0
  ccv_nnc_tensor_free(x16_tensor);
740
0
  ccv_nnc_tensor_free(y16_tensor);
741
0
  ccv_nnc_tensor_free(y_tensor);
742
0
  ccv_nnc_tensor_free(ty);
743
0
  ccv_nnc_graph_free(graph);
744
0
  ccv_nnc_tensor_arena_free(tensor_arena);
745
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
746
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
747
0
}
748
749
750
TEST_CASE("compare sigmoid gradient with mps")
751
1
{
752
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS) &&
753
1
    ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_BACKWARD, CCV_NNC_BACKEND_MPS));
754
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
755
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
756
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
757
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "sigmoid");
758
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
759
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
760
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
761
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
762
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
763
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
764
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
765
0
  dsfmt_t dsfmt;
766
0
  dsfmt_init_gen_rand(&dsfmt, 0);
767
0
  int i;
768
0
  for (i = 0; i < 10 * 100; i++)
769
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
770
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
771
0
  for (i = 0; i < 10 * 100; i++)
772
0
    dy_tensor->data.f32[i] = 0;
773
0
  for (i = 0; i < 10; i++)
774
0
    dy_tensor->data.f32[i * 100 + i] = 1;
775
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
776
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
777
0
  ccv_nnc_graph_t* graph = 0;
778
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
779
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
780
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
781
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
782
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
783
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
784
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
785
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
786
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
787
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
788
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
789
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
790
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
791
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
792
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
793
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
794
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
795
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
796
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
797
0
  ccv_nnc_tensor_free(x_tensor);
798
0
  ccv_nnc_tensor_free(y_tensor);
799
0
  ccv_nnc_tensor_free(dx_tensor);
800
0
  ccv_nnc_tensor_free(dy_tensor);
801
0
  ccv_nnc_tensor_free(ty_tensor);
802
0
  ccv_nnc_tensor_free(tdx_tensor);
803
0
  ccv_nnc_tensor_free(dyt);
804
0
  ccv_nnc_graph_free(graph);
805
0
  ccv_nnc_tensor_arena_free(tensor_arena);
806
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
807
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
808
0
}
809
810
TEST_CASE("compare relu with mps")
811
1
{
812
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_MPS));
813
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
814
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
815
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "y");
816
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
817
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
818
0
  ccv_nnc_graph_t* graph = 0;
819
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
820
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
821
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
822
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
823
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
824
0
  dsfmt_t dsfmt;
825
0
  dsfmt_init_gen_rand(&dsfmt, 0);
826
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
827
0
  int i;
828
0
  for (i = 0; i < 7 * 7 * 10; i++)
829
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
830
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
831
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
832
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
833
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
834
0
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
835
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
836
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
837
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
838
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
839
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
840
0
  ccv_nnc_tensor_arena_free(tensor_arena);
841
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
842
0
  ccv_nnc_graph_free(graph);
843
0
  ccv_nnc_tensor_free(x_tensor);
844
0
  ccv_nnc_tensor_free(y_tensor);
845
0
  ccv_nnc_tensor_free(cpu_y);
846
0
}
847
848
TEST_CASE("compare relu with mps in half precision")
849
1
{
850
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_MPS));
851
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
852
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
853
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "y");
854
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
855
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
856
0
  ccv_nnc_graph_t* graph = 0;
857
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
858
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
859
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
860
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
861
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
862
0
  dsfmt_t dsfmt;
863
0
  dsfmt_init_gen_rand(&dsfmt, 0);
864
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
865
0
  int i;
866
0
  for (i = 0; i < 7 * 7 * 10; i++)
867
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
868
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
869
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
870
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
871
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
872
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
873
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
874
0
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
875
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
876
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
877
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
878
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
879
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
880
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 7 * 7 * 10, 1e-3, "mps result should equal to cpu result");
881
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
882
0
  ccv_nnc_tensor_arena_free(tensor_arena);
883
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
884
0
  ccv_nnc_graph_free(graph);
885
0
  ccv_nnc_tensor_free(x_tensor);
886
0
  ccv_nnc_tensor_free(x16_tensor);
887
0
  ccv_nnc_tensor_free(y_tensor);
888
0
  ccv_nnc_tensor_free(cpu_y);
889
0
  ccv_nnc_tensor_free(cpu_y16);
890
0
}
891
892
TEST_CASE("compare layer norm with mps")
893
1
{
894
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
895
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
896
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
897
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
898
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
899
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
900
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
901
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
902
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "bias");
903
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
904
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
905
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
906
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
907
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
908
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
909
0
  ccv_nnc_graph_t* graph = 0;
910
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
911
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
912
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
913
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
914
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
915
0
  dsfmt_t dsfmt;
916
0
  float xdata[2 * 2 * 2 * 10];
917
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
918
0
  int i;
919
0
  dsfmt_init_gen_rand(&dsfmt, 1);
920
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
921
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
922
0
  float scaledata[1 * 2 * 2 * 10];
923
0
  float biasdata[1 * 2 * 2 * 10];
924
0
  for (i = 0; i < 1 * 2 * 2 * 10; i++)
925
0
  {
926
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
927
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
928
0
  }
929
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
930
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
931
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
932
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
933
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
934
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
935
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
936
0
  ccv_nnc_graph_free(graph);
937
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
938
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
939
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
940
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
941
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "bias");
942
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
943
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
944
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
945
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
946
0
  ccv_nnc_graph_t* cpu_graph = 0;
947
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
948
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
949
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
950
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
951
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
952
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
953
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
954
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
955
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 2 * 2 * 10);
956
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
957
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
958
  // Note that MPS and my other implementations treat epsilon differently.
959
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "layer norm result from mps should match the one from reference implementation");
960
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
961
0
  ccv_nnc_tensor_arena_free(tensor_arena);
962
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
963
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
964
0
  ccv_nnc_graph_free(cpu_graph);
965
0
}
966
967
TEST_CASE("compare layer norm with mps without scale / bias")
968
1
{
969
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
970
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
971
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
972
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
973
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
974
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
975
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
976
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
977
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
978
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
979
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
980
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
981
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
982
0
  ccv_nnc_graph_t* graph = 0;
983
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
984
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
985
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
986
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
987
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
988
0
  dsfmt_t dsfmt;
989
0
  float xdata[2 * 2 * 2 * 10];
990
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
991
0
  int i;
992
0
  dsfmt_init_gen_rand(&dsfmt, 1);
993
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
994
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
995
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
996
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
997
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
998
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
999
0
  ccv_nnc_graph_free(graph);
1000
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1001
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1002
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1003
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
1004
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1005
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
1006
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1007
0
  ccv_nnc_graph_t* cpu_graph = 0;
1008
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1009
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1010
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1011
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1012
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
1013
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1014
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1015
  // Note that MPS and my other implementations treat epsilon differently.
1016
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "layer norm result from mps should match the one from reference implementation");
1017
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1018
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1019
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1020
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1021
0
  ccv_nnc_graph_free(cpu_graph);
1022
0
}
1023
1024
TEST_CASE("compare group norm with mps")
1025
1
{
1026
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
1027
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
1028
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1029
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
1030
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
1031
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
1032
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
1033
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "scale");
1034
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "bias");
1035
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
1036
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
1037
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1038
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
1039
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1040
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1041
0
  ccv_nnc_graph_t* graph = 0;
1042
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1043
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1044
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1045
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1046
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1047
0
  dsfmt_t dsfmt;
1048
0
  float xdata[2 * 16 * 2 * 10];
1049
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1050
0
  int i;
1051
0
  dsfmt_init_gen_rand(&dsfmt, 1);
1052
0
  for (i = 0; i < 2 * 16 * 2 * 10; i++)
1053
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1054
0
  float scaledata[1 * 16 * 2 * 10];
1055
0
  float biasdata[1 * 16 * 2 * 10];
1056
0
  for (i = 0; i < 1 * 16 * 2 * 10; i++)
1057
0
  {
1058
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1059
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
1060
0
  }
1061
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1062
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1063
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
1064
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1065
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1066
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1067
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1068
0
  ccv_nnc_graph_free(graph);
1069
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1070
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
1071
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
1072
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "scale");
1073
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "bias");
1074
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
1075
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
1076
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
1077
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1078
0
  ccv_nnc_graph_t* cpu_graph = 0;
1079
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1080
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1081
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1082
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1083
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
1084
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1085
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 16 * 2 * 10);
1086
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
1087
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 16 * 2 * 10);
1088
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1089
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1090
  // Note that MPS and my other implementations treat epsilon differently.
1091
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-3, "group norm result from mps should match the one from reference implementation");
1092
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1093
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1094
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1095
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1096
0
  ccv_nnc_graph_free(cpu_graph);
1097
0
}
1098
1099
TEST_CASE("compare group norm with mps without scale / bias")
1100
1
{
1101
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
1102
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
1103
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1104
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
1105
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
1106
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
1107
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
1108
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
1109
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
1110
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1111
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
1112
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1113
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1114
0
  ccv_nnc_graph_t* graph = 0;
1115
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1116
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1117
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1118
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1119
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1120
0
  dsfmt_t dsfmt;
1121
0
  float xdata[2 * 16 * 2 * 10];
1122
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1123
0
  int i;
1124
0
  dsfmt_init_gen_rand(&dsfmt, 1);
1125
0
  for (i = 0; i < 2 * 16 * 2 * 10; i++)
1126
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1127
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1128
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1129
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1130
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1131
0
  ccv_nnc_graph_free(graph);
1132
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1133
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
1134
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
1135
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
1136
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
1137
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
1138
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1139
0
  ccv_nnc_graph_t* cpu_graph = 0;
1140
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1141
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1142
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1143
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1144
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
1145
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1146
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1147
  // Note that MPS and my other implementations treat epsilon differently.
1148
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-3, "group norm result from mps should match the one from reference implementation");
1149
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1150
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1151
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1152
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1153
0
  ccv_nnc_graph_free(cpu_graph);
1154
0
}
1155
1156
TEST_CASE("compare rmsnorm with mps")
1157
1
{
1158
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
1159
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
1160
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1161
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
1162
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
1163
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
1164
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
1165
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
1166
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
1167
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1168
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-6, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
1169
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1170
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1171
0
  ccv_nnc_graph_t* graph = 0;
1172
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1173
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1174
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1175
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1176
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1177
0
  dsfmt_t dsfmt;
1178
0
  float xdata[2 * 2 * 2 * 10];
1179
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1180
0
  int i;
1181
0
  dsfmt_init_gen_rand(&dsfmt, 1);
1182
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
1183
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1184
0
  float scaledata[1 * 2 * 2 * 10];
1185
0
  for (i = 0; i < 1 * 2 * 2 * 10; i++)
1186
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1187
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1188
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale)), 0);
1189
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1190
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1191
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1192
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1193
0
  ccv_nnc_graph_free(graph);
1194
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1195
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1196
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1197
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
1198
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1199
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-6, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "rmsnorm");
1200
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1201
0
  ccv_nnc_graph_t* cpu_graph = 0;
1202
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1203
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1204
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1205
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1206
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
1207
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1208
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
1209
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1210
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1211
  // Note that MPS and my other implementations treat epsilon differently.
1212
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "rmsnorm result from mps should match the one from reference implementation");
1213
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1214
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1215
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1216
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1217
0
  ccv_nnc_graph_free(cpu_graph);
1218
0
}
1219
1220
TEST_CASE("compare add with mps")
1221
1
{
1222
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
1223
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1224
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
1225
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
1226
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
1227
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
1228
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
1229
0
  ccv_nnc_tensor_symbol_t z = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "z");
1230
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
1231
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
1232
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(z), "transfer");
1233
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1234
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1235
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1236
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1237
0
  ccv_nnc_graph_t* graph = 0;
1238
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1239
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1240
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(z), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1241
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1242
0
  dsfmt_t dsfmt;
1243
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1244
0
  int i;
1245
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1246
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1247
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
1248
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1249
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1250
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
1251
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1252
0
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, z);
1253
0
  REQUIRE_TENSOR_EQ(zt, z_tensor, "add should match");
1254
0
  ccv_nnc_tensor_free(x_tensor);
1255
0
  ccv_nnc_tensor_free(y_tensor);
1256
0
  ccv_nnc_tensor_free(zt);
1257
0
  ccv_nnc_graph_free(graph);
1258
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1259
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1260
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1261
0
}
1262
1263
TEST_CASE("compare add with mps in half precision")
1264
1
{
1265
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
1266
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1267
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
1268
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
1269
0
  ccv_nnc_tensor_symbol_t x16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), "x 16");
1270
0
  ccv_nnc_tensor_symbol_t y16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 1, 3), "y 16");
1271
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "a");
1272
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 1, 3), "b");
1273
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "c");
1274
0
  ccv_nnc_tensor_symbol_t z = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "z");
1275
0
  ccv_nnc_tensor_symbol_t z16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), "z 16");
1276
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(x16, y16), "convert");
1277
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x16, y16), TENSOR_SYMBOL_LIST(a, b), "transfer");
1278
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
1279
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(z16), "transfer");
1280
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(z16), TENSOR_SYMBOL_LIST(z), "convert");
1281
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1282
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1283
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1284
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1285
0
  ccv_nnc_graph_t* graph = 0;
1286
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1287
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1288
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(z), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1289
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1290
0
  dsfmt_t dsfmt;
1291
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1292
0
  int i;
1293
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1294
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1295
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
1296
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1297
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1298
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
1299
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1300
0
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, z);
1301
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, zt->data.f32, z_tensor->data.f32, 10 * 5 * 5 * 3, 1e-3, "add should match");
1302
0
  ccv_nnc_tensor_free(x_tensor);
1303
0
  ccv_nnc_tensor_free(y_tensor);
1304
0
  ccv_nnc_tensor_free(zt);
1305
0
  ccv_nnc_graph_free(graph);
1306
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1307
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1308
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1309
0
}
1310
1311
TEST_CASE("compare add gradient with mps")
1312
1
{
1313
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1314
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1315
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1316
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
1317
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
1318
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
1319
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
1320
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
1321
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
1322
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
1323
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1324
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(x, y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1325
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1326
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1327
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1328
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1329
0
  ccv_nnc_graph_t* graph = 0;
1330
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1331
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1332
0
  ccv_nnc_tensor_symbol_t dc = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, c);
1333
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
1334
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
1335
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(dx, dy), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1336
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1337
0
  dsfmt_t dsfmt;
1338
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1339
0
  int i;
1340
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1341
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1342
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
1343
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1344
0
  ccv_nnc_tensor_t* dct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1345
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1346
0
    dct->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1347
0
  ccv_nnc_tensor_t* const dc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dc);
1348
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dct), TENSOR_LIST(dc_tensor), 0);
1349
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1350
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1351
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
1352
0
  ccv_nnc_tensor_t* dxt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1353
0
  ccv_nnc_tensor_t* dyt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1354
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dct, x_tensor, y_tensor, zt), TENSOR_LIST(dxt, dyt), 0);
1355
0
  ccv_nnc_tensor_t* dx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
1356
0
  ccv_nnc_tensor_t* dy_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dy);
1357
0
  REQUIRE_TENSOR_EQ(dxt, dx_tensor, "backward pass should match");
1358
0
  REQUIRE_TENSOR_EQ(dyt, dy_tensor, "backward pass should match");
1359
0
  ccv_nnc_tensor_free(x_tensor);
1360
0
  ccv_nnc_tensor_free(y_tensor);
1361
0
  ccv_nnc_tensor_free(dct);
1362
0
  ccv_nnc_tensor_free(zt);
1363
0
  ccv_nnc_tensor_free(dxt);
1364
0
  ccv_nnc_tensor_free(dyt);
1365
0
  ccv_nnc_graph_free(graph);
1366
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1367
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1368
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1369
0
}
1370
1371
TEST_CASE("compare add gradient with mps no dyt ")
1372
1
{
1373
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1374
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1375
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1376
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
1377
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
1378
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
1379
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
1380
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
1381
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
1382
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
1383
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1384
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(x, y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1385
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1386
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1387
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1388
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1389
0
  ccv_nnc_graph_t* graph = 0;
1390
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1391
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1392
0
  ccv_nnc_tensor_symbol_t dc = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, c);
1393
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
1394
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
1395
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(dx, dy), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1396
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1397
0
  dsfmt_t dsfmt;
1398
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1399
0
  int i;
1400
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1401
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1402
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
1403
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1404
0
  ccv_nnc_tensor_t* dct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1405
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1406
0
    dct->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1407
0
  ccv_nnc_tensor_t* const dc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dc);
1408
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dct), TENSOR_LIST(dc_tensor), 0);
1409
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1410
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1411
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
1412
0
  ccv_nnc_tensor_t* dxt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1413
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dct, x_tensor, y_tensor, zt), TENSOR_LIST(dxt, 0), 0);
1414
0
  ccv_nnc_tensor_t* dx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
1415
0
  REQUIRE_TENSOR_EQ(dxt, dx_tensor, "backward pass should match");
1416
0
  ccv_nnc_tensor_free(x_tensor);
1417
0
  ccv_nnc_tensor_free(y_tensor);
1418
0
  ccv_nnc_tensor_free(dct);
1419
0
  ccv_nnc_tensor_free(zt);
1420
0
  ccv_nnc_tensor_free(dxt);
1421
0
  ccv_nnc_graph_free(graph);
1422
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1423
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1424
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1425
0
}
1426
1427
TEST_CASE("broadcasting semantics for add backward mps (a,b)")
1428
1
{
1429
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1430
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1431
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1432
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1433
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1434
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1435
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1436
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1437
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1438
0
  a->data.f32[0] = 1;
1439
0
  a->data.f32[1] = 2;
1440
0
  a->data.f32[2] = 3;
1441
0
  a->data.f32[3] = 4;
1442
0
  b->data.f32[0] = 5;
1443
0
  b->data.f32[1] = 6;
1444
0
  float ctp[] = {
1445
0
    6, 7,
1446
0
    7, 8,
1447
0
    8, 9,
1448
0
    9, 10
1449
0
  };
1450
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
1451
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1452
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1453
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1454
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1455
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1456
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
1457
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, gdb), 0);
1458
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
1459
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, dbt), 0);
1460
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
1461
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
1462
0
  ccv_nnc_tensor_free(a);
1463
0
  ccv_nnc_tensor_free(b);
1464
0
  ccv_nnc_tensor_free(c);
1465
0
  ccv_nnc_tensor_free(da);
1466
0
  ccv_nnc_tensor_free(db);
1467
0
  ccv_nnc_tensor_free(dat);
1468
0
  ccv_nnc_tensor_free(dbt);
1469
0
  ccv_nnc_tensor_free(ga);
1470
0
  ccv_nnc_tensor_free(gb);
1471
0
  ccv_nnc_tensor_free(gc);
1472
0
  ccv_nnc_tensor_free(gda);
1473
0
  ccv_nnc_tensor_free(gdb);
1474
0
}
1475
1476
TEST_CASE("broadcasting semantics for add backward mps (a, nil)")
1477
1
{
1478
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1479
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1480
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1481
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1482
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1483
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1484
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1485
0
  a->data.f32[0] = 1;
1486
0
  a->data.f32[1] = 2;
1487
0
  a->data.f32[2] = 3;
1488
0
  a->data.f32[3] = 4;
1489
0
  b->data.f32[0] = 5;
1490
0
  b->data.f32[1] = 6;
1491
0
  float ctp[] = {
1492
0
    6, 7,
1493
0
    7, 8,
1494
0
    8, 9,
1495
0
    9, 10
1496
0
  };
1497
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
1498
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1499
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1500
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1501
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1502
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
1503
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, ), TENSOR_LIST(gda, ), 0);
1504
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, ), TENSOR_LIST(da, ), 0);
1505
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, ), TENSOR_LIST(dat, ), 0);
1506
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
1507
0
  ccv_nnc_tensor_free(a);
1508
0
  ccv_nnc_tensor_free(b);
1509
0
  ccv_nnc_tensor_free(c);
1510
0
  ccv_nnc_tensor_free(da);
1511
0
  ccv_nnc_tensor_free(dat);
1512
0
  ccv_nnc_tensor_free(ga);
1513
0
  ccv_nnc_tensor_free(gb);
1514
0
  ccv_nnc_tensor_free(gc);
1515
0
  ccv_nnc_tensor_free(gda);
1516
0
}
1517
1518
TEST_CASE("broadcasting semantics for add backward mps (nil,b)")
1519
1
{
1520
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1521
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1522
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1523
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1524
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1525
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1526
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1527
0
  a->data.f32[0] = 1;
1528
0
  a->data.f32[1] = 2;
1529
0
  a->data.f32[2] = 3;
1530
0
  a->data.f32[3] = 4;
1531
0
  b->data.f32[0] = 5;
1532
0
  b->data.f32[1] = 6;
1533
0
  float ctp[] = {
1534
0
    6, 7,
1535
0
    7, 8,
1536
0
    8, 9,
1537
0
    9, 10
1538
0
  };
1539
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
1540
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1541
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1542
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1543
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1544
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
1545
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(0, gdb), 0);
1546
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdb), TENSOR_LIST(db), 0);
1547
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, 0, b), TENSOR_LIST(0, dbt), 0);
1548
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
1549
0
  ccv_nnc_tensor_free(a);
1550
0
  ccv_nnc_tensor_free(b);
1551
0
  ccv_nnc_tensor_free(c);
1552
0
  ccv_nnc_tensor_free(db);
1553
0
  ccv_nnc_tensor_free(dbt);
1554
0
  ccv_nnc_tensor_free(ga);
1555
0
  ccv_nnc_tensor_free(gb);
1556
0
  ccv_nnc_tensor_free(gc);
1557
0
  ccv_nnc_tensor_free(gdb);
1558
0
}
1559
1560
TEST_CASE("compare ewsum with mps")
1561
1
{
1562
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_MPS));
1563
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
1564
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
1565
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
1566
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
1567
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1568
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1569
0
  ccv_nnc_tensor_t* const hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1570
0
  ccv_nnc_tensor_t* const hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1571
0
  ccv_nnc_tensor_t* const gd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1572
0
  int i;
1573
0
  for (i = 0; i < 100; i++)
1574
0
  {
1575
0
    ha->data.f32[i] = 1;
1576
0
    hb->data.f32[i] = 0.5;
1577
0
    hc->data.f32[i] = 0.25;
1578
0
    gd->data.f32[i] = 1.75;
1579
0
  }
1580
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hc), TENSOR_LIST(a, b, c), 0);
1581
0
  ccv_nnc_cmd_exec(CMD_EWSUM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(d), 0);
1582
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(hd), 0);
1583
0
  REQUIRE_TENSOR_EQ(hd, gd, "ewsum result should be the same");
1584
0
  ccv_nnc_tensor_free(a);
1585
0
  ccv_nnc_tensor_free(b);
1586
0
  ccv_nnc_tensor_free(c);
1587
0
  ccv_nnc_tensor_free(d);
1588
0
  ccv_nnc_tensor_free(ha);
1589
0
  ccv_nnc_tensor_free(hb);
1590
0
  ccv_nnc_tensor_free(hc);
1591
0
  ccv_nnc_tensor_free(hd);
1592
0
  ccv_nnc_tensor_free(gd);
1593
0
}
1594
1595
TEST_CASE("compare ewsum with mps in half precision")
1596
1
{
1597
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_MPS));
1598
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
1599
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
1600
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
1601
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
1602
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1603
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1604
0
  ccv_nnc_tensor_t* const hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1605
0
  ccv_nnc_tensor_t* const hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1606
0
  ccv_nnc_tensor_t* const ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
1607
0
  ccv_nnc_tensor_t* const hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
1608
0
  ccv_nnc_tensor_t* const hc16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
1609
0
  ccv_nnc_tensor_t* const hd16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
1610
0
  ccv_nnc_tensor_t* const gd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1611
0
  int i;
1612
0
  for (i = 0; i < 100; i++)
1613
0
  {
1614
0
    ha->data.f32[i] = 1;
1615
0
    hb->data.f32[i] = 0.5;
1616
0
    hc->data.f32[i] = 0.25;
1617
0
    gd->data.f32[i] = 1.75;
1618
0
  }
1619
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hc), TENSOR_LIST(ha16, hb16, hc16), 0);
1620
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hb16, hc16), TENSOR_LIST(a, b, c), 0);
1621
0
  ccv_nnc_cmd_exec(CMD_EWSUM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(d), 0);
1622
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(hd16), 0);
1623
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hd16), TENSOR_LIST(hd), 0);
1624
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hd->data.f32, gd->data.f32, 100, 1e-3, "ewsum result should be the same");
1625
0
  ccv_nnc_tensor_free(a);
1626
0
  ccv_nnc_tensor_free(b);
1627
0
  ccv_nnc_tensor_free(c);
1628
0
  ccv_nnc_tensor_free(d);
1629
0
  ccv_nnc_tensor_free(ha);
1630
0
  ccv_nnc_tensor_free(hb);
1631
0
  ccv_nnc_tensor_free(hc);
1632
0
  ccv_nnc_tensor_free(hd);
1633
0
  ccv_nnc_tensor_free(ha16);
1634
0
  ccv_nnc_tensor_free(hb16);
1635
0
  ccv_nnc_tensor_free(hc16);
1636
0
  ccv_nnc_tensor_free(hd16);
1637
0
  ccv_nnc_tensor_free(gd);
1638
0
}
1639
1640
TEST_CASE("compare transpose two tensor views")
1641
1
{
1642
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
1643
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
1644
0
  memset(ha->data.f32, 0, sizeof(float) * 7 * 6 * 5 * 4);
1645
0
  ccv_nnc_tensor_view_t ha_view = ccv_nnc_tensor_view(ha, CPU_TENSOR_NHWC(32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
1646
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 8, 7, 6, 5), 0);
1647
0
  memset(hb->data.f32, 0, sizeof(float) * 8 * 7 * 6 * 5);
1648
0
  ccv_nnc_tensor_view_t hb_view = ccv_nnc_tensor_view(hb, CPU_TENSOR_NHWC(32F, 4, 2, 2, 3), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(7 * 6 * 5, 6 * 5, 5, 1));
1649
0
  int i, j, k, l;
1650
0
  for (i = 0; i < 4; i++)
1651
0
    for (j = 0; j < 3; j++)
1652
0
      for (k = 0; k < 2; k++)
1653
0
        for (l = 0; l < 2; l++)
1654
0
          ha->data.f32[(i + 3) * 6 * 5 * 4 + (j + 2) * 5 * 4 + (k + 1) * 4 + l] = i * 3 * 2 * 2 + j * 2 * 2 + k * 2 + l;
1655
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&ha_view), TENSOR_LIST((ccv_nnc_tensor_t*)&hb_view), 0);
1656
0
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
1657
0
  memset(hd->data.f32, 0, sizeof(float) * 7 * 6 * 5 * 4);
1658
0
  ccv_nnc_tensor_view_t hd_view = ccv_nnc_tensor_view(hd, CPU_TENSOR_NHWC(32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
1659
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&hb_view), TENSOR_LIST((ccv_nnc_tensor_t*)&hd_view), 0);
1660
0
  REQUIRE_TENSOR_EQ(hd, ha, "4x3x2x2 tensor should be exactly the same.");
1661
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 7, 6, 5, 4), 0);
1662
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1663
0
  ccv_nnc_tensor_view_t a_view = ccv_nnc_tensor_view(a, GPU_TENSOR_NHWC(000, 32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
1664
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 8, 7, 6, 5), 0);
1665
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(b), 0);
1666
0
  ccv_nnc_tensor_view_t b_view = ccv_nnc_tensor_view(b, GPU_TENSOR_NHWC(000, 32F, 4, 2, 2, 3), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(7 * 6 * 5, 6 * 5, 5, 1));
1667
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&a_view), TENSOR_LIST((ccv_nnc_tensor_t*)&b_view), 0);
1668
0
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 7, 6, 5, 4), 0);
1669
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(d), 0);
1670
0
  ccv_nnc_tensor_view_t d_view = ccv_nnc_tensor_view(d, GPU_TENSOR_NHWC(000, 32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
1671
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&b_view), TENSOR_LIST((ccv_nnc_tensor_t*)&d_view), 0);
1672
0
  ccv_nnc_tensor_t* const hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 8, 7, 6, 5), 0);
1673
0
  ccv_nnc_tensor_t* const hdt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
1674
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, d), TENSOR_LIST(hbt, hdt), 0);
1675
0
  REQUIRE_TENSOR_EQ(hbt, hb, "4x2x2x3 tensor should be exactly the same.");
1676
0
  REQUIRE_TENSOR_EQ(hdt, hd, "4x3x2x2 tensor should be exactly the same.");
1677
0
  ccv_nnc_tensor_free(ha);
1678
0
  ccv_nnc_tensor_free(hb);
1679
0
  ccv_nnc_tensor_free(hd);
1680
0
  ccv_nnc_tensor_free(hbt);
1681
0
  ccv_nnc_tensor_free(hdt);
1682
0
  ccv_nnc_tensor_free(a);
1683
0
  ccv_nnc_tensor_free(b);
1684
0
  ccv_nnc_tensor_free(d);
1685
0
}
1686
1687
TEST_CASE("broadcasting semantics for add [[1, 2, 3], [4, 5, 6]] + [7, 8, 9]")
1688
1
{
1689
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
1690
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1691
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
1692
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1693
0
  a->data.f32[0] = 1;
1694
0
  a->data.f32[1] = 2;
1695
0
  a->data.f32[2] = 3;
1696
0
  a->data.f32[3] = 4;
1697
0
  a->data.f32[4] = 5;
1698
0
  a->data.f32[5] = 6;
1699
0
  b->data.f32[0] = 7;
1700
0
  b->data.f32[1] = 8;
1701
0
  b->data.f32[2] = 9;
1702
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1703
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1704
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1705
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1706
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(1, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1707
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1708
0
  float ctp[] = {
1709
0
    8, 10, 12,
1710
0
    11, 13, 15
1711
0
  };
1712
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1713
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1714
0
  ccv_nnc_tensor_free(a);
1715
0
  ccv_nnc_tensor_free(b);
1716
0
  ccv_nnc_tensor_free(c);
1717
0
  ccv_nnc_tensor_free(ga);
1718
0
  ccv_nnc_tensor_free(gb);
1719
0
  ccv_nnc_tensor_free(gc);
1720
0
}
1721
1722
TEST_CASE("broadcasting semantics for add [[1], [2], [3], [4]] + [5, 6]")
1723
1
{
1724
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
1725
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1726
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1727
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1728
0
  a->data.f32[0] = 1;
1729
0
  a->data.f32[1] = 2;
1730
0
  a->data.f32[2] = 3;
1731
0
  a->data.f32[3] = 4;
1732
0
  b->data.f32[0] = 5;
1733
0
  b->data.f32[1] = 6;
1734
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1735
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1736
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1737
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1738
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(1, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1739
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1740
0
  float ctp[] = {
1741
0
    6, 7,
1742
0
    7, 8,
1743
0
    8, 9,
1744
0
    9, 10
1745
0
  };
1746
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1747
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1748
0
  ccv_nnc_tensor_free(a);
1749
0
  ccv_nnc_tensor_free(b);
1750
0
  ccv_nnc_tensor_free(c);
1751
0
  ccv_nnc_tensor_free(ga);
1752
0
  ccv_nnc_tensor_free(gb);
1753
0
  ccv_nnc_tensor_free(gc);
1754
0
}
1755
1756
TEST_CASE("broadcasting semantics for mul [[1, 2, 3], [4, 5, 6]] * [7, 8, 9]")
1757
1
{
1758
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
1759
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1760
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
1761
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1762
0
  a->data.f32[0] = 1;
1763
0
  a->data.f32[1] = 2;
1764
0
  a->data.f32[2] = 3;
1765
0
  a->data.f32[3] = 4;
1766
0
  a->data.f32[4] = 5;
1767
0
  a->data.f32[5] = 6;
1768
0
  b->data.f32[0] = 7;
1769
0
  b->data.f32[1] = 8;
1770
0
  b->data.f32[2] = 9;
1771
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1772
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1773
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1774
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1775
0
  ccv_nnc_cmd_exec(CMD_MUL_FORWARD(1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1776
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1777
0
  float ctp[] = {
1778
0
    7, 16, 27,
1779
0
    28, 40, 54
1780
0
  };
1781
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1782
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1783
0
  ccv_nnc_tensor_free(a);
1784
0
  ccv_nnc_tensor_free(b);
1785
0
  ccv_nnc_tensor_free(c);
1786
0
  ccv_nnc_tensor_free(ga);
1787
0
  ccv_nnc_tensor_free(gb);
1788
0
  ccv_nnc_tensor_free(gc);
1789
0
}
1790
1791
TEST_CASE("broadcasting semantics for mul [[1], [2], [3], [4]] * [5, 6]")
1792
1
{
1793
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
1794
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1795
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1796
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1797
0
  a->data.f32[0] = 1;
1798
0
  a->data.f32[1] = 2;
1799
0
  a->data.f32[2] = 3;
1800
0
  a->data.f32[3] = 4;
1801
0
  b->data.f32[0] = 5;
1802
0
  b->data.f32[1] = 6;
1803
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1804
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1805
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1806
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1807
0
  ccv_nnc_cmd_exec(CMD_MUL_FORWARD(1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1808
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1809
0
  float ctp[] = {
1810
0
    5, 6,
1811
0
    10, 12,
1812
0
    15, 18,
1813
0
    20, 24
1814
0
  };
1815
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1816
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1817
0
  ccv_nnc_tensor_free(a);
1818
0
  ccv_nnc_tensor_free(b);
1819
0
  ccv_nnc_tensor_free(c);
1820
0
  ccv_nnc_tensor_free(ga);
1821
0
  ccv_nnc_tensor_free(gb);
1822
0
  ccv_nnc_tensor_free(gc);
1823
0
}
1824
1825
TEST_CASE("scalar mul [[1, 2, 3], [4, 5, 6]] * 0.3")
1826
1
{
1827
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
1828
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1829
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1830
0
  a->data.f32[0] = 1;
1831
0
  a->data.f32[1] = 2;
1832
0
  a->data.f32[2] = 3;
1833
0
  a->data.f32[3] = 4;
1834
0
  a->data.f32[4] = 5;
1835
0
  a->data.f32[5] = 6;
1836
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1837
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1838
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ga), 0);
1839
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(0.3), ccv_nnc_no_hint, 0, TENSOR_LIST(ga), TENSOR_LIST(gc), 0);
1840
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1841
0
  float ctp[] = {
1842
0
    0.3, 0.6, 0.9,
1843
0
    1.2, 1.5, 1.8,
1844
0
  };
1845
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1846
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1847
0
  ccv_nnc_tensor_free(a);
1848
0
  ccv_nnc_tensor_free(c);
1849
0
  ccv_nnc_tensor_free(ga);
1850
0
  ccv_nnc_tensor_free(gc);
1851
0
}
1852
1853
TEST_CASE("scalar mul [[1, 2, 3], [4, 5, 6]] * 0.5, int")
1854
1
{
1855
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
1856
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2, 3), 0);
1857
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2, 3), 0);
1858
0
  ccv_nnc_tensor_t* const ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2, 3), 0);
1859
0
  a->data.i32[0] = 1;
1860
0
  a->data.i32[1] = 2;
1861
0
  a->data.i32[2] = 3;
1862
0
  a->data.i32[3] = 4;
1863
0
  a->data.i32[4] = 5;
1864
0
  a->data.i32[5] = 6;
1865
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2, 3), 0);
1866
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2, 3), 0);
1867
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ga), 0);
1868
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(ga), TENSOR_LIST(gc), 0);
1869
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1870
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ct), 0);
1871
0
  REQUIRE_TENSOR_EQ(c, ct, "result should be equal");
1872
0
  ccv_nnc_tensor_free(a);
1873
0
  ccv_nnc_tensor_free(c);
1874
0
  ccv_nnc_tensor_free(ct);
1875
0
  ccv_nnc_tensor_free(ga);
1876
0
  ccv_nnc_tensor_free(gc);
1877
0
}
1878
1879
TEST_CASE("compare average pooling with mps")
1880
1
{
1881
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
1882
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
1883
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
1884
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "y");
1885
0
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "avg_pool");
1886
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
1887
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1888
0
  ccv_nnc_graph_t* graph = 0;
1889
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1890
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1891
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1892
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1893
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1894
0
  dsfmt_t dsfmt;
1895
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1896
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
1897
0
  int i;
1898
0
  for (i = 0; i < 7 * 7 * 10; i++)
1899
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1900
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1901
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
1902
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1903
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1904
0
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
1905
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1906
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1907
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
1908
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
1909
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1910
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1911
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1912
0
  ccv_nnc_graph_free(graph);
1913
0
  ccv_nnc_tensor_free(x_tensor);
1914
0
  ccv_nnc_tensor_free(y_tensor);
1915
0
  ccv_nnc_tensor_free(cpu_y);
1916
0
}
1917
1918
TEST_CASE("compare average pooling with mps in half precision")
1919
1
{
1920
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
1921
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
1922
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
1923
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "y");
1924
0
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "avg_pool");
1925
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
1926
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1927
0
  ccv_nnc_graph_t* graph = 0;
1928
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1929
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1930
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1931
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1932
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1933
0
  dsfmt_t dsfmt;
1934
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1935
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
1936
0
  int i;
1937
0
  for (i = 0; i < 7 * 7 * 10; i++)
1938
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1939
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
1940
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1941
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
1942
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
1943
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1944
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1945
0
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
1946
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1947
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1948
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
1949
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
1950
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
1951
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 3 * 3 * 10, 1e-3, "mps result should equal to cpu result");
1952
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1953
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1954
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1955
0
  ccv_nnc_graph_free(graph);
1956
0
  ccv_nnc_tensor_free(x_tensor);
1957
0
  ccv_nnc_tensor_free(x16_tensor);
1958
0
  ccv_nnc_tensor_free(y_tensor);
1959
0
  ccv_nnc_tensor_free(cpu_y);
1960
0
  ccv_nnc_tensor_free(cpu_y16);
1961
0
}
1962
1963
TEST_CASE("compare max pooling with mps")
1964
1
{
1965
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
1966
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
1967
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
1968
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "y");
1969
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
1970
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
1971
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1972
0
  ccv_nnc_graph_t* graph = 0;
1973
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1974
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1975
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1976
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1977
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1978
0
  dsfmt_t dsfmt;
1979
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1980
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
1981
0
  int i;
1982
0
  for (i = 0; i < 7 * 7 * 10; i++)
1983
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1984
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1985
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
1986
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1987
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1988
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
1989
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1990
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1991
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
1992
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
1993
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1994
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1995
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1996
0
  ccv_nnc_graph_free(graph);
1997
0
  ccv_nnc_tensor_free(x_tensor);
1998
0
  ccv_nnc_tensor_free(y_tensor);
1999
0
  ccv_nnc_tensor_free(cpu_y);
2000
0
}
2001
2002
TEST_CASE("compare max pooling with mps in half precision")
2003
1
{
2004
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
2005
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2006
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
2007
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "y");
2008
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
2009
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
2010
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2011
0
  ccv_nnc_graph_t* graph = 0;
2012
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2013
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2014
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2015
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2016
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2017
0
  dsfmt_t dsfmt;
2018
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2019
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2020
0
  int i;
2021
0
  for (i = 0; i < 7 * 7 * 10; i++)
2022
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2023
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2024
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
2025
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2026
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
2027
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2028
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2029
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
2030
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2031
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
2032
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2033
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
2034
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
2035
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 3 * 3 * 10, 1e-3, "mps result should equal to cpu result");
2036
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2037
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2038
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2039
0
  ccv_nnc_graph_free(graph);
2040
0
  ccv_nnc_tensor_free(x_tensor);
2041
0
  ccv_nnc_tensor_free(x16_tensor);
2042
0
  ccv_nnc_tensor_free(y_tensor);
2043
0
  ccv_nnc_tensor_free(cpu_y);
2044
0
  ccv_nnc_tensor_free(cpu_y16);
2045
0
}
2046
2047
TEST_CASE("compare max pooling 2x2 with mps")
2048
1
{
2049
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
2050
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2051
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 6, 6), "x");
2052
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 3, 3), "y");
2053
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(2, 2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
2054
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (0, 0)));
2055
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2056
0
  ccv_nnc_graph_t* graph = 0;
2057
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2058
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2059
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2060
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2061
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2062
0
  dsfmt_t dsfmt;
2063
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2064
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 6, 6), 0);
2065
0
  int i, j;
2066
0
  for (i = 0; i < 6 * 6 * 10; i++)
2067
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2068
0
  ccv_nnc_tensor_t* const gt_x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 6, 6, 10), 0);
2069
0
  for (i = 0; i < 10; i++)
2070
0
    for (j = 0; j < 6 * 6; j++)
2071
0
      gt_x->data.f32[j * 10 + i] = x_tensor->data.f32[i * 6 * 6 + j];
2072
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2073
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2074
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2075
0
  ccv_nnc_tensor_t* const gt_y= ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2076
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(2, 2), HINT((2, 2), (0, 0)), 0, TENSOR_LIST(gt_x), TENSOR_LIST(gt_y), 0);
2077
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2078
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2079
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
2080
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2081
0
  for (i = 0; i < 10; i++)
2082
0
    for (j = 0; j < 3 * 3; j++)
2083
0
      y_tensor->data.f32[i * 3 * 3 + j] = gt_y->data.f32[j * 10 + i];
2084
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
2085
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2086
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2087
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2088
0
  ccv_nnc_graph_free(graph);
2089
0
  ccv_nnc_tensor_free(x_tensor);
2090
0
  ccv_nnc_tensor_free(y_tensor);
2091
0
  ccv_nnc_tensor_free(cpu_y);
2092
0
}
2093
2094
TEST_CASE("compare max pooling 2x2 with mps in half precision")
2095
1
{
2096
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
2097
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2098
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 6, 6), "x");
2099
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 3, 3), "y");
2100
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(2, 2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
2101
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (0, 0)));
2102
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2103
0
  ccv_nnc_graph_t* graph = 0;
2104
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2105
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2106
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2107
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2108
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2109
0
  dsfmt_t dsfmt;
2110
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2111
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 6, 6), 0);
2112
0
  int i, j;
2113
0
  for (i = 0; i < 6 * 6 * 10; i++)
2114
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2115
0
  ccv_nnc_tensor_t* const gt_x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 6, 6, 10), 0);
2116
0
  for (i = 0; i < 10; i++)
2117
0
    for (j = 0; j < 6 * 6; j++)
2118
0
      gt_x->data.f32[j * 10 + i] = x_tensor->data.f32[i * 6 * 6 + j];
2119
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2120
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 6, 6), 0);
2121
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2122
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
2123
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2124
0
  ccv_nnc_tensor_t* const gt_y= ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2125
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(2, 2), HINT((2, 2), (0, 0)), 0, TENSOR_LIST(gt_x), TENSOR_LIST(gt_y), 0);
2126
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2127
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 3, 3), 0);
2128
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2129
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
2130
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
2131
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2132
0
  for (i = 0; i < 10; i++)
2133
0
    for (j = 0; j < 3 * 3; j++)
2134
0
      y_tensor->data.f32[i * 3 * 3 + j] = gt_y->data.f32[j * 10 + i];
2135
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 10 * 3 * 3, 1e-3, "mps result should equal to cpu result");
2136
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2137
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2138
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2139
0
  ccv_nnc_graph_free(graph);
2140
0
  ccv_nnc_tensor_free(x_tensor);
2141
0
  ccv_nnc_tensor_free(x16_tensor);
2142
0
  ccv_nnc_tensor_free(y_tensor);
2143
0
  ccv_nnc_tensor_free(cpu_y);
2144
0
  ccv_nnc_tensor_free(cpu_y16);
2145
0
}
2146
2147
2148
TEST_CASE("mps mse mean loss forward")
2149
1
{
2150
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS));
2151
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2152
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2153
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 1), 0);
2154
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2155
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2156
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
2157
0
  dsfmt_t dsfmt;
2158
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2159
0
  int i;
2160
0
  for (i = 0; i < 1000; i++)
2161
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2162
0
  for (i = 0; i < 1000; i++)
2163
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2164
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
2165
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2166
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2167
0
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
2168
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(tc), 0);
2169
0
  REQUIRE_TENSOR_EQ(tc, hc, "MPS computed output should be the same as CPU computed ones");
2170
0
  ccv_nnc_tensor_free(a);
2171
0
  ccv_nnc_tensor_free(b);
2172
0
  ccv_nnc_tensor_free(c);
2173
0
  ccv_nnc_tensor_free(ha);
2174
0
  ccv_nnc_tensor_free(hb);
2175
0
  ccv_nnc_tensor_free(hc);
2176
0
  ccv_nnc_tensor_free(tc);
2177
0
}
2178
2179
TEST_CASE("mps mse sum loss forward")
2180
1
{
2181
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS));
2182
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2183
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2184
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 1), 0);
2185
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2186
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2187
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
2188
0
  dsfmt_t dsfmt;
2189
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2190
0
  int i;
2191
0
  for (i = 0; i < 1000; i++)
2192
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2193
0
  for (i = 0; i < 1000; i++)
2194
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2195
  
2196
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
2197
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2198
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2199
0
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
2200
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(tc), 0);
2201
2202
0
  REQUIRE_TENSOR_EQ(tc, hc, "MPS computed output should be the same as CPU computed ones");
2203
0
  ccv_nnc_tensor_free(a);
2204
0
  ccv_nnc_tensor_free(b);
2205
0
  ccv_nnc_tensor_free(c);
2206
0
  ccv_nnc_tensor_free(ha);
2207
0
  ccv_nnc_tensor_free(hb);
2208
0
  ccv_nnc_tensor_free(hc);
2209
0
  ccv_nnc_tensor_free(tc);
2210
0
}
2211
2212
TEST_CASE("mps mse mean loss backward")
2213
1
{
2214
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS) &&
2215
1
  ccv_nnc_cmd_ok(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_MPS));
2216
2217
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2218
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2219
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2220
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2221
0
  ccv_nnc_tensor_t* db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2222
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2223
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2224
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2225
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2226
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2227
0
  ccv_nnc_tensor_t* hdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2228
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2229
0
  dsfmt_t dsfmt;
2230
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2231
0
  int i;
2232
0
  for (i = 0; i < 1000; i++)
2233
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2234
0
  for (i = 0; i < 1000; i++)
2235
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2236
0
  for (i = 0; i < 10; i++)
2237
0
    hg->data.f32[i] = 1;
2238
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2239
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2240
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hda, hdb), 0);
2241
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2242
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(da, db), 0);
2243
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2244
0
  ccv_nnc_tensor_t* tdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2245
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, db), TENSOR_LIST(tda, tdb), 0);
2246
2247
0
  REQUIRE_TENSOR_EQ(tda, hda, "MPS computed output should be the same as CPU computed ones");
2248
0
  REQUIRE_TENSOR_EQ(tdb, hdb, "MPS computed output should be the same as CPU computed ones");
2249
2250
0
  ccv_nnc_tensor_free(a);
2251
0
  ccv_nnc_tensor_free(b);
2252
0
  ccv_nnc_tensor_free(c);
2253
0
  ccv_nnc_tensor_free(da);
2254
0
  ccv_nnc_tensor_free(db);
2255
0
  ccv_nnc_tensor_free(g);
2256
0
  ccv_nnc_tensor_free(ha);
2257
0
  ccv_nnc_tensor_free(hb);
2258
0
  ccv_nnc_tensor_free(hc);
2259
0
  ccv_nnc_tensor_free(hda);
2260
0
  ccv_nnc_tensor_free(hdb);
2261
0
  ccv_nnc_tensor_free(hg);
2262
0
  ccv_nnc_tensor_free(tda);
2263
0
  ccv_nnc_tensor_free(tdb);
2264
0
}
2265
2266
TEST_CASE("mps mse sum loss backward")
2267
1
{
2268
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS) &&
2269
1
    ccv_nnc_cmd_ok(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_MPS));
2270
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2271
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2272
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2273
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2274
0
  ccv_nnc_tensor_t* db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2275
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2276
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2277
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2278
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2279
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2280
0
  ccv_nnc_tensor_t* hdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2281
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2282
0
  dsfmt_t dsfmt;
2283
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2284
0
  int i;
2285
0
  for (i = 0; i < 1000; i++)
2286
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2287
0
  for (i = 0; i < 1000; i++)
2288
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2289
0
  for (i = 0; i < 10; i++)
2290
0
    hg->data.f32[i] = 1;
2291
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2292
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2293
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hda, hdb), 0);
2294
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2295
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(da, db), 0);
2296
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2297
0
  ccv_nnc_tensor_t* tdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2298
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, db), TENSOR_LIST(tda, tdb), 0);
2299
0
  REQUIRE_TENSOR_EQ(tda, hda, "MPS computed output should be the same as CPU computed ones");
2300
0
  REQUIRE_TENSOR_EQ(tdb, hdb, "MPS computed output should be the same as CPU computed ones");
2301
0
  ccv_nnc_tensor_free(a);
2302
0
  ccv_nnc_tensor_free(b);
2303
0
  ccv_nnc_tensor_free(c);
2304
0
  ccv_nnc_tensor_free(da);
2305
0
  ccv_nnc_tensor_free(db);
2306
0
  ccv_nnc_tensor_free(g);
2307
0
  ccv_nnc_tensor_free(ha);
2308
0
  ccv_nnc_tensor_free(hb);
2309
0
  ccv_nnc_tensor_free(hc);
2310
0
  ccv_nnc_tensor_free(hda);
2311
0
  ccv_nnc_tensor_free(hdb);
2312
0
  ccv_nnc_tensor_free(hg);
2313
0
  ccv_nnc_tensor_free(tda);
2314
0
  ccv_nnc_tensor_free(tdb);
2315
0
}
2316
2317
2318
TEST_CASE("mps mse sum loss backward (no output db)")
2319
1
{
2320
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS) &&
2321
1
    ccv_nnc_cmd_ok(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_MPS));
2322
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2323
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2324
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2325
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2326
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2327
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2328
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2329
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2330
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2331
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2332
0
  dsfmt_t dsfmt;
2333
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2334
0
  int i;
2335
0
  for (i = 0; i < 1000; i++)
2336
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2337
0
  for (i = 0; i < 1000; i++)
2338
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2339
0
  for (i = 0; i < 10; i++)
2340
0
    hg->data.f32[i] = 1;
2341
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2342
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2343
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hda, 0), 0);
2344
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2345
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(da, 0), 0);
2346
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2347
0
  ccv_nnc_tensor_t* tdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2348
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, 0), TENSOR_LIST(tda, 0), 0);
2349
0
  REQUIRE_TENSOR_EQ(tda, hda, "MPS computed output should be the same as CPU computed ones");
2350
0
  ccv_nnc_tensor_free(a);
2351
0
  ccv_nnc_tensor_free(b);
2352
0
  ccv_nnc_tensor_free(c);
2353
0
  ccv_nnc_tensor_free(da);
2354
0
  ccv_nnc_tensor_free(g);
2355
0
  ccv_nnc_tensor_free(ha);
2356
0
  ccv_nnc_tensor_free(hb);
2357
0
  ccv_nnc_tensor_free(hc);
2358
0
  ccv_nnc_tensor_free(hda);
2359
0
  ccv_nnc_tensor_free(hg);
2360
0
  ccv_nnc_tensor_free(tda);
2361
0
  ccv_nnc_tensor_free(tdb);
2362
0
}
2363
2364
TEST_CASE("mps leaky relu gradient in float")
2365
1
{
2366
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LEAKY_RELU_FORWARD, CCV_NNC_BACKEND_MPS) &&
2367
1
    ccv_nnc_cmd_ok(CCV_NNC_LEAKY_RELU_BACKWARD, CCV_NNC_BACKEND_MPS));
2368
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2369
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
2370
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
2371
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LEAKY_RELU_FORWARD(0.2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "leaky relu");
2372
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2373
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2374
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2375
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2376
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
2377
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
2378
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2379
0
  dsfmt_t dsfmt;
2380
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2381
0
  int i;
2382
0
  for (i = 0; i < 10 * 100; i++)
2383
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2384
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2385
0
  for (i = 0; i < 10 * 100; i++)
2386
0
    dy_tensor->data.f32[i] = 0;
2387
0
  for (i = 0; i < 10; i++)
2388
0
    dy_tensor->data.f32[i * 100 + i] = 1;
2389
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2390
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
2391
0
  ccv_nnc_graph_t* graph = 0;
2392
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2393
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2394
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2395
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2396
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2397
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2398
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2399
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2400
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
2401
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2402
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2403
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
2404
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
2405
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2406
0
  ccv_nnc_cmd_exec(CMD_LEAKY_RELU_FORWARD(0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
2407
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
2408
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2409
0
  ccv_nnc_cmd_exec(CMD_LEAKY_RELU_BACKWARD(0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, y_tensor), TENSOR_LIST(tdx_tensor), 0);
2410
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
2411
0
  ccv_nnc_tensor_free(x_tensor);
2412
0
  ccv_nnc_tensor_free(y_tensor);
2413
0
  ccv_nnc_tensor_free(dx_tensor);
2414
0
  ccv_nnc_tensor_free(dy_tensor);
2415
0
  ccv_nnc_tensor_free(ty_tensor);
2416
0
  ccv_nnc_tensor_free(tdx_tensor);
2417
0
  ccv_nnc_tensor_free(dyt);
2418
0
  ccv_nnc_graph_free(graph);
2419
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2420
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2421
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2422
0
}
2423
2424
TEST_CASE("compare layer norm gradient with mps")
2425
1
{
2426
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2427
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2428
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2429
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2430
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2431
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2432
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "scale");
2433
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "bias");
2434
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2435
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2436
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2437
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2438
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2439
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2440
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2441
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2442
0
  ccv_nnc_graph_t* graph = 0;
2443
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2444
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2445
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2446
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2447
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2448
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2449
0
  dsfmt_t dsfmt;
2450
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2451
0
  int i;
2452
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2453
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2454
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2455
2456
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2457
0
  float scaledata[1 * 2 * 2 * LN_DIM];
2458
0
  float biasdata[1 * 2 * 2 * LN_DIM];
2459
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2460
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2461
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2462
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
2463
2464
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2465
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2466
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
2467
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2468
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2469
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2470
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2471
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2472
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2473
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2474
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2475
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2476
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
2477
0
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
2478
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2479
0
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2480
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
2481
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2482
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2483
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2484
0
  ccv_nnc_graph_free(graph);
2485
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2486
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2487
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2488
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "scale");
2489
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "bias");
2490
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2491
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2492
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2493
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2494
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2495
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2496
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2497
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2498
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
2499
0
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
2500
0
  ccv_nnc_graph_t* cpu_graph = 0;
2501
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2502
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2503
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2504
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2505
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2506
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2507
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2508
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2509
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
2510
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
2511
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
2512
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2513
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
2514
0
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
2515
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2516
2517
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2518
0
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from cudnn should match the one from reference implementation");
2519
0
  REQUIRE_TENSOR_EQ(dbias_tensor, dcbias_tensor, "layer norm bias gradient result from cudnn should match the one from reference implementation");
2520
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2521
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2522
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2523
0
  ccv_nnc_graph_free(cpu_graph);
2524
0
  ccv_nnc_tensor_free(x_tensor);
2525
0
  ccv_nnc_tensor_free(dy_tensor);
2526
0
  ccv_nnc_tensor_free(dx_tensor);
2527
0
  ccv_nnc_tensor_free(dscale_tensor);
2528
0
  ccv_nnc_tensor_free(dbias_tensor);
2529
0
}
2530
2531
TEST_CASE("compare layer norm gradient with mps (no bias)")
2532
1
{
2533
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2534
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2535
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2536
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2537
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2538
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2539
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "scale");
2540
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "bias");
2541
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2542
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2543
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2544
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2545
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2546
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2547
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2548
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2549
0
  ccv_nnc_graph_t* graph = 0;
2550
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2551
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2552
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2553
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2554
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2555
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2556
0
  dsfmt_t dsfmt;
2557
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2558
0
  int i;
2559
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2560
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2561
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2562
2563
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2564
0
  float scaledata[1 * 2 * 2 * LN_DIM];
2565
0
  float biasdata[1 * 2 * 2 * LN_DIM];
2566
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2567
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2568
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2569
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
2570
2571
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2572
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2573
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
2574
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2575
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2576
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2577
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2578
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2579
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2580
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2581
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2582
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2583
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
2584
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2585
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
2586
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2587
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2588
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2589
0
  ccv_nnc_graph_free(graph);
2590
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2591
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2592
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2593
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "scale");
2594
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "bias");
2595
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2596
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2597
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2598
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2599
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2600
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2601
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2602
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2603
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
2604
0
  ccv_nnc_graph_t* cpu_graph = 0;
2605
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2606
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2607
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2608
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2609
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2610
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2611
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2612
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2613
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
2614
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2615
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
2616
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2617
2618
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2619
0
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from cudnn should match the one from reference implementation");
2620
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2621
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2622
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2623
0
  ccv_nnc_graph_free(cpu_graph);
2624
0
  ccv_nnc_tensor_free(x_tensor);
2625
0
  ccv_nnc_tensor_free(dy_tensor);
2626
0
  ccv_nnc_tensor_free(dx_tensor);
2627
0
  ccv_nnc_tensor_free(dscale_tensor);
2628
0
}
2629
2630
TEST_CASE("compare layer norm gradient with mps without scale / bias")
2631
1
{
2632
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2633
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2634
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2635
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2636
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2637
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2638
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2639
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2640
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2641
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2642
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2643
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2644
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2645
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2646
0
  ccv_nnc_graph_t* graph = 0;
2647
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2648
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2649
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2650
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2651
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2652
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2653
0
  dsfmt_t dsfmt;
2654
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2655
0
  int i;
2656
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2657
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2658
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2659
2660
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2661
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2662
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2663
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2664
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2665
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2666
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2667
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2668
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2669
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2670
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2671
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2672
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2673
0
  ccv_nnc_graph_free(graph);
2674
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2675
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2676
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2677
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2678
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2679
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2680
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2681
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2682
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2683
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2684
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2685
0
  ccv_nnc_graph_t* cpu_graph = 0;
2686
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2687
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2688
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2689
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2690
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2691
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2692
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2693
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2694
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2695
2696
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2697
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2698
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2699
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2700
0
  ccv_nnc_graph_free(cpu_graph);
2701
0
  ccv_nnc_tensor_free(x_tensor);
2702
0
  ccv_nnc_tensor_free(dy_tensor);
2703
0
  ccv_nnc_tensor_free(dx_tensor);
2704
0
}
2705
2706
TEST_CASE("compare layer norm gradient with mps (no bias) without scale / bias")
2707
1
{
2708
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2709
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2710
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2711
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2712
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2713
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2714
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2715
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2716
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2717
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2718
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2719
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2720
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2721
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2722
0
  ccv_nnc_graph_t* graph = 0;
2723
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2724
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2725
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2726
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2727
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2728
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2729
0
  dsfmt_t dsfmt;
2730
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2731
0
  int i;
2732
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2733
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2734
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2735
2736
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2737
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2738
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2739
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2740
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2741
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2742
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2743
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2744
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2745
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2746
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2747
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2748
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2749
0
  ccv_nnc_graph_free(graph);
2750
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2751
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2752
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2753
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2754
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2755
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2756
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2757
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2758
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2759
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2760
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2761
0
  ccv_nnc_graph_t* cpu_graph = 0;
2762
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2763
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2764
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2765
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2766
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2767
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2768
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2769
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2770
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2771
2772
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2773
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2774
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2775
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2776
0
  ccv_nnc_graph_free(cpu_graph);
2777
0
  ccv_nnc_tensor_free(x_tensor);
2778
0
  ccv_nnc_tensor_free(dy_tensor);
2779
0
  ccv_nnc_tensor_free(dx_tensor);
2780
0
}
2781
2782
TEST_CASE("compare rmsnorm gradient with mps")
2783
1
{
2784
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2785
1
    ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2786
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2787
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2788
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2789
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2790
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "scale");
2791
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2792
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
2793
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2794
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2795
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2796
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2797
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2798
0
  ccv_nnc_graph_t* graph = 0;
2799
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2800
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2801
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2802
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2803
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2804
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2805
0
  dsfmt_t dsfmt;
2806
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2807
0
  int i;
2808
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2809
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2810
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2811
2812
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2813
0
  float scaledata[1 * 2 * 2 * LN_DIM];
2814
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2815
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2816
2817
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2818
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale)), 0);
2819
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2820
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2821
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2822
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2823
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2824
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2825
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2826
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2827
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2828
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
2829
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2830
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
2831
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2832
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2833
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2834
0
  ccv_nnc_graph_free(graph);
2835
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2836
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2837
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2838
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "scale");
2839
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2840
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "layer_norm");
2841
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2842
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2843
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2844
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2845
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2846
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
2847
0
  ccv_nnc_graph_t* cpu_graph = 0;
2848
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2849
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2850
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2851
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2852
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2853
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2854
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2855
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2856
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
2857
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2858
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
2859
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2860
2861
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2862
0
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from cudnn should match the one from reference implementation");
2863
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2864
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2865
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2866
0
  ccv_nnc_graph_free(cpu_graph);
2867
0
  ccv_nnc_tensor_free(x_tensor);
2868
0
  ccv_nnc_tensor_free(dy_tensor);
2869
0
  ccv_nnc_tensor_free(dx_tensor);
2870
0
  ccv_nnc_tensor_free(dscale_tensor);
2871
0
}
2872
2873
TEST_CASE("mps backward convolution in nchw format")
2874
1
{
2875
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_MPS));
2876
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2877
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2878
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
2879
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
2880
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
2881
0
  assert(cmd.backend >= 0);
2882
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, g->info);
2883
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, g->info) == 0);
2884
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2885
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2886
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, 1, OUTPUT_DIM), 0);
2887
  // configure the inlets.
2888
0
  dsfmt_t dsfmt;
2889
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2890
0
  int i;
2891
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
2892
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
2893
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
2894
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2895
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
2896
0
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
2897
  // Copy generated matrix values over to GPU.
2898
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2899
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
2900
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2901
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2902
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
2903
0
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2904
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
2905
0
  ccv_nnc_tensor_t* gao = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
2906
0
  ccv_nnc_tensor_t* ggo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
2907
0
  ccv_nnc_tensor_t* gho = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
2908
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
2909
0
  ccv_nnc_tensor_t* gbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
2910
0
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
2911
0
  ccv_nnc_tensor_t* gdbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
2912
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
2913
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
2914
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gw, gg), TENSOR_LIST(gao, gwo, ggo), 0);
2915
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
2916
2917
0
  assert(cmd.backend >= 0);
2918
0
  cmd.algorithm = -1;
2919
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
2920
2921
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context);
2922
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context));
2923
0
  ccv_nnc_stream_context_wait(stream_context);
2924
0
  ccv_nnc_stream_context_free(stream_context);
2925
0
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2926
0
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2927
0
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, OUTPUT_DIM, 1, 1), 0);
2928
2929
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gho, gdwo, gdbiaso), TENSOR_LIST(gh, gdw, gdbias), 0);
2930
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
2931
2932
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from mps should match from CPU");
2933
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from mps should match from CPU");
2934
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
2935
0
  ccv_nnc_tensor_free(gao);
2936
0
  ccv_nnc_tensor_free(ggo);
2937
0
  ccv_nnc_tensor_free(gho);
2938
0
  ccv_nnc_tensor_free(gwo);
2939
0
  ccv_nnc_tensor_free(gbiaso);
2940
0
  ccv_nnc_tensor_free(gdwo);
2941
0
  ccv_nnc_tensor_free(gdbiaso);
2942
0
  ccv_nnc_tensor_free(h);
2943
0
  ccv_nnc_tensor_free(gh);
2944
0
  ccv_nnc_tensor_free(w);
2945
0
  ccv_nnc_tensor_free(g);
2946
0
  ccv_nnc_tensor_free(a);
2947
0
  ccv_nnc_tensor_free(gbias);
2948
0
  ccv_nnc_tensor_free(gdbias);
2949
0
  ccv_nnc_tensor_free(gdw);
2950
0
  ccv_nnc_tensor_free(gw);
2951
0
  ccv_nnc_tensor_free(gg);
2952
0
  ccv_nnc_tensor_free(ga);
2953
0
  ccv_nnc_tensor_free(ch);
2954
0
  ccv_nnc_tensor_free(cdw);
2955
0
  ccv_nnc_tensor_free(cdbias);
2956
0
}
2957
2958
TEST_CASE("mps backward convolution in nhwc format")
2959
1
{
2960
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_MPS));
2961
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2962
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2963
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
2964
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
2965
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
2966
0
  assert(cmd.backend >= 0);
2967
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, g->info);
2968
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, g->info) == 0);
2969
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2970
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2971
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
2972
  // configure the inlets.
2973
0
  dsfmt_t dsfmt;
2974
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2975
0
  int i;
2976
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
2977
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
2978
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
2979
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2980
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
2981
0
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
2982
  // Copy generated matrix values over to GPU.
2983
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2984
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
2985
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2986
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2987
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
2988
0
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2989
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
2990
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
2991
0
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
2992
2993
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
2994
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
2995
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), 0);
2996
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
2997
2998
0
  assert(cmd.backend >= 0);
2999
0
  cmd.algorithm = -1;
3000
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
3001
3002
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(gg, ga, gwo), TENSOR_LIST(gh, gdwo, gdbias), stream_context);
3003
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(gg, ga, gwo), TENSOR_LIST(gh, gdwo, gdbias), stream_context));
3004
0
  ccv_nnc_stream_context_wait(stream_context);
3005
0
  ccv_nnc_stream_context_free(stream_context);
3006
0
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3007
0
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3008
0
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, 1,  OUTPUT_DIM), 0);
3009
  
3010
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdwo), TENSOR_LIST(gdw), 0);
3011
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
3012
3013
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from mps should match from CPU");
3014
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from mps should match from CPU");
3015
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
3016
3017
0
  ccv_nnc_tensor_free(gwo);
3018
0
  ccv_nnc_tensor_free(gdwo);
3019
0
  ccv_nnc_tensor_free(h);
3020
0
  ccv_nnc_tensor_free(gh);
3021
0
  ccv_nnc_tensor_free(w);
3022
0
  ccv_nnc_tensor_free(g);
3023
0
  ccv_nnc_tensor_free(a);
3024
0
  ccv_nnc_tensor_free(gbias);
3025
0
  ccv_nnc_tensor_free(gdbias);
3026
0
  ccv_nnc_tensor_free(gdw);
3027
0
  ccv_nnc_tensor_free(gw);
3028
0
  ccv_nnc_tensor_free(gg);
3029
0
  ccv_nnc_tensor_free(ga);
3030
0
  ccv_nnc_tensor_free(ch);
3031
0
  ccv_nnc_tensor_free(cdw);
3032
0
  ccv_nnc_tensor_free(cdbias);
3033
0
}
3034
3035
TEST_CASE("mps backward convolution in nchw format with dilation 2, 3")
3036
1
{
3037
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_MPS));
3038
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3039
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3040
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
3041
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
3042
0
  cmd.info.convolution.dilation[0] = 2;
3043
0
  cmd.info.convolution.dilation[1] = 3;
3044
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
3045
0
  assert(cmd.backend >= 0);
3046
0
  ccv_nnc_cmd_param_t modified_cmd = cmd.info;
3047
0
  modified_cmd.size.dim[0] = (cmd.info.size.dim[0] - 1) * ccv_max(cmd.info.convolution.dilation[0], 1) + 1;
3048
0
  modified_cmd.size.dim[1] = (cmd.info.size.dim[1] - 1) * ccv_max(cmd.info.convolution.dilation[1], 1) + 1;
3049
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(modified_cmd, a->info, g->info);
3050
0
  assert(ccv_nnc_hint_verify(hint, modified_cmd, a->info, g->info) == 0);
3051
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3052
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3053
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, 1, OUTPUT_DIM), 0);
3054
  // configure the inlets.
3055
0
  dsfmt_t dsfmt;
3056
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3057
0
  int i;
3058
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
3059
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
3060
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
3061
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3062
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
3063
0
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
3064
  // Copy generated matrix values over to GPU.
3065
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3066
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
3067
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3068
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3069
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
3070
0
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3071
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
3072
0
  ccv_nnc_tensor_t* gao = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
3073
0
  ccv_nnc_tensor_t* ggo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
3074
0
  ccv_nnc_tensor_t* gho = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
3075
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
3076
0
  ccv_nnc_tensor_t* gbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
3077
0
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
3078
0
  ccv_nnc_tensor_t* gdbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
3079
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
3080
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
3081
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gw, gg), TENSOR_LIST(gao, gwo, ggo), 0);
3082
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
3083
3084
0
  assert(cmd.backend >= 0);
3085
0
  cmd.algorithm = -1;
3086
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
3087
3088
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context);
3089
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context));
3090
0
  ccv_nnc_stream_context_wait(stream_context);
3091
0
  ccv_nnc_stream_context_free(stream_context);
3092
0
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3093
0
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3094
0
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, OUTPUT_DIM, 1, 1), 0);
3095
3096
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gho, gdwo, gdbiaso), TENSOR_LIST(gh, gdw, gdbias), 0);
3097
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
3098
3099
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from mps should match from CPU");
3100
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from mps should match from CPU");
3101
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
3102
0
  ccv_nnc_tensor_free(gao);
3103
0
  ccv_nnc_tensor_free(ggo);
3104
0
  ccv_nnc_tensor_free(gho);
3105
0
  ccv_nnc_tensor_free(gwo);
3106
0
  ccv_nnc_tensor_free(gbiaso);
3107
0
  ccv_nnc_tensor_free(gdwo);
3108
0
  ccv_nnc_tensor_free(gdbiaso);
3109
0
  ccv_nnc_tensor_free(h);
3110
0
  ccv_nnc_tensor_free(gh);
3111
0
  ccv_nnc_tensor_free(w);
3112
0
  ccv_nnc_tensor_free(g);
3113
0
  ccv_nnc_tensor_free(a);
3114
0
  ccv_nnc_tensor_free(gbias);
3115
0
  ccv_nnc_tensor_free(gdbias);
3116
0
  ccv_nnc_tensor_free(gdw);
3117
0
  ccv_nnc_tensor_free(gw);
3118
0
  ccv_nnc_tensor_free(gg);
3119
0
  ccv_nnc_tensor_free(ga);
3120
0
  ccv_nnc_tensor_free(ch);
3121
0
  ccv_nnc_tensor_free(cdw);
3122
0
  ccv_nnc_tensor_free(cdbias);
3123
0
}
3124
3125
TEST_CASE("compare group norm gradient with mps")
3126
1
{
3127
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3128
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3129
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3130
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3131
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3132
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3133
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
3134
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
3135
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3136
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3137
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3138
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3139
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3140
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3141
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3142
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3143
0
  ccv_nnc_graph_t* graph = 0;
3144
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3145
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3146
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3147
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3148
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3149
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3150
0
  dsfmt_t dsfmt;
3151
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3152
0
  int i;
3153
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3154
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3155
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3156
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3157
0
  float scaledata[1 * GN_C_DIM * 2 * LN_DIM];
3158
0
  float biasdata[1 * GN_C_DIM * 2 * LN_DIM];
3159
0
  for (i = 0; i < 1 * GN_C_DIM * 2 * LN_DIM; i++)
3160
0
  {
3161
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
3162
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
3163
0
  }
3164
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3165
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3166
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
3167
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3168
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3169
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3170
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3171
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3172
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3173
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3174
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3175
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3176
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3177
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
3178
0
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
3179
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3180
0
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3181
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
3182
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3183
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3184
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3185
0
  ccv_nnc_graph_free(graph);
3186
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3187
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3188
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3189
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
3190
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
3191
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3192
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3193
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3194
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3195
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3196
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3197
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3198
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3199
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
3200
0
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
3201
0
  ccv_nnc_graph_t* cpu_graph = 0;
3202
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3203
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3204
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3205
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3206
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3207
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3208
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3209
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
3210
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * GN_C_DIM * 2 * LN_DIM);
3211
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
3212
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * GN_C_DIM * 2 * LN_DIM);
3213
3214
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3215
3216
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3217
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
3218
0
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
3219
3220
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3221
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias_tensor->data.f32, dcbias_tensor->data.f32, 1 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3222
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dscale_tensor->data.f32, dcscale_tensor->data.f32, 1 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3223
3224
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3225
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3226
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3227
0
  ccv_nnc_graph_free(cpu_graph);
3228
0
  ccv_nnc_tensor_free(x_tensor);
3229
0
  ccv_nnc_tensor_free(dy_tensor);
3230
0
  ccv_nnc_tensor_free(dx_tensor);
3231
0
  ccv_nnc_tensor_free(dscale_tensor);
3232
0
  ccv_nnc_tensor_free(dbias_tensor);
3233
0
}
3234
3235
TEST_CASE("compare group norm gradient with mps, variant 1")
3236
1
{
3237
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3238
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3239
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3240
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3241
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3242
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3243
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 1, 1), "scale");
3244
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 1, 1), "bias");
3245
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
3246
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
3247
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3248
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3249
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3250
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3251
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3252
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3253
0
  ccv_nnc_graph_t* graph = 0;
3254
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3255
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3256
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3257
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3258
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3259
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3260
0
  dsfmt_t dsfmt;
3261
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3262
0
  int i;
3263
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3264
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3265
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3266
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3267
0
  float scaledata[1 * GN_C_DIM * 1 * 1];
3268
0
  float biasdata[1 * GN_C_DIM * 1 * 1];
3269
0
  for (i = 0; i < 1 * GN_C_DIM * 1 * 1; i++)
3270
0
  {
3271
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
3272
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
3273
0
  }
3274
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
3275
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
3276
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
3277
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3278
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3279
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3280
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3281
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3282
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3283
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3284
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3285
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3286
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3287
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
3288
0
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
3289
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
3290
0
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
3291
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
3292
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3293
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3294
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3295
0
  ccv_nnc_graph_free(graph);
3296
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3297
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3298
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3299
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), "scale");
3300
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), "bias");
3301
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
3302
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
3303
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3304
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3305
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3306
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3307
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3308
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3309
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
3310
0
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
3311
0
  ccv_nnc_graph_t* cpu_graph = 0;
3312
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3313
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3314
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3315
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3316
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3317
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3318
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3319
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
3320
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * GN_C_DIM * 1 * 1);
3321
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
3322
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * GN_C_DIM * 1 * 1);
3323
3324
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3325
3326
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3327
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
3328
0
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
3329
3330
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3331
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias_tensor->data.f32, dcbias_tensor->data.f32, 1 * GN_C_DIM * 1 * 1, 1e-5, "group norm output from mps should match from CPU");
3332
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dscale_tensor->data.f32, dcscale_tensor->data.f32, 1 * GN_C_DIM * 1 * 1, 1e-5, "group norm output from mps should match from CPU");
3333
3334
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3335
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3336
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3337
0
  ccv_nnc_graph_free(cpu_graph);
3338
0
  ccv_nnc_tensor_free(x_tensor);
3339
0
  ccv_nnc_tensor_free(dy_tensor);
3340
0
  ccv_nnc_tensor_free(dx_tensor);
3341
0
  ccv_nnc_tensor_free(dscale_tensor);
3342
0
  ccv_nnc_tensor_free(dbias_tensor);
3343
0
}
3344
3345
TEST_CASE("compare group norm gradient with mps (no dbias)")
3346
1
{
3347
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3348
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3349
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3350
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3351
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3352
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3353
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
3354
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
3355
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3356
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3357
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3358
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3359
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3360
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3361
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3362
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3363
0
  ccv_nnc_graph_t* graph = 0;
3364
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3365
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3366
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3367
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3368
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3369
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3370
0
  dsfmt_t dsfmt;
3371
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3372
0
  int i;
3373
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3374
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3375
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3376
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3377
0
  float scaledata[1 * GN_C_DIM * 2 * LN_DIM];
3378
0
  float biasdata[1 * GN_C_DIM * 2 * LN_DIM];
3379
0
  for (i = 0; i < 1 * GN_C_DIM * 2 * LN_DIM; i++)
3380
0
  {
3381
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
3382
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
3383
0
  }
3384
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3385
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3386
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
3387
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3388
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3389
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3390
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3391
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3392
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3393
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3394
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3395
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3396
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3397
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
3398
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3399
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
3400
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3401
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3402
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3403
0
  ccv_nnc_graph_free(graph);
3404
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3405
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3406
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3407
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
3408
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
3409
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3410
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3411
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3412
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3413
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3414
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3415
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3416
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3417
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
3418
0
  ccv_nnc_graph_t* cpu_graph = 0;
3419
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3420
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3421
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3422
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3423
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3424
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3425
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3426
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
3427
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * GN_C_DIM * 2 * LN_DIM);
3428
3429
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3430
3431
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3432
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
3433
3434
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3435
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dscale_tensor->data.f32, dcscale_tensor->data.f32, 1 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3436
3437
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3438
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3439
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3440
0
  ccv_nnc_graph_free(cpu_graph);
3441
0
  ccv_nnc_tensor_free(x_tensor);
3442
0
  ccv_nnc_tensor_free(dy_tensor);
3443
0
  ccv_nnc_tensor_free(dx_tensor);
3444
0
  ccv_nnc_tensor_free(dscale_tensor);
3445
0
}
3446
3447
TEST_CASE("compare group norm gradient with mps without scale / bias")
3448
1
{
3449
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3450
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3451
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3452
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3453
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3454
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3455
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3456
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3457
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3458
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3459
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3460
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3461
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3462
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3463
0
  ccv_nnc_graph_t* graph = 0;
3464
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3465
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3466
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3467
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3468
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3469
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3470
0
  dsfmt_t dsfmt;
3471
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3472
0
  int i;
3473
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3474
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3475
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3476
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3477
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3478
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3479
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3480
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3481
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3482
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3483
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3484
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3485
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3486
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3487
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3488
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3489
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3490
0
  ccv_nnc_graph_free(graph);
3491
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3492
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3493
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3494
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3495
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3496
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3497
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3498
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3499
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3500
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3501
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3502
0
  ccv_nnc_graph_t* cpu_graph = 0;
3503
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3504
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3505
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3506
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3507
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3508
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3509
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3510
3511
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3512
3513
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3514
3515
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3516
3517
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3518
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3519
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3520
0
  ccv_nnc_graph_free(cpu_graph);
3521
0
  ccv_nnc_tensor_free(x_tensor);
3522
0
  ccv_nnc_tensor_free(dy_tensor);
3523
0
  ccv_nnc_tensor_free(dx_tensor);
3524
0
}
3525
3526
TEST_CASE("compare group norm gradient with mps, variant 1 without scale / bias")
3527
1
{
3528
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3529
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3530
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3531
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3532
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3533
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3534
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
3535
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
3536
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3537
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3538
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3539
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3540
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3541
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3542
0
  ccv_nnc_graph_t* graph = 0;
3543
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3544
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3545
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3546
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3547
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3548
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3549
0
  dsfmt_t dsfmt;
3550
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3551
0
  int i;
3552
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3553
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3554
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3555
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3556
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3557
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3558
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3559
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3560
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3561
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3562
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3563
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3564
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3565
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3566
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3567
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3568
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3569
0
  ccv_nnc_graph_free(graph);
3570
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3571
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3572
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3573
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
3574
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
3575
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3576
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3577
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3578
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3579
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3580
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3581
0
  ccv_nnc_graph_t* cpu_graph = 0;
3582
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3583
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3584
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3585
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3586
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3587
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3588
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3589
3590
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3591
3592
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3593
3594
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3595
3596
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3597
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3598
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3599
0
  ccv_nnc_graph_free(cpu_graph);
3600
0
  ccv_nnc_tensor_free(x_tensor);
3601
0
  ccv_nnc_tensor_free(dy_tensor);
3602
0
  ccv_nnc_tensor_free(dx_tensor);
3603
0
}
3604
3605
TEST_CASE("compare group norm gradient with mps (no dbias) without scale / bias")
3606
1
{
3607
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3608
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3609
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3610
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3611
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3612
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3613
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3614
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3615
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3616
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3617
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3618
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3619
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3620
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3621
0
  ccv_nnc_graph_t* graph = 0;
3622
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3623
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3624
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3625
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3626
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3627
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3628
0
  dsfmt_t dsfmt;
3629
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3630
0
  int i;
3631
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3632
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3633
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3634
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3635
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3636
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3637
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3638
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3639
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3640
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3641
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3642
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3643
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3644
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3645
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3646
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3647
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3648
0
  ccv_nnc_graph_free(graph);
3649
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3650
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3651
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3652
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3653
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3654
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3655
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3656
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3657
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3658
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3659
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3660
0
  ccv_nnc_graph_t* cpu_graph = 0;
3661
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3662
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3663
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3664
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3665
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3666
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3667
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3668
3669
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3670
3671
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3672
3673
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3674
3675
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3676
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3677
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3678
0
  ccv_nnc_graph_free(cpu_graph);
3679
0
  ccv_nnc_tensor_free(x_tensor);
3680
0
  ccv_nnc_tensor_free(dy_tensor);
3681
0
  ccv_nnc_tensor_free(dx_tensor);
3682
0
}
3683
3684
TEST_CASE("broadcasting semantics for mul backward (a,b)")
3685
1
{
3686
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3687
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3688
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3689
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3690
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3691
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3692
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3693
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3694
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3695
0
  a->data.f32[0] = 1;
3696
0
  a->data.f32[1] = 2;
3697
0
  a->data.f32[2] = 3;
3698
0
  a->data.f32[3] = 4;
3699
0
  b->data.f32[0] = 5;
3700
0
  b->data.f32[1] = 6;
3701
0
  float ctp[] = {
3702
0
    6, 7,
3703
0
    7, 8,
3704
0
    8, 9,
3705
0
    9, 10
3706
0
  };
3707
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3708
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3709
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3710
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3711
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3712
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3713
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3714
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, gdb), 0);
3715
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
3716
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, dbt), 0);
3717
3718
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3719
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3720
0
  ccv_nnc_tensor_free(a);
3721
0
  ccv_nnc_tensor_free(b);
3722
0
  ccv_nnc_tensor_free(c);
3723
0
  ccv_nnc_tensor_free(da);
3724
0
  ccv_nnc_tensor_free(db);
3725
0
  ccv_nnc_tensor_free(dat);
3726
0
  ccv_nnc_tensor_free(dbt);
3727
0
  ccv_nnc_tensor_free(ga);
3728
0
  ccv_nnc_tensor_free(gb);
3729
0
  ccv_nnc_tensor_free(gc);
3730
0
  ccv_nnc_tensor_free(gda);
3731
0
  ccv_nnc_tensor_free(gdb);
3732
0
}
3733
3734
TEST_CASE("broadcasting semantics for mul backward (a, nil)")
3735
1
{
3736
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3737
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3738
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3739
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3740
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3741
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3742
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3743
0
  a->data.f32[0] = 1;
3744
0
  a->data.f32[1] = 2;
3745
0
  a->data.f32[2] = 3;
3746
0
  a->data.f32[3] = 4;
3747
0
  b->data.f32[0] = 5;
3748
0
  b->data.f32[1] = 6;
3749
0
  float ctp[] = {
3750
0
    6, 7,
3751
0
    7, 8,
3752
0
    8, 9,
3753
0
    9, 10
3754
0
  };
3755
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3756
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3757
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3758
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3759
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3760
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3761
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, 0), 0);
3762
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, 0), TENSOR_LIST(da, 0), 0);
3763
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, 0), 0);
3764
3765
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3766
0
  ccv_nnc_tensor_free(a);
3767
0
  ccv_nnc_tensor_free(b);
3768
0
  ccv_nnc_tensor_free(c);
3769
0
  ccv_nnc_tensor_free(da);
3770
0
  ccv_nnc_tensor_free(dat);
3771
0
  ccv_nnc_tensor_free(ga);
3772
0
  ccv_nnc_tensor_free(gb);
3773
0
  ccv_nnc_tensor_free(gc);
3774
0
  ccv_nnc_tensor_free(gda);
3775
0
}
3776
3777
TEST_CASE("broadcasting semantics for mul backward (nil,b)")
3778
1
{
3779
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3780
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3781
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3782
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3783
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3784
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3785
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3786
0
  a->data.f32[0] = 1;
3787
0
  a->data.f32[1] = 2;
3788
0
  a->data.f32[2] = 3;
3789
0
  a->data.f32[3] = 4;
3790
0
  b->data.f32[0] = 5;
3791
0
  b->data.f32[1] = 6;
3792
0
  float ctp[] = {
3793
0
    6, 7,
3794
0
    7, 8,
3795
0
    8, 9,
3796
0
    9, 10
3797
0
  };
3798
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3799
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3800
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3801
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3802
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3803
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3804
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(0, gdb), 0);
3805
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, gdb), TENSOR_LIST(0, db), 0);
3806
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(0, dbt), 0);
3807
3808
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3809
0
  ccv_nnc_tensor_free(a);
3810
0
  ccv_nnc_tensor_free(b);
3811
0
  ccv_nnc_tensor_free(c);
3812
0
  ccv_nnc_tensor_free(db);
3813
0
  ccv_nnc_tensor_free(dbt);
3814
0
  ccv_nnc_tensor_free(ga);
3815
0
  ccv_nnc_tensor_free(gb);
3816
0
  ccv_nnc_tensor_free(gc);
3817
0
  ccv_nnc_tensor_free(gdb);
3818
0
}
3819
3820
TEST_CASE("broadcasting semantics for mul backward (no output db)")
3821
1
{
3822
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3823
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3824
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3825
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3826
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3827
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3828
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3829
0
  a->data.f32[0] = 1;
3830
0
  a->data.f32[1] = 2;
3831
0
  a->data.f32[2] = 3;
3832
0
  a->data.f32[3] = 4;
3833
0
  b->data.f32[0] = 5;
3834
0
  b->data.f32[1] = 6;
3835
0
  float ctp[] = {
3836
0
    6, 7,
3837
0
    7, 8,
3838
0
    8, 9,
3839
0
    9, 10
3840
0
  };
3841
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3842
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3843
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3844
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3845
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3846
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3847
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, 0), 0);
3848
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, 0), TENSOR_LIST(da, 0), 0);
3849
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, 0), 0);
3850
3851
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3852
0
  ccv_nnc_tensor_free(a);
3853
0
  ccv_nnc_tensor_free(b);
3854
0
  ccv_nnc_tensor_free(c);
3855
0
  ccv_nnc_tensor_free(da);
3856
0
  ccv_nnc_tensor_free(dat);
3857
0
  ccv_nnc_tensor_free(ga);
3858
0
  ccv_nnc_tensor_free(gb);
3859
0
  ccv_nnc_tensor_free(gc);
3860
0
  ccv_nnc_tensor_free(gda);
3861
0
}
3862
3863
TEST_CASE("broadcasting semantics for mul backward (no input grad)")
3864
1
{
3865
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3866
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3867
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3868
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3869
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3870
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3871
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3872
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3873
0
  a->data.f32[0] = 1;
3874
0
  a->data.f32[1] = 2;
3875
0
  a->data.f32[2] = 3;
3876
0
  a->data.f32[3] = 4;
3877
0
  b->data.f32[0] = 5;
3878
0
  b->data.f32[1] = 6;
3879
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3880
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3881
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3882
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3883
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3884
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
3885
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
3886
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
3887
3888
3889
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3890
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3891
0
  ccv_nnc_tensor_free(a);
3892
0
  ccv_nnc_tensor_free(b);
3893
0
  ccv_nnc_tensor_free(da);
3894
0
  ccv_nnc_tensor_free(db);
3895
0
  ccv_nnc_tensor_free(dat);
3896
0
  ccv_nnc_tensor_free(dbt);
3897
0
  ccv_nnc_tensor_free(ga);
3898
0
  ccv_nnc_tensor_free(gb);
3899
0
  ccv_nnc_tensor_free(gda);
3900
0
  ccv_nnc_tensor_free(gdb);
3901
0
}
3902
3903
3904
TEST_CASE("broadcasting semantics for mul backward (no input grad) for b")
3905
1
{
3906
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3907
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3908
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3909
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3910
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3911
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3912
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3913
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3914
0
  a->data.f32[0] = 1;
3915
0
  a->data.f32[1] = 2;
3916
0
  a->data.f32[2] = 3;
3917
0
  a->data.f32[3] = 4;
3918
0
  a->data.f32[4] = 5;
3919
0
  a->data.f32[5] = 6;
3920
0
  b->data.f32[0] = 7;
3921
0
  b->data.f32[1] = 8;
3922
0
  b->data.f32[2] = 9;
3923
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3924
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3925
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3926
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3927
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3928
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
3929
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
3930
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
3931
3932
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3933
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3934
0
  ccv_nnc_tensor_free(a);
3935
0
  ccv_nnc_tensor_free(b);
3936
0
  ccv_nnc_tensor_free(da);
3937
0
  ccv_nnc_tensor_free(db);
3938
0
  ccv_nnc_tensor_free(dat);
3939
0
  ccv_nnc_tensor_free(dbt);
3940
0
  ccv_nnc_tensor_free(ga);
3941
0
  ccv_nnc_tensor_free(gb);
3942
0
  ccv_nnc_tensor_free(gda);
3943
0
  ccv_nnc_tensor_free(gdb);
3944
0
}
3945
3946
TEST_CASE("broadcasting semantics for mul backward (no input grad) for a")
3947
1
{
3948
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3949
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3950
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3951
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3952
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3953
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3954
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3955
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3956
0
  b->data.f32[0] = 1;
3957
0
  b->data.f32[1] = 2;
3958
0
  b->data.f32[2] = 3;
3959
0
  b->data.f32[3] = 4;
3960
0
  b->data.f32[4] = 5;
3961
0
  b->data.f32[5] = 6;
3962
0
  a->data.f32[0] = 7;
3963
0
  a->data.f32[1] = 8;
3964
0
  a->data.f32[2] = 9;
3965
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3966
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3967
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3968
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3969
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3970
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
3971
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
3972
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
3973
3974
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3975
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3976
0
  ccv_nnc_tensor_free(a);
3977
0
  ccv_nnc_tensor_free(b);
3978
0
  ccv_nnc_tensor_free(da);
3979
0
  ccv_nnc_tensor_free(db);
3980
0
  ccv_nnc_tensor_free(dat);
3981
0
  ccv_nnc_tensor_free(dbt);
3982
0
  ccv_nnc_tensor_free(ga);
3983
0
  ccv_nnc_tensor_free(gb);
3984
0
  ccv_nnc_tensor_free(gda);
3985
0
  ccv_nnc_tensor_free(gdb);
3986
0
}
3987
3988
TEST_CASE("mps scalar mul forward")
3989
1
{
3990
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3991
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
3992
3993
0
  ccv_nnc_tensor_t* const x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
3994
0
  ccv_nnc_tensor_t* const gx = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
3995
  
3996
0
  dsfmt_t dsfmt;
3997
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3998
0
  int i;
3999
0
  for (i = 0; i < 4; i++)
4000
0
      x->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4001
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x), TENSOR_LIST(gx), 0);
4002
4003
0
  ccv_nnc_tensor_t* const gy = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
4004
4005
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(1.1), ccv_nnc_no_hint, 0, TENSOR_LIST(gx), TENSOR_LIST(gy), 0);
4006
4007
0
  ccv_nnc_tensor_t* const y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
4008
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gy), TENSOR_LIST(y), 0);
4009
0
  for (i = 0; i < 4; i++) {
4010
0
      REQUIRE_EQ_WITH_TOLERANCE(x->data.f32[i] * 1.1, y->data.f32[i], 1e-5, "scalarmul forward cy has to be 1.1 * x");
4011
0
  }
4012
4013
0
  ccv_nnc_tensor_free(x);
4014
0
  ccv_nnc_tensor_free(gx);
4015
0
  ccv_nnc_tensor_free(gy);
4016
0
  ccv_nnc_tensor_free(y);
4017
0
}
4018
4019
TEST_CASE("mps scalar mul backward")
4020
1
{
4021
1
  GUARD_ELSE_RETURN(
4022
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
4023
4024
0
  ccv_nnc_tensor_t* const y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
4025
4026
0
  dsfmt_t dsfmt;
4027
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4028
0
  int i;
4029
0
  for (i = 0; i < 4; i++)
4030
0
      y->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4031
0
  ccv_nnc_tensor_t* const gy = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
4032
0
  ccv_nnc_tensor_t* const gdx = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
4033
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y), TENSOR_LIST(gy), 0);
4034
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_BACKWARD(1.1), ccv_nnc_no_hint, 0, TENSOR_LIST(gy), TENSOR_LIST(gdx), 0);
4035
  
4036
0
  ccv_nnc_tensor_t* const dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
4037
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdx), TENSOR_LIST(dx), 0);
4038
4039
0
  for (i = 0; i < 4; i++) {
4040
0
      REQUIRE_EQ_WITH_TOLERANCE(dx->data.f32[i], y->data.f32[i] * 1.1, 1e-5, "scalarmul backward dx has to be 1.1 * dy");
4041
0
  }
4042
4043
0
  ccv_nnc_tensor_free(y);
4044
0
  ccv_nnc_tensor_free(gy);
4045
0
  ccv_nnc_tensor_free(gdx);
4046
0
  ccv_nnc_tensor_free(dx);
4047
0
}
4048
4049
TEST_CASE("mps scalar mul backward, no input")
4050
1
{
4051
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
4052
4053
0
  ccv_nnc_tensor_t* const gdx = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
4054
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_BACKWARD(1.1), ccv_nnc_no_hint, 0, TENSOR_LIST(0), TENSOR_LIST(gdx), 0);
4055
0
  ccv_nnc_tensor_t* const dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
4056
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdx), TENSOR_LIST(dx), 0);
4057
4058
0
  for (int i = 0; i < 4; i++)
4059
0
      REQUIRE_EQ_WITH_TOLERANCE(dx->data.f32[i], 1.1, 1e-5, "scalar mul backward without input should be 1.1 ");
4060
0
  ccv_nnc_tensor_free(gdx);
4061
0
  ccv_nnc_tensor_free(dx);
4062
0
}
4063
4064
TEST_CASE("mps forward convolution transpose")
4065
1
{
4066
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
4067
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4068
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4069
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
4070
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
4071
0
  assert(cmd.backend >= 0);
4072
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
4073
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4074
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, INPUT_DIM), 0);
4075
  // configure the inlets.
4076
0
  dsfmt_t dsfmt;
4077
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4078
0
  int i;
4079
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
4080
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
4081
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
4082
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4083
0
  for (i = 0; i < INPUT_DIM; i++)
4084
0
    bias->data.f32[i] = (float)i / INPUT_DIM;
4085
  // Copy generated matrix values over to GPU.
4086
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4087
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4088
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4089
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, INPUT_DIM), 0);
4090
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
4091
0
  move.backend = CCV_NNC_BACKEND_MPS;
4092
0
  assert(move.backend >= 0);
4093
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
4094
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
4095
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4096
4097
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
4098
0
  transform.backend = CCV_NNC_BACKEND_MPS;
4099
0
  assert(transform.backend >= 0);
4100
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
4101
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
4102
0
  ccv_nnc_stream_context_wait(stream_context);
4103
0
  ccv_nnc_tensor_free(gw);
4104
4105
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
4106
0
  assert(cmd.backend >= 0);
4107
0
  cmd.algorithm = -1;
4108
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
4109
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
4110
0
  ccv_nnc_stream_context_wait(stream_context);
4111
0
  ccv_nnc_stream_context_free(stream_context);
4112
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4113
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
4114
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
4115
0
  ccv_nnc_tensor_free(c);
4116
0
  ccv_nnc_tensor_free(gc);
4117
0
  ccv_nnc_tensor_free(bias);
4118
0
  ccv_nnc_tensor_free(w);
4119
0
  ccv_nnc_tensor_free(b);
4120
0
  ccv_nnc_tensor_free(a);
4121
0
  ccv_nnc_tensor_free(gbias);
4122
0
  ccv_nnc_tensor_free(gwo);
4123
0
  ccv_nnc_tensor_free(ga);
4124
0
}
4125
4126
TEST_CASE("mps forward convolution transpose in nchw format")
4127
1
{
4128
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
4129
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
4130
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
4131
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
4132
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
4133
0
  assert(cmd.backend >= 0);
4134
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
4135
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4136
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, INPUT_DIM), 0);
4137
  // configure the inlets.
4138
0
  dsfmt_t dsfmt;
4139
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4140
0
  int i;
4141
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
4142
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
4143
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
4144
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4145
0
  for (i = 0; i < INPUT_DIM; i++)
4146
0
    bias->data.f32[i] = (float)i / INPUT_DIM;
4147
  // Copy generated matrix values over to GPU.
4148
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
4149
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4150
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, INPUT_DIM), 0);
4151
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
4152
0
  move.backend = CCV_NNC_BACKEND_MPS;
4153
0
  assert(move.backend >= 0);
4154
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
4155
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
4156
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
4157
4158
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
4159
0
  transform.backend = CCV_NNC_BACKEND_MPS;
4160
0
  assert(transform.backend >= 0);
4161
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
4162
0
  assert(cmd.backend >= 0);
4163
0
  cmd.algorithm = -1;
4164
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
4165
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
4166
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
4167
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
4168
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-5, "output from mps should match from CPU");
4169
0
  ccv_nnc_tensor_free(c);
4170
0
  ccv_nnc_tensor_free(gc);
4171
0
  ccv_nnc_tensor_free(bias);
4172
0
  ccv_nnc_tensor_free(w);
4173
0
  ccv_nnc_tensor_free(b);
4174
0
  ccv_nnc_tensor_free(a);
4175
0
  ccv_nnc_tensor_free(gbias);
4176
0
  ccv_nnc_tensor_free(gw);
4177
0
  ccv_nnc_tensor_free(ga);
4178
0
}
4179
4180
TEST_CASE("mps forward convolution transpose in half precision")
4181
1
{
4182
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
4183
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4184
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4185
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
4186
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
4187
0
  assert(cmd.backend >= 0);
4188
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
4189
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4190
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, INPUT_DIM), 0);
4191
  // configure the inlets.
4192
0
  dsfmt_t dsfmt;
4193
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4194
0
  int i;
4195
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
4196
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
4197
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
4198
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4199
0
  for (i = 0; i < INPUT_DIM; i++)
4200
0
    bias->data.f32[i] = (float)i / INPUT_DIM;
4201
0
  ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4202
0
  ccv_nnc_tensor_t* w1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4203
0
  ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, INPUT_DIM), 0);
4204
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(a1, w1, bias1), 0);
4205
  // Copy generated matrix values over to GPU.
4206
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4207
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4208
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4209
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, INPUT_DIM), 0);
4210
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, w1, bias1), TENSOR_LIST(ga, gw, gbias), 0);
4211
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
4212
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4213
4214
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
4215
0
  transform.backend = CCV_NNC_BACKEND_MPS;
4216
0
  assert(transform.backend >= 0);
4217
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
4218
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
4219
0
  ccv_nnc_stream_context_wait(stream_context);
4220
0
  ccv_nnc_tensor_free(gw);
4221
4222
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
4223
0
  assert(cmd.backend >= 0);
4224
0
  cmd.algorithm = -1;
4225
0
  cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
4226
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
4227
0
  ccv_nnc_stream_context_wait(stream_context);
4228
0
  ccv_nnc_stream_context_free(stream_context);
4229
0
  ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4230
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
4231
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4232
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
4233
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 5e-3, "output from mps should match from CPU");
4234
0
  ccv_nnc_tensor_free(c);
4235
0
  ccv_nnc_tensor_free(gc);
4236
0
  ccv_nnc_tensor_free(bias);
4237
0
  ccv_nnc_tensor_free(w);
4238
0
  ccv_nnc_tensor_free(b);
4239
0
  ccv_nnc_tensor_free(a);
4240
0
  ccv_nnc_tensor_free(c1);
4241
0
  ccv_nnc_tensor_free(bias1);
4242
0
  ccv_nnc_tensor_free(w1);
4243
0
  ccv_nnc_tensor_free(a1);
4244
0
  ccv_nnc_tensor_free(gbias);
4245
0
  ccv_nnc_tensor_free(gwo);
4246
0
  ccv_nnc_tensor_free(ga);
4247
0
}
4248
4249
#include "case_main.h"