Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/mpsdnn.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
#include <nnc/ccv_nnc_internal.h>
9
10
TEST_SETUP()
11
{
12
  ccv_nnc_init();
13
}
14
15
0
#define INPUT_DIM (3)
16
0
#define OUTPUT_DIM (96)
17
18
0
#define INPUT_SIZE (224)
19
0
#define OUTPUT_SIZE (112)
20
21
0
#define KERNEL_SIZE (7)
22
23
#define BATCH_SIZE (16)
24
25
0
#define LN_DIM (10)
26
0
#define GN_C_DIM (16)
27
#define GN_RC_DIM (4)
28
29
TEST_CASE("mps forward convolution")
30
1
{
31
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
32
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
33
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
34
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
35
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
36
0
  assert(cmd.backend >= 0);
37
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
38
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
39
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
40
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
41
  // configure the inlets.
42
0
  dsfmt_t dsfmt;
43
0
  dsfmt_init_gen_rand(&dsfmt, 0);
44
0
  int i;
45
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
46
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
47
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
48
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
49
0
  for (i = 0; i < OUTPUT_DIM; i++)
50
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
51
  // Copy generated matrix values over to GPU.
52
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
53
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
54
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
55
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
56
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
57
0
  move.backend = CCV_NNC_BACKEND_MPS;
58
0
  assert(move.backend >= 0);
59
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
60
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
61
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
62
63
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
64
0
  transform.backend = CCV_NNC_BACKEND_MPS;
65
0
  assert(transform.backend >= 0);
66
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
67
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
68
0
  ccv_nnc_stream_context_wait(stream_context);
69
0
  ccv_nnc_tensor_free(gw);
70
71
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
72
0
  assert(cmd.backend >= 0);
73
0
  cmd.algorithm = -1;
74
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
75
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
76
0
  ccv_nnc_stream_context_wait(stream_context);
77
0
  ccv_nnc_stream_context_free(stream_context);
78
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
79
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
80
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
81
0
  ccv_nnc_tensor_free(c);
82
0
  ccv_nnc_tensor_free(gc);
83
0
  ccv_nnc_tensor_free(bias);
84
0
  ccv_nnc_tensor_free(w);
85
0
  ccv_nnc_tensor_free(b);
86
0
  ccv_nnc_tensor_free(a);
87
0
  ccv_nnc_tensor_free(gbias);
88
0
  ccv_nnc_tensor_free(gwo);
89
0
  ccv_nnc_tensor_free(ga);
90
0
}
91
92
TEST_CASE("mps forward convolution in nchw format")
93
1
{
94
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
95
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
96
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
97
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
98
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
99
0
  assert(cmd.backend >= 0);
100
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
101
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
102
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
103
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
104
  // configure the inlets.
105
0
  dsfmt_t dsfmt;
106
0
  dsfmt_init_gen_rand(&dsfmt, 0);
107
0
  int i;
108
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
109
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
110
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
111
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
112
0
  for (i = 0; i < OUTPUT_DIM; i++)
113
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
114
  // Copy generated matrix values over to GPU.
115
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
116
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
117
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
118
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
119
0
  move.backend = CCV_NNC_BACKEND_MPS;
120
0
  assert(move.backend >= 0);
121
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
122
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
123
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
124
125
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
126
0
  transform.backend = CCV_NNC_BACKEND_MPS;
127
0
  assert(transform.backend >= 0);
128
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
129
0
  assert(cmd.backend >= 0);
130
0
  cmd.algorithm = -1;
131
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
132
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
133
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
134
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
135
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
136
0
  ccv_nnc_tensor_free(c);
137
0
  ccv_nnc_tensor_free(gc);
138
0
  ccv_nnc_tensor_free(bias);
139
0
  ccv_nnc_tensor_free(w);
140
0
  ccv_nnc_tensor_free(b);
141
0
  ccv_nnc_tensor_free(a);
142
0
  ccv_nnc_tensor_free(gbias);
143
0
  ccv_nnc_tensor_free(gw);
144
0
  ccv_nnc_tensor_free(ga);
145
0
}
146
147
TEST_CASE("mps forward convolution with 1x1 kernel")
148
1
{
149
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
150
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, INPUT_DIM), 0);
151
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
152
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 1, 1, INPUT_DIM);
153
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
154
0
  assert(cmd.backend >= 0);
155
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
156
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
157
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, 1, 1, INPUT_DIM), 0);
158
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
159
  // configure the inlets.
160
0
  dsfmt_t dsfmt;
161
0
  dsfmt_init_gen_rand(&dsfmt, 0);
162
0
  int i;
163
0
  for (i = 0; i < INPUT_DIM * 1 * 1 * OUTPUT_DIM; i++)
164
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * 1 * 1);
165
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
166
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
167
0
  for (i = 0; i < OUTPUT_DIM; i++)
168
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
169
  // Copy generated matrix values over to GPU.
170
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, INPUT_DIM), 0);
171
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, 1, 1, INPUT_DIM), 0);
172
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
173
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
174
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
175
0
  move.backend = CCV_NNC_BACKEND_MPS;
176
0
  assert(move.backend >= 0);
177
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
178
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
179
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
180
181
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
182
0
  transform.backend = CCV_NNC_BACKEND_MPS;
183
0
  assert(transform.backend >= 0);
184
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
185
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
186
0
  ccv_nnc_stream_context_wait(stream_context);
187
0
  ccv_nnc_tensor_free(gw);
188
189
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
190
0
  assert(cmd.backend >= 0);
191
0
  cmd.algorithm = -1;
192
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
193
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
194
0
  ccv_nnc_stream_context_wait(stream_context);
195
0
  ccv_nnc_stream_context_free(stream_context);
196
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
197
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
198
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
199
0
  ccv_nnc_tensor_free(c);
200
0
  ccv_nnc_tensor_free(gc);
201
0
  ccv_nnc_tensor_free(bias);
202
0
  ccv_nnc_tensor_free(w);
203
0
  ccv_nnc_tensor_free(b);
204
0
  ccv_nnc_tensor_free(a);
205
0
  ccv_nnc_tensor_free(gbias);
206
0
  ccv_nnc_tensor_free(gwo);
207
0
  ccv_nnc_tensor_free(ga);
208
0
}
209
210
TEST_CASE("mps forward convolution in nchw format with 1x1 kernel")
211
1
{
212
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
213
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
214
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
215
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 1, 1, INPUT_DIM);
216
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
217
0
  assert(cmd.backend >= 0);
218
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
219
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
220
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
221
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
222
  // configure the inlets.
223
0
  dsfmt_t dsfmt;
224
0
  dsfmt_init_gen_rand(&dsfmt, 0);
225
0
  int i;
226
0
  for (i = 0; i < INPUT_DIM * 1 * 1 * OUTPUT_DIM; i++)
227
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * 1 * 1);
228
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
229
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
230
0
  for (i = 0; i < OUTPUT_DIM; i++)
231
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
232
  // Copy generated matrix values over to GPU.
233
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
234
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
235
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
236
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
237
0
  move.backend = CCV_NNC_BACKEND_MPS;
238
0
  assert(move.backend >= 0);
239
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
240
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
241
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
242
243
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
244
0
  transform.backend = CCV_NNC_BACKEND_MPS;
245
0
  assert(transform.backend >= 0);
246
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
247
0
  assert(cmd.backend >= 0);
248
0
  cmd.algorithm = -1;
249
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
250
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
251
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
252
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
253
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
254
0
  ccv_nnc_tensor_free(c);
255
0
  ccv_nnc_tensor_free(gc);
256
0
  ccv_nnc_tensor_free(bias);
257
0
  ccv_nnc_tensor_free(w);
258
0
  ccv_nnc_tensor_free(b);
259
0
  ccv_nnc_tensor_free(a);
260
0
  ccv_nnc_tensor_free(gbias);
261
0
  ccv_nnc_tensor_free(gw);
262
0
  ccv_nnc_tensor_free(ga);
263
0
}
264
265
TEST_CASE("mps forward convolution in half precision")
266
1
{
267
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
268
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
269
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
270
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
271
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
272
0
  assert(cmd.backend >= 0);
273
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
274
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
275
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
276
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
277
  // configure the inlets.
278
0
  dsfmt_t dsfmt;
279
0
  dsfmt_init_gen_rand(&dsfmt, 0);
280
0
  int i;
281
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
282
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
283
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
284
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
285
0
  for (i = 0; i < OUTPUT_DIM; i++)
286
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
287
0
  ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
288
0
  ccv_nnc_tensor_t* w1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
289
0
  ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM), 0);
290
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(a1, w1, bias1), 0);
291
  // Copy generated matrix values over to GPU.
292
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
293
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
294
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
295
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM), 0);
296
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, w1, bias1), TENSOR_LIST(ga, gw, gbias), 0);
297
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
298
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
299
300
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
301
0
  transform.backend = CCV_NNC_BACKEND_MPS;
302
0
  assert(transform.backend >= 0);
303
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
304
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
305
0
  ccv_nnc_stream_context_wait(stream_context);
306
0
  ccv_nnc_tensor_free(gw);
307
308
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
309
0
  assert(cmd.backend >= 0);
310
0
  cmd.algorithm = -1;
311
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
312
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
313
0
  ccv_nnc_stream_context_wait(stream_context);
314
0
  ccv_nnc_stream_context_free(stream_context);
315
0
  ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
316
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
317
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
318
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
319
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 5e-3, "output from mps should match from CPU");
320
0
  ccv_nnc_tensor_free(c);
321
0
  ccv_nnc_tensor_free(gc);
322
0
  ccv_nnc_tensor_free(bias);
323
0
  ccv_nnc_tensor_free(w);
324
0
  ccv_nnc_tensor_free(b);
325
0
  ccv_nnc_tensor_free(a);
326
0
  ccv_nnc_tensor_free(c1);
327
0
  ccv_nnc_tensor_free(bias1);
328
0
  ccv_nnc_tensor_free(w1);
329
0
  ccv_nnc_tensor_free(a1);
330
0
  ccv_nnc_tensor_free(gbias);
331
0
  ccv_nnc_tensor_free(gwo);
332
0
  ccv_nnc_tensor_free(ga);
333
0
}
334
335
TEST_CASE("mps forward convolution with dilation 2, 3")
336
1
{
337
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
338
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
339
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
340
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
341
0
  cmd.info.convolution.dilation[0] = 2;
342
0
  cmd.info.convolution.dilation[1] = 3;
343
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
344
0
  assert(cmd.backend >= 0);
345
0
  ccv_nnc_cmd_param_t modified_cmd = cmd.info;
346
0
  modified_cmd.size.dim[0] = (cmd.info.size.dim[0] - 1) * ccv_max(cmd.info.convolution.dilation[0], 1) + 1;
347
0
  modified_cmd.size.dim[1] = (cmd.info.size.dim[1] - 1) * ccv_max(cmd.info.convolution.dilation[1], 1) + 1;
348
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(modified_cmd, a->info, b->info);
349
0
  assert(ccv_nnc_hint_verify(hint, modified_cmd, a->info, b->info) == 0);
350
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
351
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
352
  // configure the inlets.
353
0
  dsfmt_t dsfmt;
354
0
  dsfmt_init_gen_rand(&dsfmt, 0);
355
0
  int i;
356
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
357
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
358
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
359
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
360
0
  for (i = 0; i < OUTPUT_DIM; i++)
361
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
362
  // Copy generated matrix values over to GPU.
363
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
364
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
365
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
366
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
367
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
368
0
  move.backend = CCV_NNC_BACKEND_MPS;
369
0
  assert(move.backend >= 0);
370
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
371
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
372
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
373
374
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
375
0
  transform.backend = CCV_NNC_BACKEND_MPS;
376
0
  assert(transform.backend >= 0);
377
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
378
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
379
0
  ccv_nnc_stream_context_wait(stream_context);
380
0
  ccv_nnc_tensor_free(gw);
381
382
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
383
0
  assert(cmd.backend >= 0);
384
0
  cmd.algorithm = -1;
385
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
386
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
387
0
  ccv_nnc_stream_context_wait(stream_context);
388
0
  ccv_nnc_stream_context_free(stream_context);
389
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
390
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
391
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
392
0
  ccv_nnc_tensor_free(c);
393
0
  ccv_nnc_tensor_free(gc);
394
0
  ccv_nnc_tensor_free(bias);
395
0
  ccv_nnc_tensor_free(w);
396
0
  ccv_nnc_tensor_free(b);
397
0
  ccv_nnc_tensor_free(a);
398
0
  ccv_nnc_tensor_free(gbias);
399
0
  ccv_nnc_tensor_free(gwo);
400
0
  ccv_nnc_tensor_free(ga);
401
0
}
402
403
TEST_CASE("mps forward convolution 3d")
404
1
{
405
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
406
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
407
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
408
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
409
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
410
0
  hint.stride.dim[0] = 2;
411
0
  hint.border.begin[0] = 1;
412
0
  hint.border.end[0] = 1;
413
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
414
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
415
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
416
  // configure the inlets.
417
0
  dsfmt_t dsfmt;
418
0
  dsfmt_init_gen_rand(&dsfmt, 0);
419
0
  int i;
420
0
  for (i = 0; i < INPUT_DIM * 3 * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
421
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
422
0
  for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
423
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
424
0
  for (i = 0; i < OUTPUT_DIM; i++)
425
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
426
  // Copy generated matrix values over to GPU.
427
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
428
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
429
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
430
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
431
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
432
0
  move.backend = CCV_NNC_BACKEND_MPS;
433
0
  assert(move.backend >= 0);
434
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
435
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
436
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
437
0
  transform.backend = CCV_NNC_BACKEND_MPS;
438
0
  assert(transform.backend >= 0);
439
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
440
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
441
0
  ccv_nnc_stream_context_wait(stream_context);
442
0
  ccv_nnc_tensor_free(gw);
443
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
444
0
  assert(cmd.backend >= 0);
445
0
  cmd.algorithm = -1;
446
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
447
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
448
0
  ccv_nnc_stream_context_wait(stream_context);
449
0
  ccv_nnc_stream_context_free(stream_context);
450
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
451
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
452
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
453
0
  assert(cmd.backend >= 0);
454
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
455
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from mps should match from CPU");
456
0
  ccv_nnc_tensor_free(c);
457
0
  ccv_nnc_tensor_free(gc);
458
0
  ccv_nnc_tensor_free(bias);
459
0
  ccv_nnc_tensor_free(w);
460
0
  ccv_nnc_tensor_free(b);
461
0
  ccv_nnc_tensor_free(a);
462
0
  ccv_nnc_tensor_free(gbias);
463
0
  ccv_nnc_tensor_free(gwo);
464
0
  ccv_nnc_tensor_free(ga);
465
0
}
466
467
TEST_CASE("mps forward convolution 3d in nchw format")
468
1
{
469
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
470
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
471
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
472
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
473
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
474
0
  hint.stride.dim[0] = 2;
475
0
  hint.border.begin[0] = 1;
476
0
  hint.border.end[0] = 1;
477
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
478
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
479
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM), 0);
480
  // configure the inlets.
481
0
  dsfmt_t dsfmt;
482
0
  dsfmt_init_gen_rand(&dsfmt, 0);
483
0
  int i;
484
0
  for (i = 0; i < 3 * INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
485
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
486
0
  for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
487
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
488
0
  for (i = 0; i < OUTPUT_DIM; i++)
489
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
490
  // Copy generated matrix values over to GPU.
491
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
492
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
493
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
494
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
495
0
  move.backend = CCV_NNC_BACKEND_MPS;
496
0
  assert(move.backend >= 0);
497
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
498
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
499
500
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
501
0
  transform.backend = CCV_NNC_BACKEND_MPS;
502
0
  assert(transform.backend >= 0);
503
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
504
0
  assert(cmd.backend >= 0);
505
0
  cmd.algorithm = -1;
506
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
507
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
508
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
509
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
510
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
511
0
  assert(cmd.backend >= 0);
512
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
513
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from cudnn should match from CPU");
514
0
  ccv_nnc_tensor_free(c);
515
0
  ccv_nnc_tensor_free(gc);
516
0
  ccv_nnc_tensor_free(bias);
517
0
  ccv_nnc_tensor_free(w);
518
0
  ccv_nnc_tensor_free(b);
519
0
  ccv_nnc_tensor_free(a);
520
0
  ccv_nnc_tensor_free(gbias);
521
0
  ccv_nnc_tensor_free(gw);
522
0
  ccv_nnc_tensor_free(ga);
523
0
}
524
525
TEST_CASE("compare softmax with mps")
526
1
{
527
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_MPS));
528
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
529
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
530
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
531
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "softmax");
532
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
533
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
534
0
  ccv_nnc_graph_t* graph = 0;
535
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
536
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
537
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
538
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
539
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
540
0
  dsfmt_t dsfmt;
541
0
  dsfmt_init_gen_rand(&dsfmt, 0);
542
0
  int i;
543
0
  for (i = 0; i < 20 * 10; i++)
544
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
545
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
546
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
547
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
548
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
549
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
550
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
551
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
552
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
553
0
  REQUIRE_TENSOR_EQ(ty, y_tensor, "softmax from mps should match from CPU");
554
0
  ccv_nnc_tensor_free(x_tensor);
555
0
  ccv_nnc_tensor_free(y_tensor);
556
0
  ccv_nnc_tensor_free(ty);
557
0
  ccv_nnc_graph_free(graph);
558
0
  ccv_nnc_tensor_arena_free(tensor_arena);
559
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
560
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
561
0
}
562
563
TEST_CASE("compare softmax with mps in half precision")
564
1
{
565
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_MPS));
566
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
567
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
568
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
569
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "softmax");
570
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
571
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
572
0
  ccv_nnc_graph_t* graph = 0;
573
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
574
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
575
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
576
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
577
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
578
0
  dsfmt_t dsfmt;
579
0
  dsfmt_init_gen_rand(&dsfmt, 0);
580
0
  int i;
581
0
  for (i = 0; i < 20 * 10; i++)
582
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
583
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
584
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
585
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
586
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
587
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
588
0
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
589
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
590
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
591
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
592
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
593
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
594
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
595
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "softmax from mps should match from CPU");
596
0
  ccv_nnc_tensor_free(x_tensor);
597
0
  ccv_nnc_tensor_free(x16_tensor);
598
0
  ccv_nnc_tensor_free(y16_tensor);
599
0
  ccv_nnc_tensor_free(y_tensor);
600
0
  ccv_nnc_tensor_free(ty);
601
0
  ccv_nnc_graph_free(graph);
602
0
  ccv_nnc_tensor_arena_free(tensor_arena);
603
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
604
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
605
0
}
606
607
TEST_CASE("compare softmax gradient with mps")
608
1
{
609
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_MPS) &&
610
1
    ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_BACKWARD, CCV_NNC_BACKEND_MPS));
611
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
612
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
613
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
614
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "softmax");
615
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
616
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
617
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
618
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
619
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
620
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
621
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
622
0
  dsfmt_t dsfmt;
623
0
  dsfmt_init_gen_rand(&dsfmt, 0);
624
0
  int i;
625
0
  for (i = 0; i < 10 * 100; i++)
626
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
627
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
628
0
  for (i = 0; i < 10 * 100; i++)
629
0
    dy_tensor->data.f32[i] = 0;
630
0
  for (i = 0; i < 10; i++)
631
0
    dy_tensor->data.f32[i * 100 + i] = 1;
632
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
633
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
634
0
  ccv_nnc_graph_t* graph = 0;
635
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
636
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
637
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
638
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
639
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
640
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
641
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
642
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
643
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
644
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
645
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
646
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
647
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
648
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
649
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
650
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
651
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
652
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
653
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
654
0
  ccv_nnc_tensor_free(x_tensor);
655
0
  ccv_nnc_tensor_free(y_tensor);
656
0
  ccv_nnc_tensor_free(dx_tensor);
657
0
  ccv_nnc_tensor_free(dy_tensor);
658
0
  ccv_nnc_tensor_free(ty_tensor);
659
0
  ccv_nnc_tensor_free(tdx_tensor);
660
0
  ccv_nnc_tensor_free(dyt);
661
0
  ccv_nnc_graph_free(graph);
662
0
  ccv_nnc_tensor_arena_free(tensor_arena);
663
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
664
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
665
0
}
666
667
TEST_CASE("compare sigmoid with mps")
668
1
{
669
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS));
670
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
671
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
672
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
673
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "sigmoid");
674
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
675
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
676
0
  ccv_nnc_graph_t* graph = 0;
677
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
678
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
679
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
680
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
681
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
682
0
  dsfmt_t dsfmt;
683
0
  dsfmt_init_gen_rand(&dsfmt, 0);
684
0
  int i;
685
0
  for (i = 0; i < 20 * 10; i++)
686
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
687
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
688
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
689
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
690
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
691
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
692
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
693
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
694
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
695
0
  REQUIRE_TENSOR_EQ(ty, y_tensor, "sigmoid from mps should match from CPU");
696
0
  ccv_nnc_tensor_free(x_tensor);
697
0
  ccv_nnc_tensor_free(y_tensor);
698
0
  ccv_nnc_tensor_free(ty);
699
0
  ccv_nnc_graph_free(graph);
700
0
  ccv_nnc_tensor_arena_free(tensor_arena);
701
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
702
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
703
0
}
704
705
TEST_CASE("compare sigmoid with mps in half precision")
706
1
{
707
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS));
708
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
709
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
710
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
711
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "sigmoid");
712
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
713
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
714
0
  ccv_nnc_graph_t* graph = 0;
715
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
716
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
717
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
718
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
719
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
720
0
  dsfmt_t dsfmt;
721
0
  dsfmt_init_gen_rand(&dsfmt, 0);
722
0
  int i;
723
0
  for (i = 0; i < 20 * 10; i++)
724
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
725
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
726
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
727
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
728
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
729
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
730
0
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
731
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
732
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
733
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
734
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
735
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
736
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
737
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "sigmoid from mps should match from CPU");
738
0
  ccv_nnc_tensor_free(x_tensor);
739
0
  ccv_nnc_tensor_free(x16_tensor);
740
0
  ccv_nnc_tensor_free(y16_tensor);
741
0
  ccv_nnc_tensor_free(y_tensor);
742
0
  ccv_nnc_tensor_free(ty);
743
0
  ccv_nnc_graph_free(graph);
744
0
  ccv_nnc_tensor_arena_free(tensor_arena);
745
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
746
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
747
0
}
748
749
750
TEST_CASE("compare sigmoid gradient with mps")
751
1
{
752
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS) &&
753
1
    ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_BACKWARD, CCV_NNC_BACKEND_MPS));
754
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
755
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
756
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
757
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "sigmoid");
758
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
759
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
760
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
761
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
762
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
763
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
764
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
765
0
  dsfmt_t dsfmt;
766
0
  dsfmt_init_gen_rand(&dsfmt, 0);
767
0
  int i;
768
0
  for (i = 0; i < 10 * 100; i++)
769
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
770
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
771
0
  for (i = 0; i < 10 * 100; i++)
772
0
    dy_tensor->data.f32[i] = 0;
773
0
  for (i = 0; i < 10; i++)
774
0
    dy_tensor->data.f32[i * 100 + i] = 1;
775
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
776
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
777
0
  ccv_nnc_graph_t* graph = 0;
778
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
779
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
780
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
781
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
782
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
783
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
784
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
785
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
786
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
787
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
788
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
789
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
790
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
791
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
792
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
793
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
794
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
795
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
796
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
797
0
  ccv_nnc_tensor_free(x_tensor);
798
0
  ccv_nnc_tensor_free(y_tensor);
799
0
  ccv_nnc_tensor_free(dx_tensor);
800
0
  ccv_nnc_tensor_free(dy_tensor);
801
0
  ccv_nnc_tensor_free(ty_tensor);
802
0
  ccv_nnc_tensor_free(tdx_tensor);
803
0
  ccv_nnc_tensor_free(dyt);
804
0
  ccv_nnc_graph_free(graph);
805
0
  ccv_nnc_tensor_arena_free(tensor_arena);
806
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
807
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
808
0
}
809
810
TEST_CASE("compare relu with mps")
811
1
{
812
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_MPS));
813
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
814
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
815
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "y");
816
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
817
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
818
0
  ccv_nnc_graph_t* graph = 0;
819
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
820
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
821
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
822
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
823
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
824
0
  dsfmt_t dsfmt;
825
0
  dsfmt_init_gen_rand(&dsfmt, 0);
826
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
827
0
  int i;
828
0
  for (i = 0; i < 7 * 7 * 10; i++)
829
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
830
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
831
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
832
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
833
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
834
0
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
835
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
836
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
837
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
838
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
839
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
840
0
  ccv_nnc_tensor_arena_free(tensor_arena);
841
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
842
0
  ccv_nnc_graph_free(graph);
843
0
  ccv_nnc_tensor_free(x_tensor);
844
0
  ccv_nnc_tensor_free(y_tensor);
845
0
  ccv_nnc_tensor_free(cpu_y);
846
0
}
847
848
TEST_CASE("compare relu with mps in half precision")
849
1
{
850
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_MPS));
851
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
852
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
853
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "y");
854
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
855
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
856
0
  ccv_nnc_graph_t* graph = 0;
857
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
858
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
859
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
860
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
861
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
862
0
  dsfmt_t dsfmt;
863
0
  dsfmt_init_gen_rand(&dsfmt, 0);
864
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
865
0
  int i;
866
0
  for (i = 0; i < 7 * 7 * 10; i++)
867
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
868
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
869
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
870
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
871
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
872
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
873
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
874
0
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
875
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
876
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
877
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
878
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
879
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
880
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 7 * 7 * 10, 1e-3, "mps result should equal to cpu result");
881
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
882
0
  ccv_nnc_tensor_arena_free(tensor_arena);
883
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
884
0
  ccv_nnc_graph_free(graph);
885
0
  ccv_nnc_tensor_free(x_tensor);
886
0
  ccv_nnc_tensor_free(x16_tensor);
887
0
  ccv_nnc_tensor_free(y_tensor);
888
0
  ccv_nnc_tensor_free(cpu_y);
889
0
  ccv_nnc_tensor_free(cpu_y16);
890
0
}
891
892
TEST_CASE("compare layer norm with mps")
893
1
{
894
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
895
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
896
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
897
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
898
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
899
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
900
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
901
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
902
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "bias");
903
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
904
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
905
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
906
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
907
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
908
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
909
0
  ccv_nnc_graph_t* graph = 0;
910
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
911
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
912
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
913
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
914
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
915
0
  dsfmt_t dsfmt;
916
0
  float xdata[2 * 2 * 2 * 10];
917
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
918
0
  int i;
919
0
  dsfmt_init_gen_rand(&dsfmt, 1);
920
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
921
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
922
0
  float scaledata[1 * 2 * 2 * 10];
923
0
  float biasdata[1 * 2 * 2 * 10];
924
0
  for (i = 0; i < 1 * 2 * 2 * 10; i++)
925
0
  {
926
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
927
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
928
0
  }
929
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
930
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
931
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
932
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
933
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
934
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
935
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
936
0
  ccv_nnc_graph_free(graph);
937
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
938
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
939
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
940
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
941
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "bias");
942
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
943
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
944
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
945
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
946
0
  ccv_nnc_graph_t* cpu_graph = 0;
947
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
948
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
949
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
950
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
951
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
952
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
953
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
954
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
955
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 2 * 2 * 10);
956
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
957
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
958
  // Note that MPS and my other implementations treat epsilon differently.
959
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "layer norm result from mps should match the one from reference implementation");
960
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
961
0
  ccv_nnc_tensor_arena_free(tensor_arena);
962
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
963
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
964
0
  ccv_nnc_graph_free(cpu_graph);
965
0
}
966
967
TEST_CASE("compare layer norm with mps without scale / bias")
968
1
{
969
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
970
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
971
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
972
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
973
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
974
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
975
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
976
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
977
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
978
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
979
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
980
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
981
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
982
0
  ccv_nnc_graph_t* graph = 0;
983
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
984
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
985
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
986
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
987
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
988
0
  dsfmt_t dsfmt;
989
0
  float xdata[2 * 2 * 2 * 10];
990
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
991
0
  int i;
992
0
  dsfmt_init_gen_rand(&dsfmt, 1);
993
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
994
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
995
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
996
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
997
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
998
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
999
0
  ccv_nnc_graph_free(graph);
1000
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1001
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1002
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1003
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
1004
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1005
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
1006
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1007
0
  ccv_nnc_graph_t* cpu_graph = 0;
1008
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1009
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1010
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1011
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1012
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
1013
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1014
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1015
  // Note that MPS and my other implementations treat epsilon differently.
1016
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "layer norm result from mps should match the one from reference implementation");
1017
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1018
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1019
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1020
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1021
0
  ccv_nnc_graph_free(cpu_graph);
1022
0
}
1023
1024
TEST_CASE("compare group norm with mps")
1025
1
{
1026
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
1027
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
1028
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1029
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
1030
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
1031
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
1032
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
1033
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "scale");
1034
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "bias");
1035
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
1036
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
1037
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1038
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
1039
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1040
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1041
0
  ccv_nnc_graph_t* graph = 0;
1042
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1043
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1044
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1045
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1046
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1047
0
  dsfmt_t dsfmt;
1048
0
  float xdata[2 * 16 * 2 * 10];
1049
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1050
0
  int i;
1051
0
  dsfmt_init_gen_rand(&dsfmt, 1);
1052
0
  for (i = 0; i < 2 * 16 * 2 * 10; i++)
1053
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1054
0
  float scaledata[1 * 16 * 2 * 10];
1055
0
  float biasdata[1 * 16 * 2 * 10];
1056
0
  for (i = 0; i < 1 * 16 * 2 * 10; i++)
1057
0
  {
1058
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1059
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
1060
0
  }
1061
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1062
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
1063
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
1064
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1065
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1066
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1067
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1068
0
  ccv_nnc_graph_free(graph);
1069
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1070
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
1071
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
1072
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "scale");
1073
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "bias");
1074
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
1075
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
1076
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
1077
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1078
0
  ccv_nnc_graph_t* cpu_graph = 0;
1079
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1080
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1081
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1082
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1083
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
1084
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1085
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 16 * 2 * 10);
1086
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
1087
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 16 * 2 * 10);
1088
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1089
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1090
  // Note that MPS and my other implementations treat epsilon differently.
1091
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-3, "group norm result from mps should match the one from reference implementation");
1092
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1093
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1094
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1095
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1096
0
  ccv_nnc_graph_free(cpu_graph);
1097
0
}
1098
1099
TEST_CASE("compare group norm with mps without scale / bias")
1100
1
{
1101
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
1102
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
1103
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1104
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
1105
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
1106
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
1107
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
1108
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
1109
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
1110
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1111
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
1112
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1113
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1114
0
  ccv_nnc_graph_t* graph = 0;
1115
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1116
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1117
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1118
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1119
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1120
0
  dsfmt_t dsfmt;
1121
0
  float xdata[2 * 16 * 2 * 10];
1122
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1123
0
  int i;
1124
0
  dsfmt_init_gen_rand(&dsfmt, 1);
1125
0
  for (i = 0; i < 2 * 16 * 2 * 10; i++)
1126
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1127
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1128
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1129
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1130
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1131
0
  ccv_nnc_graph_free(graph);
1132
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1133
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
1134
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
1135
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
1136
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
1137
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
1138
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1139
0
  ccv_nnc_graph_t* cpu_graph = 0;
1140
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1141
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1142
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1143
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1144
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
1145
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1146
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1147
  // Note that MPS and my other implementations treat epsilon differently.
1148
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-3, "group norm result from mps should match the one from reference implementation");
1149
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1150
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1151
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1152
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1153
0
  ccv_nnc_graph_free(cpu_graph);
1154
0
}
1155
1156
TEST_CASE("compare rmsnorm with mps")
1157
1
{
1158
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
1159
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
1160
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1161
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
1162
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
1163
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
1164
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
1165
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
1166
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
1167
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
1168
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-6, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
1169
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
1170
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1171
0
  ccv_nnc_graph_t* graph = 0;
1172
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1173
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1174
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1175
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1176
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1177
0
  dsfmt_t dsfmt;
1178
0
  float xdata[2 * 2 * 2 * 10];
1179
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1180
0
  int i;
1181
0
  dsfmt_init_gen_rand(&dsfmt, 1);
1182
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
1183
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
1184
0
  float scaledata[1 * 2 * 2 * 10];
1185
0
  for (i = 0; i < 1 * 2 * 2 * 10; i++)
1186
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
1187
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
1188
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale)), 0);
1189
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1190
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1191
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1192
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1193
0
  ccv_nnc_graph_free(graph);
1194
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
1195
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
1196
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
1197
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
1198
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
1199
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-6, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "rmsnorm");
1200
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1201
0
  ccv_nnc_graph_t* cpu_graph = 0;
1202
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
1203
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
1204
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
1205
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
1206
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
1207
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
1208
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
1209
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
1210
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
1211
  // Note that MPS and my other implementations treat epsilon differently.
1212
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "rmsnorm result from mps should match the one from reference implementation");
1213
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
1214
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1215
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
1216
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
1217
0
  ccv_nnc_graph_free(cpu_graph);
1218
0
}
1219
1220
TEST_CASE("compare add with mps")
1221
1
{
1222
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
1223
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1224
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
1225
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
1226
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
1227
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
1228
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
1229
0
  ccv_nnc_tensor_symbol_t z = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "z");
1230
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
1231
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
1232
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(z), "transfer");
1233
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1234
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1235
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1236
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1237
0
  ccv_nnc_graph_t* graph = 0;
1238
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1239
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1240
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(z), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1241
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1242
0
  dsfmt_t dsfmt;
1243
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1244
0
  int i;
1245
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1246
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1247
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
1248
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1249
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1250
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
1251
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1252
0
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, z);
1253
0
  REQUIRE_TENSOR_EQ(zt, z_tensor, "add should match");
1254
0
  ccv_nnc_tensor_free(x_tensor);
1255
0
  ccv_nnc_tensor_free(y_tensor);
1256
0
  ccv_nnc_tensor_free(zt);
1257
0
  ccv_nnc_graph_free(graph);
1258
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1259
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1260
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1261
0
}
1262
1263
TEST_CASE("compare add with mps in half precision")
1264
1
{
1265
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
1266
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1267
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
1268
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
1269
0
  ccv_nnc_tensor_symbol_t x16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), "x 16");
1270
0
  ccv_nnc_tensor_symbol_t y16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 1, 3), "y 16");
1271
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "a");
1272
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 1, 3), "b");
1273
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "c");
1274
0
  ccv_nnc_tensor_symbol_t z = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "z");
1275
0
  ccv_nnc_tensor_symbol_t z16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), "z 16");
1276
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(x16, y16), "convert");
1277
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x16, y16), TENSOR_SYMBOL_LIST(a, b), "transfer");
1278
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
1279
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(z16), "transfer");
1280
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(z16), TENSOR_SYMBOL_LIST(z), "convert");
1281
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1282
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1283
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1284
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1285
0
  ccv_nnc_graph_t* graph = 0;
1286
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1287
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1288
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(z), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1289
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1290
0
  dsfmt_t dsfmt;
1291
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1292
0
  int i;
1293
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1294
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1295
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
1296
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1297
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1298
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
1299
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1300
0
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, z);
1301
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, zt->data.f32, z_tensor->data.f32, 10 * 5 * 5 * 3, 1e-3, "add should match");
1302
0
  ccv_nnc_tensor_free(x_tensor);
1303
0
  ccv_nnc_tensor_free(y_tensor);
1304
0
  ccv_nnc_tensor_free(zt);
1305
0
  ccv_nnc_graph_free(graph);
1306
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1307
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1308
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1309
0
}
1310
1311
TEST_CASE("compare add gradient with mps")
1312
1
{
1313
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1314
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1315
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1316
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
1317
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
1318
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
1319
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
1320
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
1321
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
1322
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
1323
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1324
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(x, y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1325
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1326
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1327
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1328
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1329
0
  ccv_nnc_graph_t* graph = 0;
1330
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1331
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1332
0
  ccv_nnc_tensor_symbol_t dc = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, c);
1333
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
1334
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
1335
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(dx, dy), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1336
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1337
0
  dsfmt_t dsfmt;
1338
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1339
0
  int i;
1340
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1341
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1342
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
1343
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1344
0
  ccv_nnc_tensor_t* dct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1345
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1346
0
    dct->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1347
0
  ccv_nnc_tensor_t* const dc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dc);
1348
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dct), TENSOR_LIST(dc_tensor), 0);
1349
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1350
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1351
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
1352
0
  ccv_nnc_tensor_t* dxt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1353
0
  ccv_nnc_tensor_t* dyt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1354
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dct, x_tensor, y_tensor, zt), TENSOR_LIST(dxt, dyt), 0);
1355
0
  ccv_nnc_tensor_t* dx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
1356
0
  ccv_nnc_tensor_t* dy_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dy);
1357
0
  REQUIRE_TENSOR_EQ(dxt, dx_tensor, "backward pass should match");
1358
0
  REQUIRE_TENSOR_EQ(dyt, dy_tensor, "backward pass should match");
1359
0
  ccv_nnc_tensor_free(x_tensor);
1360
0
  ccv_nnc_tensor_free(y_tensor);
1361
0
  ccv_nnc_tensor_free(dct);
1362
0
  ccv_nnc_tensor_free(zt);
1363
0
  ccv_nnc_tensor_free(dxt);
1364
0
  ccv_nnc_tensor_free(dyt);
1365
0
  ccv_nnc_graph_free(graph);
1366
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1367
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1368
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1369
0
}
1370
1371
TEST_CASE("compare add gradient with mps no dyt ")
1372
1
{
1373
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1374
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1375
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
1376
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
1377
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
1378
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
1379
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
1380
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
1381
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
1382
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
1383
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1384
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(x, y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
1385
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1386
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1387
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1388
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
1389
0
  ccv_nnc_graph_t* graph = 0;
1390
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1391
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1392
0
  ccv_nnc_tensor_symbol_t dc = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, c);
1393
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
1394
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
1395
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(dx, dy), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1396
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1397
0
  dsfmt_t dsfmt;
1398
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1399
0
  int i;
1400
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1401
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1402
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
1403
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1404
0
  ccv_nnc_tensor_t* dct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1405
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
1406
0
    dct->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1407
0
  ccv_nnc_tensor_t* const dc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dc);
1408
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dct), TENSOR_LIST(dc_tensor), 0);
1409
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1410
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1411
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
1412
0
  ccv_nnc_tensor_t* dxt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
1413
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dct, x_tensor, y_tensor, zt), TENSOR_LIST(dxt, 0), 0);
1414
0
  ccv_nnc_tensor_t* dx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
1415
0
  REQUIRE_TENSOR_EQ(dxt, dx_tensor, "backward pass should match");
1416
0
  ccv_nnc_tensor_free(x_tensor);
1417
0
  ccv_nnc_tensor_free(y_tensor);
1418
0
  ccv_nnc_tensor_free(dct);
1419
0
  ccv_nnc_tensor_free(zt);
1420
0
  ccv_nnc_tensor_free(dxt);
1421
0
  ccv_nnc_graph_free(graph);
1422
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1423
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1424
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1425
0
}
1426
1427
TEST_CASE("broadcasting semantics for add backward mps (a,b)")
1428
1
{
1429
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1430
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1431
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1432
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1433
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1434
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1435
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1436
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1437
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1438
0
  a->data.f32[0] = 1;
1439
0
  a->data.f32[1] = 2;
1440
0
  a->data.f32[2] = 3;
1441
0
  a->data.f32[3] = 4;
1442
0
  b->data.f32[0] = 5;
1443
0
  b->data.f32[1] = 6;
1444
0
  float ctp[] = {
1445
0
    6, 7,
1446
0
    7, 8,
1447
0
    8, 9,
1448
0
    9, 10
1449
0
  };
1450
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
1451
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1452
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1453
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1454
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1455
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1456
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
1457
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, gdb), 0);
1458
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
1459
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, dbt), 0);
1460
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
1461
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
1462
0
  ccv_nnc_tensor_free(a);
1463
0
  ccv_nnc_tensor_free(b);
1464
0
  ccv_nnc_tensor_free(c);
1465
0
  ccv_nnc_tensor_free(da);
1466
0
  ccv_nnc_tensor_free(db);
1467
0
  ccv_nnc_tensor_free(dat);
1468
0
  ccv_nnc_tensor_free(dbt);
1469
0
  ccv_nnc_tensor_free(ga);
1470
0
  ccv_nnc_tensor_free(gb);
1471
0
  ccv_nnc_tensor_free(gc);
1472
0
  ccv_nnc_tensor_free(gda);
1473
0
  ccv_nnc_tensor_free(gdb);
1474
0
}
1475
1476
TEST_CASE("broadcasting semantics for add backward mps (a, nil)")
1477
1
{
1478
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1479
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1480
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1481
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1482
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1483
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1484
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1485
0
  a->data.f32[0] = 1;
1486
0
  a->data.f32[1] = 2;
1487
0
  a->data.f32[2] = 3;
1488
0
  a->data.f32[3] = 4;
1489
0
  b->data.f32[0] = 5;
1490
0
  b->data.f32[1] = 6;
1491
0
  float ctp[] = {
1492
0
    6, 7,
1493
0
    7, 8,
1494
0
    8, 9,
1495
0
    9, 10
1496
0
  };
1497
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
1498
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1499
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1500
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1501
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1502
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
1503
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, ), TENSOR_LIST(gda, ), 0);
1504
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, ), TENSOR_LIST(da, ), 0);
1505
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, ), TENSOR_LIST(dat, ), 0);
1506
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
1507
0
  ccv_nnc_tensor_free(a);
1508
0
  ccv_nnc_tensor_free(b);
1509
0
  ccv_nnc_tensor_free(c);
1510
0
  ccv_nnc_tensor_free(da);
1511
0
  ccv_nnc_tensor_free(dat);
1512
0
  ccv_nnc_tensor_free(ga);
1513
0
  ccv_nnc_tensor_free(gb);
1514
0
  ccv_nnc_tensor_free(gc);
1515
0
  ccv_nnc_tensor_free(gda);
1516
0
}
1517
1518
TEST_CASE("broadcasting semantics for add backward mps (nil,b)")
1519
1
{
1520
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
1521
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
1522
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1523
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1524
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1525
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1526
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1527
0
  a->data.f32[0] = 1;
1528
0
  a->data.f32[1] = 2;
1529
0
  a->data.f32[2] = 3;
1530
0
  a->data.f32[3] = 4;
1531
0
  b->data.f32[0] = 5;
1532
0
  b->data.f32[1] = 6;
1533
0
  float ctp[] = {
1534
0
    6, 7,
1535
0
    7, 8,
1536
0
    8, 9,
1537
0
    9, 10
1538
0
  };
1539
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
1540
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1541
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1542
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1543
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1544
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
1545
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(0, gdb), 0);
1546
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdb), TENSOR_LIST(db), 0);
1547
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, 0, b), TENSOR_LIST(0, dbt), 0);
1548
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
1549
0
  ccv_nnc_tensor_free(a);
1550
0
  ccv_nnc_tensor_free(b);
1551
0
  ccv_nnc_tensor_free(c);
1552
0
  ccv_nnc_tensor_free(db);
1553
0
  ccv_nnc_tensor_free(dbt);
1554
0
  ccv_nnc_tensor_free(ga);
1555
0
  ccv_nnc_tensor_free(gb);
1556
0
  ccv_nnc_tensor_free(gc);
1557
0
  ccv_nnc_tensor_free(gdb);
1558
0
}
1559
1560
TEST_CASE("compare ewsum with mps")
1561
1
{
1562
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_MPS));
1563
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
1564
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
1565
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
1566
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
1567
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1568
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1569
0
  ccv_nnc_tensor_t* const hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1570
0
  ccv_nnc_tensor_t* const hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1571
0
  ccv_nnc_tensor_t* const gd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1572
0
  int i;
1573
0
  for (i = 0; i < 100; i++)
1574
0
  {
1575
0
    ha->data.f32[i] = 1;
1576
0
    hb->data.f32[i] = 0.5;
1577
0
    hc->data.f32[i] = 0.25;
1578
0
    gd->data.f32[i] = 1.75;
1579
0
  }
1580
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hc), TENSOR_LIST(a, b, c), 0);
1581
0
  ccv_nnc_cmd_exec(CMD_EWSUM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(d), 0);
1582
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(hd), 0);
1583
0
  REQUIRE_TENSOR_EQ(hd, gd, "ewsum result should be the same");
1584
0
  ccv_nnc_tensor_free(a);
1585
0
  ccv_nnc_tensor_free(b);
1586
0
  ccv_nnc_tensor_free(c);
1587
0
  ccv_nnc_tensor_free(d);
1588
0
  ccv_nnc_tensor_free(ha);
1589
0
  ccv_nnc_tensor_free(hb);
1590
0
  ccv_nnc_tensor_free(hc);
1591
0
  ccv_nnc_tensor_free(hd);
1592
0
  ccv_nnc_tensor_free(gd);
1593
0
}
1594
1595
TEST_CASE("compare ewsum with mps in half precision")
1596
1
{
1597
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_MPS));
1598
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
1599
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
1600
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
1601
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
1602
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1603
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1604
0
  ccv_nnc_tensor_t* const hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1605
0
  ccv_nnc_tensor_t* const hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1606
0
  ccv_nnc_tensor_t* const ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
1607
0
  ccv_nnc_tensor_t* const hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
1608
0
  ccv_nnc_tensor_t* const hc16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
1609
0
  ccv_nnc_tensor_t* const hd16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
1610
0
  ccv_nnc_tensor_t* const gd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
1611
0
  int i;
1612
0
  for (i = 0; i < 100; i++)
1613
0
  {
1614
0
    ha->data.f32[i] = 1;
1615
0
    hb->data.f32[i] = 0.5;
1616
0
    hc->data.f32[i] = 0.25;
1617
0
    gd->data.f32[i] = 1.75;
1618
0
  }
1619
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hc), TENSOR_LIST(ha16, hb16, hc16), 0);
1620
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hb16, hc16), TENSOR_LIST(a, b, c), 0);
1621
0
  ccv_nnc_cmd_exec(CMD_EWSUM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(d), 0);
1622
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(hd16), 0);
1623
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hd16), TENSOR_LIST(hd), 0);
1624
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hd->data.f32, gd->data.f32, 100, 1e-3, "ewsum result should be the same");
1625
0
  ccv_nnc_tensor_free(a);
1626
0
  ccv_nnc_tensor_free(b);
1627
0
  ccv_nnc_tensor_free(c);
1628
0
  ccv_nnc_tensor_free(d);
1629
0
  ccv_nnc_tensor_free(ha);
1630
0
  ccv_nnc_tensor_free(hb);
1631
0
  ccv_nnc_tensor_free(hc);
1632
0
  ccv_nnc_tensor_free(hd);
1633
0
  ccv_nnc_tensor_free(ha16);
1634
0
  ccv_nnc_tensor_free(hb16);
1635
0
  ccv_nnc_tensor_free(hc16);
1636
0
  ccv_nnc_tensor_free(hd16);
1637
0
  ccv_nnc_tensor_free(gd);
1638
0
}
1639
1640
TEST_CASE("compare transpose two tensor views")
1641
1
{
1642
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
1643
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
1644
0
  memset(ha->data.f32, 0, sizeof(float) * 7 * 6 * 5 * 4);
1645
0
  ccv_nnc_tensor_view_t ha_view = ccv_nnc_tensor_view(ha, CPU_TENSOR_NHWC(32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
1646
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 8, 7, 6, 5), 0);
1647
0
  memset(hb->data.f32, 0, sizeof(float) * 8 * 7 * 6 * 5);
1648
0
  ccv_nnc_tensor_view_t hb_view = ccv_nnc_tensor_view(hb, CPU_TENSOR_NHWC(32F, 4, 2, 2, 3), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(7 * 6 * 5, 6 * 5, 5, 1));
1649
0
  int i, j, k, l;
1650
0
  for (i = 0; i < 4; i++)
1651
0
    for (j = 0; j < 3; j++)
1652
0
      for (k = 0; k < 2; k++)
1653
0
        for (l = 0; l < 2; l++)
1654
0
          ha->data.f32[(i + 3) * 6 * 5 * 4 + (j + 2) * 5 * 4 + (k + 1) * 4 + l] = i * 3 * 2 * 2 + j * 2 * 2 + k * 2 + l;
1655
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&ha_view), TENSOR_LIST((ccv_nnc_tensor_t*)&hb_view), 0);
1656
0
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
1657
0
  memset(hd->data.f32, 0, sizeof(float) * 7 * 6 * 5 * 4);
1658
0
  ccv_nnc_tensor_view_t hd_view = ccv_nnc_tensor_view(hd, CPU_TENSOR_NHWC(32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
1659
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&hb_view), TENSOR_LIST((ccv_nnc_tensor_t*)&hd_view), 0);
1660
0
  REQUIRE_TENSOR_EQ(hd, ha, "4x3x2x2 tensor should be exactly the same.");
1661
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 7, 6, 5, 4), 0);
1662
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1663
0
  ccv_nnc_tensor_view_t a_view = ccv_nnc_tensor_view(a, GPU_TENSOR_NHWC(000, 32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
1664
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 8, 7, 6, 5), 0);
1665
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(b), 0);
1666
0
  ccv_nnc_tensor_view_t b_view = ccv_nnc_tensor_view(b, GPU_TENSOR_NHWC(000, 32F, 4, 2, 2, 3), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(7 * 6 * 5, 6 * 5, 5, 1));
1667
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&a_view), TENSOR_LIST((ccv_nnc_tensor_t*)&b_view), 0);
1668
0
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 7, 6, 5, 4), 0);
1669
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(d), 0);
1670
0
  ccv_nnc_tensor_view_t d_view = ccv_nnc_tensor_view(d, GPU_TENSOR_NHWC(000, 32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
1671
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&b_view), TENSOR_LIST((ccv_nnc_tensor_t*)&d_view), 0);
1672
0
  ccv_nnc_tensor_t* const hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 8, 7, 6, 5), 0);
1673
0
  ccv_nnc_tensor_t* const hdt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
1674
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, d), TENSOR_LIST(hbt, hdt), 0);
1675
0
  REQUIRE_TENSOR_EQ(hbt, hb, "4x2x2x3 tensor should be exactly the same.");
1676
0
  REQUIRE_TENSOR_EQ(hdt, hd, "4x3x2x2 tensor should be exactly the same.");
1677
0
  ccv_nnc_tensor_free(ha);
1678
0
  ccv_nnc_tensor_free(hb);
1679
0
  ccv_nnc_tensor_free(hd);
1680
0
  ccv_nnc_tensor_free(hbt);
1681
0
  ccv_nnc_tensor_free(hdt);
1682
0
  ccv_nnc_tensor_free(a);
1683
0
  ccv_nnc_tensor_free(b);
1684
0
  ccv_nnc_tensor_free(d);
1685
0
}
1686
1687
TEST_CASE("broadcasting semantics for add [[1, 2, 3], [4, 5, 6]] + [7, 8, 9]")
1688
1
{
1689
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
1690
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1691
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
1692
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1693
0
  a->data.f32[0] = 1;
1694
0
  a->data.f32[1] = 2;
1695
0
  a->data.f32[2] = 3;
1696
0
  a->data.f32[3] = 4;
1697
0
  a->data.f32[4] = 5;
1698
0
  a->data.f32[5] = 6;
1699
0
  b->data.f32[0] = 7;
1700
0
  b->data.f32[1] = 8;
1701
0
  b->data.f32[2] = 9;
1702
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1703
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1704
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1705
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1706
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(1, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1707
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1708
0
  float ctp[] = {
1709
0
    8, 10, 12,
1710
0
    11, 13, 15
1711
0
  };
1712
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1713
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1714
0
  ccv_nnc_tensor_free(a);
1715
0
  ccv_nnc_tensor_free(b);
1716
0
  ccv_nnc_tensor_free(c);
1717
0
  ccv_nnc_tensor_free(ga);
1718
0
  ccv_nnc_tensor_free(gb);
1719
0
  ccv_nnc_tensor_free(gc);
1720
0
}
1721
1722
TEST_CASE("broadcasting semantics for add [[1], [2], [3], [4]] + [5, 6]")
1723
1
{
1724
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
1725
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1726
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1727
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1728
0
  a->data.f32[0] = 1;
1729
0
  a->data.f32[1] = 2;
1730
0
  a->data.f32[2] = 3;
1731
0
  a->data.f32[3] = 4;
1732
0
  b->data.f32[0] = 5;
1733
0
  b->data.f32[1] = 6;
1734
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1735
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1736
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1737
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1738
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(1, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1739
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1740
0
  float ctp[] = {
1741
0
    6, 7,
1742
0
    7, 8,
1743
0
    8, 9,
1744
0
    9, 10
1745
0
  };
1746
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1747
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1748
0
  ccv_nnc_tensor_free(a);
1749
0
  ccv_nnc_tensor_free(b);
1750
0
  ccv_nnc_tensor_free(c);
1751
0
  ccv_nnc_tensor_free(ga);
1752
0
  ccv_nnc_tensor_free(gb);
1753
0
  ccv_nnc_tensor_free(gc);
1754
0
}
1755
1756
TEST_CASE("broadcasting semantics for mul [[1, 2, 3], [4, 5, 6]] * [7, 8, 9]")
1757
1
{
1758
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
1759
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1760
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
1761
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1762
0
  a->data.f32[0] = 1;
1763
0
  a->data.f32[1] = 2;
1764
0
  a->data.f32[2] = 3;
1765
0
  a->data.f32[3] = 4;
1766
0
  a->data.f32[4] = 5;
1767
0
  a->data.f32[5] = 6;
1768
0
  b->data.f32[0] = 7;
1769
0
  b->data.f32[1] = 8;
1770
0
  b->data.f32[2] = 9;
1771
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1772
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1773
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1774
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1775
0
  ccv_nnc_cmd_exec(CMD_MUL_FORWARD(1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1776
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1777
0
  float ctp[] = {
1778
0
    7, 16, 27,
1779
0
    28, 40, 54
1780
0
  };
1781
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1782
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1783
0
  ccv_nnc_tensor_free(a);
1784
0
  ccv_nnc_tensor_free(b);
1785
0
  ccv_nnc_tensor_free(c);
1786
0
  ccv_nnc_tensor_free(ga);
1787
0
  ccv_nnc_tensor_free(gb);
1788
0
  ccv_nnc_tensor_free(gc);
1789
0
}
1790
1791
TEST_CASE("broadcasting semantics for mul [[1], [2], [3], [4]] * [5, 6]")
1792
1
{
1793
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
1794
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
1795
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
1796
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1797
0
  a->data.f32[0] = 1;
1798
0
  a->data.f32[1] = 2;
1799
0
  a->data.f32[2] = 3;
1800
0
  a->data.f32[3] = 4;
1801
0
  b->data.f32[0] = 5;
1802
0
  b->data.f32[1] = 6;
1803
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
1804
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
1805
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1806
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1807
0
  ccv_nnc_cmd_exec(CMD_MUL_FORWARD(1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1808
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1809
0
  float ctp[] = {
1810
0
    5, 6,
1811
0
    10, 12,
1812
0
    15, 18,
1813
0
    20, 24
1814
0
  };
1815
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1816
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1817
0
  ccv_nnc_tensor_free(a);
1818
0
  ccv_nnc_tensor_free(b);
1819
0
  ccv_nnc_tensor_free(c);
1820
0
  ccv_nnc_tensor_free(ga);
1821
0
  ccv_nnc_tensor_free(gb);
1822
0
  ccv_nnc_tensor_free(gc);
1823
0
}
1824
1825
TEST_CASE("scalar mul [[1, 2, 3], [4, 5, 6]] * 0.3")
1826
1
{
1827
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
1828
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1829
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1830
0
  a->data.f32[0] = 1;
1831
0
  a->data.f32[1] = 2;
1832
0
  a->data.f32[2] = 3;
1833
0
  a->data.f32[3] = 4;
1834
0
  a->data.f32[4] = 5;
1835
0
  a->data.f32[5] = 6;
1836
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1837
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1838
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ga), 0);
1839
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(0.3), ccv_nnc_no_hint, 0, TENSOR_LIST(ga), TENSOR_LIST(gc), 0);
1840
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1841
0
  float ctp[] = {
1842
0
    0.3, 0.6, 0.9,
1843
0
    1.2, 1.5, 1.8,
1844
0
  };
1845
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1846
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1847
0
  ccv_nnc_tensor_free(a);
1848
0
  ccv_nnc_tensor_free(c);
1849
0
  ccv_nnc_tensor_free(ga);
1850
0
  ccv_nnc_tensor_free(gc);
1851
0
}
1852
1853
TEST_CASE("compare average pooling with mps")
1854
1
{
1855
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
1856
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
1857
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
1858
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "y");
1859
0
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "avg_pool");
1860
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
1861
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1862
0
  ccv_nnc_graph_t* graph = 0;
1863
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1864
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1865
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1866
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1867
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1868
0
  dsfmt_t dsfmt;
1869
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1870
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
1871
0
  int i;
1872
0
  for (i = 0; i < 7 * 7 * 10; i++)
1873
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1874
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1875
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
1876
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1877
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1878
0
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
1879
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1880
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1881
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
1882
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
1883
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1884
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1885
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1886
0
  ccv_nnc_graph_free(graph);
1887
0
  ccv_nnc_tensor_free(x_tensor);
1888
0
  ccv_nnc_tensor_free(y_tensor);
1889
0
  ccv_nnc_tensor_free(cpu_y);
1890
0
}
1891
1892
TEST_CASE("compare average pooling with mps in half precision")
1893
1
{
1894
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
1895
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
1896
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
1897
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "y");
1898
0
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "avg_pool");
1899
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
1900
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1901
0
  ccv_nnc_graph_t* graph = 0;
1902
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1903
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1904
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1905
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1906
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1907
0
  dsfmt_t dsfmt;
1908
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1909
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
1910
0
  int i;
1911
0
  for (i = 0; i < 7 * 7 * 10; i++)
1912
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1913
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
1914
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1915
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
1916
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
1917
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1918
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1919
0
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
1920
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1921
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1922
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
1923
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
1924
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
1925
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 3 * 3 * 10, 1e-3, "mps result should equal to cpu result");
1926
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1927
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1928
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1929
0
  ccv_nnc_graph_free(graph);
1930
0
  ccv_nnc_tensor_free(x_tensor);
1931
0
  ccv_nnc_tensor_free(x16_tensor);
1932
0
  ccv_nnc_tensor_free(y_tensor);
1933
0
  ccv_nnc_tensor_free(cpu_y);
1934
0
  ccv_nnc_tensor_free(cpu_y16);
1935
0
}
1936
1937
TEST_CASE("compare max pooling with mps")
1938
1
{
1939
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
1940
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
1941
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
1942
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "y");
1943
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
1944
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
1945
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1946
0
  ccv_nnc_graph_t* graph = 0;
1947
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1948
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1949
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1950
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1951
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1952
0
  dsfmt_t dsfmt;
1953
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1954
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
1955
0
  int i;
1956
0
  for (i = 0; i < 7 * 7 * 10; i++)
1957
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1958
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1959
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
1960
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
1961
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1962
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
1963
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
1964
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
1965
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
1966
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
1967
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
1968
0
  ccv_nnc_tensor_arena_free(tensor_arena);
1969
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
1970
0
  ccv_nnc_graph_free(graph);
1971
0
  ccv_nnc_tensor_free(x_tensor);
1972
0
  ccv_nnc_tensor_free(y_tensor);
1973
0
  ccv_nnc_tensor_free(cpu_y);
1974
0
}
1975
1976
TEST_CASE("compare max pooling with mps in half precision")
1977
1
{
1978
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
1979
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
1980
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
1981
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "y");
1982
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
1983
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
1984
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1985
0
  ccv_nnc_graph_t* graph = 0;
1986
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
1987
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
1988
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
1989
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
1990
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
1991
0
  dsfmt_t dsfmt;
1992
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1993
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
1994
0
  int i;
1995
0
  for (i = 0; i < 7 * 7 * 10; i++)
1996
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1997
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
1998
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
1999
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2000
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
2001
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2002
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2003
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
2004
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2005
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
2006
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2007
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
2008
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
2009
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 3 * 3 * 10, 1e-3, "mps result should equal to cpu result");
2010
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2011
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2012
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2013
0
  ccv_nnc_graph_free(graph);
2014
0
  ccv_nnc_tensor_free(x_tensor);
2015
0
  ccv_nnc_tensor_free(x16_tensor);
2016
0
  ccv_nnc_tensor_free(y_tensor);
2017
0
  ccv_nnc_tensor_free(cpu_y);
2018
0
  ccv_nnc_tensor_free(cpu_y16);
2019
0
}
2020
2021
TEST_CASE("compare max pooling 2x2 with mps")
2022
1
{
2023
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
2024
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2025
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 6, 6), "x");
2026
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 3, 3), "y");
2027
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(2, 2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
2028
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (0, 0)));
2029
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2030
0
  ccv_nnc_graph_t* graph = 0;
2031
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2032
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2033
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2034
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2035
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2036
0
  dsfmt_t dsfmt;
2037
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2038
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 6, 6), 0);
2039
0
  int i, j;
2040
0
  for (i = 0; i < 6 * 6 * 10; i++)
2041
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2042
0
  ccv_nnc_tensor_t* const gt_x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 6, 6, 10), 0);
2043
0
  for (i = 0; i < 10; i++)
2044
0
    for (j = 0; j < 6 * 6; j++)
2045
0
      gt_x->data.f32[j * 10 + i] = x_tensor->data.f32[i * 6 * 6 + j];
2046
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2047
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2048
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2049
0
  ccv_nnc_tensor_t* const gt_y= ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2050
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(2, 2), HINT((2, 2), (0, 0)), 0, TENSOR_LIST(gt_x), TENSOR_LIST(gt_y), 0);
2051
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2052
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2053
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
2054
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2055
0
  for (i = 0; i < 10; i++)
2056
0
    for (j = 0; j < 3 * 3; j++)
2057
0
      y_tensor->data.f32[i * 3 * 3 + j] = gt_y->data.f32[j * 10 + i];
2058
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
2059
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2060
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2061
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2062
0
  ccv_nnc_graph_free(graph);
2063
0
  ccv_nnc_tensor_free(x_tensor);
2064
0
  ccv_nnc_tensor_free(y_tensor);
2065
0
  ccv_nnc_tensor_free(cpu_y);
2066
0
}
2067
2068
TEST_CASE("compare max pooling 2x2 with mps in half precision")
2069
1
{
2070
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
2071
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2072
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 6, 6), "x");
2073
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 3, 3), "y");
2074
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(2, 2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
2075
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (0, 0)));
2076
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2077
0
  ccv_nnc_graph_t* graph = 0;
2078
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2079
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2080
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2081
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2082
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2083
0
  dsfmt_t dsfmt;
2084
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2085
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 6, 6), 0);
2086
0
  int i, j;
2087
0
  for (i = 0; i < 6 * 6 * 10; i++)
2088
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2089
0
  ccv_nnc_tensor_t* const gt_x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 6, 6, 10), 0);
2090
0
  for (i = 0; i < 10; i++)
2091
0
    for (j = 0; j < 6 * 6; j++)
2092
0
      gt_x->data.f32[j * 10 + i] = x_tensor->data.f32[i * 6 * 6 + j];
2093
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2094
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 6, 6), 0);
2095
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2096
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
2097
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2098
0
  ccv_nnc_tensor_t* const gt_y= ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
2099
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(2, 2), HINT((2, 2), (0, 0)), 0, TENSOR_LIST(gt_x), TENSOR_LIST(gt_y), 0);
2100
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2101
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 3, 3), 0);
2102
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2103
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
2104
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
2105
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
2106
0
  for (i = 0; i < 10; i++)
2107
0
    for (j = 0; j < 3 * 3; j++)
2108
0
      y_tensor->data.f32[i * 3 * 3 + j] = gt_y->data.f32[j * 10 + i];
2109
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 10 * 3 * 3, 1e-3, "mps result should equal to cpu result");
2110
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2111
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2112
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2113
0
  ccv_nnc_graph_free(graph);
2114
0
  ccv_nnc_tensor_free(x_tensor);
2115
0
  ccv_nnc_tensor_free(x16_tensor);
2116
0
  ccv_nnc_tensor_free(y_tensor);
2117
0
  ccv_nnc_tensor_free(cpu_y);
2118
0
  ccv_nnc_tensor_free(cpu_y16);
2119
0
}
2120
2121
2122
TEST_CASE("mps mse mean loss forward")
2123
1
{
2124
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS));
2125
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2126
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2127
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 1), 0);
2128
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2129
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2130
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
2131
0
  dsfmt_t dsfmt;
2132
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2133
0
  int i;
2134
0
  for (i = 0; i < 1000; i++)
2135
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2136
0
  for (i = 0; i < 1000; i++)
2137
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2138
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
2139
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2140
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2141
0
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
2142
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(tc), 0);
2143
0
  REQUIRE_TENSOR_EQ(tc, hc, "MPS computed output should be the same as CPU computed ones");
2144
0
  ccv_nnc_tensor_free(a);
2145
0
  ccv_nnc_tensor_free(b);
2146
0
  ccv_nnc_tensor_free(c);
2147
0
  ccv_nnc_tensor_free(ha);
2148
0
  ccv_nnc_tensor_free(hb);
2149
0
  ccv_nnc_tensor_free(hc);
2150
0
  ccv_nnc_tensor_free(tc);
2151
0
}
2152
2153
TEST_CASE("mps mse sum loss forward")
2154
1
{
2155
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS));
2156
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2157
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2158
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 1), 0);
2159
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2160
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2161
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
2162
0
  dsfmt_t dsfmt;
2163
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2164
0
  int i;
2165
0
  for (i = 0; i < 1000; i++)
2166
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2167
0
  for (i = 0; i < 1000; i++)
2168
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2169
  
2170
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
2171
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2172
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2173
0
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
2174
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(tc), 0);
2175
2176
0
  REQUIRE_TENSOR_EQ(tc, hc, "MPS computed output should be the same as CPU computed ones");
2177
0
  ccv_nnc_tensor_free(a);
2178
0
  ccv_nnc_tensor_free(b);
2179
0
  ccv_nnc_tensor_free(c);
2180
0
  ccv_nnc_tensor_free(ha);
2181
0
  ccv_nnc_tensor_free(hb);
2182
0
  ccv_nnc_tensor_free(hc);
2183
0
  ccv_nnc_tensor_free(tc);
2184
0
}
2185
2186
TEST_CASE("mps mse mean loss backward")
2187
1
{
2188
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS) &&
2189
1
  ccv_nnc_cmd_ok(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_MPS));
2190
2191
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2192
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2193
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2194
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2195
0
  ccv_nnc_tensor_t* db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2196
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2197
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2198
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2199
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2200
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2201
0
  ccv_nnc_tensor_t* hdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2202
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2203
0
  dsfmt_t dsfmt;
2204
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2205
0
  int i;
2206
0
  for (i = 0; i < 1000; i++)
2207
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2208
0
  for (i = 0; i < 1000; i++)
2209
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2210
0
  for (i = 0; i < 10; i++)
2211
0
    hg->data.f32[i] = 1;
2212
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2213
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2214
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hda, hdb), 0);
2215
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2216
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(da, db), 0);
2217
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2218
0
  ccv_nnc_tensor_t* tdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2219
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, db), TENSOR_LIST(tda, tdb), 0);
2220
2221
0
  REQUIRE_TENSOR_EQ(tda, hda, "MPS computed output should be the same as CPU computed ones");
2222
0
  REQUIRE_TENSOR_EQ(tdb, hdb, "MPS computed output should be the same as CPU computed ones");
2223
2224
0
  ccv_nnc_tensor_free(a);
2225
0
  ccv_nnc_tensor_free(b);
2226
0
  ccv_nnc_tensor_free(c);
2227
0
  ccv_nnc_tensor_free(da);
2228
0
  ccv_nnc_tensor_free(db);
2229
0
  ccv_nnc_tensor_free(g);
2230
0
  ccv_nnc_tensor_free(ha);
2231
0
  ccv_nnc_tensor_free(hb);
2232
0
  ccv_nnc_tensor_free(hc);
2233
0
  ccv_nnc_tensor_free(hda);
2234
0
  ccv_nnc_tensor_free(hdb);
2235
0
  ccv_nnc_tensor_free(hg);
2236
0
  ccv_nnc_tensor_free(tda);
2237
0
  ccv_nnc_tensor_free(tdb);
2238
0
}
2239
2240
TEST_CASE("mps mse sum loss backward")
2241
1
{
2242
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS) &&
2243
1
    ccv_nnc_cmd_ok(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_MPS));
2244
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2245
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2246
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2247
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2248
0
  ccv_nnc_tensor_t* db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2249
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2250
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2251
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2252
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2253
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2254
0
  ccv_nnc_tensor_t* hdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2255
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2256
0
  dsfmt_t dsfmt;
2257
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2258
0
  int i;
2259
0
  for (i = 0; i < 1000; i++)
2260
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2261
0
  for (i = 0; i < 1000; i++)
2262
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2263
0
  for (i = 0; i < 10; i++)
2264
0
    hg->data.f32[i] = 1;
2265
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2266
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2267
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hda, hdb), 0);
2268
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2269
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(da, db), 0);
2270
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2271
0
  ccv_nnc_tensor_t* tdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2272
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, db), TENSOR_LIST(tda, tdb), 0);
2273
0
  REQUIRE_TENSOR_EQ(tda, hda, "MPS computed output should be the same as CPU computed ones");
2274
0
  REQUIRE_TENSOR_EQ(tdb, hdb, "MPS computed output should be the same as CPU computed ones");
2275
0
  ccv_nnc_tensor_free(a);
2276
0
  ccv_nnc_tensor_free(b);
2277
0
  ccv_nnc_tensor_free(c);
2278
0
  ccv_nnc_tensor_free(da);
2279
0
  ccv_nnc_tensor_free(db);
2280
0
  ccv_nnc_tensor_free(g);
2281
0
  ccv_nnc_tensor_free(ha);
2282
0
  ccv_nnc_tensor_free(hb);
2283
0
  ccv_nnc_tensor_free(hc);
2284
0
  ccv_nnc_tensor_free(hda);
2285
0
  ccv_nnc_tensor_free(hdb);
2286
0
  ccv_nnc_tensor_free(hg);
2287
0
  ccv_nnc_tensor_free(tda);
2288
0
  ccv_nnc_tensor_free(tdb);
2289
0
}
2290
2291
2292
TEST_CASE("mps mse sum loss backward (no output db)")
2293
1
{
2294
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS) &&
2295
1
    ccv_nnc_cmd_ok(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_MPS));
2296
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2297
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2298
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2299
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2300
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
2301
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2302
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2303
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2304
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2305
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
2306
0
  dsfmt_t dsfmt;
2307
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2308
0
  int i;
2309
0
  for (i = 0; i < 1000; i++)
2310
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2311
0
  for (i = 0; i < 1000; i++)
2312
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2313
0
  for (i = 0; i < 10; i++)
2314
0
    hg->data.f32[i] = 1;
2315
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2316
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2317
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hda, 0), 0);
2318
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2319
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(da, 0), 0);
2320
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2321
0
  ccv_nnc_tensor_t* tdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2322
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, 0), TENSOR_LIST(tda, 0), 0);
2323
0
  REQUIRE_TENSOR_EQ(tda, hda, "MPS computed output should be the same as CPU computed ones");
2324
0
  ccv_nnc_tensor_free(a);
2325
0
  ccv_nnc_tensor_free(b);
2326
0
  ccv_nnc_tensor_free(c);
2327
0
  ccv_nnc_tensor_free(da);
2328
0
  ccv_nnc_tensor_free(g);
2329
0
  ccv_nnc_tensor_free(ha);
2330
0
  ccv_nnc_tensor_free(hb);
2331
0
  ccv_nnc_tensor_free(hc);
2332
0
  ccv_nnc_tensor_free(hda);
2333
0
  ccv_nnc_tensor_free(hg);
2334
0
  ccv_nnc_tensor_free(tda);
2335
0
  ccv_nnc_tensor_free(tdb);
2336
0
}
2337
2338
TEST_CASE("mps leaky relu gradient in float")
2339
1
{
2340
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LEAKY_RELU_FORWARD, CCV_NNC_BACKEND_MPS) &&
2341
1
    ccv_nnc_cmd_ok(CCV_NNC_LEAKY_RELU_BACKWARD, CCV_NNC_BACKEND_MPS));
2342
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2343
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
2344
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
2345
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LEAKY_RELU_FORWARD(0.2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "leaky relu");
2346
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2347
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2348
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2349
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2350
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
2351
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
2352
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2353
0
  dsfmt_t dsfmt;
2354
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2355
0
  int i;
2356
0
  for (i = 0; i < 10 * 100; i++)
2357
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2358
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2359
0
  for (i = 0; i < 10 * 100; i++)
2360
0
    dy_tensor->data.f32[i] = 0;
2361
0
  for (i = 0; i < 10; i++)
2362
0
    dy_tensor->data.f32[i * 100 + i] = 1;
2363
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2364
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
2365
0
  ccv_nnc_graph_t* graph = 0;
2366
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2367
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2368
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2369
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2370
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2371
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2372
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2373
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2374
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
2375
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2376
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2377
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
2378
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
2379
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2380
0
  ccv_nnc_cmd_exec(CMD_LEAKY_RELU_FORWARD(0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
2381
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
2382
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2383
0
  ccv_nnc_cmd_exec(CMD_LEAKY_RELU_BACKWARD(0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, y_tensor), TENSOR_LIST(tdx_tensor), 0);
2384
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
2385
0
  ccv_nnc_tensor_free(x_tensor);
2386
0
  ccv_nnc_tensor_free(y_tensor);
2387
0
  ccv_nnc_tensor_free(dx_tensor);
2388
0
  ccv_nnc_tensor_free(dy_tensor);
2389
0
  ccv_nnc_tensor_free(ty_tensor);
2390
0
  ccv_nnc_tensor_free(tdx_tensor);
2391
0
  ccv_nnc_tensor_free(dyt);
2392
0
  ccv_nnc_graph_free(graph);
2393
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2394
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2395
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2396
0
}
2397
2398
TEST_CASE("compare layer norm gradient with mps")
2399
1
{
2400
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2401
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2402
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2403
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2404
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2405
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2406
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "scale");
2407
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "bias");
2408
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2409
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2410
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2411
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2412
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2413
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2414
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2415
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2416
0
  ccv_nnc_graph_t* graph = 0;
2417
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2418
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2419
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2420
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2421
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2422
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2423
0
  dsfmt_t dsfmt;
2424
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2425
0
  int i;
2426
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2427
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2428
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2429
2430
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2431
0
  float scaledata[1 * 2 * 2 * LN_DIM];
2432
0
  float biasdata[1 * 2 * 2 * LN_DIM];
2433
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2434
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2435
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2436
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
2437
2438
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2439
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2440
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
2441
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2442
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2443
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2444
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2445
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2446
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2447
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2448
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2449
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2450
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
2451
0
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
2452
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2453
0
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2454
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
2455
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2456
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2457
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2458
0
  ccv_nnc_graph_free(graph);
2459
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2460
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2461
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2462
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "scale");
2463
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "bias");
2464
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2465
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2466
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2467
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2468
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2469
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2470
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2471
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2472
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
2473
0
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
2474
0
  ccv_nnc_graph_t* cpu_graph = 0;
2475
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2476
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2477
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2478
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2479
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2480
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2481
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2482
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2483
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
2484
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
2485
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
2486
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2487
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
2488
0
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
2489
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2490
2491
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2492
0
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from cudnn should match the one from reference implementation");
2493
0
  REQUIRE_TENSOR_EQ(dbias_tensor, dcbias_tensor, "layer norm bias gradient result from cudnn should match the one from reference implementation");
2494
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2495
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2496
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2497
0
  ccv_nnc_graph_free(cpu_graph);
2498
0
  ccv_nnc_tensor_free(x_tensor);
2499
0
  ccv_nnc_tensor_free(dy_tensor);
2500
0
  ccv_nnc_tensor_free(dx_tensor);
2501
0
  ccv_nnc_tensor_free(dscale_tensor);
2502
0
  ccv_nnc_tensor_free(dbias_tensor);
2503
0
}
2504
2505
TEST_CASE("compare layer norm gradient with mps (no bias)")
2506
1
{
2507
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2508
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2509
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2510
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2511
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2512
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2513
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "scale");
2514
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "bias");
2515
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2516
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2517
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2518
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2519
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2520
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2521
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2522
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2523
0
  ccv_nnc_graph_t* graph = 0;
2524
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2525
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2526
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2527
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2528
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2529
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2530
0
  dsfmt_t dsfmt;
2531
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2532
0
  int i;
2533
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2534
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2535
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2536
2537
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2538
0
  float scaledata[1 * 2 * 2 * LN_DIM];
2539
0
  float biasdata[1 * 2 * 2 * LN_DIM];
2540
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2541
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2542
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2543
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
2544
2545
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2546
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2547
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
2548
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2549
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2550
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2551
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2552
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2553
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2554
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2555
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2556
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2557
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
2558
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2559
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
2560
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2561
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2562
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2563
0
  ccv_nnc_graph_free(graph);
2564
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2565
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2566
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2567
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "scale");
2568
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "bias");
2569
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2570
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2571
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2572
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2573
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2574
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2575
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2576
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2577
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
2578
0
  ccv_nnc_graph_t* cpu_graph = 0;
2579
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2580
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2581
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2582
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2583
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2584
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2585
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2586
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2587
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
2588
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2589
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
2590
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2591
2592
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2593
0
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from cudnn should match the one from reference implementation");
2594
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2595
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2596
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2597
0
  ccv_nnc_graph_free(cpu_graph);
2598
0
  ccv_nnc_tensor_free(x_tensor);
2599
0
  ccv_nnc_tensor_free(dy_tensor);
2600
0
  ccv_nnc_tensor_free(dx_tensor);
2601
0
  ccv_nnc_tensor_free(dscale_tensor);
2602
0
}
2603
2604
TEST_CASE("compare layer norm gradient with mps without scale / bias")
2605
1
{
2606
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2607
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2608
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2609
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2610
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2611
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2612
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2613
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2614
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2615
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2616
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2617
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2618
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2619
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2620
0
  ccv_nnc_graph_t* graph = 0;
2621
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2622
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2623
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2624
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2625
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2626
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2627
0
  dsfmt_t dsfmt;
2628
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2629
0
  int i;
2630
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2631
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2632
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2633
2634
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2635
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2636
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2637
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2638
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2639
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2640
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2641
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2642
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2643
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2644
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2645
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2646
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2647
0
  ccv_nnc_graph_free(graph);
2648
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2649
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2650
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2651
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2652
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2653
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2654
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2655
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2656
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2657
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2658
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2659
0
  ccv_nnc_graph_t* cpu_graph = 0;
2660
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2661
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2662
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2663
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2664
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2665
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2666
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2667
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2668
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2669
2670
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2671
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2672
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2673
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2674
0
  ccv_nnc_graph_free(cpu_graph);
2675
0
  ccv_nnc_tensor_free(x_tensor);
2676
0
  ccv_nnc_tensor_free(dy_tensor);
2677
0
  ccv_nnc_tensor_free(dx_tensor);
2678
0
}
2679
2680
TEST_CASE("compare layer norm gradient with mps (no bias) without scale / bias")
2681
1
{
2682
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2683
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2684
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2685
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2686
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2687
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2688
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2689
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2690
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2691
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2692
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2693
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2694
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2695
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2696
0
  ccv_nnc_graph_t* graph = 0;
2697
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2698
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2699
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2700
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2701
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2702
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2703
0
  dsfmt_t dsfmt;
2704
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2705
0
  int i;
2706
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2707
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2708
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2709
2710
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2711
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2712
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2713
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2714
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2715
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2716
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2717
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2718
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2719
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2720
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2721
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2722
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2723
0
  ccv_nnc_graph_free(graph);
2724
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2725
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2726
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2727
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2728
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2729
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2730
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2731
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2732
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2733
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2734
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2735
0
  ccv_nnc_graph_t* cpu_graph = 0;
2736
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2737
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2738
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2739
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2740
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2741
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2742
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2743
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2744
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2745
2746
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2747
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2748
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2749
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2750
0
  ccv_nnc_graph_free(cpu_graph);
2751
0
  ccv_nnc_tensor_free(x_tensor);
2752
0
  ccv_nnc_tensor_free(dy_tensor);
2753
0
  ccv_nnc_tensor_free(dx_tensor);
2754
0
}
2755
2756
TEST_CASE("compare rmsnorm gradient with mps")
2757
1
{
2758
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2759
1
    ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
2760
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
2761
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2762
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
2763
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
2764
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "scale");
2765
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2766
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
2767
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2768
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2769
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2770
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
2771
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
2772
0
  ccv_nnc_graph_t* graph = 0;
2773
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2774
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2775
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2776
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2777
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2778
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
2779
0
  dsfmt_t dsfmt;
2780
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2781
0
  int i;
2782
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2783
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2784
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
2785
2786
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
2787
0
  float scaledata[1 * 2 * 2 * LN_DIM];
2788
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
2789
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2790
2791
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2792
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale)), 0);
2793
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2794
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
2795
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
2796
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2797
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
2798
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2799
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
2800
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
2801
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
2802
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
2803
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
2804
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
2805
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2806
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2807
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2808
0
  ccv_nnc_graph_free(graph);
2809
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2810
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
2811
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
2812
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "scale");
2813
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2814
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "layer_norm");
2815
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2816
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
2817
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2818
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
2819
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
2820
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
2821
0
  ccv_nnc_graph_t* cpu_graph = 0;
2822
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2823
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2824
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2825
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2826
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2827
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
2828
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
2829
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2830
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
2831
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2832
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
2833
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
2834
2835
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from cudnn should match the one from reference implementation");
2836
0
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from cudnn should match the one from reference implementation");
2837
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2838
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2839
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2840
0
  ccv_nnc_graph_free(cpu_graph);
2841
0
  ccv_nnc_tensor_free(x_tensor);
2842
0
  ccv_nnc_tensor_free(dy_tensor);
2843
0
  ccv_nnc_tensor_free(dx_tensor);
2844
0
  ccv_nnc_tensor_free(dscale_tensor);
2845
0
}
2846
2847
TEST_CASE("mps backward convolution in nchw format")
2848
1
{
2849
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_MPS));
2850
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2851
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2852
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
2853
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
2854
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
2855
0
  assert(cmd.backend >= 0);
2856
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, g->info);
2857
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, g->info) == 0);
2858
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2859
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2860
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, 1, OUTPUT_DIM), 0);
2861
  // configure the inlets.
2862
0
  dsfmt_t dsfmt;
2863
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2864
0
  int i;
2865
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
2866
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
2867
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
2868
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2869
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
2870
0
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
2871
  // Copy generated matrix values over to GPU.
2872
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2873
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
2874
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2875
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2876
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
2877
0
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2878
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
2879
0
  ccv_nnc_tensor_t* gao = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
2880
0
  ccv_nnc_tensor_t* ggo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
2881
0
  ccv_nnc_tensor_t* gho = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
2882
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
2883
0
  ccv_nnc_tensor_t* gbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
2884
0
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
2885
0
  ccv_nnc_tensor_t* gdbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
2886
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
2887
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
2888
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gw, gg), TENSOR_LIST(gao, gwo, ggo), 0);
2889
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
2890
2891
0
  assert(cmd.backend >= 0);
2892
0
  cmd.algorithm = -1;
2893
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
2894
2895
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context);
2896
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context));
2897
0
  ccv_nnc_stream_context_wait(stream_context);
2898
0
  ccv_nnc_stream_context_free(stream_context);
2899
0
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2900
0
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2901
0
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, OUTPUT_DIM, 1, 1), 0);
2902
2903
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gho, gdwo, gdbiaso), TENSOR_LIST(gh, gdw, gdbias), 0);
2904
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
2905
2906
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from mps should match from CPU");
2907
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from mps should match from CPU");
2908
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
2909
0
  ccv_nnc_tensor_free(gao);
2910
0
  ccv_nnc_tensor_free(ggo);
2911
0
  ccv_nnc_tensor_free(gho);
2912
0
  ccv_nnc_tensor_free(gwo);
2913
0
  ccv_nnc_tensor_free(gbiaso);
2914
0
  ccv_nnc_tensor_free(gdwo);
2915
0
  ccv_nnc_tensor_free(gdbiaso);
2916
0
  ccv_nnc_tensor_free(h);
2917
0
  ccv_nnc_tensor_free(gh);
2918
0
  ccv_nnc_tensor_free(w);
2919
0
  ccv_nnc_tensor_free(g);
2920
0
  ccv_nnc_tensor_free(a);
2921
0
  ccv_nnc_tensor_free(gbias);
2922
0
  ccv_nnc_tensor_free(gdbias);
2923
0
  ccv_nnc_tensor_free(gdw);
2924
0
  ccv_nnc_tensor_free(gw);
2925
0
  ccv_nnc_tensor_free(gg);
2926
0
  ccv_nnc_tensor_free(ga);
2927
0
  ccv_nnc_tensor_free(ch);
2928
0
  ccv_nnc_tensor_free(cdw);
2929
0
  ccv_nnc_tensor_free(cdbias);
2930
0
}
2931
2932
TEST_CASE("mps backward convolution in nhwc format")
2933
1
{
2934
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_MPS));
2935
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2936
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2937
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
2938
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
2939
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
2940
0
  assert(cmd.backend >= 0);
2941
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, g->info);
2942
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, g->info) == 0);
2943
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2944
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2945
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
2946
  // configure the inlets.
2947
0
  dsfmt_t dsfmt;
2948
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2949
0
  int i;
2950
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
2951
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
2952
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
2953
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2954
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
2955
0
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
2956
  // Copy generated matrix values over to GPU.
2957
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2958
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
2959
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2960
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2961
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
2962
0
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2963
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
2964
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
2965
0
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
2966
2967
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
2968
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
2969
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), 0);
2970
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
2971
2972
0
  assert(cmd.backend >= 0);
2973
0
  cmd.algorithm = -1;
2974
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
2975
2976
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(gg, ga, gwo), TENSOR_LIST(gh, gdwo, gdbias), stream_context);
2977
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(gg, ga, gwo), TENSOR_LIST(gh, gdwo, gdbias), stream_context));
2978
0
  ccv_nnc_stream_context_wait(stream_context);
2979
0
  ccv_nnc_stream_context_free(stream_context);
2980
0
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
2981
0
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
2982
0
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, 1,  OUTPUT_DIM), 0);
2983
  
2984
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdwo), TENSOR_LIST(gdw), 0);
2985
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
2986
2987
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from mps should match from CPU");
2988
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from mps should match from CPU");
2989
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
2990
2991
0
  ccv_nnc_tensor_free(gwo);
2992
0
  ccv_nnc_tensor_free(gdwo);
2993
0
  ccv_nnc_tensor_free(h);
2994
0
  ccv_nnc_tensor_free(gh);
2995
0
  ccv_nnc_tensor_free(w);
2996
0
  ccv_nnc_tensor_free(g);
2997
0
  ccv_nnc_tensor_free(a);
2998
0
  ccv_nnc_tensor_free(gbias);
2999
0
  ccv_nnc_tensor_free(gdbias);
3000
0
  ccv_nnc_tensor_free(gdw);
3001
0
  ccv_nnc_tensor_free(gw);
3002
0
  ccv_nnc_tensor_free(gg);
3003
0
  ccv_nnc_tensor_free(ga);
3004
0
  ccv_nnc_tensor_free(ch);
3005
0
  ccv_nnc_tensor_free(cdw);
3006
0
  ccv_nnc_tensor_free(cdbias);
3007
0
}
3008
3009
TEST_CASE("mps backward convolution in nchw format with dilation 2, 3")
3010
1
{
3011
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_MPS));
3012
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3013
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3014
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
3015
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
3016
0
  cmd.info.convolution.dilation[0] = 2;
3017
0
  cmd.info.convolution.dilation[1] = 3;
3018
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
3019
0
  assert(cmd.backend >= 0);
3020
0
  ccv_nnc_cmd_param_t modified_cmd = cmd.info;
3021
0
  modified_cmd.size.dim[0] = (cmd.info.size.dim[0] - 1) * ccv_max(cmd.info.convolution.dilation[0], 1) + 1;
3022
0
  modified_cmd.size.dim[1] = (cmd.info.size.dim[1] - 1) * ccv_max(cmd.info.convolution.dilation[1], 1) + 1;
3023
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(modified_cmd, a->info, g->info);
3024
0
  assert(ccv_nnc_hint_verify(hint, modified_cmd, a->info, g->info) == 0);
3025
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3026
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3027
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, 1, OUTPUT_DIM), 0);
3028
  // configure the inlets.
3029
0
  dsfmt_t dsfmt;
3030
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3031
0
  int i;
3032
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
3033
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
3034
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
3035
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3036
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
3037
0
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
3038
  // Copy generated matrix values over to GPU.
3039
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3040
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
3041
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3042
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3043
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
3044
0
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3045
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
3046
0
  ccv_nnc_tensor_t* gao = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
3047
0
  ccv_nnc_tensor_t* ggo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
3048
0
  ccv_nnc_tensor_t* gho = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
3049
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
3050
0
  ccv_nnc_tensor_t* gbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
3051
0
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
3052
0
  ccv_nnc_tensor_t* gdbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
3053
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
3054
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
3055
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gw, gg), TENSOR_LIST(gao, gwo, ggo), 0);
3056
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
3057
3058
0
  assert(cmd.backend >= 0);
3059
0
  cmd.algorithm = -1;
3060
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
3061
3062
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context);
3063
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context));
3064
0
  ccv_nnc_stream_context_wait(stream_context);
3065
0
  ccv_nnc_stream_context_free(stream_context);
3066
0
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
3067
0
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
3068
0
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, OUTPUT_DIM, 1, 1), 0);
3069
3070
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gho, gdwo, gdbiaso), TENSOR_LIST(gh, gdw, gdbias), 0);
3071
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
3072
3073
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from mps should match from CPU");
3074
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from mps should match from CPU");
3075
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
3076
0
  ccv_nnc_tensor_free(gao);
3077
0
  ccv_nnc_tensor_free(ggo);
3078
0
  ccv_nnc_tensor_free(gho);
3079
0
  ccv_nnc_tensor_free(gwo);
3080
0
  ccv_nnc_tensor_free(gbiaso);
3081
0
  ccv_nnc_tensor_free(gdwo);
3082
0
  ccv_nnc_tensor_free(gdbiaso);
3083
0
  ccv_nnc_tensor_free(h);
3084
0
  ccv_nnc_tensor_free(gh);
3085
0
  ccv_nnc_tensor_free(w);
3086
0
  ccv_nnc_tensor_free(g);
3087
0
  ccv_nnc_tensor_free(a);
3088
0
  ccv_nnc_tensor_free(gbias);
3089
0
  ccv_nnc_tensor_free(gdbias);
3090
0
  ccv_nnc_tensor_free(gdw);
3091
0
  ccv_nnc_tensor_free(gw);
3092
0
  ccv_nnc_tensor_free(gg);
3093
0
  ccv_nnc_tensor_free(ga);
3094
0
  ccv_nnc_tensor_free(ch);
3095
0
  ccv_nnc_tensor_free(cdw);
3096
0
  ccv_nnc_tensor_free(cdbias);
3097
0
}
3098
3099
TEST_CASE("compare group norm gradient with mps")
3100
1
{
3101
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3102
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3103
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3104
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3105
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3106
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3107
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
3108
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
3109
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3110
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3111
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3112
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3113
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3114
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3115
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3116
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3117
0
  ccv_nnc_graph_t* graph = 0;
3118
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3119
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3120
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3121
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3122
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3123
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3124
0
  dsfmt_t dsfmt;
3125
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3126
0
  int i;
3127
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3128
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3129
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3130
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3131
0
  float scaledata[1 * GN_C_DIM * 2 * LN_DIM];
3132
0
  float biasdata[1 * GN_C_DIM * 2 * LN_DIM];
3133
0
  for (i = 0; i < 1 * GN_C_DIM * 2 * LN_DIM; i++)
3134
0
  {
3135
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
3136
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
3137
0
  }
3138
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3139
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3140
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
3141
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3142
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3143
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3144
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3145
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3146
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3147
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3148
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3149
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3150
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3151
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
3152
0
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
3153
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3154
0
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3155
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
3156
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3157
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3158
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3159
0
  ccv_nnc_graph_free(graph);
3160
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3161
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3162
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3163
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
3164
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
3165
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3166
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3167
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3168
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3169
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3170
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3171
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3172
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3173
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
3174
0
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
3175
0
  ccv_nnc_graph_t* cpu_graph = 0;
3176
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3177
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3178
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3179
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3180
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3181
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3182
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3183
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
3184
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * GN_C_DIM * 2 * LN_DIM);
3185
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
3186
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * GN_C_DIM * 2 * LN_DIM);
3187
3188
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3189
3190
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3191
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
3192
0
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
3193
3194
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3195
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias_tensor->data.f32, dcbias_tensor->data.f32, 1 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3196
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dscale_tensor->data.f32, dcscale_tensor->data.f32, 1 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3197
3198
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3199
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3200
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3201
0
  ccv_nnc_graph_free(cpu_graph);
3202
0
  ccv_nnc_tensor_free(x_tensor);
3203
0
  ccv_nnc_tensor_free(dy_tensor);
3204
0
  ccv_nnc_tensor_free(dx_tensor);
3205
0
  ccv_nnc_tensor_free(dscale_tensor);
3206
0
  ccv_nnc_tensor_free(dbias_tensor);
3207
0
}
3208
3209
TEST_CASE("compare group norm gradient with mps, variant 1")
3210
1
{
3211
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3212
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3213
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3214
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3215
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3216
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3217
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 1, 1), "scale");
3218
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 1, 1), "bias");
3219
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
3220
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
3221
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3222
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3223
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3224
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3225
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3226
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3227
0
  ccv_nnc_graph_t* graph = 0;
3228
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3229
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3230
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3231
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3232
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3233
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3234
0
  dsfmt_t dsfmt;
3235
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3236
0
  int i;
3237
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3238
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3239
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3240
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3241
0
  float scaledata[1 * GN_C_DIM * 1 * 1];
3242
0
  float biasdata[1 * GN_C_DIM * 1 * 1];
3243
0
  for (i = 0; i < 1 * GN_C_DIM * 1 * 1; i++)
3244
0
  {
3245
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
3246
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
3247
0
  }
3248
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
3249
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
3250
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
3251
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3252
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3253
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3254
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3255
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3256
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3257
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3258
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3259
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3260
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3261
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
3262
0
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
3263
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
3264
0
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
3265
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
3266
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3267
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3268
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3269
0
  ccv_nnc_graph_free(graph);
3270
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3271
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3272
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3273
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), "scale");
3274
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), "bias");
3275
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
3276
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
3277
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3278
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3279
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3280
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3281
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3282
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3283
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
3284
0
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
3285
0
  ccv_nnc_graph_t* cpu_graph = 0;
3286
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3287
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3288
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3289
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3290
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3291
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3292
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3293
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
3294
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * GN_C_DIM * 1 * 1);
3295
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
3296
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * GN_C_DIM * 1 * 1);
3297
3298
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3299
3300
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3301
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
3302
0
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
3303
3304
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3305
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias_tensor->data.f32, dcbias_tensor->data.f32, 1 * GN_C_DIM * 1 * 1, 1e-5, "group norm output from mps should match from CPU");
3306
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dscale_tensor->data.f32, dcscale_tensor->data.f32, 1 * GN_C_DIM * 1 * 1, 1e-5, "group norm output from mps should match from CPU");
3307
3308
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3309
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3310
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3311
0
  ccv_nnc_graph_free(cpu_graph);
3312
0
  ccv_nnc_tensor_free(x_tensor);
3313
0
  ccv_nnc_tensor_free(dy_tensor);
3314
0
  ccv_nnc_tensor_free(dx_tensor);
3315
0
  ccv_nnc_tensor_free(dscale_tensor);
3316
0
  ccv_nnc_tensor_free(dbias_tensor);
3317
0
}
3318
3319
TEST_CASE("compare group norm gradient with mps (no dbias)")
3320
1
{
3321
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3322
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3323
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3324
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3325
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3326
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3327
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
3328
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
3329
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3330
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3331
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3332
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3333
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3334
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3335
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3336
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3337
0
  ccv_nnc_graph_t* graph = 0;
3338
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3339
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3340
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3341
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3342
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3343
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3344
0
  dsfmt_t dsfmt;
3345
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3346
0
  int i;
3347
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3348
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3349
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3350
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3351
0
  float scaledata[1 * GN_C_DIM * 2 * LN_DIM];
3352
0
  float biasdata[1 * GN_C_DIM * 2 * LN_DIM];
3353
0
  for (i = 0; i < 1 * GN_C_DIM * 2 * LN_DIM; i++)
3354
0
  {
3355
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
3356
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
3357
0
  }
3358
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3359
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3360
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
3361
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3362
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3363
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3364
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3365
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3366
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3367
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3368
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3369
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3370
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3371
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
3372
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
3373
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
3374
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3375
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3376
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3377
0
  ccv_nnc_graph_free(graph);
3378
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3379
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3380
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3381
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
3382
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
3383
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3384
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3385
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3386
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3387
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3388
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3389
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3390
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3391
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
3392
0
  ccv_nnc_graph_t* cpu_graph = 0;
3393
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3394
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3395
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3396
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3397
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3398
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3399
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3400
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
3401
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * GN_C_DIM * 2 * LN_DIM);
3402
3403
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3404
3405
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3406
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
3407
3408
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3409
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dscale_tensor->data.f32, dcscale_tensor->data.f32, 1 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3410
3411
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3412
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3413
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3414
0
  ccv_nnc_graph_free(cpu_graph);
3415
0
  ccv_nnc_tensor_free(x_tensor);
3416
0
  ccv_nnc_tensor_free(dy_tensor);
3417
0
  ccv_nnc_tensor_free(dx_tensor);
3418
0
  ccv_nnc_tensor_free(dscale_tensor);
3419
0
}
3420
3421
TEST_CASE("compare group norm gradient with mps without scale / bias")
3422
1
{
3423
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3424
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3425
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3426
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3427
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3428
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3429
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3430
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3431
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3432
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3433
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3434
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3435
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3436
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3437
0
  ccv_nnc_graph_t* graph = 0;
3438
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3439
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3440
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3441
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3442
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3443
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3444
0
  dsfmt_t dsfmt;
3445
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3446
0
  int i;
3447
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3448
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3449
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3450
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3451
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3452
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3453
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3454
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3455
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3456
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3457
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3458
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3459
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3460
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3461
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3462
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3463
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3464
0
  ccv_nnc_graph_free(graph);
3465
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3466
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3467
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3468
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3469
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3470
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3471
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3472
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3473
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3474
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3475
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3476
0
  ccv_nnc_graph_t* cpu_graph = 0;
3477
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3478
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3479
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3480
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3481
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3482
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3483
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3484
3485
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3486
3487
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3488
3489
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3490
3491
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3492
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3493
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3494
0
  ccv_nnc_graph_free(cpu_graph);
3495
0
  ccv_nnc_tensor_free(x_tensor);
3496
0
  ccv_nnc_tensor_free(dy_tensor);
3497
0
  ccv_nnc_tensor_free(dx_tensor);
3498
0
}
3499
3500
TEST_CASE("compare group norm gradient with mps, variant 1 without scale / bias")
3501
1
{
3502
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3503
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3504
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3505
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3506
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3507
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3508
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
3509
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
3510
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3511
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3512
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3513
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3514
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3515
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3516
0
  ccv_nnc_graph_t* graph = 0;
3517
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3518
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3519
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3520
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3521
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3522
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3523
0
  dsfmt_t dsfmt;
3524
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3525
0
  int i;
3526
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3527
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3528
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3529
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3530
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3531
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3532
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3533
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3534
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3535
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3536
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3537
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3538
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3539
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3540
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3541
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3542
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3543
0
  ccv_nnc_graph_free(graph);
3544
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3545
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3546
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3547
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
3548
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
3549
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3550
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3551
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3552
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3553
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3554
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3555
0
  ccv_nnc_graph_t* cpu_graph = 0;
3556
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3557
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3558
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3559
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3560
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3561
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3562
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3563
3564
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3565
3566
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3567
3568
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3569
3570
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3571
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3572
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3573
0
  ccv_nnc_graph_free(cpu_graph);
3574
0
  ccv_nnc_tensor_free(x_tensor);
3575
0
  ccv_nnc_tensor_free(dy_tensor);
3576
0
  ccv_nnc_tensor_free(dx_tensor);
3577
0
}
3578
3579
TEST_CASE("compare group norm gradient with mps (no dbias) without scale / bias")
3580
1
{
3581
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3582
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3583
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
3584
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3585
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3586
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3587
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3588
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3589
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3590
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3591
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3592
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3593
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
3594
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
3595
0
  ccv_nnc_graph_t* graph = 0;
3596
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3597
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3598
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3599
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3600
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3601
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
3602
0
  dsfmt_t dsfmt;
3603
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3604
0
  int i;
3605
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3606
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3607
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
3608
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
3609
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3610
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3611
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
3612
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
3613
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
3614
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
3615
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3616
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
3617
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
3618
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
3619
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3620
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3621
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3622
0
  ccv_nnc_graph_free(graph);
3623
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3624
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
3625
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
3626
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
3627
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
3628
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3629
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3630
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
3631
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3632
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
3633
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
3634
0
  ccv_nnc_graph_t* cpu_graph = 0;
3635
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3636
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3637
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3638
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3639
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3640
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
3641
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
3642
3643
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3644
3645
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
3646
3647
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
3648
3649
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3650
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3651
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3652
0
  ccv_nnc_graph_free(cpu_graph);
3653
0
  ccv_nnc_tensor_free(x_tensor);
3654
0
  ccv_nnc_tensor_free(dy_tensor);
3655
0
  ccv_nnc_tensor_free(dx_tensor);
3656
0
}
3657
3658
TEST_CASE("broadcasting semantics for mul backward (a,b)")
3659
1
{
3660
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3661
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3662
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3663
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3664
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3665
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3666
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3667
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3668
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3669
0
  a->data.f32[0] = 1;
3670
0
  a->data.f32[1] = 2;
3671
0
  a->data.f32[2] = 3;
3672
0
  a->data.f32[3] = 4;
3673
0
  b->data.f32[0] = 5;
3674
0
  b->data.f32[1] = 6;
3675
0
  float ctp[] = {
3676
0
    6, 7,
3677
0
    7, 8,
3678
0
    8, 9,
3679
0
    9, 10
3680
0
  };
3681
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3682
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3683
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3684
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3685
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3686
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3687
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3688
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, gdb), 0);
3689
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
3690
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, dbt), 0);
3691
3692
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3693
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3694
0
  ccv_nnc_tensor_free(a);
3695
0
  ccv_nnc_tensor_free(b);
3696
0
  ccv_nnc_tensor_free(c);
3697
0
  ccv_nnc_tensor_free(da);
3698
0
  ccv_nnc_tensor_free(db);
3699
0
  ccv_nnc_tensor_free(dat);
3700
0
  ccv_nnc_tensor_free(dbt);
3701
0
  ccv_nnc_tensor_free(ga);
3702
0
  ccv_nnc_tensor_free(gb);
3703
0
  ccv_nnc_tensor_free(gc);
3704
0
  ccv_nnc_tensor_free(gda);
3705
0
  ccv_nnc_tensor_free(gdb);
3706
0
}
3707
3708
TEST_CASE("broadcasting semantics for mul backward (a, nil)")
3709
1
{
3710
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3711
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3712
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3713
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3714
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3715
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3716
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3717
0
  a->data.f32[0] = 1;
3718
0
  a->data.f32[1] = 2;
3719
0
  a->data.f32[2] = 3;
3720
0
  a->data.f32[3] = 4;
3721
0
  b->data.f32[0] = 5;
3722
0
  b->data.f32[1] = 6;
3723
0
  float ctp[] = {
3724
0
    6, 7,
3725
0
    7, 8,
3726
0
    8, 9,
3727
0
    9, 10
3728
0
  };
3729
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3730
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3731
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3732
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3733
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3734
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3735
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, 0), 0);
3736
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, 0), TENSOR_LIST(da, 0), 0);
3737
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, 0), 0);
3738
3739
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3740
0
  ccv_nnc_tensor_free(a);
3741
0
  ccv_nnc_tensor_free(b);
3742
0
  ccv_nnc_tensor_free(c);
3743
0
  ccv_nnc_tensor_free(da);
3744
0
  ccv_nnc_tensor_free(dat);
3745
0
  ccv_nnc_tensor_free(ga);
3746
0
  ccv_nnc_tensor_free(gb);
3747
0
  ccv_nnc_tensor_free(gc);
3748
0
  ccv_nnc_tensor_free(gda);
3749
0
}
3750
3751
TEST_CASE("broadcasting semantics for mul backward (nil,b)")
3752
1
{
3753
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3754
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3755
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3756
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3757
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3758
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3759
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3760
0
  a->data.f32[0] = 1;
3761
0
  a->data.f32[1] = 2;
3762
0
  a->data.f32[2] = 3;
3763
0
  a->data.f32[3] = 4;
3764
0
  b->data.f32[0] = 5;
3765
0
  b->data.f32[1] = 6;
3766
0
  float ctp[] = {
3767
0
    6, 7,
3768
0
    7, 8,
3769
0
    8, 9,
3770
0
    9, 10
3771
0
  };
3772
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3773
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3774
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3775
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3776
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3777
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3778
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(0, gdb), 0);
3779
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, gdb), TENSOR_LIST(0, db), 0);
3780
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(0, dbt), 0);
3781
3782
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3783
0
  ccv_nnc_tensor_free(a);
3784
0
  ccv_nnc_tensor_free(b);
3785
0
  ccv_nnc_tensor_free(c);
3786
0
  ccv_nnc_tensor_free(db);
3787
0
  ccv_nnc_tensor_free(dbt);
3788
0
  ccv_nnc_tensor_free(ga);
3789
0
  ccv_nnc_tensor_free(gb);
3790
0
  ccv_nnc_tensor_free(gc);
3791
0
  ccv_nnc_tensor_free(gdb);
3792
0
}
3793
3794
TEST_CASE("broadcasting semantics for mul backward (no output db)")
3795
1
{
3796
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3797
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3798
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3799
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3800
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3801
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3802
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3803
0
  a->data.f32[0] = 1;
3804
0
  a->data.f32[1] = 2;
3805
0
  a->data.f32[2] = 3;
3806
0
  a->data.f32[3] = 4;
3807
0
  b->data.f32[0] = 5;
3808
0
  b->data.f32[1] = 6;
3809
0
  float ctp[] = {
3810
0
    6, 7,
3811
0
    7, 8,
3812
0
    8, 9,
3813
0
    9, 10
3814
0
  };
3815
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3816
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3817
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3818
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3819
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3820
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3821
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, 0), 0);
3822
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, 0), TENSOR_LIST(da, 0), 0);
3823
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, 0), 0);
3824
3825
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3826
0
  ccv_nnc_tensor_free(a);
3827
0
  ccv_nnc_tensor_free(b);
3828
0
  ccv_nnc_tensor_free(c);
3829
0
  ccv_nnc_tensor_free(da);
3830
0
  ccv_nnc_tensor_free(dat);
3831
0
  ccv_nnc_tensor_free(ga);
3832
0
  ccv_nnc_tensor_free(gb);
3833
0
  ccv_nnc_tensor_free(gc);
3834
0
  ccv_nnc_tensor_free(gda);
3835
0
}
3836
3837
TEST_CASE("broadcasting semantics for mul backward (no input grad)")
3838
1
{
3839
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3840
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3841
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3842
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3843
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3844
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3845
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3846
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3847
0
  a->data.f32[0] = 1;
3848
0
  a->data.f32[1] = 2;
3849
0
  a->data.f32[2] = 3;
3850
0
  a->data.f32[3] = 4;
3851
0
  b->data.f32[0] = 5;
3852
0
  b->data.f32[1] = 6;
3853
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3854
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3855
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3856
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3857
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3858
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
3859
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
3860
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
3861
3862
3863
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3864
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3865
0
  ccv_nnc_tensor_free(a);
3866
0
  ccv_nnc_tensor_free(b);
3867
0
  ccv_nnc_tensor_free(da);
3868
0
  ccv_nnc_tensor_free(db);
3869
0
  ccv_nnc_tensor_free(dat);
3870
0
  ccv_nnc_tensor_free(dbt);
3871
0
  ccv_nnc_tensor_free(ga);
3872
0
  ccv_nnc_tensor_free(gb);
3873
0
  ccv_nnc_tensor_free(gda);
3874
0
  ccv_nnc_tensor_free(gdb);
3875
0
}
3876
3877
3878
TEST_CASE("broadcasting semantics for mul backward (no input grad) for b")
3879
1
{
3880
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3881
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3882
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3883
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3884
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3885
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3886
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3887
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3888
0
  a->data.f32[0] = 1;
3889
0
  a->data.f32[1] = 2;
3890
0
  a->data.f32[2] = 3;
3891
0
  a->data.f32[3] = 4;
3892
0
  a->data.f32[4] = 5;
3893
0
  a->data.f32[5] = 6;
3894
0
  b->data.f32[0] = 7;
3895
0
  b->data.f32[1] = 8;
3896
0
  b->data.f32[2] = 9;
3897
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3898
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3899
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3900
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3901
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3902
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
3903
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
3904
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
3905
3906
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3907
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3908
0
  ccv_nnc_tensor_free(a);
3909
0
  ccv_nnc_tensor_free(b);
3910
0
  ccv_nnc_tensor_free(da);
3911
0
  ccv_nnc_tensor_free(db);
3912
0
  ccv_nnc_tensor_free(dat);
3913
0
  ccv_nnc_tensor_free(dbt);
3914
0
  ccv_nnc_tensor_free(ga);
3915
0
  ccv_nnc_tensor_free(gb);
3916
0
  ccv_nnc_tensor_free(gda);
3917
0
  ccv_nnc_tensor_free(gdb);
3918
0
}
3919
3920
TEST_CASE("broadcasting semantics for mul backward (no input grad) for a")
3921
1
{
3922
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
3923
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3924
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3925
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3926
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3927
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3928
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3929
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3930
0
  b->data.f32[0] = 1;
3931
0
  b->data.f32[1] = 2;
3932
0
  b->data.f32[2] = 3;
3933
0
  b->data.f32[3] = 4;
3934
0
  b->data.f32[4] = 5;
3935
0
  b->data.f32[5] = 6;
3936
0
  a->data.f32[0] = 7;
3937
0
  a->data.f32[1] = 8;
3938
0
  a->data.f32[2] = 9;
3939
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3940
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3941
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3942
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3943
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3944
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
3945
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
3946
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
3947
3948
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3949
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3950
0
  ccv_nnc_tensor_free(a);
3951
0
  ccv_nnc_tensor_free(b);
3952
0
  ccv_nnc_tensor_free(da);
3953
0
  ccv_nnc_tensor_free(db);
3954
0
  ccv_nnc_tensor_free(dat);
3955
0
  ccv_nnc_tensor_free(dbt);
3956
0
  ccv_nnc_tensor_free(ga);
3957
0
  ccv_nnc_tensor_free(gb);
3958
0
  ccv_nnc_tensor_free(gda);
3959
0
  ccv_nnc_tensor_free(gdb);
3960
0
}
3961
3962
TEST_CASE("mps scalar mul forward")
3963
1
{
3964
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_MPS) &&
3965
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
3966
3967
0
  ccv_nnc_tensor_t* const x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
3968
0
  ccv_nnc_tensor_t* const gx = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
3969
  
3970
0
  dsfmt_t dsfmt;
3971
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3972
0
  int i;
3973
0
  for (i = 0; i < 4; i++)
3974
0
      x->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3975
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x), TENSOR_LIST(gx), 0);
3976
3977
0
  ccv_nnc_tensor_t* const gy = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
3978
3979
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(1.1), ccv_nnc_no_hint, 0, TENSOR_LIST(gx), TENSOR_LIST(gy), 0);
3980
3981
0
  ccv_nnc_tensor_t* const y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
3982
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gy), TENSOR_LIST(y), 0);
3983
0
  for (i = 0; i < 4; i++) {
3984
0
      REQUIRE_EQ_WITH_TOLERANCE(x->data.f32[i] * 1.1, y->data.f32[i], 1e-5, "scalarmul forward cy has to be 1.1 * x");
3985
0
  }
3986
3987
0
  ccv_nnc_tensor_free(x);
3988
0
  ccv_nnc_tensor_free(gx);
3989
0
  ccv_nnc_tensor_free(gy);
3990
0
  ccv_nnc_tensor_free(y);
3991
0
}
3992
3993
TEST_CASE("mps scalar mul backward")
3994
1
{
3995
1
  GUARD_ELSE_RETURN(
3996
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
3997
3998
0
  ccv_nnc_tensor_t* const y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
3999
4000
0
  dsfmt_t dsfmt;
4001
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4002
0
  int i;
4003
0
  for (i = 0; i < 4; i++)
4004
0
      y->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4005
0
  ccv_nnc_tensor_t* const gy = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
4006
0
  ccv_nnc_tensor_t* const gdx = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
4007
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y), TENSOR_LIST(gy), 0);
4008
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_BACKWARD(1.1), ccv_nnc_no_hint, 0, TENSOR_LIST(gy), TENSOR_LIST(gdx), 0);
4009
  
4010
0
  ccv_nnc_tensor_t* const dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
4011
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdx), TENSOR_LIST(dx), 0);
4012
4013
0
  for (i = 0; i < 4; i++) {
4014
0
      REQUIRE_EQ_WITH_TOLERANCE(dx->data.f32[i], y->data.f32[i] * 1.1, 1e-5, "scalarmul backward dx has to be 1.1 * dy");
4015
0
  }
4016
4017
0
  ccv_nnc_tensor_free(y);
4018
0
  ccv_nnc_tensor_free(gy);
4019
0
  ccv_nnc_tensor_free(gdx);
4020
0
  ccv_nnc_tensor_free(dx);
4021
0
}
4022
4023
TEST_CASE("mps scalar mul backward, no input")
4024
1
{
4025
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
4026
4027
0
  ccv_nnc_tensor_t* const gdx = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
4028
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_BACKWARD(1.1), ccv_nnc_no_hint, 0, TENSOR_LIST(0), TENSOR_LIST(gdx), 0);
4029
0
  ccv_nnc_tensor_t* const dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
4030
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdx), TENSOR_LIST(dx), 0);
4031
4032
0
  for (int i = 0; i < 4; i++)
4033
0
      REQUIRE_EQ_WITH_TOLERANCE(dx->data.f32[i], 1.1, 1e-5, "scalar mul backward without input should be 1.1 ");
4034
0
  ccv_nnc_tensor_free(gdx);
4035
0
  ccv_nnc_tensor_free(dx);
4036
0
}
4037
4038
TEST_CASE("mps forward convolution transpose")
4039
1
{
4040
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
4041
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4042
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4043
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
4044
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
4045
0
  assert(cmd.backend >= 0);
4046
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
4047
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4048
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, INPUT_DIM), 0);
4049
  // configure the inlets.
4050
0
  dsfmt_t dsfmt;
4051
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4052
0
  int i;
4053
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
4054
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
4055
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
4056
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4057
0
  for (i = 0; i < INPUT_DIM; i++)
4058
0
    bias->data.f32[i] = (float)i / INPUT_DIM;
4059
  // Copy generated matrix values over to GPU.
4060
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4061
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4062
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4063
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, INPUT_DIM), 0);
4064
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
4065
0
  move.backend = CCV_NNC_BACKEND_MPS;
4066
0
  assert(move.backend >= 0);
4067
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
4068
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
4069
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4070
4071
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
4072
0
  transform.backend = CCV_NNC_BACKEND_MPS;
4073
0
  assert(transform.backend >= 0);
4074
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
4075
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
4076
0
  ccv_nnc_stream_context_wait(stream_context);
4077
0
  ccv_nnc_tensor_free(gw);
4078
4079
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
4080
0
  assert(cmd.backend >= 0);
4081
0
  cmd.algorithm = -1;
4082
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
4083
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
4084
0
  ccv_nnc_stream_context_wait(stream_context);
4085
0
  ccv_nnc_stream_context_free(stream_context);
4086
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4087
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
4088
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
4089
0
  ccv_nnc_tensor_free(c);
4090
0
  ccv_nnc_tensor_free(gc);
4091
0
  ccv_nnc_tensor_free(bias);
4092
0
  ccv_nnc_tensor_free(w);
4093
0
  ccv_nnc_tensor_free(b);
4094
0
  ccv_nnc_tensor_free(a);
4095
0
  ccv_nnc_tensor_free(gbias);
4096
0
  ccv_nnc_tensor_free(gwo);
4097
0
  ccv_nnc_tensor_free(ga);
4098
0
}
4099
4100
TEST_CASE("mps forward convolution transpose in nchw format")
4101
1
{
4102
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
4103
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
4104
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
4105
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
4106
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
4107
0
  assert(cmd.backend >= 0);
4108
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
4109
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4110
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, INPUT_DIM), 0);
4111
  // configure the inlets.
4112
0
  dsfmt_t dsfmt;
4113
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4114
0
  int i;
4115
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
4116
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
4117
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
4118
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4119
0
  for (i = 0; i < INPUT_DIM; i++)
4120
0
    bias->data.f32[i] = (float)i / INPUT_DIM;
4121
  // Copy generated matrix values over to GPU.
4122
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
4123
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4124
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, INPUT_DIM), 0);
4125
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
4126
0
  move.backend = CCV_NNC_BACKEND_MPS;
4127
0
  assert(move.backend >= 0);
4128
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
4129
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
4130
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
4131
4132
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
4133
0
  transform.backend = CCV_NNC_BACKEND_MPS;
4134
0
  assert(transform.backend >= 0);
4135
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
4136
0
  assert(cmd.backend >= 0);
4137
0
  cmd.algorithm = -1;
4138
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
4139
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
4140
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
4141
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
4142
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-5, "output from mps should match from CPU");
4143
0
  ccv_nnc_tensor_free(c);
4144
0
  ccv_nnc_tensor_free(gc);
4145
0
  ccv_nnc_tensor_free(bias);
4146
0
  ccv_nnc_tensor_free(w);
4147
0
  ccv_nnc_tensor_free(b);
4148
0
  ccv_nnc_tensor_free(a);
4149
0
  ccv_nnc_tensor_free(gbias);
4150
0
  ccv_nnc_tensor_free(gw);
4151
0
  ccv_nnc_tensor_free(ga);
4152
0
}
4153
4154
TEST_CASE("mps forward convolution transpose in half precision")
4155
1
{
4156
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
4157
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4158
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4159
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
4160
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
4161
0
  assert(cmd.backend >= 0);
4162
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
4163
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4164
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, INPUT_DIM), 0);
4165
  // configure the inlets.
4166
0
  dsfmt_t dsfmt;
4167
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4168
0
  int i;
4169
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
4170
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
4171
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
4172
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4173
0
  for (i = 0; i < INPUT_DIM; i++)
4174
0
    bias->data.f32[i] = (float)i / INPUT_DIM;
4175
0
  ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4176
0
  ccv_nnc_tensor_t* w1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4177
0
  ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, INPUT_DIM), 0);
4178
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(a1, w1, bias1), 0);
4179
  // Copy generated matrix values over to GPU.
4180
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4181
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4182
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4183
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, INPUT_DIM), 0);
4184
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, w1, bias1), TENSOR_LIST(ga, gw, gbias), 0);
4185
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
4186
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4187
4188
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
4189
0
  transform.backend = CCV_NNC_BACKEND_MPS;
4190
0
  assert(transform.backend >= 0);
4191
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
4192
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
4193
0
  ccv_nnc_stream_context_wait(stream_context);
4194
0
  ccv_nnc_tensor_free(gw);
4195
4196
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
4197
0
  assert(cmd.backend >= 0);
4198
0
  cmd.algorithm = -1;
4199
0
  cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
4200
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
4201
0
  ccv_nnc_stream_context_wait(stream_context);
4202
0
  ccv_nnc_stream_context_free(stream_context);
4203
0
  ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4204
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
4205
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4206
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
4207
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 5e-3, "output from mps should match from CPU");
4208
0
  ccv_nnc_tensor_free(c);
4209
0
  ccv_nnc_tensor_free(gc);
4210
0
  ccv_nnc_tensor_free(bias);
4211
0
  ccv_nnc_tensor_free(w);
4212
0
  ccv_nnc_tensor_free(b);
4213
0
  ccv_nnc_tensor_free(a);
4214
0
  ccv_nnc_tensor_free(c1);
4215
0
  ccv_nnc_tensor_free(bias1);
4216
0
  ccv_nnc_tensor_free(w1);
4217
0
  ccv_nnc_tensor_free(a1);
4218
0
  ccv_nnc_tensor_free(gbias);
4219
0
  ccv_nnc_tensor_free(gwo);
4220
0
  ccv_nnc_tensor_free(ga);
4221
0
}
4222
4223
#include "case_main.h"