Coverage Report

Created: 2026-04-03 17:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/mpsdnn.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
#include <nnc/ccv_nnc_internal.h>
9
10
TEST_SETUP()
11
{
12
  ccv_nnc_init();
13
}
14
15
0
#define INPUT_DIM (3)
16
0
#define OUTPUT_DIM (96)
17
18
0
#define INPUT_SIZE (224)
19
0
#define OUTPUT_SIZE (112)
20
21
0
#define KERNEL_SIZE (7)
22
23
#define BATCH_SIZE (16)
24
25
0
#define LN_DIM (10)
26
0
#define GN_C_DIM (16)
27
#define GN_RC_DIM (4)
28
29
TEST_CASE("mps forward convolution")
30
1
{
31
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
32
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
33
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
34
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
35
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
36
0
  assert(cmd.backend >= 0);
37
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
38
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
39
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
40
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
41
  // configure the inlets.
42
0
  dsfmt_t dsfmt;
43
0
  dsfmt_init_gen_rand(&dsfmt, 0);
44
0
  int i;
45
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
46
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
47
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
48
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
49
0
  for (i = 0; i < OUTPUT_DIM; i++)
50
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
51
  // Copy generated matrix values over to GPU.
52
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
53
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
54
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
55
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
56
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
57
0
  move.backend = CCV_NNC_BACKEND_MPS;
58
0
  assert(move.backend >= 0);
59
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
60
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
61
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
62
63
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
64
0
  transform.backend = CCV_NNC_BACKEND_MPS;
65
0
  assert(transform.backend >= 0);
66
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
67
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
68
0
  ccv_nnc_stream_context_wait(stream_context);
69
0
  ccv_nnc_tensor_free(gw);
70
71
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
72
0
  assert(cmd.backend >= 0);
73
0
  cmd.algorithm = -1;
74
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
75
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
76
0
  ccv_nnc_stream_context_wait(stream_context);
77
0
  ccv_nnc_stream_context_free(stream_context);
78
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
79
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
80
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 5e-4, "output from mps should match from CPU");
81
0
  ccv_nnc_tensor_free(c);
82
0
  ccv_nnc_tensor_free(gc);
83
0
  ccv_nnc_tensor_free(bias);
84
0
  ccv_nnc_tensor_free(w);
85
0
  ccv_nnc_tensor_free(b);
86
0
  ccv_nnc_tensor_free(a);
87
0
  ccv_nnc_tensor_free(gbias);
88
0
  ccv_nnc_tensor_free(gwo);
89
0
  ccv_nnc_tensor_free(ga);
90
0
}
91
92
TEST_CASE("mps forward convolution in nchw format")
93
1
{
94
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
95
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
96
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
97
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
98
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
99
0
  assert(cmd.backend >= 0);
100
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
101
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
102
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
103
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
104
  // configure the inlets.
105
0
  dsfmt_t dsfmt;
106
0
  dsfmt_init_gen_rand(&dsfmt, 0);
107
0
  int i;
108
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
109
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
110
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
111
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
112
0
  for (i = 0; i < OUTPUT_DIM; i++)
113
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
114
  // Copy generated matrix values over to GPU.
115
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
116
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
117
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
118
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
119
0
  move.backend = CCV_NNC_BACKEND_MPS;
120
0
  assert(move.backend >= 0);
121
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
122
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
123
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
124
125
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
126
0
  transform.backend = CCV_NNC_BACKEND_MPS;
127
0
  assert(transform.backend >= 0);
128
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
129
0
  assert(cmd.backend >= 0);
130
0
  cmd.algorithm = -1;
131
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
132
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
133
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
134
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
135
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-3, "output from mps should match from CPU");
136
0
  ccv_nnc_tensor_free(c);
137
0
  ccv_nnc_tensor_free(gc);
138
0
  ccv_nnc_tensor_free(bias);
139
0
  ccv_nnc_tensor_free(w);
140
0
  ccv_nnc_tensor_free(b);
141
0
  ccv_nnc_tensor_free(a);
142
0
  ccv_nnc_tensor_free(gbias);
143
0
  ccv_nnc_tensor_free(gw);
144
0
  ccv_nnc_tensor_free(ga);
145
0
}
146
147
TEST_CASE("mps forward convolution with 1x1 kernel")
148
1
{
149
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
150
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, INPUT_DIM), 0);
151
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
152
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 1, 1, INPUT_DIM);
153
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
154
0
  assert(cmd.backend >= 0);
155
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
156
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
157
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, 1, 1, INPUT_DIM), 0);
158
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
159
  // configure the inlets.
160
0
  dsfmt_t dsfmt;
161
0
  dsfmt_init_gen_rand(&dsfmt, 0);
162
0
  int i;
163
0
  for (i = 0; i < INPUT_DIM * 1 * 1 * OUTPUT_DIM; i++)
164
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * 1 * 1);
165
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
166
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
167
0
  for (i = 0; i < OUTPUT_DIM; i++)
168
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
169
  // Copy generated matrix values over to GPU.
170
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, INPUT_DIM), 0);
171
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, 1, 1, INPUT_DIM), 0);
172
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
173
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
174
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
175
0
  move.backend = CCV_NNC_BACKEND_MPS;
176
0
  assert(move.backend >= 0);
177
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
178
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
179
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
180
181
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
182
0
  transform.backend = CCV_NNC_BACKEND_MPS;
183
0
  assert(transform.backend >= 0);
184
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
185
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
186
0
  ccv_nnc_stream_context_wait(stream_context);
187
0
  ccv_nnc_tensor_free(gw);
188
189
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
190
0
  assert(cmd.backend >= 0);
191
0
  cmd.algorithm = -1;
192
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
193
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
194
0
  ccv_nnc_stream_context_wait(stream_context);
195
0
  ccv_nnc_stream_context_free(stream_context);
196
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
197
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
198
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-3, "output from mps should match from CPU");
199
0
  ccv_nnc_tensor_free(c);
200
0
  ccv_nnc_tensor_free(gc);
201
0
  ccv_nnc_tensor_free(bias);
202
0
  ccv_nnc_tensor_free(w);
203
0
  ccv_nnc_tensor_free(b);
204
0
  ccv_nnc_tensor_free(a);
205
0
  ccv_nnc_tensor_free(gbias);
206
0
  ccv_nnc_tensor_free(gwo);
207
0
  ccv_nnc_tensor_free(ga);
208
0
}
209
210
TEST_CASE("mps forward convolution in nchw format with 1x1 kernel")
211
1
{
212
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
213
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
214
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
215
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 1, 1, INPUT_DIM);
216
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
217
0
  assert(cmd.backend >= 0);
218
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
219
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
220
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
221
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
222
  // configure the inlets.
223
0
  dsfmt_t dsfmt;
224
0
  dsfmt_init_gen_rand(&dsfmt, 0);
225
0
  int i;
226
0
  for (i = 0; i < INPUT_DIM * 1 * 1 * OUTPUT_DIM; i++)
227
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * 1 * 1);
228
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
229
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
230
0
  for (i = 0; i < OUTPUT_DIM; i++)
231
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
232
  // Copy generated matrix values over to GPU.
233
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
234
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
235
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
236
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
237
0
  move.backend = CCV_NNC_BACKEND_MPS;
238
0
  assert(move.backend >= 0);
239
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
240
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
241
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
242
243
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
244
0
  transform.backend = CCV_NNC_BACKEND_MPS;
245
0
  assert(transform.backend >= 0);
246
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
247
0
  assert(cmd.backend >= 0);
248
0
  cmd.algorithm = -1;
249
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
250
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
251
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
252
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
253
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-3, "output from mps should match from CPU");
254
0
  ccv_nnc_tensor_free(c);
255
0
  ccv_nnc_tensor_free(gc);
256
0
  ccv_nnc_tensor_free(bias);
257
0
  ccv_nnc_tensor_free(w);
258
0
  ccv_nnc_tensor_free(b);
259
0
  ccv_nnc_tensor_free(a);
260
0
  ccv_nnc_tensor_free(gbias);
261
0
  ccv_nnc_tensor_free(gw);
262
0
  ccv_nnc_tensor_free(ga);
263
0
}
264
265
TEST_CASE("mps forward convolution in nchw format with row-wise 8i weight")
266
1
{
267
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
268
0
  const int batch_size = 2;
269
0
  const int input_dim = 8;
270
0
  const int output_dim = 12;
271
0
  const int spatial = 9;
272
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, input_dim, spatial, spatial), 0);
273
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, output_dim, spatial, spatial), 0);
274
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_dim, 1, 1, input_dim);
275
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
276
0
  assert(cmd.backend >= 0);
277
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
278
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
279
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, output_dim, input_dim, 1, 1), 0);
280
0
  ccv_nnc_tensor_t* wq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NCHW(32F, output_dim, input_dim, 1, 1)), 0);
281
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_dim), 0);
282
0
  dsfmt_t dsfmt;
283
0
  dsfmt_init_gen_rand(&dsfmt, 0);
284
0
  int i;
285
0
  for (i = 0; i < batch_size * input_dim * spatial * spatial; i++)
286
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
287
0
  for (i = 0; i < output_dim * input_dim; i++)
288
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5f;
289
0
  for (i = 0; i < output_dim; i++)
290
0
    bias->data.f32[i] = (float)i / output_dim;
291
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(w->data.f32, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_dim * input_dim, 1, wq->data.u8, ccv_nnc_tensor_data_size_without_padding(wq->info));
292
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(wq->info), "row-wise 8i convolution weight should fit the tensor exactly");
293
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
294
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, batch_size, input_dim, spatial, spatial), 0);
295
0
  ccv_nnc_tensor_t* gwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NCHW(000, 32F, output_dim, input_dim, 1, 1)), 0);
296
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_dim), 0);
297
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, batch_size, output_dim, spatial, spatial), 0);
298
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
299
0
  move.backend = CCV_NNC_BACKEND_MPS;
300
0
  assert(move.backend >= 0);
301
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, wq, bias), TENSOR_LIST(ga, gwq, gbias), 0);
302
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
303
0
  assert(cmd.backend >= 0);
304
0
  cmd.algorithm = -1;
305
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwq, gbias), TENSOR_LIST(gc), 0);
306
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwq, gbias), TENSOR_LIST(gc), 0));
307
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, output_dim, spatial, spatial), 0);
308
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
309
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_dim * spatial * spatial, 2e-3, "output from mps should match CPU when row-wise 8i weights are dequantized to dense scratch");
310
0
  ccv_nnc_tensor_free(c);
311
0
  ccv_nnc_tensor_free(gc);
312
0
  ccv_nnc_tensor_free(gbias);
313
0
  ccv_nnc_tensor_free(gwq);
314
0
  ccv_nnc_tensor_free(ga);
315
0
  ccv_nnc_tensor_free(bias);
316
0
  ccv_nnc_tensor_free(wq);
317
0
  ccv_nnc_tensor_free(w);
318
0
  ccv_nnc_tensor_free(b);
319
0
  ccv_nnc_tensor_free(a);
320
0
}
321
322
TEST_CASE("mps forward convolution in nchw format with 1x1 kernel and no bias")
323
1
{
324
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
325
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
326
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
327
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 1, 1, INPUT_DIM);
328
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
329
0
  assert(cmd.backend >= 0);
330
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
331
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
332
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
333
0
  dsfmt_t dsfmt;
334
0
  dsfmt_init_gen_rand(&dsfmt, 1);
335
0
  int i;
336
0
  for (i = 0; i < INPUT_DIM * 1 * 1 * OUTPUT_DIM; i++)
337
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * 1 * 1);
338
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
339
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
340
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
341
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 1, 1), 0);
342
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
343
0
  move.backend = CCV_NNC_BACKEND_MPS;
344
0
  assert(move.backend >= 0);
345
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
346
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
347
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
348
349
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
350
0
  assert(cmd.backend >= 0);
351
0
  cmd.algorithm = -1;
352
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw), TENSOR_LIST(gc), 0);
353
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw), TENSOR_LIST(gc), 0));
354
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
355
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
356
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-3, "output from mps should match from CPU");
357
0
  ccv_nnc_tensor_free(c);
358
0
  ccv_nnc_tensor_free(gc);
359
0
  ccv_nnc_tensor_free(w);
360
0
  ccv_nnc_tensor_free(b);
361
0
  ccv_nnc_tensor_free(a);
362
0
  ccv_nnc_tensor_free(gw);
363
0
  ccv_nnc_tensor_free(ga);
364
0
}
365
366
TEST_CASE("mps forward convolution in nchw format with 1x1 kernel on edge tiles")
367
1
{
368
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
369
0
  const int batch_size = 2;
370
0
  const int input_dim = 17;
371
0
  const int output_dim = 77;
372
0
  const int output_h = 17;
373
0
  const int output_w = 19;
374
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, input_dim, output_h, output_w), 0);
375
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, output_dim, output_h, output_w), 0);
376
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_dim, 1, 1, input_dim);
377
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
378
0
  assert(cmd.backend >= 0);
379
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
380
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
381
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_dim, input_dim, 1, 1), 0);
382
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_dim), 0);
383
0
  dsfmt_t dsfmt;
384
0
  dsfmt_init_gen_rand(&dsfmt, 2);
385
0
  int i;
386
0
  for (i = 0; i < input_dim * output_dim; i++)
387
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / input_dim;
388
0
  for (i = 0; i < batch_size * input_dim * output_h * output_w; i++)
389
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
390
0
  for (i = 0; i < output_dim; i++)
391
0
    bias->data.f32[i] = (float)i / output_dim;
392
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, batch_size, input_dim, output_h, output_w), 0);
393
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_dim, input_dim, 1, 1), 0);
394
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_dim), 0);
395
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
396
0
  move.backend = CCV_NNC_BACKEND_MPS;
397
0
  assert(move.backend >= 0);
398
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
399
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
400
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, batch_size, output_dim, output_h, output_w), 0);
401
402
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
403
0
  assert(cmd.backend >= 0);
404
0
  cmd.algorithm = -1;
405
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
406
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
407
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, output_dim, output_h, output_w), 0);
408
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
409
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_dim * output_h * output_w, 1e-3, "output from mps should match from CPU");
410
0
  ccv_nnc_tensor_free(c);
411
0
  ccv_nnc_tensor_free(gc);
412
0
  ccv_nnc_tensor_free(bias);
413
0
  ccv_nnc_tensor_free(w);
414
0
  ccv_nnc_tensor_free(b);
415
0
  ccv_nnc_tensor_free(a);
416
0
  ccv_nnc_tensor_free(gbias);
417
0
  ccv_nnc_tensor_free(gw);
418
0
  ccv_nnc_tensor_free(ga);
419
0
}
420
421
TEST_CASE("mps forward convolution in nchw format with 1x1 kernel on edge tiles and no bias")
422
1
{
423
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
424
0
  const int batch_size = 2;
425
0
  const int input_dim = 17;
426
0
  const int output_dim = 77;
427
0
  const int output_h = 17;
428
0
  const int output_w = 19;
429
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, input_dim, output_h, output_w), 0);
430
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, output_dim, output_h, output_w), 0);
431
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_dim, 1, 1, input_dim);
432
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
433
0
  assert(cmd.backend >= 0);
434
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
435
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
436
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_dim, input_dim, 1, 1), 0);
437
0
  dsfmt_t dsfmt;
438
0
  dsfmt_init_gen_rand(&dsfmt, 3);
439
0
  int i;
440
0
  for (i = 0; i < input_dim * output_dim; i++)
441
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / input_dim;
442
0
  for (i = 0; i < batch_size * input_dim * output_h * output_w; i++)
443
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
444
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, batch_size, input_dim, output_h, output_w), 0);
445
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_dim, input_dim, 1, 1), 0);
446
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
447
0
  move.backend = CCV_NNC_BACKEND_MPS;
448
0
  assert(move.backend >= 0);
449
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
450
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
451
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, batch_size, output_dim, output_h, output_w), 0);
452
453
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
454
0
  assert(cmd.backend >= 0);
455
0
  cmd.algorithm = -1;
456
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw), TENSOR_LIST(gc), 0);
457
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw), TENSOR_LIST(gc), 0));
458
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, output_dim, output_h, output_w), 0);
459
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
460
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_dim * output_h * output_w, 1e-3, "output from mps should match from CPU");
461
0
  ccv_nnc_tensor_free(c);
462
0
  ccv_nnc_tensor_free(gc);
463
0
  ccv_nnc_tensor_free(w);
464
0
  ccv_nnc_tensor_free(b);
465
0
  ccv_nnc_tensor_free(a);
466
0
  ccv_nnc_tensor_free(gw);
467
0
  ccv_nnc_tensor_free(ga);
468
0
}
469
470
TEST_CASE("mps forward convolution in half precision")
471
1
{
472
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
473
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
474
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
475
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
476
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
477
0
  assert(cmd.backend >= 0);
478
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
479
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
480
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
481
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
482
  // configure the inlets.
483
0
  dsfmt_t dsfmt;
484
0
  dsfmt_init_gen_rand(&dsfmt, 0);
485
0
  int i;
486
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
487
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
488
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
489
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
490
0
  for (i = 0; i < OUTPUT_DIM; i++)
491
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
492
0
  ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
493
0
  ccv_nnc_tensor_t* w1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
494
0
  ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM), 0);
495
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(a1, w1, bias1), 0);
496
  // Copy generated matrix values over to GPU.
497
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
498
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
499
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
500
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM), 0);
501
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, w1, bias1), TENSOR_LIST(ga, gw, gbias), 0);
502
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
503
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
504
505
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
506
0
  transform.backend = CCV_NNC_BACKEND_MPS;
507
0
  assert(transform.backend >= 0);
508
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
509
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
510
0
  ccv_nnc_stream_context_wait(stream_context);
511
0
  ccv_nnc_tensor_free(gw);
512
513
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
514
0
  assert(cmd.backend >= 0);
515
0
  cmd.algorithm = -1;
516
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
517
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
518
0
  ccv_nnc_stream_context_wait(stream_context);
519
0
  ccv_nnc_stream_context_free(stream_context);
520
0
  ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
521
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
522
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
523
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
524
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 5e-3, "output from mps should match from CPU");
525
0
  ccv_nnc_tensor_free(c);
526
0
  ccv_nnc_tensor_free(gc);
527
0
  ccv_nnc_tensor_free(bias);
528
0
  ccv_nnc_tensor_free(w);
529
0
  ccv_nnc_tensor_free(b);
530
0
  ccv_nnc_tensor_free(a);
531
0
  ccv_nnc_tensor_free(c1);
532
0
  ccv_nnc_tensor_free(bias1);
533
0
  ccv_nnc_tensor_free(w1);
534
0
  ccv_nnc_tensor_free(a1);
535
0
  ccv_nnc_tensor_free(gbias);
536
0
  ccv_nnc_tensor_free(gwo);
537
0
  ccv_nnc_tensor_free(ga);
538
0
}
539
540
TEST_CASE("mps forward convolution with dilation 2, 3")
541
1
{
542
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
543
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
544
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
545
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
546
0
  cmd.info.convolution.dilation[0] = 2;
547
0
  cmd.info.convolution.dilation[1] = 3;
548
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
549
0
  assert(cmd.backend >= 0);
550
0
  ccv_nnc_cmd_param_t modified_cmd = cmd.info;
551
0
  modified_cmd.size.dim[0] = (cmd.info.size.dim[0] - 1) * ccv_max(cmd.info.convolution.dilation[0], 1) + 1;
552
0
  modified_cmd.size.dim[1] = (cmd.info.size.dim[1] - 1) * ccv_max(cmd.info.convolution.dilation[1], 1) + 1;
553
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(modified_cmd, a->info, b->info);
554
0
  assert(ccv_nnc_hint_verify(hint, modified_cmd, a->info, b->info) == 0);
555
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
556
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
557
  // configure the inlets.
558
0
  dsfmt_t dsfmt;
559
0
  dsfmt_init_gen_rand(&dsfmt, 0);
560
0
  int i;
561
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
562
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
563
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
564
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
565
0
  for (i = 0; i < OUTPUT_DIM; i++)
566
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
567
  // Copy generated matrix values over to GPU.
568
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
569
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
570
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
571
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
572
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
573
0
  move.backend = CCV_NNC_BACKEND_MPS;
574
0
  assert(move.backend >= 0);
575
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
576
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
577
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
578
579
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
580
0
  transform.backend = CCV_NNC_BACKEND_MPS;
581
0
  assert(transform.backend >= 0);
582
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
583
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
584
0
  ccv_nnc_stream_context_wait(stream_context);
585
0
  ccv_nnc_tensor_free(gw);
586
587
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
588
0
  assert(cmd.backend >= 0);
589
0
  cmd.algorithm = -1;
590
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
591
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
592
0
  ccv_nnc_stream_context_wait(stream_context);
593
0
  ccv_nnc_stream_context_free(stream_context);
594
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
595
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
596
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-5, "output from mps should match from CPU");
597
0
  ccv_nnc_tensor_free(c);
598
0
  ccv_nnc_tensor_free(gc);
599
0
  ccv_nnc_tensor_free(bias);
600
0
  ccv_nnc_tensor_free(w);
601
0
  ccv_nnc_tensor_free(b);
602
0
  ccv_nnc_tensor_free(a);
603
0
  ccv_nnc_tensor_free(gbias);
604
0
  ccv_nnc_tensor_free(gwo);
605
0
  ccv_nnc_tensor_free(ga);
606
0
}
607
608
TEST_CASE("mps forward convolution 3d")
609
1
{
610
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
611
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
612
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
613
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
614
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
615
0
  hint.stride.dim[0] = 2;
616
0
  hint.border.begin[0] = 1;
617
0
  hint.border.end[0] = 1;
618
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
619
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
620
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
621
  // configure the inlets.
622
0
  dsfmt_t dsfmt;
623
0
  dsfmt_init_gen_rand(&dsfmt, 0);
624
0
  int i;
625
0
  for (i = 0; i < INPUT_DIM * 3 * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
626
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
627
0
  for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
628
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
629
0
  for (i = 0; i < OUTPUT_DIM; i++)
630
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
631
  // Copy generated matrix values over to GPU.
632
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 5, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
633
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
634
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
635
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM), 0);
636
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
637
0
  move.backend = CCV_NNC_BACKEND_MPS;
638
0
  assert(move.backend >= 0);
639
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
640
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
641
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
642
0
  transform.backend = CCV_NNC_BACKEND_MPS;
643
0
  assert(transform.backend >= 0);
644
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
645
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
646
0
  ccv_nnc_stream_context_wait(stream_context);
647
0
  ccv_nnc_tensor_free(gw);
648
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
649
0
  assert(cmd.backend >= 0);
650
0
  cmd.algorithm = -1;
651
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
652
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
653
0
  ccv_nnc_stream_context_wait(stream_context);
654
0
  ccv_nnc_stream_context_free(stream_context);
655
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, 3, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
656
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
657
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
658
0
  assert(cmd.backend >= 0);
659
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
660
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from mps should match from CPU");
661
0
  ccv_nnc_tensor_free(c);
662
0
  ccv_nnc_tensor_free(gc);
663
0
  ccv_nnc_tensor_free(bias);
664
0
  ccv_nnc_tensor_free(w);
665
0
  ccv_nnc_tensor_free(b);
666
0
  ccv_nnc_tensor_free(a);
667
0
  ccv_nnc_tensor_free(gbias);
668
0
  ccv_nnc_tensor_free(gwo);
669
0
  ccv_nnc_tensor_free(ga);
670
0
}
671
672
TEST_CASE("mps forward convolution 3d via mfa conv3d")
673
1
{
674
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
675
0
  const int batch_size = 2;
676
0
  const int input_channels = 16;
677
0
  const int output_channels = 32;
678
0
  const int input_depth = 5;
679
0
  const int input_height = 10;
680
0
  const int input_width = 10;
681
0
  const int kernel_depth = 3;
682
0
  const int kernel_height = 3;
683
0
  const int kernel_width = 3;
684
0
  const int output_depth = 3;
685
0
  const int output_height = 8;
686
0
  const int output_width = 8;
687
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
688
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
689
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
690
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
691
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
692
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
693
0
  dsfmt_t dsfmt;
694
0
  dsfmt_init_gen_rand(&dsfmt, 1);
695
0
  int i;
696
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
697
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
698
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
699
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
700
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
701
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
702
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
703
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
704
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
705
0
  move.backend = CCV_NNC_BACKEND_MPS;
706
0
  assert(move.backend >= 0);
707
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
708
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
709
0
  transform.backend = CCV_NNC_BACKEND_MPS;
710
0
  assert(transform.backend >= 0);
711
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
712
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
713
0
  ccv_nnc_stream_context_wait(stream_context);
714
0
  ccv_nnc_tensor_free(gw);
715
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
716
0
  assert(cmd.backend >= 0);
717
0
  cmd.algorithm = -1;
718
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
719
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
720
0
  ccv_nnc_stream_context_wait(stream_context);
721
0
  ccv_nnc_stream_context_free(stream_context);
722
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
723
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
724
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
725
0
  assert(cmd.backend >= 0);
726
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
727
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
728
0
  ccv_nnc_tensor_free(c);
729
0
  ccv_nnc_tensor_free(gc);
730
0
  ccv_nnc_tensor_free(w);
731
0
  ccv_nnc_tensor_free(b);
732
0
  ccv_nnc_tensor_free(a);
733
0
  ccv_nnc_tensor_free(gwo);
734
0
  ccv_nnc_tensor_free(ga);
735
0
}
736
737
TEST_CASE("mps forward convolution 3d via mfa conv3d with bias")
738
1
{
739
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
740
0
  const int batch_size = 2;
741
0
  const int input_channels = 16;
742
0
  const int output_channels = 32;
743
0
  const int input_depth = 5;
744
0
  const int input_height = 10;
745
0
  const int input_width = 10;
746
0
  const int kernel_depth = 3;
747
0
  const int kernel_height = 3;
748
0
  const int kernel_width = 3;
749
0
  const int output_depth = 3;
750
0
  const int output_height = 8;
751
0
  const int output_width = 8;
752
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
753
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
754
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
755
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
756
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
757
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
758
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels), 0);
759
0
  dsfmt_t dsfmt;
760
0
  dsfmt_init_gen_rand(&dsfmt, 2);
761
0
  int i;
762
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
763
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
764
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
765
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
766
0
  for (i = 0; i < output_channels; i++)
767
0
    bias->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
768
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
769
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
770
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
771
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels), 0);
772
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
773
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
774
0
  move.backend = CCV_NNC_BACKEND_MPS;
775
0
  assert(move.backend >= 0);
776
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
777
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
778
0
  transform.backend = CCV_NNC_BACKEND_MPS;
779
0
  assert(transform.backend >= 0);
780
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
781
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
782
0
  ccv_nnc_stream_context_wait(stream_context);
783
0
  ccv_nnc_tensor_free(gw);
784
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
785
0
  assert(cmd.backend >= 0);
786
0
  cmd.algorithm = -1;
787
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
788
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
789
0
  ccv_nnc_stream_context_wait(stream_context);
790
0
  ccv_nnc_stream_context_free(stream_context);
791
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
792
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
793
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
794
0
  assert(cmd.backend >= 0);
795
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
796
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
797
0
  ccv_nnc_tensor_free(c);
798
0
  ccv_nnc_tensor_free(gc);
799
0
  ccv_nnc_tensor_free(bias);
800
0
  ccv_nnc_tensor_free(w);
801
0
  ccv_nnc_tensor_free(b);
802
0
  ccv_nnc_tensor_free(a);
803
0
  ccv_nnc_tensor_free(gbias);
804
0
  ccv_nnc_tensor_free(gwo);
805
0
  ccv_nnc_tensor_free(ga);
806
0
}
807
808
TEST_CASE("mps forward convolution 3d via mfa conv3d with no padding on 17x17 spatial dimensions")
809
1
{
810
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
811
0
  const int batch_size = 2;
812
0
  const int input_channels = 16;
813
0
  const int output_channels = 32;
814
0
  const int input_depth = 5;
815
0
  const int input_height = 17;
816
0
  const int input_width = 17;
817
0
  const int kernel_depth = 3;
818
0
  const int kernel_height = 3;
819
0
  const int kernel_width = 3;
820
0
  const int output_depth = 3;
821
0
  const int output_height = 15;
822
0
  const int output_width = 15;
823
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
824
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
825
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
826
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
827
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
828
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
829
0
  dsfmt_t dsfmt;
830
0
  dsfmt_init_gen_rand(&dsfmt, 21);
831
0
  int i;
832
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
833
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
834
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
835
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
836
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
837
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
838
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
839
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
840
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
841
0
  move.backend = CCV_NNC_BACKEND_MPS;
842
0
  assert(move.backend >= 0);
843
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
844
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
845
0
  transform.backend = CCV_NNC_BACKEND_MPS;
846
0
  assert(transform.backend >= 0);
847
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
848
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
849
0
  ccv_nnc_stream_context_wait(stream_context);
850
0
  ccv_nnc_tensor_free(gw);
851
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
852
0
  assert(cmd.backend >= 0);
853
0
  cmd.algorithm = -1;
854
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
855
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
856
0
  ccv_nnc_stream_context_wait(stream_context);
857
0
  ccv_nnc_stream_context_free(stream_context);
858
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
859
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
860
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
861
0
  assert(cmd.backend >= 0);
862
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
863
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
864
0
  ccv_nnc_tensor_free(c);
865
0
  ccv_nnc_tensor_free(gc);
866
0
  ccv_nnc_tensor_free(w);
867
0
  ccv_nnc_tensor_free(b);
868
0
  ccv_nnc_tensor_free(a);
869
0
  ccv_nnc_tensor_free(gwo);
870
0
  ccv_nnc_tensor_free(ga);
871
0
}
872
873
TEST_CASE("mps forward convolution 3d via mfa conv3d with no padding on 65x65 spatial dimensions")
874
1
{
875
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
876
0
  const int batch_size = 2;
877
0
  const int input_channels = 16;
878
0
  const int output_channels = 32;
879
0
  const int input_depth = 5;
880
0
  const int input_height = 65;
881
0
  const int input_width = 65;
882
0
  const int kernel_depth = 3;
883
0
  const int kernel_height = 3;
884
0
  const int kernel_width = 3;
885
0
  const int output_depth = 3;
886
0
  const int output_height = 63;
887
0
  const int output_width = 63;
888
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
889
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
890
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
891
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
892
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
893
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
894
0
  dsfmt_t dsfmt;
895
0
  dsfmt_init_gen_rand(&dsfmt, 22);
896
0
  int i;
897
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
898
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
899
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
900
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
901
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
902
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
903
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
904
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
905
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
906
0
  move.backend = CCV_NNC_BACKEND_MPS;
907
0
  assert(move.backend >= 0);
908
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
909
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
910
0
  transform.backend = CCV_NNC_BACKEND_MPS;
911
0
  assert(transform.backend >= 0);
912
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
913
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
914
0
  ccv_nnc_stream_context_wait(stream_context);
915
0
  ccv_nnc_tensor_free(gw);
916
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
917
0
  assert(cmd.backend >= 0);
918
0
  cmd.algorithm = -1;
919
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
920
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
921
0
  ccv_nnc_stream_context_wait(stream_context);
922
0
  ccv_nnc_stream_context_free(stream_context);
923
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
924
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
925
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
926
0
  assert(cmd.backend >= 0);
927
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
928
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
929
0
  ccv_nnc_tensor_free(c);
930
0
  ccv_nnc_tensor_free(gc);
931
0
  ccv_nnc_tensor_free(w);
932
0
  ccv_nnc_tensor_free(b);
933
0
  ccv_nnc_tensor_free(a);
934
0
  ccv_nnc_tensor_free(gwo);
935
0
  ccv_nnc_tensor_free(ga);
936
0
}
937
938
TEST_CASE("mps forward convolution 3d via mfa conv3d with no padding on 129x129 spatial dimensions")
939
1
{
940
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
941
0
  const int batch_size = 2;
942
0
  const int input_channels = 16;
943
0
  const int output_channels = 32;
944
0
  const int input_depth = 5;
945
0
  const int input_height = 129;
946
0
  const int input_width = 129;
947
0
  const int kernel_depth = 3;
948
0
  const int kernel_height = 3;
949
0
  const int kernel_width = 3;
950
0
  const int output_depth = 3;
951
0
  const int output_height = 127;
952
0
  const int output_width = 127;
953
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
954
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
955
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
956
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
957
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
958
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
959
0
  dsfmt_t dsfmt;
960
0
  dsfmt_init_gen_rand(&dsfmt, 23);
961
0
  int i;
962
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
963
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
964
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
965
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
966
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
967
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
968
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
969
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
970
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
971
0
  move.backend = CCV_NNC_BACKEND_MPS;
972
0
  assert(move.backend >= 0);
973
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
974
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
975
0
  transform.backend = CCV_NNC_BACKEND_MPS;
976
0
  assert(transform.backend >= 0);
977
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
978
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
979
0
  ccv_nnc_stream_context_wait(stream_context);
980
0
  ccv_nnc_tensor_free(gw);
981
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
982
0
  assert(cmd.backend >= 0);
983
0
  cmd.algorithm = -1;
984
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
985
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
986
0
  ccv_nnc_stream_context_wait(stream_context);
987
0
  ccv_nnc_stream_context_free(stream_context);
988
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
989
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
990
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
991
0
  assert(cmd.backend >= 0);
992
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
993
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
994
0
  ccv_nnc_tensor_free(c);
995
0
  ccv_nnc_tensor_free(gc);
996
0
  ccv_nnc_tensor_free(w);
997
0
  ccv_nnc_tensor_free(b);
998
0
  ccv_nnc_tensor_free(a);
999
0
  ccv_nnc_tensor_free(gwo);
1000
0
  ccv_nnc_tensor_free(ga);
1001
0
}
1002
1003
TEST_CASE("mps forward convolution 3d via mfa conv3d 5x5")
1004
1
{
1005
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1006
0
  const int batch_size = 2;
1007
0
  const int input_channels = 16;
1008
0
  const int output_channels = 32;
1009
0
  const int input_depth = 5;
1010
0
  const int input_height = 12;
1011
0
  const int input_width = 12;
1012
0
  const int kernel_depth = 3;
1013
0
  const int kernel_height = 5;
1014
0
  const int kernel_width = 5;
1015
0
  const int output_depth = 3;
1016
0
  const int output_height = 8;
1017
0
  const int output_width = 8;
1018
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1019
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1020
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1021
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
1022
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1023
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1024
0
  dsfmt_t dsfmt;
1025
0
  dsfmt_init_gen_rand(&dsfmt, 3);
1026
0
  int i;
1027
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1028
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1029
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1030
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1031
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1032
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1033
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1034
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1035
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1036
0
  move.backend = CCV_NNC_BACKEND_MPS;
1037
0
  assert(move.backend >= 0);
1038
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1039
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1040
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1041
0
  assert(transform.backend >= 0);
1042
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1043
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1044
0
  ccv_nnc_stream_context_wait(stream_context);
1045
0
  ccv_nnc_tensor_free(gw);
1046
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1047
0
  assert(cmd.backend >= 0);
1048
0
  cmd.algorithm = -1;
1049
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1050
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1051
0
  ccv_nnc_stream_context_wait(stream_context);
1052
0
  ccv_nnc_stream_context_free(stream_context);
1053
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1054
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1055
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1056
0
  assert(cmd.backend >= 0);
1057
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1058
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1059
0
  ccv_nnc_tensor_free(c);
1060
0
  ccv_nnc_tensor_free(gc);
1061
0
  ccv_nnc_tensor_free(w);
1062
0
  ccv_nnc_tensor_free(b);
1063
0
  ccv_nnc_tensor_free(a);
1064
0
  ccv_nnc_tensor_free(gwo);
1065
0
  ccv_nnc_tensor_free(ga);
1066
0
}
1067
1068
TEST_CASE("mps forward convolution 3d via mfa conv3d 7x7")
1069
1
{
1070
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1071
0
  const int batch_size = 2;
1072
0
  const int input_channels = 16;
1073
0
  const int output_channels = 32;
1074
0
  const int input_depth = 5;
1075
0
  const int input_height = 14;
1076
0
  const int input_width = 14;
1077
0
  const int kernel_depth = 3;
1078
0
  const int kernel_height = 7;
1079
0
  const int kernel_width = 7;
1080
0
  const int output_depth = 3;
1081
0
  const int output_height = 8;
1082
0
  const int output_width = 8;
1083
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1084
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1085
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1086
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
1087
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1088
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1089
0
  dsfmt_t dsfmt;
1090
0
  dsfmt_init_gen_rand(&dsfmt, 4);
1091
0
  int i;
1092
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1093
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1094
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1095
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1096
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1097
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1098
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1099
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1100
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1101
0
  move.backend = CCV_NNC_BACKEND_MPS;
1102
0
  assert(move.backend >= 0);
1103
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1104
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1105
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1106
0
  assert(transform.backend >= 0);
1107
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1108
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1109
0
  ccv_nnc_stream_context_wait(stream_context);
1110
0
  ccv_nnc_tensor_free(gw);
1111
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1112
0
  assert(cmd.backend >= 0);
1113
0
  cmd.algorithm = -1;
1114
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1115
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1116
0
  ccv_nnc_stream_context_wait(stream_context);
1117
0
  ccv_nnc_stream_context_free(stream_context);
1118
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1119
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1120
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1121
0
  assert(cmd.backend >= 0);
1122
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1123
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1124
0
  ccv_nnc_tensor_free(c);
1125
0
  ccv_nnc_tensor_free(gc);
1126
0
  ccv_nnc_tensor_free(w);
1127
0
  ccv_nnc_tensor_free(b);
1128
0
  ccv_nnc_tensor_free(a);
1129
0
  ccv_nnc_tensor_free(gwo);
1130
0
  ccv_nnc_tensor_free(ga);
1131
0
}
1132
1133
TEST_CASE("mps forward convolution 3d via mfa conv3d with padding")
1134
1
{
1135
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1136
0
  const int batch_size = 2;
1137
0
  const int input_channels = 16;
1138
0
  const int output_channels = 32;
1139
0
  const int input_depth = 5;
1140
0
  const int input_height = 10;
1141
0
  const int input_width = 10;
1142
0
  const int kernel_depth = 3;
1143
0
  const int kernel_height = 3;
1144
0
  const int kernel_width = 3;
1145
0
  const int padding_top = 1;
1146
0
  const int padding_bottom = 1;
1147
0
  const int padding_left = 1;
1148
0
  const int padding_right = 1;
1149
0
  const int output_depth = input_depth - kernel_depth + 1;
1150
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1151
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1152
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1153
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1154
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1155
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1156
0
  hint.stride.dim[0] = 1;
1157
0
  hint.stride.dim[1] = 1;
1158
0
  hint.stride.dim[2] = 1;
1159
0
  hint.border.begin[0] = 0;
1160
0
  hint.border.end[0] = 0;
1161
0
  hint.border.begin[1] = padding_top;
1162
0
  hint.border.end[1] = padding_bottom;
1163
0
  hint.border.begin[2] = padding_left;
1164
0
  hint.border.end[2] = padding_right;
1165
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1166
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1167
0
  dsfmt_t dsfmt;
1168
0
  dsfmt_init_gen_rand(&dsfmt, 5);
1169
0
  int i;
1170
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1171
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1172
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1173
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1174
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1175
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1176
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1177
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1178
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1179
0
  move.backend = CCV_NNC_BACKEND_MPS;
1180
0
  assert(move.backend >= 0);
1181
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1182
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1183
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1184
0
  assert(transform.backend >= 0);
1185
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1186
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1187
0
  ccv_nnc_stream_context_wait(stream_context);
1188
0
  ccv_nnc_tensor_free(gw);
1189
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1190
0
  assert(cmd.backend >= 0);
1191
0
  cmd.algorithm = -1;
1192
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1193
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1194
0
  ccv_nnc_stream_context_wait(stream_context);
1195
0
  ccv_nnc_stream_context_free(stream_context);
1196
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1197
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1198
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1199
0
  assert(cmd.backend >= 0);
1200
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1201
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1202
0
  ccv_nnc_tensor_free(c);
1203
0
  ccv_nnc_tensor_free(gc);
1204
0
  ccv_nnc_tensor_free(w);
1205
0
  ccv_nnc_tensor_free(b);
1206
0
  ccv_nnc_tensor_free(a);
1207
0
  ccv_nnc_tensor_free(gwo);
1208
0
  ccv_nnc_tensor_free(ga);
1209
0
}
1210
1211
TEST_CASE("mps forward convolution 3d via mfa conv3d with asymmetric padding")
1212
1
{
1213
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1214
0
  const int batch_size = 2;
1215
0
  const int input_channels = 16;
1216
0
  const int output_channels = 32;
1217
0
  const int input_depth = 5;
1218
0
  const int input_height = 10;
1219
0
  const int input_width = 11;
1220
0
  const int kernel_depth = 3;
1221
0
  const int kernel_height = 5;
1222
0
  const int kernel_width = 5;
1223
0
  const int padding_top = 1;
1224
0
  const int padding_bottom = 0;
1225
0
  const int padding_left = 2;
1226
0
  const int padding_right = 1;
1227
0
  const int output_depth = input_depth - kernel_depth + 1;
1228
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1229
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1230
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1231
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1232
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1233
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1234
0
  hint.stride.dim[0] = 1;
1235
0
  hint.stride.dim[1] = 1;
1236
0
  hint.stride.dim[2] = 1;
1237
0
  hint.border.begin[0] = 0;
1238
0
  hint.border.end[0] = 0;
1239
0
  hint.border.begin[1] = padding_top;
1240
0
  hint.border.end[1] = padding_bottom;
1241
0
  hint.border.begin[2] = padding_left;
1242
0
  hint.border.end[2] = padding_right;
1243
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1244
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1245
0
  dsfmt_t dsfmt;
1246
0
  dsfmt_init_gen_rand(&dsfmt, 6);
1247
0
  int i;
1248
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1249
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1250
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1251
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1252
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1253
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1254
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1255
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1256
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1257
0
  move.backend = CCV_NNC_BACKEND_MPS;
1258
0
  assert(move.backend >= 0);
1259
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1260
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1261
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1262
0
  assert(transform.backend >= 0);
1263
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1264
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1265
0
  ccv_nnc_stream_context_wait(stream_context);
1266
0
  ccv_nnc_tensor_free(gw);
1267
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1268
0
  assert(cmd.backend >= 0);
1269
0
  cmd.algorithm = -1;
1270
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1271
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1272
0
  ccv_nnc_stream_context_wait(stream_context);
1273
0
  ccv_nnc_stream_context_free(stream_context);
1274
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1275
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1276
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1277
0
  assert(cmd.backend >= 0);
1278
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1279
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1280
0
  ccv_nnc_tensor_free(c);
1281
0
  ccv_nnc_tensor_free(gc);
1282
0
  ccv_nnc_tensor_free(w);
1283
0
  ccv_nnc_tensor_free(b);
1284
0
  ccv_nnc_tensor_free(a);
1285
0
  ccv_nnc_tensor_free(gwo);
1286
0
  ccv_nnc_tensor_free(ga);
1287
0
}
1288
1289
TEST_CASE("mps forward convolution 3d via mfa conv3d with padding on small spatial dimensions")
1290
1
{
1291
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1292
0
  const int batch_size = 2;
1293
0
  const int input_channels = 16;
1294
0
  const int output_channels = 32;
1295
0
  const int input_depth = 5;
1296
0
  const int input_height = 5;
1297
0
  const int input_width = 5;
1298
0
  const int kernel_depth = 3;
1299
0
  const int kernel_height = 3;
1300
0
  const int kernel_width = 3;
1301
0
  const int padding_top = 1;
1302
0
  const int padding_bottom = 1;
1303
0
  const int padding_left = 1;
1304
0
  const int padding_right = 1;
1305
0
  const int output_depth = input_depth - kernel_depth + 1;
1306
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1307
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1308
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1309
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1310
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1311
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1312
0
  hint.stride.dim[0] = 1;
1313
0
  hint.stride.dim[1] = 1;
1314
0
  hint.stride.dim[2] = 1;
1315
0
  hint.border.begin[0] = 0;
1316
0
  hint.border.end[0] = 0;
1317
0
  hint.border.begin[1] = padding_top;
1318
0
  hint.border.end[1] = padding_bottom;
1319
0
  hint.border.begin[2] = padding_left;
1320
0
  hint.border.end[2] = padding_right;
1321
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1322
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1323
0
  dsfmt_t dsfmt;
1324
0
  dsfmt_init_gen_rand(&dsfmt, 7);
1325
0
  int i;
1326
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1327
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1328
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1329
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1330
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1331
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1332
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1333
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1334
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1335
0
  move.backend = CCV_NNC_BACKEND_MPS;
1336
0
  assert(move.backend >= 0);
1337
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1338
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1339
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1340
0
  assert(transform.backend >= 0);
1341
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1342
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1343
0
  ccv_nnc_stream_context_wait(stream_context);
1344
0
  ccv_nnc_tensor_free(gw);
1345
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1346
0
  assert(cmd.backend >= 0);
1347
0
  cmd.algorithm = -1;
1348
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1349
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1350
0
  ccv_nnc_stream_context_wait(stream_context);
1351
0
  ccv_nnc_stream_context_free(stream_context);
1352
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1353
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1354
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1355
0
  assert(cmd.backend >= 0);
1356
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1357
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1358
0
  ccv_nnc_tensor_free(c);
1359
0
  ccv_nnc_tensor_free(gc);
1360
0
  ccv_nnc_tensor_free(w);
1361
0
  ccv_nnc_tensor_free(b);
1362
0
  ccv_nnc_tensor_free(a);
1363
0
  ccv_nnc_tensor_free(gwo);
1364
0
  ccv_nnc_tensor_free(ga);
1365
0
}
1366
1367
TEST_CASE("mps forward convolution 3d via mfa conv3d with padding and bias")
1368
1
{
1369
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1370
0
  const int batch_size = 2;
1371
0
  const int input_channels = 16;
1372
0
  const int output_channels = 32;
1373
0
  const int input_depth = 5;
1374
0
  const int input_height = 9;
1375
0
  const int input_width = 9;
1376
0
  const int kernel_depth = 3;
1377
0
  const int kernel_height = 3;
1378
0
  const int kernel_width = 3;
1379
0
  const int padding_top = 1;
1380
0
  const int padding_bottom = 1;
1381
0
  const int padding_left = 1;
1382
0
  const int padding_right = 1;
1383
0
  const int output_depth = input_depth - kernel_depth + 1;
1384
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1385
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1386
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1387
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1388
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1389
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1390
0
  hint.stride.dim[0] = 1;
1391
0
  hint.stride.dim[1] = 1;
1392
0
  hint.stride.dim[2] = 1;
1393
0
  hint.border.begin[0] = 0;
1394
0
  hint.border.end[0] = 0;
1395
0
  hint.border.begin[1] = padding_top;
1396
0
  hint.border.end[1] = padding_bottom;
1397
0
  hint.border.begin[2] = padding_left;
1398
0
  hint.border.end[2] = padding_right;
1399
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1400
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1401
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels), 0);
1402
0
  dsfmt_t dsfmt;
1403
0
  dsfmt_init_gen_rand(&dsfmt, 19);
1404
0
  int i;
1405
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1406
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1407
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1408
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1409
0
  for (i = 0; i < output_channels; i++)
1410
0
    bias->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1411
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1412
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1413
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1414
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels), 0);
1415
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1416
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1417
0
  move.backend = CCV_NNC_BACKEND_MPS;
1418
0
  assert(move.backend >= 0);
1419
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
1420
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1421
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1422
0
  assert(transform.backend >= 0);
1423
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1424
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1425
0
  ccv_nnc_stream_context_wait(stream_context);
1426
0
  ccv_nnc_tensor_free(gw);
1427
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1428
0
  assert(cmd.backend >= 0);
1429
0
  cmd.algorithm = -1;
1430
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
1431
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
1432
0
  ccv_nnc_stream_context_wait(stream_context);
1433
0
  ccv_nnc_stream_context_free(stream_context);
1434
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1435
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1436
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1437
0
  assert(cmd.backend >= 0);
1438
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1439
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1440
0
  ccv_nnc_tensor_free(c);
1441
0
  ccv_nnc_tensor_free(gc);
1442
0
  ccv_nnc_tensor_free(bias);
1443
0
  ccv_nnc_tensor_free(w);
1444
0
  ccv_nnc_tensor_free(b);
1445
0
  ccv_nnc_tensor_free(a);
1446
0
  ccv_nnc_tensor_free(gbias);
1447
0
  ccv_nnc_tensor_free(gwo);
1448
0
  ccv_nnc_tensor_free(ga);
1449
0
}
1450
1451
TEST_CASE("mps forward convolution 3d via mfa conv3d with one point padding on tile boundary and bias")
1452
1
{
1453
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1454
0
  const int batch_size = 2;
1455
0
  const int input_channels = 16;
1456
0
  const int output_channels = 32;
1457
0
  const int input_depth = 5;
1458
0
  const int input_height = 8;
1459
0
  const int input_width = 8;
1460
0
  const int kernel_depth = 3;
1461
0
  const int kernel_height = 3;
1462
0
  const int kernel_width = 3;
1463
0
  const int padding_top = 1;
1464
0
  const int padding_bottom = 1;
1465
0
  const int padding_left = 1;
1466
0
  const int padding_right = 1;
1467
0
  const int output_depth = input_depth - kernel_depth + 1;
1468
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1469
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1470
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1471
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1472
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1473
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1474
0
  hint.stride.dim[0] = 1;
1475
0
  hint.stride.dim[1] = 1;
1476
0
  hint.stride.dim[2] = 1;
1477
0
  hint.border.begin[0] = 0;
1478
0
  hint.border.end[0] = 0;
1479
0
  hint.border.begin[1] = padding_top;
1480
0
  hint.border.end[1] = padding_bottom;
1481
0
  hint.border.begin[2] = padding_left;
1482
0
  hint.border.end[2] = padding_right;
1483
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1484
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1485
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels), 0);
1486
0
  dsfmt_t dsfmt;
1487
0
  dsfmt_init_gen_rand(&dsfmt, 20);
1488
0
  int i;
1489
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1490
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1491
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1492
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1493
0
  for (i = 0; i < output_channels; i++)
1494
0
    bias->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1495
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1496
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1497
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1498
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels), 0);
1499
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1500
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1501
0
  move.backend = CCV_NNC_BACKEND_MPS;
1502
0
  assert(move.backend >= 0);
1503
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
1504
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1505
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1506
0
  assert(transform.backend >= 0);
1507
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1508
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1509
0
  ccv_nnc_stream_context_wait(stream_context);
1510
0
  ccv_nnc_tensor_free(gw);
1511
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1512
0
  assert(cmd.backend >= 0);
1513
0
  cmd.algorithm = -1;
1514
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
1515
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
1516
0
  ccv_nnc_stream_context_wait(stream_context);
1517
0
  ccv_nnc_stream_context_free(stream_context);
1518
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1519
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1520
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1521
0
  assert(cmd.backend >= 0);
1522
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1523
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1524
0
  ccv_nnc_tensor_free(c);
1525
0
  ccv_nnc_tensor_free(gc);
1526
0
  ccv_nnc_tensor_free(bias);
1527
0
  ccv_nnc_tensor_free(w);
1528
0
  ccv_nnc_tensor_free(b);
1529
0
  ccv_nnc_tensor_free(a);
1530
0
  ccv_nnc_tensor_free(gbias);
1531
0
  ccv_nnc_tensor_free(gwo);
1532
0
  ccv_nnc_tensor_free(ga);
1533
0
}
1534
1535
TEST_CASE("mps forward convolution 3d via mfa conv3d with one point padding on tile boundary")
1536
1
{
1537
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1538
0
  const int batch_size = 2;
1539
0
  const int input_channels = 16;
1540
0
  const int output_channels = 32;
1541
0
  const int input_depth = 5;
1542
0
  const int input_height = 8;
1543
0
  const int input_width = 8;
1544
0
  const int kernel_depth = 3;
1545
0
  const int kernel_height = 3;
1546
0
  const int kernel_width = 3;
1547
0
  const int padding_top = 1;
1548
0
  const int padding_bottom = 1;
1549
0
  const int padding_left = 1;
1550
0
  const int padding_right = 1;
1551
0
  const int output_depth = input_depth - kernel_depth + 1;
1552
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1553
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1554
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1555
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1556
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1557
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1558
0
  hint.stride.dim[0] = 1;
1559
0
  hint.stride.dim[1] = 1;
1560
0
  hint.stride.dim[2] = 1;
1561
0
  hint.border.begin[0] = 0;
1562
0
  hint.border.end[0] = 0;
1563
0
  hint.border.begin[1] = padding_top;
1564
0
  hint.border.end[1] = padding_bottom;
1565
0
  hint.border.begin[2] = padding_left;
1566
0
  hint.border.end[2] = padding_right;
1567
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1568
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1569
0
  dsfmt_t dsfmt;
1570
0
  dsfmt_init_gen_rand(&dsfmt, 9);
1571
0
  int i;
1572
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1573
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1574
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1575
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1576
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1577
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1578
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1579
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1580
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1581
0
  move.backend = CCV_NNC_BACKEND_MPS;
1582
0
  assert(move.backend >= 0);
1583
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1584
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1585
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1586
0
  assert(transform.backend >= 0);
1587
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1588
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1589
0
  ccv_nnc_stream_context_wait(stream_context);
1590
0
  ccv_nnc_tensor_free(gw);
1591
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1592
0
  assert(cmd.backend >= 0);
1593
0
  cmd.algorithm = -1;
1594
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1595
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1596
0
  ccv_nnc_stream_context_wait(stream_context);
1597
0
  ccv_nnc_stream_context_free(stream_context);
1598
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1599
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1600
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1601
0
  assert(cmd.backend >= 0);
1602
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1603
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1604
0
  ccv_nnc_tensor_free(c);
1605
0
  ccv_nnc_tensor_free(gc);
1606
0
  ccv_nnc_tensor_free(w);
1607
0
  ccv_nnc_tensor_free(b);
1608
0
  ccv_nnc_tensor_free(a);
1609
0
  ccv_nnc_tensor_free(gwo);
1610
0
  ccv_nnc_tensor_free(ga);
1611
0
}
1612
1613
TEST_CASE("mps forward convolution 3d via mfa conv3d with one point padding on small rectangular spatial dimensions")
1614
1
{
1615
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1616
0
  const int batch_size = 2;
1617
0
  const int input_channels = 16;
1618
0
  const int output_channels = 32;
1619
0
  const int input_depth = 5;
1620
0
  const int input_height = 6;
1621
0
  const int input_width = 7;
1622
0
  const int kernel_depth = 3;
1623
0
  const int kernel_height = 3;
1624
0
  const int kernel_width = 3;
1625
0
  const int padding_top = 1;
1626
0
  const int padding_bottom = 1;
1627
0
  const int padding_left = 1;
1628
0
  const int padding_right = 1;
1629
0
  const int output_depth = input_depth - kernel_depth + 1;
1630
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1631
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1632
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1633
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1634
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1635
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1636
0
  hint.stride.dim[0] = 1;
1637
0
  hint.stride.dim[1] = 1;
1638
0
  hint.stride.dim[2] = 1;
1639
0
  hint.border.begin[0] = 0;
1640
0
  hint.border.end[0] = 0;
1641
0
  hint.border.begin[1] = padding_top;
1642
0
  hint.border.end[1] = padding_bottom;
1643
0
  hint.border.begin[2] = padding_left;
1644
0
  hint.border.end[2] = padding_right;
1645
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1646
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1647
0
  dsfmt_t dsfmt;
1648
0
  dsfmt_init_gen_rand(&dsfmt, 10);
1649
0
  int i;
1650
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1651
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1652
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1653
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1654
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1655
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1656
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1657
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1658
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1659
0
  move.backend = CCV_NNC_BACKEND_MPS;
1660
0
  assert(move.backend >= 0);
1661
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1662
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1663
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1664
0
  assert(transform.backend >= 0);
1665
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1666
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1667
0
  ccv_nnc_stream_context_wait(stream_context);
1668
0
  ccv_nnc_tensor_free(gw);
1669
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1670
0
  assert(cmd.backend >= 0);
1671
0
  cmd.algorithm = -1;
1672
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1673
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1674
0
  ccv_nnc_stream_context_wait(stream_context);
1675
0
  ccv_nnc_stream_context_free(stream_context);
1676
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1677
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1678
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1679
0
  assert(cmd.backend >= 0);
1680
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1681
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1682
0
  ccv_nnc_tensor_free(c);
1683
0
  ccv_nnc_tensor_free(gc);
1684
0
  ccv_nnc_tensor_free(w);
1685
0
  ccv_nnc_tensor_free(b);
1686
0
  ccv_nnc_tensor_free(a);
1687
0
  ccv_nnc_tensor_free(gwo);
1688
0
  ccv_nnc_tensor_free(ga);
1689
0
}
1690
1691
TEST_CASE("mps forward convolution 3d via mfa conv3d with one point padding on 9x9 spatial dimensions")
1692
1
{
1693
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1694
0
  const int batch_size = 2;
1695
0
  const int input_channels = 16;
1696
0
  const int output_channels = 32;
1697
0
  const int input_depth = 5;
1698
0
  const int input_height = 9;
1699
0
  const int input_width = 9;
1700
0
  const int kernel_depth = 3;
1701
0
  const int kernel_height = 3;
1702
0
  const int kernel_width = 3;
1703
0
  const int padding_top = 1;
1704
0
  const int padding_bottom = 1;
1705
0
  const int padding_left = 1;
1706
0
  const int padding_right = 1;
1707
0
  const int output_depth = input_depth - kernel_depth + 1;
1708
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1709
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1710
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1711
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1712
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1713
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1714
0
  hint.stride.dim[0] = 1;
1715
0
  hint.stride.dim[1] = 1;
1716
0
  hint.stride.dim[2] = 1;
1717
0
  hint.border.begin[0] = 0;
1718
0
  hint.border.end[0] = 0;
1719
0
  hint.border.begin[1] = padding_top;
1720
0
  hint.border.end[1] = padding_bottom;
1721
0
  hint.border.begin[2] = padding_left;
1722
0
  hint.border.end[2] = padding_right;
1723
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1724
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1725
0
  dsfmt_t dsfmt;
1726
0
  dsfmt_init_gen_rand(&dsfmt, 11);
1727
0
  int i;
1728
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1729
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1730
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1731
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1732
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1733
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1734
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1735
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1736
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1737
0
  move.backend = CCV_NNC_BACKEND_MPS;
1738
0
  assert(move.backend >= 0);
1739
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1740
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1741
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1742
0
  assert(transform.backend >= 0);
1743
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1744
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1745
0
  ccv_nnc_stream_context_wait(stream_context);
1746
0
  ccv_nnc_tensor_free(gw);
1747
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1748
0
  assert(cmd.backend >= 0);
1749
0
  cmd.algorithm = -1;
1750
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1751
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1752
0
  ccv_nnc_stream_context_wait(stream_context);
1753
0
  ccv_nnc_stream_context_free(stream_context);
1754
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1755
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1756
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1757
0
  assert(cmd.backend >= 0);
1758
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1759
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1760
0
  ccv_nnc_tensor_free(c);
1761
0
  ccv_nnc_tensor_free(gc);
1762
0
  ccv_nnc_tensor_free(w);
1763
0
  ccv_nnc_tensor_free(b);
1764
0
  ccv_nnc_tensor_free(a);
1765
0
  ccv_nnc_tensor_free(gwo);
1766
0
  ccv_nnc_tensor_free(ga);
1767
0
}
1768
1769
TEST_CASE("mps forward convolution 3d via mfa conv3d with one point padding on 8x9 spatial dimensions")
1770
1
{
1771
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1772
0
  const int batch_size = 2;
1773
0
  const int input_channels = 16;
1774
0
  const int output_channels = 32;
1775
0
  const int input_depth = 5;
1776
0
  const int input_height = 8;
1777
0
  const int input_width = 9;
1778
0
  const int kernel_depth = 3;
1779
0
  const int kernel_height = 3;
1780
0
  const int kernel_width = 3;
1781
0
  const int padding_top = 1;
1782
0
  const int padding_bottom = 1;
1783
0
  const int padding_left = 1;
1784
0
  const int padding_right = 1;
1785
0
  const int output_depth = input_depth - kernel_depth + 1;
1786
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1787
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1788
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1789
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1790
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1791
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1792
0
  hint.stride.dim[0] = 1;
1793
0
  hint.stride.dim[1] = 1;
1794
0
  hint.stride.dim[2] = 1;
1795
0
  hint.border.begin[0] = 0;
1796
0
  hint.border.end[0] = 0;
1797
0
  hint.border.begin[1] = padding_top;
1798
0
  hint.border.end[1] = padding_bottom;
1799
0
  hint.border.begin[2] = padding_left;
1800
0
  hint.border.end[2] = padding_right;
1801
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1802
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1803
0
  dsfmt_t dsfmt;
1804
0
  dsfmt_init_gen_rand(&dsfmt, 12);
1805
0
  int i;
1806
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1807
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1808
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1809
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1810
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1811
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1812
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1813
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1814
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1815
0
  move.backend = CCV_NNC_BACKEND_MPS;
1816
0
  assert(move.backend >= 0);
1817
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1818
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1819
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1820
0
  assert(transform.backend >= 0);
1821
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1822
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1823
0
  ccv_nnc_stream_context_wait(stream_context);
1824
0
  ccv_nnc_tensor_free(gw);
1825
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1826
0
  assert(cmd.backend >= 0);
1827
0
  cmd.algorithm = -1;
1828
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1829
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1830
0
  ccv_nnc_stream_context_wait(stream_context);
1831
0
  ccv_nnc_stream_context_free(stream_context);
1832
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1833
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1834
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1835
0
  assert(cmd.backend >= 0);
1836
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1837
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1838
0
  ccv_nnc_tensor_free(c);
1839
0
  ccv_nnc_tensor_free(gc);
1840
0
  ccv_nnc_tensor_free(w);
1841
0
  ccv_nnc_tensor_free(b);
1842
0
  ccv_nnc_tensor_free(a);
1843
0
  ccv_nnc_tensor_free(gwo);
1844
0
  ccv_nnc_tensor_free(ga);
1845
0
}
1846
1847
TEST_CASE("mps forward convolution 3d via mfa conv3d with one point padding on 9x8 spatial dimensions")
1848
1
{
1849
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1850
0
  const int batch_size = 2;
1851
0
  const int input_channels = 16;
1852
0
  const int output_channels = 32;
1853
0
  const int input_depth = 5;
1854
0
  const int input_height = 9;
1855
0
  const int input_width = 8;
1856
0
  const int kernel_depth = 3;
1857
0
  const int kernel_height = 3;
1858
0
  const int kernel_width = 3;
1859
0
  const int padding_top = 1;
1860
0
  const int padding_bottom = 1;
1861
0
  const int padding_left = 1;
1862
0
  const int padding_right = 1;
1863
0
  const int output_depth = input_depth - kernel_depth + 1;
1864
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1865
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1866
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1867
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1868
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1869
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1870
0
  hint.stride.dim[0] = 1;
1871
0
  hint.stride.dim[1] = 1;
1872
0
  hint.stride.dim[2] = 1;
1873
0
  hint.border.begin[0] = 0;
1874
0
  hint.border.end[0] = 0;
1875
0
  hint.border.begin[1] = padding_top;
1876
0
  hint.border.end[1] = padding_bottom;
1877
0
  hint.border.begin[2] = padding_left;
1878
0
  hint.border.end[2] = padding_right;
1879
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1880
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1881
0
  dsfmt_t dsfmt;
1882
0
  dsfmt_init_gen_rand(&dsfmt, 13);
1883
0
  int i;
1884
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1885
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1886
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1887
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1888
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1889
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1890
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1891
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1892
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1893
0
  move.backend = CCV_NNC_BACKEND_MPS;
1894
0
  assert(move.backend >= 0);
1895
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1896
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1897
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1898
0
  assert(transform.backend >= 0);
1899
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1900
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1901
0
  ccv_nnc_stream_context_wait(stream_context);
1902
0
  ccv_nnc_tensor_free(gw);
1903
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1904
0
  assert(cmd.backend >= 0);
1905
0
  cmd.algorithm = -1;
1906
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1907
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1908
0
  ccv_nnc_stream_context_wait(stream_context);
1909
0
  ccv_nnc_stream_context_free(stream_context);
1910
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1911
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1912
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1913
0
  assert(cmd.backend >= 0);
1914
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1915
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1916
0
  ccv_nnc_tensor_free(c);
1917
0
  ccv_nnc_tensor_free(gc);
1918
0
  ccv_nnc_tensor_free(w);
1919
0
  ccv_nnc_tensor_free(b);
1920
0
  ccv_nnc_tensor_free(a);
1921
0
  ccv_nnc_tensor_free(gwo);
1922
0
  ccv_nnc_tensor_free(ga);
1923
0
}
1924
1925
TEST_CASE("mps forward convolution 3d via mfa conv3d with one point padding on 17x19 spatial dimensions")
1926
1
{
1927
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
1928
0
  const int batch_size = 2;
1929
0
  const int input_channels = 16;
1930
0
  const int output_channels = 32;
1931
0
  const int input_depth = 5;
1932
0
  const int input_height = 17;
1933
0
  const int input_width = 19;
1934
0
  const int kernel_depth = 3;
1935
0
  const int kernel_height = 3;
1936
0
  const int kernel_width = 3;
1937
0
  const int padding_top = 1;
1938
0
  const int padding_bottom = 1;
1939
0
  const int padding_left = 1;
1940
0
  const int padding_right = 1;
1941
0
  const int output_depth = input_depth - kernel_depth + 1;
1942
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
1943
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
1944
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1945
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1946
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
1947
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
1948
0
  hint.stride.dim[0] = 1;
1949
0
  hint.stride.dim[1] = 1;
1950
0
  hint.stride.dim[2] = 1;
1951
0
  hint.border.begin[0] = 0;
1952
0
  hint.border.end[0] = 0;
1953
0
  hint.border.begin[1] = padding_top;
1954
0
  hint.border.end[1] = padding_bottom;
1955
0
  hint.border.begin[2] = padding_left;
1956
0
  hint.border.end[2] = padding_right;
1957
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
1958
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1959
0
  dsfmt_t dsfmt;
1960
0
  dsfmt_init_gen_rand(&dsfmt, 17);
1961
0
  int i;
1962
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
1963
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
1964
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
1965
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
1966
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
1967
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
1968
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
1969
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1970
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
1971
0
  move.backend = CCV_NNC_BACKEND_MPS;
1972
0
  assert(move.backend >= 0);
1973
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
1974
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
1975
0
  transform.backend = CCV_NNC_BACKEND_MPS;
1976
0
  assert(transform.backend >= 0);
1977
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
1978
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
1979
0
  ccv_nnc_stream_context_wait(stream_context);
1980
0
  ccv_nnc_tensor_free(gw);
1981
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1982
0
  assert(cmd.backend >= 0);
1983
0
  cmd.algorithm = -1;
1984
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
1985
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
1986
0
  ccv_nnc_stream_context_wait(stream_context);
1987
0
  ccv_nnc_stream_context_free(stream_context);
1988
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
1989
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1990
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
1991
0
  assert(cmd.backend >= 0);
1992
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1993
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
1994
0
  ccv_nnc_tensor_free(c);
1995
0
  ccv_nnc_tensor_free(gc);
1996
0
  ccv_nnc_tensor_free(w);
1997
0
  ccv_nnc_tensor_free(b);
1998
0
  ccv_nnc_tensor_free(a);
1999
0
  ccv_nnc_tensor_free(gwo);
2000
0
  ccv_nnc_tensor_free(ga);
2001
0
}
2002
2003
TEST_CASE("mps forward convolution 3d via mfa conv3d with one point padding on 33x35 spatial dimensions")
2004
1
{
2005
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
2006
0
  const int batch_size = 2;
2007
0
  const int input_channels = 16;
2008
0
  const int output_channels = 32;
2009
0
  const int input_depth = 4;
2010
0
  const int input_height = 33;
2011
0
  const int input_width = 35;
2012
0
  const int kernel_depth = 3;
2013
0
  const int kernel_height = 3;
2014
0
  const int kernel_width = 3;
2015
0
  const int padding_top = 1;
2016
0
  const int padding_bottom = 1;
2017
0
  const int padding_left = 1;
2018
0
  const int padding_right = 1;
2019
0
  const int output_depth = input_depth - kernel_depth + 1;
2020
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
2021
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
2022
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
2023
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2024
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
2025
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
2026
0
  hint.stride.dim[0] = 1;
2027
0
  hint.stride.dim[1] = 1;
2028
0
  hint.stride.dim[2] = 1;
2029
0
  hint.border.begin[0] = 0;
2030
0
  hint.border.end[0] = 0;
2031
0
  hint.border.begin[1] = padding_top;
2032
0
  hint.border.end[1] = padding_bottom;
2033
0
  hint.border.begin[2] = padding_left;
2034
0
  hint.border.end[2] = padding_right;
2035
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
2036
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
2037
0
  dsfmt_t dsfmt;
2038
0
  dsfmt_init_gen_rand(&dsfmt, 18);
2039
0
  int i;
2040
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
2041
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
2042
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
2043
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
2044
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
2045
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
2046
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
2047
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2048
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
2049
0
  move.backend = CCV_NNC_BACKEND_MPS;
2050
0
  assert(move.backend >= 0);
2051
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
2052
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
2053
0
  transform.backend = CCV_NNC_BACKEND_MPS;
2054
0
  assert(transform.backend >= 0);
2055
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
2056
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
2057
0
  ccv_nnc_stream_context_wait(stream_context);
2058
0
  ccv_nnc_tensor_free(gw);
2059
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
2060
0
  assert(cmd.backend >= 0);
2061
0
  cmd.algorithm = -1;
2062
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
2063
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
2064
0
  ccv_nnc_stream_context_wait(stream_context);
2065
0
  ccv_nnc_stream_context_free(stream_context);
2066
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2067
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2068
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
2069
0
  assert(cmd.backend >= 0);
2070
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2071
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
2072
0
  ccv_nnc_tensor_free(c);
2073
0
  ccv_nnc_tensor_free(gc);
2074
0
  ccv_nnc_tensor_free(w);
2075
0
  ccv_nnc_tensor_free(b);
2076
0
  ccv_nnc_tensor_free(a);
2077
0
  ccv_nnc_tensor_free(gwo);
2078
0
  ccv_nnc_tensor_free(ga);
2079
0
}
2080
2081
TEST_CASE("mps forward convolution 3d via mfa conv3d with one point padding on 9x65 spatial dimensions")
2082
1
{
2083
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
2084
0
  const int batch_size = 2;
2085
0
  const int input_channels = 16;
2086
0
  const int output_channels = 32;
2087
0
  const int input_depth = 4;
2088
0
  const int input_height = 9;
2089
0
  const int input_width = 65;
2090
0
  const int kernel_depth = 3;
2091
0
  const int kernel_height = 3;
2092
0
  const int kernel_width = 3;
2093
0
  const int padding_top = 1;
2094
0
  const int padding_bottom = 1;
2095
0
  const int padding_left = 1;
2096
0
  const int padding_right = 1;
2097
0
  const int output_depth = input_depth - kernel_depth + 1;
2098
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
2099
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
2100
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
2101
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2102
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
2103
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
2104
0
  hint.stride.dim[0] = 1;
2105
0
  hint.stride.dim[1] = 1;
2106
0
  hint.stride.dim[2] = 1;
2107
0
  hint.border.begin[0] = 0;
2108
0
  hint.border.end[0] = 0;
2109
0
  hint.border.begin[1] = padding_top;
2110
0
  hint.border.end[1] = padding_bottom;
2111
0
  hint.border.begin[2] = padding_left;
2112
0
  hint.border.end[2] = padding_right;
2113
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
2114
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
2115
0
  dsfmt_t dsfmt;
2116
0
  dsfmt_init_gen_rand(&dsfmt, 19);
2117
0
  int i;
2118
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
2119
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
2120
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
2121
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
2122
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
2123
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
2124
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
2125
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2126
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
2127
0
  move.backend = CCV_NNC_BACKEND_MPS;
2128
0
  assert(move.backend >= 0);
2129
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(ga, gw), 0);
2130
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
2131
0
  transform.backend = CCV_NNC_BACKEND_MPS;
2132
0
  assert(transform.backend >= 0);
2133
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
2134
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
2135
0
  ccv_nnc_stream_context_wait(stream_context);
2136
0
  ccv_nnc_tensor_free(gw);
2137
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
2138
0
  assert(cmd.backend >= 0);
2139
0
  cmd.algorithm = -1;
2140
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context);
2141
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo), TENSOR_LIST(gc), stream_context));
2142
0
  ccv_nnc_stream_context_wait(stream_context);
2143
0
  ccv_nnc_stream_context_free(stream_context);
2144
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2145
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2146
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
2147
0
  assert(cmd.backend >= 0);
2148
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2149
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
2150
0
  ccv_nnc_tensor_free(c);
2151
0
  ccv_nnc_tensor_free(gc);
2152
0
  ccv_nnc_tensor_free(w);
2153
0
  ccv_nnc_tensor_free(b);
2154
0
  ccv_nnc_tensor_free(a);
2155
0
  ccv_nnc_tensor_free(gwo);
2156
0
  ccv_nnc_tensor_free(ga);
2157
0
}
2158
2159
TEST_CASE("mps forward convolution 3d via mfa conv3d with partial output channel tile and bias")
2160
1
{
2161
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
2162
0
  const int batch_size = 2;
2163
0
  const int input_channels = 16;
2164
0
  const int output_channels = 16;
2165
0
  const int input_depth = 4;
2166
0
  const int input_height = 33;
2167
0
  const int input_width = 35;
2168
0
  const int kernel_depth = 3;
2169
0
  const int kernel_height = 3;
2170
0
  const int kernel_width = 3;
2171
0
  const int padding_top = 1;
2172
0
  const int padding_bottom = 1;
2173
0
  const int padding_left = 1;
2174
0
  const int padding_right = 1;
2175
0
  const int output_depth = input_depth - kernel_depth + 1;
2176
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
2177
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
2178
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
2179
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2180
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
2181
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
2182
0
  hint.stride.dim[0] = 1;
2183
0
  hint.stride.dim[1] = 1;
2184
0
  hint.stride.dim[2] = 1;
2185
0
  hint.border.begin[0] = 0;
2186
0
  hint.border.end[0] = 0;
2187
0
  hint.border.begin[1] = padding_top;
2188
0
  hint.border.end[1] = padding_bottom;
2189
0
  hint.border.begin[2] = padding_left;
2190
0
  hint.border.end[2] = padding_right;
2191
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
2192
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
2193
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels), 0);
2194
0
  dsfmt_t dsfmt;
2195
0
  dsfmt_init_gen_rand(&dsfmt, 21);
2196
0
  int i;
2197
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
2198
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
2199
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
2200
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
2201
0
  for (i = 0; i < output_channels; i++)
2202
0
    bias->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
2203
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
2204
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
2205
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
2206
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels), 0);
2207
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2208
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
2209
0
  move.backend = CCV_NNC_BACKEND_MPS;
2210
0
  assert(move.backend >= 0);
2211
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
2212
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
2213
0
  transform.backend = CCV_NNC_BACKEND_MPS;
2214
0
  assert(transform.backend >= 0);
2215
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
2216
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
2217
0
  ccv_nnc_stream_context_wait(stream_context);
2218
0
  ccv_nnc_tensor_free(gw);
2219
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
2220
0
  assert(cmd.backend >= 0);
2221
0
  cmd.algorithm = -1;
2222
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
2223
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
2224
0
  ccv_nnc_stream_context_wait(stream_context);
2225
0
  ccv_nnc_stream_context_free(stream_context);
2226
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2227
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2228
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
2229
0
  assert(cmd.backend >= 0);
2230
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2231
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
2232
0
  ccv_nnc_tensor_free(c);
2233
0
  ccv_nnc_tensor_free(gc);
2234
0
  ccv_nnc_tensor_free(bias);
2235
0
  ccv_nnc_tensor_free(w);
2236
0
  ccv_nnc_tensor_free(b);
2237
0
  ccv_nnc_tensor_free(a);
2238
0
  ccv_nnc_tensor_free(gbias);
2239
0
  ccv_nnc_tensor_free(gwo);
2240
0
  ccv_nnc_tensor_free(ga);
2241
0
}
2242
2243
TEST_CASE("mps forward convolution 3d via mfa conv3d with 48 channels and partial output channel tile")
2244
1
{
2245
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
2246
0
  const int batch_size = 2;
2247
0
  const int input_channels = 48;
2248
0
  const int output_channels = 48;
2249
0
  const int input_depth = 4;
2250
0
  const int input_height = 33;
2251
0
  const int input_width = 35;
2252
0
  const int kernel_depth = 3;
2253
0
  const int kernel_height = 3;
2254
0
  const int kernel_width = 3;
2255
0
  const int padding_top = 1;
2256
0
  const int padding_bottom = 1;
2257
0
  const int padding_left = 1;
2258
0
  const int padding_right = 1;
2259
0
  const int output_depth = input_depth - kernel_depth + 1;
2260
0
  const int output_height = input_height + padding_top + padding_bottom - kernel_height + 1;
2261
0
  const int output_width = input_width + padding_left + padding_right - kernel_width + 1;
2262
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
2263
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2264
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, output_channels, kernel_depth, kernel_height, kernel_width, input_channels);
2265
0
  ccv_nnc_hint_t hint = ccv_nnc_no_hint;
2266
0
  hint.stride.dim[0] = 1;
2267
0
  hint.stride.dim[1] = 1;
2268
0
  hint.stride.dim[2] = 1;
2269
0
  hint.border.begin[0] = 0;
2270
0
  hint.border.end[0] = 0;
2271
0
  hint.border.begin[1] = padding_top;
2272
0
  hint.border.end[1] = padding_bottom;
2273
0
  hint.border.begin[2] = padding_left;
2274
0
  hint.border.end[2] = padding_right;
2275
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
2276
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
2277
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, output_channels), 0);
2278
0
  dsfmt_t dsfmt;
2279
0
  dsfmt_init_gen_rand(&dsfmt, 23);
2280
0
  int i;
2281
0
  for (i = 0; i < batch_size * input_depth * input_height * input_width * input_channels; i++)
2282
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
2283
0
  for (i = 0; i < output_channels * kernel_depth * kernel_height * kernel_width * input_channels; i++)
2284
0
    w->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
2285
0
  for (i = 0; i < output_channels; i++)
2286
0
    bias->data.f32[i] = (dsfmt_genrand_open_close(&dsfmt) - 0.5) * 0.1;
2287
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, input_depth, input_height, input_width, input_channels), 0);
2288
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels, kernel_depth, kernel_height, kernel_width, input_channels), 0);
2289
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, output_channels, input_channels, kernel_depth, kernel_height, kernel_width), 0);
2290
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, output_channels), 0);
2291
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2292
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
2293
0
  move.backend = CCV_NNC_BACKEND_MPS;
2294
0
  assert(move.backend >= 0);
2295
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
2296
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
2297
0
  transform.backend = CCV_NNC_BACKEND_MPS;
2298
0
  assert(transform.backend >= 0);
2299
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
2300
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
2301
0
  ccv_nnc_stream_context_wait(stream_context);
2302
0
  ccv_nnc_tensor_free(gw);
2303
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
2304
0
  assert(cmd.backend >= 0);
2305
0
  cmd.algorithm = -1;
2306
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
2307
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
2308
0
  ccv_nnc_stream_context_wait(stream_context);
2309
0
  ccv_nnc_stream_context_free(stream_context);
2310
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, batch_size, output_depth, output_height, output_width, output_channels), 0);
2311
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2312
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
2313
0
  assert(cmd.backend >= 0);
2314
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2315
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, batch_size * output_depth * output_height * output_width * output_channels, 1e-4, "output from mps should match from CPU");
2316
0
  ccv_nnc_tensor_free(c);
2317
0
  ccv_nnc_tensor_free(gc);
2318
0
  ccv_nnc_tensor_free(bias);
2319
0
  ccv_nnc_tensor_free(w);
2320
0
  ccv_nnc_tensor_free(b);
2321
0
  ccv_nnc_tensor_free(a);
2322
0
  ccv_nnc_tensor_free(gbias);
2323
0
  ccv_nnc_tensor_free(gwo);
2324
0
  ccv_nnc_tensor_free(ga);
2325
0
}
2326
2327
TEST_CASE("mps forward convolution 3d in nchw format")
2328
1
{
2329
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_MPS));
2330
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
2331
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
2332
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
2333
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
2334
0
  hint.stride.dim[0] = 2;
2335
0
  hint.border.begin[0] = 1;
2336
0
  hint.border.end[0] = 1;
2337
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
2338
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
2339
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM), 0);
2340
  // configure the inlets.
2341
0
  dsfmt_t dsfmt;
2342
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2343
0
  int i;
2344
0
  for (i = 0; i < 3 * INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
2345
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
2346
0
  for (i = 0; i < 5 * INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
2347
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2348
0
  for (i = 0; i < OUTPUT_DIM; i++)
2349
0
    bias->data.f32[i] = (float)i / OUTPUT_DIM;
2350
  // Copy generated matrix values over to GPU.
2351
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, 5, INPUT_SIZE, INPUT_SIZE), 0);
2352
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, 3, KERNEL_SIZE, KERNEL_SIZE), 0);
2353
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM), 0);
2354
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
2355
0
  move.backend = CCV_NNC_BACKEND_MPS;
2356
0
  assert(move.backend >= 0);
2357
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
2358
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
2359
2360
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
2361
0
  transform.backend = CCV_NNC_BACKEND_MPS;
2362
0
  assert(transform.backend >= 0);
2363
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
2364
0
  assert(cmd.backend >= 0);
2365
0
  cmd.algorithm = -1;
2366
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
2367
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
2368
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, 3, OUTPUT_SIZE, OUTPUT_SIZE), 0);
2369
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2370
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
2371
0
  assert(cmd.backend >= 0);
2372
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2373
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * 3 * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 1e-4, "output from mps should match from CPU");
2374
0
  ccv_nnc_tensor_free(c);
2375
0
  ccv_nnc_tensor_free(gc);
2376
0
  ccv_nnc_tensor_free(bias);
2377
0
  ccv_nnc_tensor_free(w);
2378
0
  ccv_nnc_tensor_free(b);
2379
0
  ccv_nnc_tensor_free(a);
2380
0
  ccv_nnc_tensor_free(gbias);
2381
0
  ccv_nnc_tensor_free(gw);
2382
0
  ccv_nnc_tensor_free(ga);
2383
0
}
2384
2385
TEST_CASE("compare softmax with mps")
2386
1
{
2387
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_MPS));
2388
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2389
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
2390
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
2391
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "softmax");
2392
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2393
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2394
0
  ccv_nnc_graph_t* graph = 0;
2395
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2396
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2397
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2398
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2399
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2400
0
  dsfmt_t dsfmt;
2401
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2402
0
  int i;
2403
0
  for (i = 0; i < 20 * 10; i++)
2404
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2405
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
2406
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
2407
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2408
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2409
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
2410
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
2411
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2412
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
2413
0
  REQUIRE_TENSOR_EQ(ty, y_tensor, "softmax from mps should match from CPU");
2414
0
  ccv_nnc_tensor_free(x_tensor);
2415
0
  ccv_nnc_tensor_free(y_tensor);
2416
0
  ccv_nnc_tensor_free(ty);
2417
0
  ccv_nnc_graph_free(graph);
2418
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2419
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2420
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2421
0
}
2422
2423
TEST_CASE("compare softmax with mps in half precision")
2424
1
{
2425
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_MPS));
2426
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2427
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
2428
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
2429
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "softmax");
2430
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2431
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2432
0
  ccv_nnc_graph_t* graph = 0;
2433
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2434
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2435
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2436
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2437
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2438
0
  dsfmt_t dsfmt;
2439
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2440
0
  int i;
2441
0
  for (i = 0; i < 20 * 10; i++)
2442
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2443
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
2444
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
2445
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2446
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
2447
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2448
0
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
2449
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2450
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
2451
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
2452
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
2453
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2454
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
2455
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "softmax from mps should match from CPU");
2456
0
  ccv_nnc_tensor_free(x_tensor);
2457
0
  ccv_nnc_tensor_free(x16_tensor);
2458
0
  ccv_nnc_tensor_free(y16_tensor);
2459
0
  ccv_nnc_tensor_free(y_tensor);
2460
0
  ccv_nnc_tensor_free(ty);
2461
0
  ccv_nnc_graph_free(graph);
2462
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2463
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2464
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2465
0
}
2466
2467
TEST_CASE("compare softmax gradient with mps")
2468
1
{
2469
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_MPS) &&
2470
1
    ccv_nnc_cmd_ok(CCV_NNC_SOFTMAX_BACKWARD, CCV_NNC_BACKEND_MPS));
2471
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2472
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
2473
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
2474
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SOFTMAX_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "softmax");
2475
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2476
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2477
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2478
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2479
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
2480
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
2481
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2482
0
  dsfmt_t dsfmt;
2483
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2484
0
  int i;
2485
0
  for (i = 0; i < 10 * 100; i++)
2486
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2487
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2488
0
  for (i = 0; i < 10 * 100; i++)
2489
0
    dy_tensor->data.f32[i] = 0;
2490
0
  for (i = 0; i < 10; i++)
2491
0
    dy_tensor->data.f32[i * 100 + i] = 1;
2492
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2493
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
2494
0
  ccv_nnc_graph_t* graph = 0;
2495
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2496
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2497
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2498
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2499
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2500
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2501
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2502
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2503
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
2504
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2505
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2506
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
2507
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
2508
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2509
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
2510
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
2511
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2512
0
  ccv_nnc_cmd_exec(CMD_SOFTMAX_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
2513
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
2514
0
  ccv_nnc_tensor_free(x_tensor);
2515
0
  ccv_nnc_tensor_free(y_tensor);
2516
0
  ccv_nnc_tensor_free(dx_tensor);
2517
0
  ccv_nnc_tensor_free(dy_tensor);
2518
0
  ccv_nnc_tensor_free(ty_tensor);
2519
0
  ccv_nnc_tensor_free(tdx_tensor);
2520
0
  ccv_nnc_tensor_free(dyt);
2521
0
  ccv_nnc_graph_free(graph);
2522
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2523
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2524
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2525
0
}
2526
2527
TEST_CASE("compare sigmoid with mps")
2528
1
{
2529
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS));
2530
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2531
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
2532
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
2533
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "sigmoid");
2534
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2535
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2536
0
  ccv_nnc_graph_t* graph = 0;
2537
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2538
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2539
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2540
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2541
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2542
0
  dsfmt_t dsfmt;
2543
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2544
0
  int i;
2545
0
  for (i = 0; i < 20 * 10; i++)
2546
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2547
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
2548
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
2549
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2550
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2551
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
2552
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
2553
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2554
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
2555
0
  REQUIRE_TENSOR_EQ(ty, y_tensor, "sigmoid from mps should match from CPU");
2556
0
  ccv_nnc_tensor_free(x_tensor);
2557
0
  ccv_nnc_tensor_free(y_tensor);
2558
0
  ccv_nnc_tensor_free(ty);
2559
0
  ccv_nnc_graph_free(graph);
2560
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2561
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2562
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2563
0
}
2564
2565
TEST_CASE("compare sigmoid with mps in half precision")
2566
1
{
2567
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS));
2568
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2569
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
2570
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
2571
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "sigmoid");
2572
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2573
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2574
0
  ccv_nnc_graph_t* graph = 0;
2575
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2576
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2577
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2578
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2579
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2580
0
  dsfmt_t dsfmt;
2581
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2582
0
  int i;
2583
0
  for (i = 0; i < 20 * 10; i++)
2584
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2585
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
2586
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
2587
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2588
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
2589
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2590
0
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
2591
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2592
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
2593
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
2594
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
2595
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
2596
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
2597
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "sigmoid from mps should match from CPU");
2598
0
  ccv_nnc_tensor_free(x_tensor);
2599
0
  ccv_nnc_tensor_free(x16_tensor);
2600
0
  ccv_nnc_tensor_free(y16_tensor);
2601
0
  ccv_nnc_tensor_free(y_tensor);
2602
0
  ccv_nnc_tensor_free(ty);
2603
0
  ccv_nnc_graph_free(graph);
2604
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2605
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2606
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2607
0
}
2608
2609
TEST_CASE("compare sigmoid with mps and more vectorization cases")
2610
1
{
2611
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS));
2612
0
  dsfmt_t dsfmt;
2613
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2614
0
  int i;
2615
0
  ccv_nnc_tensor_t* const a0 = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 32, 32), 0);
2616
0
  ccv_nnc_tensor_t* const b0 = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 32, 32), 0);
2617
0
  ccv_nnc_tensor_t* const ha0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 32, 32), 0);
2618
0
  ccv_nnc_tensor_t* const hb0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 32, 32), 0);
2619
0
  ccv_nnc_tensor_t* const tb0 = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 32, 32), 0);
2620
0
  for (i = 0; i < 32 * 32; i++)
2621
0
    switch (i % 6)
2622
0
    {
2623
0
      case 0:
2624
0
        ha0->data.f32[i] = -80;
2625
0
        break;
2626
0
      case 1:
2627
0
        ha0->data.f32[i] = 80;
2628
0
        break;
2629
0
      case 2:
2630
0
        ha0->data.f32[i] = -20;
2631
0
        break;
2632
0
      case 3:
2633
0
        ha0->data.f32[i] = 20;
2634
0
        break;
2635
0
      default:
2636
0
        ha0->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 12 - 6;
2637
0
        break;
2638
0
    }
2639
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha0), TENSOR_LIST(a0), 0);
2640
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a0), TENSOR_LIST(b0), 0);
2641
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha0), TENSOR_LIST(tb0), 0);
2642
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b0), TENSOR_LIST(hb0), 0);
2643
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb0->data.f32, hb0->data.f32, 32 * 32, 1e-6, "sigmoid from mps should match from CPU for the length %% 1024 == 0 case");
2644
0
  ccv_nnc_tensor_free(a0);
2645
0
  ccv_nnc_tensor_free(b0);
2646
0
  ccv_nnc_tensor_free(ha0);
2647
0
  ccv_nnc_tensor_free(hb0);
2648
0
  ccv_nnc_tensor_free(tb0);
2649
2650
0
  ccv_nnc_tensor_t* const a1 = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 17, 61), 0);
2651
0
  ccv_nnc_tensor_t* const b1 = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 17, 61), 0);
2652
0
  ccv_nnc_tensor_t* const ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 17, 61), 0);
2653
0
  ccv_nnc_tensor_t* const hb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 17, 61), 0);
2654
0
  ccv_nnc_tensor_t* const tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 17, 61), 0);
2655
0
  for (i = 0; i < 17 * 61; i++)
2656
0
    switch (i % 6)
2657
0
    {
2658
0
      case 0:
2659
0
        ha1->data.f32[i] = -80;
2660
0
        break;
2661
0
      case 1:
2662
0
        ha1->data.f32[i] = 80;
2663
0
        break;
2664
0
      case 2:
2665
0
        ha1->data.f32[i] = -20;
2666
0
        break;
2667
0
      case 3:
2668
0
        ha1->data.f32[i] = 20;
2669
0
        break;
2670
0
      default:
2671
0
        ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 12 - 6;
2672
0
        break;
2673
0
    }
2674
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1), TENSOR_LIST(a1), 0);
2675
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1), TENSOR_LIST(b1), 0);
2676
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1), TENSOR_LIST(tb1), 0);
2677
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b1), TENSOR_LIST(hb1), 0);
2678
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb1->data.f32, 17 * 61, 1e-6, "sigmoid from mps should match from CPU for the tail case");
2679
0
  ccv_nnc_tensor_free(a1);
2680
0
  ccv_nnc_tensor_free(b1);
2681
0
  ccv_nnc_tensor_free(ha1);
2682
0
  ccv_nnc_tensor_free(hb1);
2683
0
  ccv_nnc_tensor_free(tb1);
2684
0
}
2685
2686
2687
TEST_CASE("compare sigmoid gradient with mps")
2688
1
{
2689
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_FORWARD, CCV_NNC_BACKEND_MPS) &&
2690
1
    ccv_nnc_cmd_ok(CCV_NNC_SIGMOID_BACKWARD, CCV_NNC_BACKEND_MPS));
2691
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2692
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
2693
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
2694
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_SIGMOID_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "sigmoid");
2695
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2696
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
2697
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2698
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2699
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
2700
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
2701
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2702
0
  dsfmt_t dsfmt;
2703
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2704
0
  int i;
2705
0
  for (i = 0; i < 10 * 100; i++)
2706
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2707
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2708
0
  for (i = 0; i < 10 * 100; i++)
2709
0
    dy_tensor->data.f32[i] = 0;
2710
0
  for (i = 0; i < 10; i++)
2711
0
    dy_tensor->data.f32[i * 100 + i] = 1;
2712
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
2713
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
2714
0
  ccv_nnc_graph_t* graph = 0;
2715
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2716
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2717
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2718
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2719
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2720
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2721
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2722
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2723
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
2724
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2725
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2726
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
2727
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
2728
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2729
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
2730
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
2731
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
2732
0
  ccv_nnc_cmd_exec(CMD_SIGMOID_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
2733
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
2734
0
  ccv_nnc_tensor_free(x_tensor);
2735
0
  ccv_nnc_tensor_free(y_tensor);
2736
0
  ccv_nnc_tensor_free(dx_tensor);
2737
0
  ccv_nnc_tensor_free(dy_tensor);
2738
0
  ccv_nnc_tensor_free(ty_tensor);
2739
0
  ccv_nnc_tensor_free(tdx_tensor);
2740
0
  ccv_nnc_tensor_free(dyt);
2741
0
  ccv_nnc_graph_free(graph);
2742
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2743
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2744
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2745
0
}
2746
2747
TEST_CASE("compare relu with mps")
2748
1
{
2749
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_MPS));
2750
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2751
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
2752
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "y");
2753
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
2754
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2755
0
  ccv_nnc_graph_t* graph = 0;
2756
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2757
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2758
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2759
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2760
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2761
0
  dsfmt_t dsfmt;
2762
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2763
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2764
0
  int i;
2765
0
  for (i = 0; i < 7 * 7 * 10; i++)
2766
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2767
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2768
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
2769
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2770
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2771
0
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
2772
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2773
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2774
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
2775
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
2776
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2777
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2778
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2779
0
  ccv_nnc_graph_free(graph);
2780
0
  ccv_nnc_tensor_free(x_tensor);
2781
0
  ccv_nnc_tensor_free(y_tensor);
2782
0
  ccv_nnc_tensor_free(cpu_y);
2783
0
}
2784
2785
TEST_CASE("compare relu with mps in half precision")
2786
1
{
2787
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RELU_FORWARD, CCV_NNC_BACKEND_MPS));
2788
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
2789
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
2790
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "y");
2791
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RELU_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "relu");
2792
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2793
0
  ccv_nnc_graph_t* graph = 0;
2794
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2795
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2796
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2797
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2798
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2799
0
  dsfmt_t dsfmt;
2800
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2801
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2802
0
  int i;
2803
0
  for (i = 0; i < 7 * 7 * 10; i++)
2804
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2805
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
2806
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2807
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
2808
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
2809
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2810
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2811
0
  ccv_nnc_cmd_exec(CMD_RELU_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
2812
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2813
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
2814
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
2815
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
2816
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
2817
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 7 * 7 * 10, 1e-3, "mps result should equal to cpu result");
2818
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2819
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2820
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2821
0
  ccv_nnc_graph_free(graph);
2822
0
  ccv_nnc_tensor_free(x_tensor);
2823
0
  ccv_nnc_tensor_free(x16_tensor);
2824
0
  ccv_nnc_tensor_free(y_tensor);
2825
0
  ccv_nnc_tensor_free(cpu_y);
2826
0
  ccv_nnc_tensor_free(cpu_y16);
2827
0
}
2828
2829
TEST_CASE("compare layer norm with mps")
2830
1
{
2831
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2832
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
2833
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2834
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
2835
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
2836
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
2837
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
2838
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
2839
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "bias");
2840
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2841
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2842
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
2843
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2844
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
2845
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2846
0
  ccv_nnc_graph_t* graph = 0;
2847
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2848
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2849
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2850
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2851
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2852
0
  dsfmt_t dsfmt;
2853
0
  float xdata[2 * 2 * 2 * 10];
2854
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2855
0
  int i;
2856
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2857
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
2858
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
2859
0
  float scaledata[1 * 2 * 2 * 10];
2860
0
  float biasdata[1 * 2 * 2 * 10];
2861
0
  for (i = 0; i < 1 * 2 * 2 * 10; i++)
2862
0
  {
2863
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2864
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
2865
0
  }
2866
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
2867
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
2868
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
2869
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2870
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2871
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2872
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2873
0
  ccv_nnc_graph_free(graph);
2874
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2875
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
2876
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
2877
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
2878
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "bias");
2879
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2880
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2881
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2882
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2883
0
  ccv_nnc_graph_t* cpu_graph = 0;
2884
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2885
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2886
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2887
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2888
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
2889
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
2890
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
2891
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
2892
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 2 * 2 * 10);
2893
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2894
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
2895
  // Note that MPS and my other implementations treat epsilon differently.
2896
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "layer norm result from mps should match the one from reference implementation");
2897
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2898
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2899
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2900
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2901
0
  ccv_nnc_graph_free(cpu_graph);
2902
0
}
2903
2904
TEST_CASE("compare layer norm with mps without scale / bias")
2905
1
{
2906
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2907
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
2908
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2909
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
2910
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
2911
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
2912
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
2913
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
2914
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
2915
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
2916
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
2917
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
2918
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2919
0
  ccv_nnc_graph_t* graph = 0;
2920
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2921
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2922
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2923
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2924
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2925
0
  dsfmt_t dsfmt;
2926
0
  float xdata[2 * 2 * 2 * 10];
2927
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2928
0
  int i;
2929
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2930
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
2931
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
2932
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
2933
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
2934
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
2935
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
2936
0
  ccv_nnc_graph_free(graph);
2937
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
2938
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
2939
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
2940
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
2941
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
2942
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-6, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
2943
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2944
0
  ccv_nnc_graph_t* cpu_graph = 0;
2945
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
2946
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
2947
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
2948
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
2949
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
2950
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
2951
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
2952
  // Note that MPS and my other implementations treat epsilon differently.
2953
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "layer norm result from mps should match the one from reference implementation");
2954
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
2955
0
  ccv_nnc_tensor_arena_free(tensor_arena);
2956
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
2957
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
2958
0
  ccv_nnc_graph_free(cpu_graph);
2959
0
}
2960
2961
TEST_CASE("compare group norm with mps")
2962
1
{
2963
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2964
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
2965
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
2966
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
2967
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
2968
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
2969
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
2970
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "scale");
2971
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 16, 2, 10), "bias");
2972
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
2973
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
2974
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
2975
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
2976
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
2977
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2978
0
  ccv_nnc_graph_t* graph = 0;
2979
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
2980
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
2981
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
2982
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
2983
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
2984
0
  dsfmt_t dsfmt;
2985
0
  float xdata[2 * 16 * 2 * 10];
2986
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
2987
0
  int i;
2988
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2989
0
  for (i = 0; i < 2 * 16 * 2 * 10; i++)
2990
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
2991
0
  float scaledata[1 * 16 * 2 * 10];
2992
0
  float biasdata[1 * 16 * 2 * 10];
2993
0
  for (i = 0; i < 1 * 16 * 2 * 10; i++)
2994
0
  {
2995
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
2996
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
2997
0
  }
2998
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
2999
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), 0);
3000
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
3001
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3002
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3003
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3004
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3005
0
  ccv_nnc_graph_free(graph);
3006
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3007
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
3008
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
3009
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "scale");
3010
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 16, 2, 10), "bias");
3011
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
3012
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
3013
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3014
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3015
0
  ccv_nnc_graph_t* cpu_graph = 0;
3016
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3017
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3018
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3019
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3020
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
3021
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
3022
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 16 * 2 * 10);
3023
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
3024
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 16 * 2 * 10);
3025
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3026
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
3027
  // Note that MPS and my other implementations treat epsilon differently.
3028
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-3, "group norm result from mps should match the one from reference implementation");
3029
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3030
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3031
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3032
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3033
0
  ccv_nnc_graph_free(cpu_graph);
3034
0
}
3035
3036
TEST_CASE("compare group norm with mps without scale / bias")
3037
1
{
3038
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3039
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
3040
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3041
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host x");
3042
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "x");
3043
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 16, 2, 10), "y");
3044
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "host y");
3045
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_mean");
3046
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2, 10), "saved_inv_std");
3047
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
3048
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
3049
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
3050
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3051
0
  ccv_nnc_graph_t* graph = 0;
3052
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3053
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3054
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3055
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3056
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3057
0
  dsfmt_t dsfmt;
3058
0
  float xdata[2 * 16 * 2 * 10];
3059
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3060
0
  int i;
3061
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3062
0
  for (i = 0; i < 2 * 16 * 2 * 10; i++)
3063
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
3064
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3065
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3066
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3067
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3068
0
  ccv_nnc_graph_free(graph);
3069
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3070
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "x");
3071
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 16, 2, 10), "y");
3072
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_mean");
3073
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 4, 2, 10), "saved_inv_std");
3074
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-7, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
3075
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3076
0
  ccv_nnc_graph_t* cpu_graph = 0;
3077
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3078
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3079
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3080
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3081
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 16 * 2 * 10);
3082
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3083
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
3084
  // Note that MPS and my other implementations treat epsilon differently.
3085
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 16 * 2 * 10, 1e-3, "group norm result from mps should match the one from reference implementation");
3086
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3087
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3088
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3089
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3090
0
  ccv_nnc_graph_free(cpu_graph);
3091
0
}
3092
3093
TEST_CASE("compare rmsnorm with mps")
3094
1
{
3095
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3096
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
3097
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3098
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
3099
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
3100
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
3101
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
3102
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, 10), "scale");
3103
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
3104
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
3105
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-6, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
3106
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
3107
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3108
0
  ccv_nnc_graph_t* graph = 0;
3109
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3110
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3111
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3112
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3113
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3114
0
  dsfmt_t dsfmt;
3115
0
  float xdata[2 * 2 * 2 * 10];
3116
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3117
0
  int i;
3118
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3119
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
3120
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
3121
0
  float scaledata[1 * 2 * 2 * 10];
3122
0
  for (i = 0; i < 1 * 2 * 2 * 10; i++)
3123
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
3124
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), 0);
3125
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale)), 0);
3126
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3127
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3128
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3129
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3130
0
  ccv_nnc_graph_free(graph);
3131
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3132
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
3133
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
3134
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, 10), "scale");
3135
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
3136
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-6, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "rmsnorm");
3137
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3138
0
  ccv_nnc_graph_t* cpu_graph = 0;
3139
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3140
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3141
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3142
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3143
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
3144
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
3145
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * 10);
3146
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3147
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
3148
  // Note that MPS and my other implementations treat epsilon differently.
3149
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "rmsnorm result from mps should match the one from reference implementation");
3150
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3151
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3152
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3153
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3154
0
  ccv_nnc_graph_free(cpu_graph);
3155
0
}
3156
3157
TEST_CASE("compare rmsnorm with mps without scale")
3158
1
{
3159
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3160
1
    ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
3161
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3162
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host x");
3163
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "x");
3164
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, 10), "y");
3165
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "host y");
3166
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
3167
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(bx), "transfer x");
3168
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-6, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
3169
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(y), "transfer y");
3170
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3171
0
  ccv_nnc_graph_t* graph = 0;
3172
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3173
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3174
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3175
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3176
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3177
0
  dsfmt_t dsfmt;
3178
0
  float xdata[2 * 2 * 2 * 10];
3179
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3180
0
  int i;
3181
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3182
0
  for (i = 0; i < 2 * 2 * 2 * 10; i++)
3183
0
    x_tensor->data.f32[i] = xdata[i] = dsfmt_genrand_open_close(&dsfmt);
3184
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3185
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3186
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3187
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3188
0
  ccv_nnc_graph_free(graph);
3189
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
3190
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "x");
3191
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, 10), "y");
3192
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
3193
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-6, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "rmsnorm");
3194
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3195
0
  ccv_nnc_graph_t* cpu_graph = 0;
3196
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
3197
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
3198
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
3199
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
3200
0
  memcpy(cx_tensor->data.f32, xdata, sizeof(float) * 2 * 2 * 2 * 10);
3201
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
3202
0
  ccv_nnc_tensor_t* const cy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cy);
3203
  // Note that MPS and my other implementations treat epsilon differently.
3204
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cy_tensor->data.f32, 2 * 2 * 2 * 10, 1e-4, "rmsnorm result from mps should match the one from reference implementation");
3205
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
3206
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3207
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
3208
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
3209
0
  ccv_nnc_graph_free(cpu_graph);
3210
0
}
3211
3212
TEST_CASE("compare add with mps")
3213
1
{
3214
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
3215
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3216
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
3217
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
3218
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
3219
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
3220
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
3221
0
  ccv_nnc_tensor_symbol_t z = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "z");
3222
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
3223
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
3224
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(z), "transfer");
3225
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3226
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3227
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3228
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
3229
0
  ccv_nnc_graph_t* graph = 0;
3230
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3231
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3232
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(z), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3233
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3234
0
  dsfmt_t dsfmt;
3235
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3236
0
  int i;
3237
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
3238
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3239
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
3240
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3241
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3242
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
3243
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3244
0
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, z);
3245
0
  REQUIRE_TENSOR_EQ(zt, z_tensor, "add should match");
3246
0
  ccv_nnc_tensor_free(x_tensor);
3247
0
  ccv_nnc_tensor_free(y_tensor);
3248
0
  ccv_nnc_tensor_free(zt);
3249
0
  ccv_nnc_graph_free(graph);
3250
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3251
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3252
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3253
0
}
3254
3255
TEST_CASE("compare add with mps in half precision")
3256
1
{
3257
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
3258
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3259
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
3260
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
3261
0
  ccv_nnc_tensor_symbol_t x16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), "x 16");
3262
0
  ccv_nnc_tensor_symbol_t y16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 1, 3), "y 16");
3263
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "a");
3264
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 1, 3), "b");
3265
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 5, 5, 3), "c");
3266
0
  ccv_nnc_tensor_symbol_t z = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "z");
3267
0
  ccv_nnc_tensor_symbol_t z16 = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(16F, 10, 5, 5, 3), "z 16");
3268
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(x16, y16), "convert");
3269
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x16, y16), TENSOR_SYMBOL_LIST(a, b), "transfer");
3270
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
3271
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(z16), "transfer");
3272
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATATYPE_CONVERSION_FORWARD(), TENSOR_SYMBOL_LIST(z16), TENSOR_SYMBOL_LIST(z), "convert");
3273
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3274
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3275
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3276
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
3277
0
  ccv_nnc_graph_t* graph = 0;
3278
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3279
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3280
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(z), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3281
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3282
0
  dsfmt_t dsfmt;
3283
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3284
0
  int i;
3285
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
3286
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3287
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
3288
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3289
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3290
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
3291
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3292
0
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, z);
3293
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, zt->data.f32, z_tensor->data.f32, 10 * 5 * 5 * 3, 1e-3, "add should match");
3294
0
  ccv_nnc_tensor_free(x_tensor);
3295
0
  ccv_nnc_tensor_free(y_tensor);
3296
0
  ccv_nnc_tensor_free(zt);
3297
0
  ccv_nnc_graph_free(graph);
3298
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3299
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3300
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3301
0
}
3302
3303
TEST_CASE("compare add gradient with mps")
3304
1
{
3305
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
3306
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
3307
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3308
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
3309
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
3310
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
3311
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
3312
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
3313
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
3314
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
3315
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3316
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(x, y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3317
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3318
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3319
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3320
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
3321
0
  ccv_nnc_graph_t* graph = 0;
3322
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3323
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3324
0
  ccv_nnc_tensor_symbol_t dc = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, c);
3325
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3326
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3327
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(dx, dy), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3328
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3329
0
  dsfmt_t dsfmt;
3330
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3331
0
  int i;
3332
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
3333
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3334
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
3335
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3336
0
  ccv_nnc_tensor_t* dct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3337
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
3338
0
    dct->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3339
0
  ccv_nnc_tensor_t* const dc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dc);
3340
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dct), TENSOR_LIST(dc_tensor), 0);
3341
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3342
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3343
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
3344
0
  ccv_nnc_tensor_t* dxt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3345
0
  ccv_nnc_tensor_t* dyt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
3346
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dct, x_tensor, y_tensor, zt), TENSOR_LIST(dxt, dyt), 0);
3347
0
  ccv_nnc_tensor_t* dx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3348
0
  ccv_nnc_tensor_t* dy_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dy);
3349
0
  REQUIRE_TENSOR_EQ(dxt, dx_tensor, "backward pass should match");
3350
0
  REQUIRE_TENSOR_EQ(dyt, dy_tensor, "backward pass should match");
3351
0
  ccv_nnc_tensor_free(x_tensor);
3352
0
  ccv_nnc_tensor_free(y_tensor);
3353
0
  ccv_nnc_tensor_free(dct);
3354
0
  ccv_nnc_tensor_free(zt);
3355
0
  ccv_nnc_tensor_free(dxt);
3356
0
  ccv_nnc_tensor_free(dyt);
3357
0
  ccv_nnc_graph_free(graph);
3358
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3359
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3360
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3361
0
}
3362
3363
TEST_CASE("compare add gradient with mps no dyt ")
3364
1
{
3365
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
3366
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
3367
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3368
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), "x");
3369
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), "y");
3370
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "a");
3371
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 1, 3), "b");
3372
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 5, 5, 3), "c");
3373
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_DATA_TRANSFER_FORWARD(), TENSOR_SYMBOL_LIST(x, y), TENSOR_SYMBOL_LIST(a, b), "transfer");
3374
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_ADD_FORWARD(0.5, 0.2), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "add");
3375
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3376
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(c), TENSOR_SYMBOL_LIST(x, y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
3377
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3378
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3379
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3380
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 1, 3), 0);
3381
0
  ccv_nnc_graph_t* graph = 0;
3382
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3383
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3384
0
  ccv_nnc_tensor_symbol_t dc = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, c);
3385
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
3386
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
3387
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(x, x_tensor), KV(y, y_tensor)), TENSOR_SYMBOL_LIST(dx, dy), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3388
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3389
0
  dsfmt_t dsfmt;
3390
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3391
0
  int i;
3392
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
3393
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3394
0
  for (i = 0; i < 10 * 5 * 1 * 3; i++)
3395
0
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3396
0
  ccv_nnc_tensor_t* dct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3397
0
  for (i = 0; i < 10 * 5 * 5 * 3; i++)
3398
0
    dct->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3399
0
  ccv_nnc_tensor_t* const dc_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dc);
3400
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dct), TENSOR_LIST(dc_tensor), 0);
3401
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3402
0
  ccv_nnc_tensor_t* zt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3403
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(zt), 0);
3404
0
  ccv_nnc_tensor_t* dxt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 5, 5, 3), 0);
3405
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dct, x_tensor, y_tensor, zt), TENSOR_LIST(dxt, 0), 0);
3406
0
  ccv_nnc_tensor_t* dx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
3407
0
  REQUIRE_TENSOR_EQ(dxt, dx_tensor, "backward pass should match");
3408
0
  ccv_nnc_tensor_free(x_tensor);
3409
0
  ccv_nnc_tensor_free(y_tensor);
3410
0
  ccv_nnc_tensor_free(dct);
3411
0
  ccv_nnc_tensor_free(zt);
3412
0
  ccv_nnc_tensor_free(dxt);
3413
0
  ccv_nnc_graph_free(graph);
3414
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3415
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3416
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3417
0
}
3418
3419
TEST_CASE("broadcasting semantics for add backward mps (a,b)")
3420
1
{
3421
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
3422
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
3423
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3424
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3425
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3426
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3427
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3428
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3429
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3430
0
  a->data.f32[0] = 1;
3431
0
  a->data.f32[1] = 2;
3432
0
  a->data.f32[2] = 3;
3433
0
  a->data.f32[3] = 4;
3434
0
  b->data.f32[0] = 5;
3435
0
  b->data.f32[1] = 6;
3436
0
  float ctp[] = {
3437
0
    6, 7,
3438
0
    7, 8,
3439
0
    8, 9,
3440
0
    9, 10
3441
0
  };
3442
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3443
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3444
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3445
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3446
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3447
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3448
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3449
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, gdb), 0);
3450
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
3451
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, dbt), 0);
3452
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3453
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3454
0
  ccv_nnc_tensor_free(a);
3455
0
  ccv_nnc_tensor_free(b);
3456
0
  ccv_nnc_tensor_free(c);
3457
0
  ccv_nnc_tensor_free(da);
3458
0
  ccv_nnc_tensor_free(db);
3459
0
  ccv_nnc_tensor_free(dat);
3460
0
  ccv_nnc_tensor_free(dbt);
3461
0
  ccv_nnc_tensor_free(ga);
3462
0
  ccv_nnc_tensor_free(gb);
3463
0
  ccv_nnc_tensor_free(gc);
3464
0
  ccv_nnc_tensor_free(gda);
3465
0
  ccv_nnc_tensor_free(gdb);
3466
0
}
3467
3468
TEST_CASE("broadcasting semantics for add backward mps (a, nil)")
3469
1
{
3470
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
3471
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
3472
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3473
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3474
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3475
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3476
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3477
0
  a->data.f32[0] = 1;
3478
0
  a->data.f32[1] = 2;
3479
0
  a->data.f32[2] = 3;
3480
0
  a->data.f32[3] = 4;
3481
0
  b->data.f32[0] = 5;
3482
0
  b->data.f32[1] = 6;
3483
0
  float ctp[] = {
3484
0
    6, 7,
3485
0
    7, 8,
3486
0
    8, 9,
3487
0
    9, 10
3488
0
  };
3489
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3490
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3491
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3492
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3493
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3494
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3495
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, ), TENSOR_LIST(gda, ), 0);
3496
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, ), TENSOR_LIST(da, ), 0);
3497
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, ), TENSOR_LIST(dat, ), 0);
3498
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
3499
0
  ccv_nnc_tensor_free(a);
3500
0
  ccv_nnc_tensor_free(b);
3501
0
  ccv_nnc_tensor_free(c);
3502
0
  ccv_nnc_tensor_free(da);
3503
0
  ccv_nnc_tensor_free(dat);
3504
0
  ccv_nnc_tensor_free(ga);
3505
0
  ccv_nnc_tensor_free(gb);
3506
0
  ccv_nnc_tensor_free(gc);
3507
0
  ccv_nnc_tensor_free(gda);
3508
0
}
3509
3510
TEST_CASE("broadcasting semantics for add backward mps (nil,b)")
3511
1
{
3512
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS) &&
3513
1
    ccv_nnc_cmd_ok(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_MPS));
3514
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3515
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3516
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3517
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3518
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3519
0
  a->data.f32[0] = 1;
3520
0
  a->data.f32[1] = 2;
3521
0
  a->data.f32[2] = 3;
3522
0
  a->data.f32[3] = 4;
3523
0
  b->data.f32[0] = 5;
3524
0
  b->data.f32[1] = 6;
3525
0
  float ctp[] = {
3526
0
    6, 7,
3527
0
    7, 8,
3528
0
    8, 9,
3529
0
    9, 10
3530
0
  };
3531
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
3532
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3533
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3534
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3535
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3536
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
3537
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(0, gdb), 0);
3538
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdb), TENSOR_LIST(db), 0);
3539
0
  ccv_nnc_cmd_exec(CMD_ADD_BACKWARD(0.5, 0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(c, 0, b), TENSOR_LIST(0, dbt), 0);
3540
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
3541
0
  ccv_nnc_tensor_free(a);
3542
0
  ccv_nnc_tensor_free(b);
3543
0
  ccv_nnc_tensor_free(c);
3544
0
  ccv_nnc_tensor_free(db);
3545
0
  ccv_nnc_tensor_free(dbt);
3546
0
  ccv_nnc_tensor_free(ga);
3547
0
  ccv_nnc_tensor_free(gb);
3548
0
  ccv_nnc_tensor_free(gc);
3549
0
  ccv_nnc_tensor_free(gdb);
3550
0
}
3551
3552
TEST_CASE("compare ewsum with mps")
3553
1
{
3554
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_MPS));
3555
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
3556
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
3557
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
3558
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 100), 0);
3559
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3560
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3561
0
  ccv_nnc_tensor_t* const hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3562
0
  ccv_nnc_tensor_t* const hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3563
0
  ccv_nnc_tensor_t* const gd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3564
0
  int i;
3565
0
  for (i = 0; i < 100; i++)
3566
0
  {
3567
0
    ha->data.f32[i] = 1;
3568
0
    hb->data.f32[i] = 0.5;
3569
0
    hc->data.f32[i] = 0.25;
3570
0
    gd->data.f32[i] = 1.75;
3571
0
  }
3572
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hc), TENSOR_LIST(a, b, c), 0);
3573
0
  ccv_nnc_cmd_exec(CMD_EWSUM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(d), 0);
3574
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(hd), 0);
3575
0
  REQUIRE_TENSOR_EQ(hd, gd, "ewsum result should be the same");
3576
0
  ccv_nnc_tensor_free(a);
3577
0
  ccv_nnc_tensor_free(b);
3578
0
  ccv_nnc_tensor_free(c);
3579
0
  ccv_nnc_tensor_free(d);
3580
0
  ccv_nnc_tensor_free(ha);
3581
0
  ccv_nnc_tensor_free(hb);
3582
0
  ccv_nnc_tensor_free(hc);
3583
0
  ccv_nnc_tensor_free(hd);
3584
0
  ccv_nnc_tensor_free(gd);
3585
0
}
3586
3587
TEST_CASE("compare ewsum with mps in half precision")
3588
1
{
3589
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_MPS));
3590
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
3591
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
3592
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
3593
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 100), 0);
3594
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3595
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3596
0
  ccv_nnc_tensor_t* const hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3597
0
  ccv_nnc_tensor_t* const hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3598
0
  ccv_nnc_tensor_t* const ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
3599
0
  ccv_nnc_tensor_t* const hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
3600
0
  ccv_nnc_tensor_t* const hc16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
3601
0
  ccv_nnc_tensor_t* const hd16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100), 0);
3602
0
  ccv_nnc_tensor_t* const gd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100), 0);
3603
0
  int i;
3604
0
  for (i = 0; i < 100; i++)
3605
0
  {
3606
0
    ha->data.f32[i] = 1;
3607
0
    hb->data.f32[i] = 0.5;
3608
0
    hc->data.f32[i] = 0.25;
3609
0
    gd->data.f32[i] = 1.75;
3610
0
  }
3611
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hc), TENSOR_LIST(ha16, hb16, hc16), 0);
3612
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hb16, hc16), TENSOR_LIST(a, b, c), 0);
3613
0
  ccv_nnc_cmd_exec(CMD_EWSUM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(d), 0);
3614
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(hd16), 0);
3615
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hd16), TENSOR_LIST(hd), 0);
3616
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hd->data.f32, gd->data.f32, 100, 1e-3, "ewsum result should be the same");
3617
0
  ccv_nnc_tensor_free(a);
3618
0
  ccv_nnc_tensor_free(b);
3619
0
  ccv_nnc_tensor_free(c);
3620
0
  ccv_nnc_tensor_free(d);
3621
0
  ccv_nnc_tensor_free(ha);
3622
0
  ccv_nnc_tensor_free(hb);
3623
0
  ccv_nnc_tensor_free(hc);
3624
0
  ccv_nnc_tensor_free(hd);
3625
0
  ccv_nnc_tensor_free(ha16);
3626
0
  ccv_nnc_tensor_free(hb16);
3627
0
  ccv_nnc_tensor_free(hc16);
3628
0
  ccv_nnc_tensor_free(hd16);
3629
0
  ccv_nnc_tensor_free(gd);
3630
0
}
3631
3632
TEST_CASE("compare transpose two tensor views")
3633
1
{
3634
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
3635
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
3636
0
  memset(ha->data.f32, 0, sizeof(float) * 7 * 6 * 5 * 4);
3637
0
  ccv_nnc_tensor_view_t ha_view = ccv_nnc_tensor_view(ha, CPU_TENSOR_NHWC(32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
3638
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 8, 7, 6, 5), 0);
3639
0
  memset(hb->data.f32, 0, sizeof(float) * 8 * 7 * 6 * 5);
3640
0
  ccv_nnc_tensor_view_t hb_view = ccv_nnc_tensor_view(hb, CPU_TENSOR_NHWC(32F, 4, 2, 2, 3), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(7 * 6 * 5, 6 * 5, 5, 1));
3641
0
  int i, j, k, l;
3642
0
  for (i = 0; i < 4; i++)
3643
0
    for (j = 0; j < 3; j++)
3644
0
      for (k = 0; k < 2; k++)
3645
0
        for (l = 0; l < 2; l++)
3646
0
          ha->data.f32[(i + 3) * 6 * 5 * 4 + (j + 2) * 5 * 4 + (k + 1) * 4 + l] = i * 3 * 2 * 2 + j * 2 * 2 + k * 2 + l;
3647
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&ha_view), TENSOR_LIST((ccv_nnc_tensor_t*)&hb_view), 0);
3648
0
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
3649
0
  memset(hd->data.f32, 0, sizeof(float) * 7 * 6 * 5 * 4);
3650
0
  ccv_nnc_tensor_view_t hd_view = ccv_nnc_tensor_view(hd, CPU_TENSOR_NHWC(32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
3651
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&hb_view), TENSOR_LIST((ccv_nnc_tensor_t*)&hd_view), 0);
3652
0
  REQUIRE_TENSOR_EQ(hd, ha, "4x3x2x2 tensor should be exactly the same.");
3653
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 7, 6, 5, 4), 0);
3654
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3655
0
  ccv_nnc_tensor_view_t a_view = ccv_nnc_tensor_view(a, GPU_TENSOR_NHWC(000, 32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
3656
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 8, 7, 6, 5), 0);
3657
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(b), 0);
3658
0
  ccv_nnc_tensor_view_t b_view = ccv_nnc_tensor_view(b, GPU_TENSOR_NHWC(000, 32F, 4, 2, 2, 3), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(7 * 6 * 5, 6 * 5, 5, 1));
3659
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&a_view), TENSOR_LIST((ccv_nnc_tensor_t*)&b_view), 0);
3660
0
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 7, 6, 5, 4), 0);
3661
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(d), 0);
3662
0
  ccv_nnc_tensor_view_t d_view = ccv_nnc_tensor_view(d, GPU_TENSOR_NHWC(000, 32F, 4, 3, 2, 2), DIM_ALLOC(3, 2, 1, 0), DIM_ALLOC(6 * 5 * 4, 5 * 4, 4, 1));
3663
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 3), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)&b_view), TENSOR_LIST((ccv_nnc_tensor_t*)&d_view), 0);
3664
0
  ccv_nnc_tensor_t* const hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 8, 7, 6, 5), 0);
3665
0
  ccv_nnc_tensor_t* const hdt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 6, 5, 4), 0);
3666
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, d), TENSOR_LIST(hbt, hdt), 0);
3667
0
  REQUIRE_TENSOR_EQ(hbt, hb, "4x2x2x3 tensor should be exactly the same.");
3668
0
  REQUIRE_TENSOR_EQ(hdt, hd, "4x3x2x2 tensor should be exactly the same.");
3669
0
  ccv_nnc_tensor_free(ha);
3670
0
  ccv_nnc_tensor_free(hb);
3671
0
  ccv_nnc_tensor_free(hd);
3672
0
  ccv_nnc_tensor_free(hbt);
3673
0
  ccv_nnc_tensor_free(hdt);
3674
0
  ccv_nnc_tensor_free(a);
3675
0
  ccv_nnc_tensor_free(b);
3676
0
  ccv_nnc_tensor_free(d);
3677
0
}
3678
3679
TEST_CASE("broadcasting semantics for add [[1, 2, 3], [4, 5, 6]] + [7, 8, 9]")
3680
1
{
3681
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
3682
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3683
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3684
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3685
0
  a->data.f32[0] = 1;
3686
0
  a->data.f32[1] = 2;
3687
0
  a->data.f32[2] = 3;
3688
0
  a->data.f32[3] = 4;
3689
0
  a->data.f32[4] = 5;
3690
0
  a->data.f32[5] = 6;
3691
0
  b->data.f32[0] = 7;
3692
0
  b->data.f32[1] = 8;
3693
0
  b->data.f32[2] = 9;
3694
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3695
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3696
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3697
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3698
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(1, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
3699
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
3700
0
  float ctp[] = {
3701
0
    8, 10, 12,
3702
0
    11, 13, 15
3703
0
  };
3704
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3705
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
3706
0
  ccv_nnc_tensor_free(a);
3707
0
  ccv_nnc_tensor_free(b);
3708
0
  ccv_nnc_tensor_free(c);
3709
0
  ccv_nnc_tensor_free(ga);
3710
0
  ccv_nnc_tensor_free(gb);
3711
0
  ccv_nnc_tensor_free(gc);
3712
0
}
3713
3714
TEST_CASE("broadcasting semantics for add [[1], [2], [3], [4]] + [5, 6]")
3715
1
{
3716
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_MPS));
3717
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3718
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3719
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3720
0
  a->data.f32[0] = 1;
3721
0
  a->data.f32[1] = 2;
3722
0
  a->data.f32[2] = 3;
3723
0
  a->data.f32[3] = 4;
3724
0
  b->data.f32[0] = 5;
3725
0
  b->data.f32[1] = 6;
3726
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3727
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3728
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3729
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3730
0
  ccv_nnc_cmd_exec(CMD_ADD_FORWARD(1, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
3731
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
3732
0
  float ctp[] = {
3733
0
    6, 7,
3734
0
    7, 8,
3735
0
    8, 9,
3736
0
    9, 10
3737
0
  };
3738
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3739
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
3740
0
  ccv_nnc_tensor_free(a);
3741
0
  ccv_nnc_tensor_free(b);
3742
0
  ccv_nnc_tensor_free(c);
3743
0
  ccv_nnc_tensor_free(ga);
3744
0
  ccv_nnc_tensor_free(gb);
3745
0
  ccv_nnc_tensor_free(gc);
3746
0
}
3747
3748
TEST_CASE("broadcasting semantics for mul [[1, 2, 3], [4, 5, 6]] * [7, 8, 9]")
3749
1
{
3750
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
3751
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3752
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
3753
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3754
0
  a->data.f32[0] = 1;
3755
0
  a->data.f32[1] = 2;
3756
0
  a->data.f32[2] = 3;
3757
0
  a->data.f32[3] = 4;
3758
0
  a->data.f32[4] = 5;
3759
0
  a->data.f32[5] = 6;
3760
0
  b->data.f32[0] = 7;
3761
0
  b->data.f32[1] = 8;
3762
0
  b->data.f32[2] = 9;
3763
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3764
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
3765
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3766
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3767
0
  ccv_nnc_cmd_exec(CMD_MUL_FORWARD(1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
3768
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
3769
0
  float ctp[] = {
3770
0
    7, 16, 27,
3771
0
    28, 40, 54
3772
0
  };
3773
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3774
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
3775
0
  ccv_nnc_tensor_free(a);
3776
0
  ccv_nnc_tensor_free(b);
3777
0
  ccv_nnc_tensor_free(c);
3778
0
  ccv_nnc_tensor_free(ga);
3779
0
  ccv_nnc_tensor_free(gb);
3780
0
  ccv_nnc_tensor_free(gc);
3781
0
}
3782
3783
TEST_CASE("broadcasting semantics for mul [[1], [2], [3], [4]] * [5, 6]")
3784
1
{
3785
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
3786
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
3787
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
3788
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3789
0
  a->data.f32[0] = 1;
3790
0
  a->data.f32[1] = 2;
3791
0
  a->data.f32[2] = 3;
3792
0
  a->data.f32[3] = 4;
3793
0
  b->data.f32[0] = 5;
3794
0
  b->data.f32[1] = 6;
3795
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
3796
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
3797
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
3798
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
3799
0
  ccv_nnc_cmd_exec(CMD_MUL_FORWARD(1), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
3800
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
3801
0
  float ctp[] = {
3802
0
    5, 6,
3803
0
    10, 12,
3804
0
    15, 18,
3805
0
    20, 24
3806
0
  };
3807
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
3808
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
3809
0
  ccv_nnc_tensor_free(a);
3810
0
  ccv_nnc_tensor_free(b);
3811
0
  ccv_nnc_tensor_free(c);
3812
0
  ccv_nnc_tensor_free(ga);
3813
0
  ccv_nnc_tensor_free(gb);
3814
0
  ccv_nnc_tensor_free(gc);
3815
0
}
3816
3817
TEST_CASE("scalar mul [[1, 2, 3], [4, 5, 6]] * 0.3")
3818
1
{
3819
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
3820
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3821
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3822
0
  a->data.f32[0] = 1;
3823
0
  a->data.f32[1] = 2;
3824
0
  a->data.f32[2] = 3;
3825
0
  a->data.f32[3] = 4;
3826
0
  a->data.f32[4] = 5;
3827
0
  a->data.f32[5] = 6;
3828
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3829
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
3830
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ga), 0);
3831
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(0.3), ccv_nnc_no_hint, 0, TENSOR_LIST(ga), TENSOR_LIST(gc), 0);
3832
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
3833
0
  float ctp[] = {
3834
0
    0.3, 0.6, 0.9,
3835
0
    1.2, 1.5, 1.8,
3836
0
  };
3837
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
3838
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
3839
0
  ccv_nnc_tensor_free(a);
3840
0
  ccv_nnc_tensor_free(c);
3841
0
  ccv_nnc_tensor_free(ga);
3842
0
  ccv_nnc_tensor_free(gc);
3843
0
}
3844
3845
TEST_CASE("scalar mul [[1, 2, 3], [4, 5, 6]] * 0.5, int")
3846
1
{
3847
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
3848
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2, 3), 0);
3849
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2, 3), 0);
3850
0
  ccv_nnc_tensor_t* const ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2, 3), 0);
3851
0
  a->data.i32[0] = 1;
3852
0
  a->data.i32[1] = 2;
3853
0
  a->data.i32[2] = 3;
3854
0
  a->data.i32[3] = 4;
3855
0
  a->data.i32[4] = 5;
3856
0
  a->data.i32[5] = 6;
3857
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2, 3), 0);
3858
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2, 3), 0);
3859
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ga), 0);
3860
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(ga), TENSOR_LIST(gc), 0);
3861
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
3862
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ct), 0);
3863
0
  REQUIRE_TENSOR_EQ(c, ct, "result should be equal");
3864
0
  ccv_nnc_tensor_free(a);
3865
0
  ccv_nnc_tensor_free(c);
3866
0
  ccv_nnc_tensor_free(ct);
3867
0
  ccv_nnc_tensor_free(ga);
3868
0
  ccv_nnc_tensor_free(gc);
3869
0
}
3870
3871
TEST_CASE("compare average pooling with mps")
3872
1
{
3873
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
3874
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
3875
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
3876
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "y");
3877
0
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "avg_pool");
3878
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
3879
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3880
0
  ccv_nnc_graph_t* graph = 0;
3881
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3882
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3883
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3884
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3885
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3886
0
  dsfmt_t dsfmt;
3887
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3888
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3889
0
  int i;
3890
0
  for (i = 0; i < 7 * 7 * 10; i++)
3891
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3892
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3893
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
3894
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3895
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3896
0
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
3897
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3898
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3899
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
3900
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
3901
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3902
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3903
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3904
0
  ccv_nnc_graph_free(graph);
3905
0
  ccv_nnc_tensor_free(x_tensor);
3906
0
  ccv_nnc_tensor_free(y_tensor);
3907
0
  ccv_nnc_tensor_free(cpu_y);
3908
0
}
3909
3910
TEST_CASE("compare average pooling with mps in half precision")
3911
1
{
3912
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_AVERAGE_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
3913
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
3914
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
3915
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "y");
3916
0
  ccv_nnc_graph_exec_symbol_t avg_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_AVERAGE_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "avg_pool");
3917
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, avg_pool, HINT((2, 2), (1, 1)));
3918
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3919
0
  ccv_nnc_graph_t* graph = 0;
3920
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3921
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3922
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3923
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3924
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3925
0
  dsfmt_t dsfmt;
3926
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3927
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3928
0
  int i;
3929
0
  for (i = 0; i < 7 * 7 * 10; i++)
3930
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3931
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
3932
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3933
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3934
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
3935
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3936
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3937
0
  ccv_nnc_cmd_exec(CMD_AVERAGE_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
3938
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3939
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3940
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
3941
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
3942
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
3943
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 3 * 3 * 10, 1e-3, "mps result should equal to cpu result");
3944
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3945
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3946
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3947
0
  ccv_nnc_graph_free(graph);
3948
0
  ccv_nnc_tensor_free(x_tensor);
3949
0
  ccv_nnc_tensor_free(x16_tensor);
3950
0
  ccv_nnc_tensor_free(y_tensor);
3951
0
  ccv_nnc_tensor_free(cpu_y);
3952
0
  ccv_nnc_tensor_free(cpu_y16);
3953
0
}
3954
3955
TEST_CASE("compare max pooling with mps")
3956
1
{
3957
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
3958
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
3959
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 7, 7, 10), "x");
3960
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 3, 3, 10), "y");
3961
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
3962
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
3963
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3964
0
  ccv_nnc_graph_t* graph = 0;
3965
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3966
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3967
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3968
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3969
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3970
0
  dsfmt_t dsfmt;
3971
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3972
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
3973
0
  int i;
3974
0
  for (i = 0; i < 7 * 7 * 10; i++)
3975
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3976
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
3977
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
3978
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3979
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3980
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
3981
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
3982
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
3983
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
3984
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
3985
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3986
0
  ccv_nnc_tensor_arena_free(tensor_arena);
3987
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3988
0
  ccv_nnc_graph_free(graph);
3989
0
  ccv_nnc_tensor_free(x_tensor);
3990
0
  ccv_nnc_tensor_free(y_tensor);
3991
0
  ccv_nnc_tensor_free(cpu_y);
3992
0
}
3993
3994
TEST_CASE("compare max pooling with mps in half precision")
3995
1
{
3996
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
3997
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
3998
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 7, 7, 10), "x");
3999
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 3, 3, 10), "y");
4000
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(5, 5), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
4001
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (1, 1)));
4002
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4003
0
  ccv_nnc_graph_t* graph = 0;
4004
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4005
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4006
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4007
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4008
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4009
0
  dsfmt_t dsfmt;
4010
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4011
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 7, 7, 10), 0);
4012
0
  int i;
4013
0
  for (i = 0; i < 7 * 7 * 10; i++)
4014
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4015
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
4016
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 7, 7, 10), 0);
4017
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
4018
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
4019
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4020
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
4021
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(5, 5), HINT((2, 2), (1, 1)), 0, TENSOR_LIST(x_tensor), TENSOR_LIST(y_tensor), 0);
4022
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
4023
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 3, 10), 0);
4024
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
4025
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
4026
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
4027
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 3 * 3 * 10, 1e-3, "mps result should equal to cpu result");
4028
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4029
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4030
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4031
0
  ccv_nnc_graph_free(graph);
4032
0
  ccv_nnc_tensor_free(x_tensor);
4033
0
  ccv_nnc_tensor_free(x16_tensor);
4034
0
  ccv_nnc_tensor_free(y_tensor);
4035
0
  ccv_nnc_tensor_free(cpu_y);
4036
0
  ccv_nnc_tensor_free(cpu_y16);
4037
0
}
4038
4039
TEST_CASE("compare max pooling 2x2 with mps")
4040
1
{
4041
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
4042
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
4043
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 6, 6), "x");
4044
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 10, 3, 3), "y");
4045
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(2, 2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
4046
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (0, 0)));
4047
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4048
0
  ccv_nnc_graph_t* graph = 0;
4049
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4050
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4051
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4052
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4053
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4054
0
  dsfmt_t dsfmt;
4055
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4056
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 6, 6), 0);
4057
0
  int i, j;
4058
0
  for (i = 0; i < 6 * 6 * 10; i++)
4059
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4060
0
  ccv_nnc_tensor_t* const gt_x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 6, 6, 10), 0);
4061
0
  for (i = 0; i < 10; i++)
4062
0
    for (j = 0; j < 6 * 6; j++)
4063
0
      gt_x->data.f32[j * 10 + i] = x_tensor->data.f32[i * 6 * 6 + j];
4064
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
4065
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
4066
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4067
0
  ccv_nnc_tensor_t* const gt_y= ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
4068
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(2, 2), HINT((2, 2), (0, 0)), 0, TENSOR_LIST(gt_x), TENSOR_LIST(gt_y), 0);
4069
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
4070
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
4071
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y), 0);
4072
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
4073
0
  for (i = 0; i < 10; i++)
4074
0
    for (j = 0; j < 3 * 3; j++)
4075
0
      y_tensor->data.f32[i * 3 * 3 + j] = gt_y->data.f32[j * 10 + i];
4076
0
  REQUIRE_TENSOR_EQ(y_tensor, cpu_y, "mps result should equal to cpu result");
4077
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4078
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4079
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4080
0
  ccv_nnc_graph_free(graph);
4081
0
  ccv_nnc_tensor_free(x_tensor);
4082
0
  ccv_nnc_tensor_free(y_tensor);
4083
0
  ccv_nnc_tensor_free(cpu_y);
4084
0
}
4085
4086
TEST_CASE("compare max pooling 2x2 with mps in half precision")
4087
1
{
4088
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MAX_POOL_FORWARD, CCV_NNC_BACKEND_MPS));
4089
0
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
4090
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 6, 6), "x");
4091
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 10, 3, 3), "y");
4092
0
  ccv_nnc_graph_exec_symbol_t max_pool = ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_MAX_POOL_FORWARD(2, 2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "max_pool");
4093
0
  ccv_nnc_graph_exec_symbol_set_hint(symbolic_graph, max_pool, HINT((2, 2), (0, 0)));
4094
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4095
0
  ccv_nnc_graph_t* graph = 0;
4096
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4097
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4098
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4099
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4100
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4101
0
  dsfmt_t dsfmt;
4102
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4103
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 6, 6), 0);
4104
0
  int i, j;
4105
0
  for (i = 0; i < 6 * 6 * 10; i++)
4106
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4107
0
  ccv_nnc_tensor_t* const gt_x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 6, 6, 10), 0);
4108
0
  for (i = 0; i < 10; i++)
4109
0
    for (j = 0; j < 6 * 6; j++)
4110
0
      gt_x->data.f32[j * 10 + i] = x_tensor->data.f32[i * 6 * 6 + j];
4111
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
4112
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 6, 6), 0);
4113
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
4114
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
4115
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4116
0
  ccv_nnc_tensor_t* const gt_y= ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 3, 10), 0);
4117
0
  ccv_nnc_cmd_exec(CMD_MAX_POOL_FORWARD(2, 2), HINT((2, 2), (0, 0)), 0, TENSOR_LIST(gt_x), TENSOR_LIST(gt_y), 0);
4118
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
4119
0
  ccv_nnc_tensor_t* const cpu_y16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 10, 3, 3), 0);
4120
0
  ccv_nnc_tensor_t* const cpu_y = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
4121
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(cpu_y16), 0);
4122
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(cpu_y16), TENSOR_LIST(cpu_y), 0);
4123
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 3, 3), 0);
4124
0
  for (i = 0; i < 10; i++)
4125
0
    for (j = 0; j < 3 * 3; j++)
4126
0
      y_tensor->data.f32[i * 3 * 3 + j] = gt_y->data.f32[j * 10 + i];
4127
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, y_tensor->data.f32, cpu_y->data.f32, 10 * 3 * 3, 1e-3, "mps result should equal to cpu result");
4128
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4129
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4130
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4131
0
  ccv_nnc_graph_free(graph);
4132
0
  ccv_nnc_tensor_free(x_tensor);
4133
0
  ccv_nnc_tensor_free(x16_tensor);
4134
0
  ccv_nnc_tensor_free(y_tensor);
4135
0
  ccv_nnc_tensor_free(cpu_y);
4136
0
  ccv_nnc_tensor_free(cpu_y16);
4137
0
}
4138
4139
4140
TEST_CASE("mps mse mean loss forward")
4141
1
{
4142
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS));
4143
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4144
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4145
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 1), 0);
4146
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4147
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4148
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
4149
0
  dsfmt_t dsfmt;
4150
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4151
0
  int i;
4152
0
  for (i = 0; i < 1000; i++)
4153
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4154
0
  for (i = 0; i < 1000; i++)
4155
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4156
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
4157
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
4158
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
4159
0
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
4160
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(tc), 0);
4161
0
  REQUIRE_TENSOR_EQ(tc, hc, "MPS computed output should be the same as CPU computed ones");
4162
0
  ccv_nnc_tensor_free(a);
4163
0
  ccv_nnc_tensor_free(b);
4164
0
  ccv_nnc_tensor_free(c);
4165
0
  ccv_nnc_tensor_free(ha);
4166
0
  ccv_nnc_tensor_free(hb);
4167
0
  ccv_nnc_tensor_free(hc);
4168
0
  ccv_nnc_tensor_free(tc);
4169
0
}
4170
4171
TEST_CASE("mps mse sum loss forward")
4172
1
{
4173
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS));
4174
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4175
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4176
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 1), 0);
4177
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4178
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4179
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
4180
0
  dsfmt_t dsfmt;
4181
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4182
0
  int i;
4183
0
  for (i = 0; i < 1000; i++)
4184
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4185
0
  for (i = 0; i < 1000; i++)
4186
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4187
  
4188
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
4189
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
4190
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
4191
0
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 1), 0);
4192
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(tc), 0);
4193
4194
0
  REQUIRE_TENSOR_EQ(tc, hc, "MPS computed output should be the same as CPU computed ones");
4195
0
  ccv_nnc_tensor_free(a);
4196
0
  ccv_nnc_tensor_free(b);
4197
0
  ccv_nnc_tensor_free(c);
4198
0
  ccv_nnc_tensor_free(ha);
4199
0
  ccv_nnc_tensor_free(hb);
4200
0
  ccv_nnc_tensor_free(hc);
4201
0
  ccv_nnc_tensor_free(tc);
4202
0
}
4203
4204
TEST_CASE("mps mse mean loss backward")
4205
1
{
4206
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS) &&
4207
1
  ccv_nnc_cmd_ok(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_MPS));
4208
4209
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4210
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4211
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4212
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4213
0
  ccv_nnc_tensor_t* db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4214
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4215
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4216
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4217
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4218
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4219
0
  ccv_nnc_tensor_t* hdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4220
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4221
0
  dsfmt_t dsfmt;
4222
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4223
0
  int i;
4224
0
  for (i = 0; i < 1000; i++)
4225
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4226
0
  for (i = 0; i < 1000; i++)
4227
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4228
0
  for (i = 0; i < 10; i++)
4229
0
    hg->data.f32[i] = 1;
4230
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
4231
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
4232
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hda, hdb), 0);
4233
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
4234
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_MEAN), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(da, db), 0);
4235
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4236
0
  ccv_nnc_tensor_t* tdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4237
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, db), TENSOR_LIST(tda, tdb), 0);
4238
4239
0
  REQUIRE_TENSOR_EQ(tda, hda, "MPS computed output should be the same as CPU computed ones");
4240
0
  REQUIRE_TENSOR_EQ(tdb, hdb, "MPS computed output should be the same as CPU computed ones");
4241
4242
0
  ccv_nnc_tensor_free(a);
4243
0
  ccv_nnc_tensor_free(b);
4244
0
  ccv_nnc_tensor_free(c);
4245
0
  ccv_nnc_tensor_free(da);
4246
0
  ccv_nnc_tensor_free(db);
4247
0
  ccv_nnc_tensor_free(g);
4248
0
  ccv_nnc_tensor_free(ha);
4249
0
  ccv_nnc_tensor_free(hb);
4250
0
  ccv_nnc_tensor_free(hc);
4251
0
  ccv_nnc_tensor_free(hda);
4252
0
  ccv_nnc_tensor_free(hdb);
4253
0
  ccv_nnc_tensor_free(hg);
4254
0
  ccv_nnc_tensor_free(tda);
4255
0
  ccv_nnc_tensor_free(tdb);
4256
0
}
4257
4258
TEST_CASE("mps mse sum loss backward")
4259
1
{
4260
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS) &&
4261
1
    ccv_nnc_cmd_ok(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_MPS));
4262
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4263
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4264
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4265
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4266
0
  ccv_nnc_tensor_t* db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4267
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4268
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4269
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4270
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4271
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4272
0
  ccv_nnc_tensor_t* hdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4273
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4274
0
  dsfmt_t dsfmt;
4275
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4276
0
  int i;
4277
0
  for (i = 0; i < 1000; i++)
4278
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4279
0
  for (i = 0; i < 1000; i++)
4280
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4281
0
  for (i = 0; i < 10; i++)
4282
0
    hg->data.f32[i] = 1;
4283
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
4284
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
4285
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hda, hdb), 0);
4286
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
4287
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(da, db), 0);
4288
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4289
0
  ccv_nnc_tensor_t* tdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4290
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, db), TENSOR_LIST(tda, tdb), 0);
4291
0
  REQUIRE_TENSOR_EQ(tda, hda, "MPS computed output should be the same as CPU computed ones");
4292
0
  REQUIRE_TENSOR_EQ(tdb, hdb, "MPS computed output should be the same as CPU computed ones");
4293
0
  ccv_nnc_tensor_free(a);
4294
0
  ccv_nnc_tensor_free(b);
4295
0
  ccv_nnc_tensor_free(c);
4296
0
  ccv_nnc_tensor_free(da);
4297
0
  ccv_nnc_tensor_free(db);
4298
0
  ccv_nnc_tensor_free(g);
4299
0
  ccv_nnc_tensor_free(ha);
4300
0
  ccv_nnc_tensor_free(hb);
4301
0
  ccv_nnc_tensor_free(hc);
4302
0
  ccv_nnc_tensor_free(hda);
4303
0
  ccv_nnc_tensor_free(hdb);
4304
0
  ccv_nnc_tensor_free(hg);
4305
0
  ccv_nnc_tensor_free(tda);
4306
0
  ccv_nnc_tensor_free(tdb);
4307
0
}
4308
4309
4310
TEST_CASE("mps mse sum loss backward (no output db)")
4311
1
{
4312
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_MPS) &&
4313
1
    ccv_nnc_cmd_ok(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_MPS));
4314
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4315
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4316
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4317
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4318
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
4319
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4320
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4321
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4322
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4323
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
4324
0
  dsfmt_t dsfmt;
4325
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4326
0
  int i;
4327
0
  for (i = 0; i < 1000; i++)
4328
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4329
0
  for (i = 0; i < 1000; i++)
4330
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4331
0
  for (i = 0; i < 10; i++)
4332
0
    hg->data.f32[i] = 1;
4333
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
4334
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
4335
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hda, 0), 0);
4336
0
  ccv_nnc_cmd_exec(CMD_MSE_FORWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
4337
0
  ccv_nnc_cmd_exec(CMD_MSE_BACKWARD(CCV_NNC_MSE_REDUCE_SUM), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(da, 0), 0);
4338
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4339
0
  ccv_nnc_tensor_t* tdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4340
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, 0), TENSOR_LIST(tda, 0), 0);
4341
0
  REQUIRE_TENSOR_EQ(tda, hda, "MPS computed output should be the same as CPU computed ones");
4342
0
  ccv_nnc_tensor_free(a);
4343
0
  ccv_nnc_tensor_free(b);
4344
0
  ccv_nnc_tensor_free(c);
4345
0
  ccv_nnc_tensor_free(da);
4346
0
  ccv_nnc_tensor_free(g);
4347
0
  ccv_nnc_tensor_free(ha);
4348
0
  ccv_nnc_tensor_free(hb);
4349
0
  ccv_nnc_tensor_free(hc);
4350
0
  ccv_nnc_tensor_free(hda);
4351
0
  ccv_nnc_tensor_free(hg);
4352
0
  ccv_nnc_tensor_free(tda);
4353
0
  ccv_nnc_tensor_free(tdb);
4354
0
}
4355
4356
TEST_CASE("mps leaky relu gradient in float")
4357
1
{
4358
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LEAKY_RELU_FORWARD, CCV_NNC_BACKEND_MPS) &&
4359
1
    ccv_nnc_cmd_ok(CCV_NNC_LEAKY_RELU_BACKWARD, CCV_NNC_BACKEND_MPS));
4360
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4361
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
4362
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
4363
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LEAKY_RELU_FORWARD(0.2), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "leaky relu");
4364
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4365
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4366
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4367
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4368
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
4369
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
4370
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4371
0
  dsfmt_t dsfmt;
4372
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4373
0
  int i;
4374
0
  for (i = 0; i < 10 * 100; i++)
4375
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4376
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4377
0
  for (i = 0; i < 10 * 100; i++)
4378
0
    dy_tensor->data.f32[i] = 0;
4379
0
  for (i = 0; i < 10; i++)
4380
0
    dy_tensor->data.f32[i * 100 + i] = 1;
4381
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
4382
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
4383
0
  ccv_nnc_graph_t* graph = 0;
4384
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4385
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4386
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4387
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4388
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
4389
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
4390
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4391
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4392
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
4393
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4394
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
4395
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
4396
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
4397
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4398
0
  ccv_nnc_cmd_exec(CMD_LEAKY_RELU_FORWARD(0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
4399
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
4400
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
4401
0
  ccv_nnc_cmd_exec(CMD_LEAKY_RELU_BACKWARD(0.2), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, y_tensor), TENSOR_LIST(tdx_tensor), 0);
4402
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
4403
0
  ccv_nnc_tensor_free(x_tensor);
4404
0
  ccv_nnc_tensor_free(y_tensor);
4405
0
  ccv_nnc_tensor_free(dx_tensor);
4406
0
  ccv_nnc_tensor_free(dy_tensor);
4407
0
  ccv_nnc_tensor_free(ty_tensor);
4408
0
  ccv_nnc_tensor_free(tdx_tensor);
4409
0
  ccv_nnc_tensor_free(dyt);
4410
0
  ccv_nnc_graph_free(graph);
4411
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4412
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4413
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4414
0
}
4415
4416
TEST_CASE("compare layer norm gradient with mps")
4417
1
{
4418
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4419
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
4420
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
4421
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4422
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
4423
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
4424
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "scale");
4425
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "bias");
4426
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
4427
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
4428
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
4429
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4430
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4431
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4432
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
4433
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
4434
0
  ccv_nnc_graph_t* graph = 0;
4435
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4436
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4437
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4438
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4439
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4440
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
4441
0
  dsfmt_t dsfmt;
4442
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4443
0
  int i;
4444
0
  dsfmt_init_gen_rand(&dsfmt, 1);
4445
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4446
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
4447
4448
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
4449
0
  float scaledata[1 * 2 * 2 * LN_DIM];
4450
0
  float biasdata[1 * 2 * 2 * LN_DIM];
4451
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
4452
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
4453
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
4454
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
4455
4456
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
4457
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
4458
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
4459
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4460
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
4461
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4462
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
4463
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
4464
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4465
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
4466
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4467
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
4468
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
4469
0
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
4470
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
4471
0
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
4472
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
4473
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4474
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4475
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4476
0
  ccv_nnc_graph_free(graph);
4477
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
4478
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
4479
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
4480
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "scale");
4481
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "bias");
4482
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
4483
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
4484
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
4485
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4486
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
4487
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4488
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
4489
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
4490
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
4491
0
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
4492
0
  ccv_nnc_graph_t* cpu_graph = 0;
4493
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
4494
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
4495
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
4496
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
4497
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4498
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
4499
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4500
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
4501
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
4502
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
4503
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
4504
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
4505
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
4506
0
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
4507
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
4508
4509
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from mps should match the one from reference implementation");
4510
0
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from mps should match the one from reference implementation");
4511
0
  REQUIRE_TENSOR_EQ(dbias_tensor, dcbias_tensor, "layer norm bias gradient result from mps should match the one from reference implementation");
4512
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
4513
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
4514
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
4515
0
  ccv_nnc_graph_free(cpu_graph);
4516
0
  ccv_nnc_tensor_free(x_tensor);
4517
0
  ccv_nnc_tensor_free(dy_tensor);
4518
0
  ccv_nnc_tensor_free(dx_tensor);
4519
0
  ccv_nnc_tensor_free(dscale_tensor);
4520
0
  ccv_nnc_tensor_free(dbias_tensor);
4521
0
}
4522
4523
TEST_CASE("compare layer norm gradient with mps (no bias)")
4524
1
{
4525
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4526
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
4527
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
4528
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4529
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
4530
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
4531
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "scale");
4532
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "bias");
4533
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
4534
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
4535
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
4536
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4537
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4538
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4539
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
4540
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
4541
0
  ccv_nnc_graph_t* graph = 0;
4542
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4543
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4544
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4545
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4546
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4547
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
4548
0
  dsfmt_t dsfmt;
4549
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4550
0
  int i;
4551
0
  dsfmt_init_gen_rand(&dsfmt, 1);
4552
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4553
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
4554
4555
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
4556
0
  float scaledata[1 * 2 * 2 * LN_DIM];
4557
0
  float biasdata[1 * 2 * 2 * LN_DIM];
4558
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
4559
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
4560
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
4561
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
4562
4563
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
4564
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
4565
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
4566
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4567
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
4568
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4569
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
4570
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
4571
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4572
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
4573
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4574
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
4575
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
4576
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
4577
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
4578
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4579
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4580
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4581
0
  ccv_nnc_graph_free(graph);
4582
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
4583
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
4584
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
4585
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "scale");
4586
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "bias");
4587
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
4588
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
4589
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
4590
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4591
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
4592
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4593
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
4594
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
4595
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
4596
0
  ccv_nnc_graph_t* cpu_graph = 0;
4597
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
4598
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
4599
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
4600
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
4601
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4602
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
4603
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4604
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
4605
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
4606
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
4607
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
4608
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
4609
4610
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from mps should match the one from reference implementation");
4611
0
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from mps should match the one from reference implementation");
4612
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
4613
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
4614
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
4615
0
  ccv_nnc_graph_free(cpu_graph);
4616
0
  ccv_nnc_tensor_free(x_tensor);
4617
0
  ccv_nnc_tensor_free(dy_tensor);
4618
0
  ccv_nnc_tensor_free(dx_tensor);
4619
0
  ccv_nnc_tensor_free(dscale_tensor);
4620
0
}
4621
4622
TEST_CASE("compare layer norm gradient with mps without scale / bias")
4623
1
{
4624
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4625
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
4626
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
4627
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4628
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
4629
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
4630
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
4631
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
4632
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
4633
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4634
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4635
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4636
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
4637
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
4638
0
  ccv_nnc_graph_t* graph = 0;
4639
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4640
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4641
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4642
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4643
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4644
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
4645
0
  dsfmt_t dsfmt;
4646
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4647
0
  int i;
4648
0
  dsfmt_init_gen_rand(&dsfmt, 1);
4649
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4650
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
4651
4652
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
4653
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4654
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
4655
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4656
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
4657
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
4658
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4659
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
4660
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4661
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
4662
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4663
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4664
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4665
0
  ccv_nnc_graph_free(graph);
4666
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
4667
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
4668
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
4669
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
4670
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
4671
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
4672
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4673
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
4674
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4675
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
4676
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
4677
0
  ccv_nnc_graph_t* cpu_graph = 0;
4678
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
4679
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
4680
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
4681
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
4682
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4683
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
4684
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4685
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
4686
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
4687
4688
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from mps should match the one from reference implementation");
4689
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
4690
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
4691
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
4692
0
  ccv_nnc_graph_free(cpu_graph);
4693
0
  ccv_nnc_tensor_free(x_tensor);
4694
0
  ccv_nnc_tensor_free(dy_tensor);
4695
0
  ccv_nnc_tensor_free(dx_tensor);
4696
0
}
4697
4698
TEST_CASE("compare layer norm gradient with mps (no bias) without scale / bias")
4699
1
{
4700
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4701
1
    ccv_nnc_cmd_ok(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
4702
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
4703
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4704
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
4705
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
4706
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_mean");
4707
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
4708
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "layer_norm");
4709
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4710
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4711
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4712
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
4713
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
4714
0
  ccv_nnc_graph_t* graph = 0;
4715
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4716
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4717
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4718
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4719
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4720
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
4721
0
  dsfmt_t dsfmt;
4722
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4723
0
  int i;
4724
0
  dsfmt_init_gen_rand(&dsfmt, 1);
4725
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4726
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
4727
4728
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
4729
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4730
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
4731
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4732
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
4733
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
4734
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4735
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
4736
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4737
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
4738
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4739
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4740
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4741
0
  ccv_nnc_graph_free(graph);
4742
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
4743
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
4744
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
4745
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_mean");
4746
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
4747
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_LAYER_NORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "layer_norm");
4748
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4749
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
4750
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4751
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
4752
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
4753
0
  ccv_nnc_graph_t* cpu_graph = 0;
4754
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
4755
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
4756
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
4757
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
4758
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4759
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
4760
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4761
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
4762
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
4763
4764
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from mps should match the one from reference implementation");
4765
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
4766
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
4767
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
4768
0
  ccv_nnc_graph_free(cpu_graph);
4769
0
  ccv_nnc_tensor_free(x_tensor);
4770
0
  ccv_nnc_tensor_free(dy_tensor);
4771
0
  ccv_nnc_tensor_free(dx_tensor);
4772
0
}
4773
4774
TEST_CASE("compare rmsnorm gradient with mps")
4775
1
{
4776
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4777
1
    ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
4778
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
4779
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4780
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
4781
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
4782
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, 2, 2, LN_DIM), "scale");
4783
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
4784
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(bx, scale), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
4785
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4786
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4787
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4788
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
4789
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
4790
0
  ccv_nnc_graph_t* graph = 0;
4791
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4792
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4793
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4794
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4795
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4796
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
4797
0
  dsfmt_t dsfmt;
4798
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4799
0
  int i;
4800
0
  dsfmt_init_gen_rand(&dsfmt, 1);
4801
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4802
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
4803
4804
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
4805
0
  float scaledata[1 * 2 * 2 * LN_DIM];
4806
0
  for (i = 0; i < 1 * 2 * 2 * LN_DIM; i++)
4807
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
4808
4809
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
4810
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale)), 0);
4811
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4812
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
4813
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4814
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
4815
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
4816
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4817
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
4818
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4819
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
4820
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
4821
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), 0);
4822
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
4823
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4824
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4825
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4826
0
  ccv_nnc_graph_free(graph);
4827
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
4828
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
4829
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
4830
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, 2, 2, LN_DIM), "scale");
4831
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
4832
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 1, 1, 2, 3), TENSOR_SYMBOL_LIST(cx, cscale), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "layer_norm");
4833
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4834
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
4835
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4836
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
4837
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
4838
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
4839
0
  ccv_nnc_graph_t* cpu_graph = 0;
4840
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
4841
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
4842
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
4843
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
4844
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4845
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
4846
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4847
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
4848
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * 2 * 2 * LN_DIM);
4849
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
4850
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
4851
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
4852
4853
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from mps should match the one from reference implementation");
4854
0
  REQUIRE_TENSOR_EQ(dscale_tensor, dcscale_tensor, "layer norm scale gradient result from mps should match the one from reference implementation");
4855
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
4856
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
4857
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
4858
0
  ccv_nnc_graph_free(cpu_graph);
4859
0
  ccv_nnc_tensor_free(x_tensor);
4860
0
  ccv_nnc_tensor_free(dy_tensor);
4861
0
  ccv_nnc_tensor_free(dx_tensor);
4862
0
  ccv_nnc_tensor_free(dscale_tensor);
4863
0
}
4864
4865
TEST_CASE("compare rmsnorm gradient with mps without scale")
4866
1
{
4867
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4868
1
    ccv_nnc_cmd_ok(CCV_NNC_RMSNORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
4869
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
4870
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4871
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "x");
4872
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 2, 2, LN_DIM), "y");
4873
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, 1, 1, 1), "saved_inv_std");
4874
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_inv_std), "rmsnorm");
4875
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4876
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
4877
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4878
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
4879
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
4880
0
  ccv_nnc_graph_t* graph = 0;
4881
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4882
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4883
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4884
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4885
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4886
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
4887
0
  dsfmt_t dsfmt;
4888
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4889
0
  int i;
4890
0
  dsfmt_init_gen_rand(&dsfmt, 1);
4891
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4892
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
4893
4894
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
4895
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4896
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
4897
0
  for (i = 0; i < 2 * 2 * 2 * LN_DIM; i++)
4898
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
4899
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
4900
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4901
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
4902
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), 0);
4903
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
4904
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4905
0
  ccv_nnc_tensor_arena_free(tensor_arena);
4906
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4907
0
  ccv_nnc_graph_free(graph);
4908
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
4909
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "x");
4910
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 2, 2, LN_DIM), "y");
4911
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, 1, 1, 1), "saved_inv_std");
4912
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_RMSNORM_FORWARD(1e-4, 0, 1, 2, 3), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_inv_std), "layer_norm");
4913
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4914
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
4915
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4916
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
4917
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
4918
0
  ccv_nnc_graph_t* cpu_graph = 0;
4919
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
4920
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
4921
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
4922
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
4923
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4924
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
4925
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * 2 * 2 * LN_DIM);
4926
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
4927
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
4928
4929
0
  REQUIRE_TENSOR_EQ(dx_tensor, dcx_tensor, "layer norm gradient result from mps should match the one from reference implementation");
4930
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
4931
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
4932
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
4933
0
  ccv_nnc_graph_free(cpu_graph);
4934
0
  ccv_nnc_tensor_free(x_tensor);
4935
0
  ccv_nnc_tensor_free(dy_tensor);
4936
0
  ccv_nnc_tensor_free(dx_tensor);
4937
0
}
4938
4939
TEST_CASE("mps backward convolution in nchw format")
4940
1
{
4941
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4942
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4943
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4944
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4945
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
4946
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
4947
0
  assert(cmd.backend >= 0);
4948
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, g->info);
4949
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, g->info) == 0);
4950
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4951
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4952
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, 1, OUTPUT_DIM), 0);
4953
  // configure the inlets.
4954
0
  dsfmt_t dsfmt;
4955
0
  dsfmt_init_gen_rand(&dsfmt, 0);
4956
0
  int i;
4957
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
4958
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
4959
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
4960
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4961
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
4962
0
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
4963
  // Copy generated matrix values over to GPU.
4964
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4965
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
4966
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4967
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4968
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
4969
0
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4970
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
4971
0
  ccv_nnc_tensor_t* gao = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
4972
0
  ccv_nnc_tensor_t* ggo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
4973
0
  ccv_nnc_tensor_t* gho = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
4974
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4975
0
  ccv_nnc_tensor_t* gbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
4976
0
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
4977
0
  ccv_nnc_tensor_t* gdbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
4978
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
4979
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
4980
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gw, gg), TENSOR_LIST(gao, gwo, ggo), 0);
4981
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
4982
4983
0
  assert(cmd.backend >= 0);
4984
0
  cmd.algorithm = -1;
4985
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
4986
4987
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context);
4988
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context));
4989
0
  ccv_nnc_stream_context_wait(stream_context);
4990
0
  ccv_nnc_stream_context_free(stream_context);
4991
0
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
4992
0
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
4993
0
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, OUTPUT_DIM, 1, 1), 0);
4994
4995
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gho, gdwo, gdbiaso), TENSOR_LIST(gh, gdw, gdbias), 0);
4996
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
4997
4998
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from mps should match from CPU");
4999
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from mps should match from CPU");
5000
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
5001
0
  ccv_nnc_tensor_free(gao);
5002
0
  ccv_nnc_tensor_free(ggo);
5003
0
  ccv_nnc_tensor_free(gho);
5004
0
  ccv_nnc_tensor_free(gwo);
5005
0
  ccv_nnc_tensor_free(gbiaso);
5006
0
  ccv_nnc_tensor_free(gdwo);
5007
0
  ccv_nnc_tensor_free(gdbiaso);
5008
0
  ccv_nnc_tensor_free(h);
5009
0
  ccv_nnc_tensor_free(gh);
5010
0
  ccv_nnc_tensor_free(w);
5011
0
  ccv_nnc_tensor_free(g);
5012
0
  ccv_nnc_tensor_free(a);
5013
0
  ccv_nnc_tensor_free(gbias);
5014
0
  ccv_nnc_tensor_free(gdbias);
5015
0
  ccv_nnc_tensor_free(gdw);
5016
0
  ccv_nnc_tensor_free(gw);
5017
0
  ccv_nnc_tensor_free(gg);
5018
0
  ccv_nnc_tensor_free(ga);
5019
0
  ccv_nnc_tensor_free(ch);
5020
0
  ccv_nnc_tensor_free(cdw);
5021
0
  ccv_nnc_tensor_free(cdbias);
5022
0
}
5023
5024
TEST_CASE("mps backward convolution in nhwc format")
5025
1
{
5026
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_MPS));
5027
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5028
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5029
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5030
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
5031
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
5032
0
  assert(cmd.backend >= 0);
5033
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, g->info);
5034
0
  assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, g->info) == 0);
5035
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5036
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5037
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
5038
  // configure the inlets.
5039
0
  dsfmt_t dsfmt;
5040
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5041
0
  int i;
5042
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
5043
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
5044
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
5045
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5046
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
5047
0
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
5048
  // Copy generated matrix values over to GPU.
5049
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5050
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5051
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5052
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5053
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
5054
0
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5055
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
5056
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
5057
0
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
5058
5059
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
5060
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
5061
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), 0);
5062
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
5063
5064
0
  assert(cmd.backend >= 0);
5065
0
  cmd.algorithm = -1;
5066
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
5067
5068
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(gg, ga, gwo), TENSOR_LIST(gh, gdwo, gdbias), stream_context);
5069
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(gg, ga, gwo), TENSOR_LIST(gh, gdwo, gdbias), stream_context));
5070
0
  ccv_nnc_stream_context_wait(stream_context);
5071
0
  ccv_nnc_stream_context_free(stream_context);
5072
0
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5073
0
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5074
0
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, 1,  OUTPUT_DIM), 0);
5075
  
5076
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdwo), TENSOR_LIST(gdw), 0);
5077
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
5078
5079
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from mps should match from CPU");
5080
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from mps should match from CPU");
5081
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
5082
5083
0
  ccv_nnc_tensor_free(gwo);
5084
0
  ccv_nnc_tensor_free(gdwo);
5085
0
  ccv_nnc_tensor_free(h);
5086
0
  ccv_nnc_tensor_free(gh);
5087
0
  ccv_nnc_tensor_free(w);
5088
0
  ccv_nnc_tensor_free(g);
5089
0
  ccv_nnc_tensor_free(a);
5090
0
  ccv_nnc_tensor_free(gbias);
5091
0
  ccv_nnc_tensor_free(gdbias);
5092
0
  ccv_nnc_tensor_free(gdw);
5093
0
  ccv_nnc_tensor_free(gw);
5094
0
  ccv_nnc_tensor_free(gg);
5095
0
  ccv_nnc_tensor_free(ga);
5096
0
  ccv_nnc_tensor_free(ch);
5097
0
  ccv_nnc_tensor_free(cdw);
5098
0
  ccv_nnc_tensor_free(cdbias);
5099
0
}
5100
5101
TEST_CASE("mps backward convolution in nchw format with dilation 2, 3")
5102
1
{
5103
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_MPS));
5104
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5105
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5106
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5107
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_BACKWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
5108
0
  cmd.info.convolution.dilation[0] = 2;
5109
0
  cmd.info.convolution.dilation[1] = 3;
5110
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
5111
0
  assert(cmd.backend >= 0);
5112
0
  ccv_nnc_cmd_param_t modified_cmd = cmd.info;
5113
0
  modified_cmd.size.dim[0] = (cmd.info.size.dim[0] - 1) * ccv_max(cmd.info.convolution.dilation[0], 1) + 1;
5114
0
  modified_cmd.size.dim[1] = (cmd.info.size.dim[1] - 1) * ccv_max(cmd.info.convolution.dilation[1], 1) + 1;
5115
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(modified_cmd, a->info, g->info);
5116
0
  assert(ccv_nnc_hint_verify(hint, modified_cmd, a->info, g->info) == 0);
5117
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5118
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5119
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, 1, OUTPUT_DIM), 0);
5120
  // configure the inlets.
5121
0
  dsfmt_t dsfmt;
5122
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5123
0
  int i;
5124
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
5125
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
5126
0
  for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
5127
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5128
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
5129
0
    g->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / OUTPUT_DIM; // (OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM);
5130
  // Copy generated matrix values over to GPU.
5131
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5132
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
5133
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5134
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5135
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
5136
0
  ccv_nnc_tensor_t* gdw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5137
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, 1, OUTPUT_DIM), 0);
5138
0
  ccv_nnc_tensor_t* gao = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
5139
0
  ccv_nnc_tensor_t* ggo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
5140
0
  ccv_nnc_tensor_t* gho = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
5141
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
5142
0
  ccv_nnc_tensor_t* gbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
5143
0
  ccv_nnc_tensor_t* gdwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
5144
0
  ccv_nnc_tensor_t* gdbiaso = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 1, OUTPUT_DIM, 1, 1), 0);
5145
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, g), TENSOR_LIST(ga, gw, gg), 0);
5146
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(g, a, w), TENSOR_LIST(h, dw, dbias), 0);
5147
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gw, gg), TENSOR_LIST(gao, gwo, ggo), 0);
5148
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
5149
5150
0
  assert(cmd.backend >= 0);
5151
0
  cmd.algorithm = -1;
5152
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
5153
5154
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context);
5155
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ggo, gao, gwo), TENSOR_LIST(gho, gdwo, gdbiaso), stream_context));
5156
0
  ccv_nnc_stream_context_wait(stream_context);
5157
0
  ccv_nnc_stream_context_free(stream_context);
5158
0
  ccv_nnc_tensor_t* ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
5159
0
  ccv_nnc_tensor_t* cdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
5160
0
  ccv_nnc_tensor_t* cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, OUTPUT_DIM, 1, 1), 0);
5161
5162
0
  ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gho, gdwo, gdbiaso), TENSOR_LIST(gh, gdw, gdbias), 0);
5163
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdw, gdbias), TENSOR_LIST(ch, cdw, cdbias), 0);
5164
5165
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw->data.f32, cdw->data.f32, INPUT_DIM * OUTPUT_DIM * KERNEL_SIZE * KERNEL_SIZE, 5e-1, "output from mps should match from CPU");
5166
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias->data.f32, cdbias->data.f32, OUTPUT_DIM, 5e-1, "output from mps should match from CPU");
5167
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, h->data.f32, ch->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
5168
0
  ccv_nnc_tensor_free(gao);
5169
0
  ccv_nnc_tensor_free(ggo);
5170
0
  ccv_nnc_tensor_free(gho);
5171
0
  ccv_nnc_tensor_free(gwo);
5172
0
  ccv_nnc_tensor_free(gbiaso);
5173
0
  ccv_nnc_tensor_free(gdwo);
5174
0
  ccv_nnc_tensor_free(gdbiaso);
5175
0
  ccv_nnc_tensor_free(h);
5176
0
  ccv_nnc_tensor_free(gh);
5177
0
  ccv_nnc_tensor_free(w);
5178
0
  ccv_nnc_tensor_free(g);
5179
0
  ccv_nnc_tensor_free(a);
5180
0
  ccv_nnc_tensor_free(gbias);
5181
0
  ccv_nnc_tensor_free(gdbias);
5182
0
  ccv_nnc_tensor_free(gdw);
5183
0
  ccv_nnc_tensor_free(gw);
5184
0
  ccv_nnc_tensor_free(gg);
5185
0
  ccv_nnc_tensor_free(ga);
5186
0
  ccv_nnc_tensor_free(ch);
5187
0
  ccv_nnc_tensor_free(cdw);
5188
0
  ccv_nnc_tensor_free(cdbias);
5189
0
}
5190
5191
TEST_CASE("compare group norm gradient with mps")
5192
1
{
5193
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5194
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
5195
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
5196
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
5197
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5198
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5199
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
5200
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
5201
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
5202
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
5203
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
5204
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5205
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
5206
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5207
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
5208
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
5209
0
  ccv_nnc_graph_t* graph = 0;
5210
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
5211
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
5212
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
5213
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
5214
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
5215
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
5216
0
  dsfmt_t dsfmt;
5217
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5218
0
  int i;
5219
0
  dsfmt_init_gen_rand(&dsfmt, 1);
5220
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5221
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
5222
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
5223
0
  float scaledata[1 * GN_C_DIM * 2 * LN_DIM];
5224
0
  float biasdata[1 * GN_C_DIM * 2 * LN_DIM];
5225
0
  for (i = 0; i < 1 * GN_C_DIM * 2 * LN_DIM; i++)
5226
0
  {
5227
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
5228
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
5229
0
  }
5230
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
5231
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
5232
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
5233
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5234
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5235
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
5236
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5237
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
5238
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
5239
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5240
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
5241
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5242
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
5243
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
5244
0
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
5245
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
5246
0
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
5247
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
5248
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
5249
0
  ccv_nnc_tensor_arena_free(tensor_arena);
5250
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
5251
0
  ccv_nnc_graph_free(graph);
5252
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
5253
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5254
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5255
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
5256
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
5257
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
5258
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
5259
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
5260
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5261
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
5262
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5263
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
5264
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
5265
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
5266
0
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
5267
0
  ccv_nnc_graph_t* cpu_graph = 0;
5268
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
5269
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
5270
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
5271
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
5272
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5273
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
5274
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5275
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
5276
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * GN_C_DIM * 2 * LN_DIM);
5277
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
5278
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * GN_C_DIM * 2 * LN_DIM);
5279
5280
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
5281
5282
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
5283
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
5284
0
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
5285
5286
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
5287
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias_tensor->data.f32, dcbias_tensor->data.f32, 1 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
5288
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dscale_tensor->data.f32, dcscale_tensor->data.f32, 1 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
5289
5290
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
5291
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
5292
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
5293
0
  ccv_nnc_graph_free(cpu_graph);
5294
0
  ccv_nnc_tensor_free(x_tensor);
5295
0
  ccv_nnc_tensor_free(dy_tensor);
5296
0
  ccv_nnc_tensor_free(dx_tensor);
5297
0
  ccv_nnc_tensor_free(dscale_tensor);
5298
0
  ccv_nnc_tensor_free(dbias_tensor);
5299
0
}
5300
5301
TEST_CASE("compare group norm gradient with mps, variant 1")
5302
1
{
5303
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5304
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
5305
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
5306
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
5307
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5308
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5309
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 1, 1), "scale");
5310
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 1, 1), "bias");
5311
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
5312
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
5313
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
5314
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5315
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale, bias), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
5316
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5317
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
5318
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
5319
0
  ccv_nnc_graph_t* graph = 0;
5320
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
5321
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
5322
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
5323
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
5324
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
5325
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
5326
0
  dsfmt_t dsfmt;
5327
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5328
0
  int i;
5329
0
  dsfmt_init_gen_rand(&dsfmt, 1);
5330
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5331
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
5332
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
5333
0
  float scaledata[1 * GN_C_DIM * 1 * 1];
5334
0
  float biasdata[1 * GN_C_DIM * 1 * 1];
5335
0
  for (i = 0; i < 1 * GN_C_DIM * 1 * 1; i++)
5336
0
  {
5337
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
5338
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
5339
0
  }
5340
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
5341
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
5342
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
5343
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5344
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5345
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
5346
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5347
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
5348
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
5349
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5350
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
5351
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5352
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
5353
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
5354
0
  ccv_nnc_tensor_t* const dbbias_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bias));
5355
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
5356
0
  ccv_nnc_tensor_t* const dbias_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), 0);
5357
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor, dbbias_tensor), TENSOR_LIST(dscale_tensor, dbias_tensor), 0);
5358
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
5359
0
  ccv_nnc_tensor_arena_free(tensor_arena);
5360
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
5361
0
  ccv_nnc_graph_free(graph);
5362
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
5363
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5364
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5365
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), "scale");
5366
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 1, 1), "bias");
5367
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
5368
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
5369
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
5370
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5371
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale, cbias), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
5372
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5373
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
5374
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
5375
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
5376
0
  ccv_nnc_tensor_symbol_t dcbias = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cbias);
5377
0
  ccv_nnc_graph_t* cpu_graph = 0;
5378
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
5379
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
5380
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
5381
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
5382
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5383
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
5384
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5385
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
5386
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * GN_C_DIM * 1 * 1);
5387
0
  ccv_nnc_tensor_t* const cbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cbias);
5388
0
  memcpy(cbias_tensor->data.f32, biasdata, sizeof(float) * 1 * GN_C_DIM * 1 * 1);
5389
5390
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
5391
5392
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
5393
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
5394
0
  ccv_nnc_tensor_t* const dcbias_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcbias);
5395
5396
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
5397
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias_tensor->data.f32, dcbias_tensor->data.f32, 1 * GN_C_DIM * 1 * 1, 1e-5, "group norm output from mps should match from CPU");
5398
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dscale_tensor->data.f32, dcscale_tensor->data.f32, 1 * GN_C_DIM * 1 * 1, 1e-5, "group norm output from mps should match from CPU");
5399
5400
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
5401
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
5402
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
5403
0
  ccv_nnc_graph_free(cpu_graph);
5404
0
  ccv_nnc_tensor_free(x_tensor);
5405
0
  ccv_nnc_tensor_free(dy_tensor);
5406
0
  ccv_nnc_tensor_free(dx_tensor);
5407
0
  ccv_nnc_tensor_free(dscale_tensor);
5408
0
  ccv_nnc_tensor_free(dbias_tensor);
5409
0
}
5410
5411
TEST_CASE("compare group norm gradient with mps (no dbias)")
5412
1
{
5413
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5414
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
5415
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
5416
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
5417
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5418
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5419
0
  ccv_nnc_tensor_symbol_t scale = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
5420
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
5421
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
5422
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
5423
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 1), TENSOR_SYMBOL_LIST(bx, scale, bias), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
5424
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5425
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx, scale), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
5426
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5427
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
5428
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
5429
0
  ccv_nnc_graph_t* graph = 0;
5430
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
5431
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
5432
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
5433
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
5434
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
5435
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
5436
0
  dsfmt_t dsfmt;
5437
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5438
0
  int i;
5439
0
  dsfmt_init_gen_rand(&dsfmt, 1);
5440
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5441
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
5442
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
5443
0
  float scaledata[1 * GN_C_DIM * 2 * LN_DIM];
5444
0
  float biasdata[1 * GN_C_DIM * 2 * LN_DIM];
5445
0
  for (i = 0; i < 1 * GN_C_DIM * 2 * LN_DIM; i++)
5446
0
  {
5447
0
    scaledata[i] = dsfmt_genrand_open_close(&dsfmt);
5448
0
    biasdata[i] = dsfmt_genrand_open_close(&dsfmt);
5449
0
  }
5450
0
  ccv_nnc_tensor_t scale_tensor = ccv_nnc_tensor(scaledata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
5451
0
  ccv_nnc_tensor_t bias_tensor = ccv_nnc_tensor(biasdata, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
5452
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(&scale_tensor, &bias_tensor), TENSOR_LIST(ccv_nnc_tensor_from_symbol(tensor_arena, scale), ccv_nnc_tensor_from_symbol(tensor_arena, bias)), 0);
5453
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5454
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5455
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
5456
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5457
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
5458
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
5459
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5460
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
5461
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5462
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
5463
0
  ccv_nnc_tensor_t* const dbscale_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_for_backward(symbolic_graph, scale));
5464
0
  ccv_nnc_tensor_t* const dscale_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), 0);
5465
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbscale_tensor), TENSOR_LIST(dscale_tensor), 0);
5466
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
5467
0
  ccv_nnc_tensor_arena_free(tensor_arena);
5468
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
5469
0
  ccv_nnc_graph_free(graph);
5470
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
5471
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5472
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5473
0
  ccv_nnc_tensor_symbol_t cscale = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "scale");
5474
0
  ccv_nnc_tensor_symbol_t cbias = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 1, GN_C_DIM, 2, LN_DIM), "bias");
5475
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
5476
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
5477
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 1), TENSOR_SYMBOL_LIST(cx, cscale, cbias), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
5478
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5479
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx, cscale), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
5480
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5481
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
5482
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
5483
0
  ccv_nnc_tensor_symbol_t dcscale = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cscale);
5484
0
  ccv_nnc_graph_t* cpu_graph = 0;
5485
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
5486
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
5487
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
5488
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
5489
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5490
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
5491
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5492
0
  ccv_nnc_tensor_t* const cscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cscale);
5493
0
  memcpy(cscale_tensor->data.f32, scaledata, sizeof(float) * 1 * GN_C_DIM * 2 * LN_DIM);
5494
5495
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
5496
5497
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
5498
0
  ccv_nnc_tensor_t* const dcscale_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcscale);
5499
5500
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
5501
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dscale_tensor->data.f32, dcscale_tensor->data.f32, 1 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
5502
5503
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
5504
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
5505
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
5506
0
  ccv_nnc_graph_free(cpu_graph);
5507
0
  ccv_nnc_tensor_free(x_tensor);
5508
0
  ccv_nnc_tensor_free(dy_tensor);
5509
0
  ccv_nnc_tensor_free(dx_tensor);
5510
0
  ccv_nnc_tensor_free(dscale_tensor);
5511
0
}
5512
5513
TEST_CASE("compare group norm gradient with mps without scale / bias")
5514
1
{
5515
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5516
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
5517
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
5518
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
5519
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5520
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5521
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
5522
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
5523
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
5524
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5525
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
5526
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5527
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
5528
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
5529
0
  ccv_nnc_graph_t* graph = 0;
5530
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
5531
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
5532
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
5533
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
5534
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
5535
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
5536
0
  dsfmt_t dsfmt;
5537
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5538
0
  int i;
5539
0
  dsfmt_init_gen_rand(&dsfmt, 1);
5540
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5541
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
5542
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
5543
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5544
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5545
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
5546
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5547
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
5548
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
5549
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5550
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
5551
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5552
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
5553
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
5554
0
  ccv_nnc_tensor_arena_free(tensor_arena);
5555
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
5556
0
  ccv_nnc_graph_free(graph);
5557
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
5558
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5559
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5560
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
5561
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
5562
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
5563
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5564
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
5565
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5566
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
5567
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
5568
0
  ccv_nnc_graph_t* cpu_graph = 0;
5569
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
5570
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
5571
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
5572
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
5573
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5574
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
5575
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5576
5577
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
5578
5579
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
5580
5581
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
5582
5583
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
5584
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
5585
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
5586
0
  ccv_nnc_graph_free(cpu_graph);
5587
0
  ccv_nnc_tensor_free(x_tensor);
5588
0
  ccv_nnc_tensor_free(dy_tensor);
5589
0
  ccv_nnc_tensor_free(dx_tensor);
5590
0
}
5591
5592
TEST_CASE("compare group norm gradient with mps, variant 1 without scale / bias")
5593
1
{
5594
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5595
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
5596
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
5597
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
5598
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5599
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5600
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
5601
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
5602
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
5603
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5604
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
5605
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5606
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
5607
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
5608
0
  ccv_nnc_graph_t* graph = 0;
5609
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
5610
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
5611
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
5612
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
5613
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
5614
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
5615
0
  dsfmt_t dsfmt;
5616
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5617
0
  int i;
5618
0
  dsfmt_init_gen_rand(&dsfmt, 1);
5619
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5620
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
5621
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
5622
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5623
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5624
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
5625
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5626
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
5627
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
5628
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5629
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
5630
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5631
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
5632
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
5633
0
  ccv_nnc_tensor_arena_free(tensor_arena);
5634
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
5635
0
  ccv_nnc_graph_free(graph);
5636
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
5637
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5638
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5639
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_mean");
5640
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 1, 1), "saved_inv_std");
5641
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
5642
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5643
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
5644
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5645
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
5646
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
5647
0
  ccv_nnc_graph_t* cpu_graph = 0;
5648
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
5649
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
5650
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
5651
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
5652
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5653
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
5654
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5655
5656
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
5657
5658
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
5659
5660
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
5661
5662
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
5663
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
5664
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
5665
0
  ccv_nnc_graph_free(cpu_graph);
5666
0
  ccv_nnc_tensor_free(x_tensor);
5667
0
  ccv_nnc_tensor_free(dy_tensor);
5668
0
  ccv_nnc_tensor_free(dx_tensor);
5669
0
}
5670
5671
TEST_CASE("compare group norm gradient with mps (no dbias) without scale / bias")
5672
1
{
5673
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5674
1
    ccv_nnc_cmd_ok(CCV_NNC_GROUP_NORM_BACKWARD, CCV_NNC_BACKEND_MPS) &&
5675
1
    (ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS) || ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS)));
5676
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
5677
0
  ccv_nnc_tensor_symbol_t bx = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5678
0
  ccv_nnc_tensor_symbol_t by = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5679
0
  ccv_nnc_tensor_symbol_t saved_mean = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
5680
0
  ccv_nnc_tensor_symbol_t saved_inv_std = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
5681
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_GROUP_NORM_FORWARD(1, 4, 1e-5, 0), TENSOR_SYMBOL_LIST(bx), TENSOR_SYMBOL_LIST(by, saved_mean, saved_inv_std), "group_norm");
5682
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5683
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(by), TENSOR_SYMBOL_LIST(bx), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
5684
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5685
0
  ccv_nnc_tensor_symbol_t dby = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, by);
5686
0
  ccv_nnc_tensor_symbol_t dbx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, bx);
5687
0
  ccv_nnc_graph_t* graph = 0;
5688
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
5689
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
5690
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
5691
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
5692
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
5693
0
  ccv_nnc_tensor_t* const bx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, bx);
5694
0
  dsfmt_t dsfmt;
5695
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5696
0
  int i;
5697
0
  dsfmt_init_gen_rand(&dsfmt, 1);
5698
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5699
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 100;
5700
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(bx_tensor), 0);
5701
  // ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5702
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5703
0
  ccv_nnc_tensor_t* const dby_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dby);
5704
0
  for (i = 0; i < 2 * GN_C_DIM * 2 * LN_DIM; i++)
5705
0
    dy_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
5706
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dby_tensor), 0);
5707
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
5708
0
  ccv_nnc_tensor_t* const dbx_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, dbx);
5709
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), 0);
5710
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dbx_tensor), TENSOR_LIST(dx_tensor), 0);
5711
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
5712
0
  ccv_nnc_tensor_arena_free(tensor_arena);
5713
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
5714
0
  ccv_nnc_graph_free(graph);
5715
0
  ccv_nnc_symbolic_graph_t* const cpu_symbolic_graph = ccv_nnc_symbolic_graph_new();
5716
0
  ccv_nnc_tensor_symbol_t cx = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "x");
5717
0
  ccv_nnc_tensor_symbol_t cy = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_C_DIM, 2, LN_DIM), "y");
5718
0
  ccv_nnc_tensor_symbol_t csaved_mean = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_mean");
5719
0
  ccv_nnc_tensor_symbol_t csaved_inv_std = ccv_nnc_tensor_symbol_new(cpu_symbolic_graph, CPU_TENSOR_NHWC(32F, 2, GN_RC_DIM, 2, LN_DIM), "saved_inv_std");
5720
0
  ccv_nnc_graph_exec_symbol_new(cpu_symbolic_graph, CMD_GROUP_NORM_FORWARD(1, GN_RC_DIM, 1e-5, 0), TENSOR_SYMBOL_LIST(cx), TENSOR_SYMBOL_LIST(cy, csaved_mean, csaved_inv_std), "group_norm");
5721
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5722
0
  ccv_nnc_symbolic_graph_backward(cpu_symbolic_graph, TENSOR_SYMBOL_LIST(cy), TENSOR_SYMBOL_LIST(cx), SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph));
5723
0
  ccv_nnc_graph_exec_symbol_autogen(cpu_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
5724
0
  ccv_nnc_tensor_symbol_t dcy = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cy);
5725
0
  ccv_nnc_tensor_symbol_t dcx = ccv_nnc_tensor_symbol_for_backward(cpu_symbolic_graph, cx);
5726
0
  ccv_nnc_graph_t* cpu_graph = 0;
5727
0
  ccv_nnc_tensor_arena_t* cpu_tensor_arena = 0;
5728
0
  ccv_nnc_graph_exec_arena_t* cpu_graph_exec_arena = 0;
5729
0
  ccv_nnc_symbolic_graph_compile(cpu_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(cpu_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(cpu_symbolic_graph), &cpu_graph, &cpu_tensor_arena, &cpu_graph_exec_arena);
5730
0
  ccv_nnc_tensor_t* const cx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, cx);
5731
0
  memcpy(cx_tensor->data.f32, x_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5732
0
  ccv_nnc_tensor_t* const dcy_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcy);
5733
0
  memcpy(dcy_tensor->data.f32, dy_tensor->data.f32, sizeof(float) * 2 * GN_C_DIM * 2 * LN_DIM);
5734
5735
0
  ccv_nnc_graph_run(cpu_graph, 0, TRAVERSE_FULL, 0, 0);
5736
5737
0
  ccv_nnc_tensor_t* const dcx_tensor = ccv_nnc_tensor_from_symbol(cpu_tensor_arena, dcx);
5738
5739
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dx_tensor->data.f32, dcx_tensor->data.f32, 2 * GN_C_DIM * 2 * LN_DIM, 1e-5, "group norm output from mps should match from CPU");
5740
5741
0
  ccv_nnc_symbolic_graph_free(cpu_symbolic_graph);
5742
0
  ccv_nnc_tensor_arena_free(cpu_tensor_arena);
5743
0
  ccv_nnc_graph_exec_arena_free(cpu_graph_exec_arena);
5744
0
  ccv_nnc_graph_free(cpu_graph);
5745
0
  ccv_nnc_tensor_free(x_tensor);
5746
0
  ccv_nnc_tensor_free(dy_tensor);
5747
0
  ccv_nnc_tensor_free(dx_tensor);
5748
0
}
5749
5750
TEST_CASE("broadcasting semantics for mul backward (a,b)")
5751
1
{
5752
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
5753
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
5754
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5755
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5756
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5757
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5758
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5759
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5760
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5761
0
  a->data.f32[0] = 1;
5762
0
  a->data.f32[1] = 2;
5763
0
  a->data.f32[2] = 3;
5764
0
  a->data.f32[3] = 4;
5765
0
  b->data.f32[0] = 5;
5766
0
  b->data.f32[1] = 6;
5767
0
  float ctp[] = {
5768
0
    6, 7,
5769
0
    7, 8,
5770
0
    8, 9,
5771
0
    9, 10
5772
0
  };
5773
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
5774
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5775
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5776
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5777
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5778
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5779
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
5780
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, gdb), 0);
5781
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
5782
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, dbt), 0);
5783
5784
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5785
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
5786
0
  ccv_nnc_tensor_free(a);
5787
0
  ccv_nnc_tensor_free(b);
5788
0
  ccv_nnc_tensor_free(c);
5789
0
  ccv_nnc_tensor_free(da);
5790
0
  ccv_nnc_tensor_free(db);
5791
0
  ccv_nnc_tensor_free(dat);
5792
0
  ccv_nnc_tensor_free(dbt);
5793
0
  ccv_nnc_tensor_free(ga);
5794
0
  ccv_nnc_tensor_free(gb);
5795
0
  ccv_nnc_tensor_free(gc);
5796
0
  ccv_nnc_tensor_free(gda);
5797
0
  ccv_nnc_tensor_free(gdb);
5798
0
}
5799
5800
TEST_CASE("broadcasting semantics for mul backward (a, nil)")
5801
1
{
5802
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
5803
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
5804
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5805
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5806
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5807
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5808
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5809
0
  a->data.f32[0] = 1;
5810
0
  a->data.f32[1] = 2;
5811
0
  a->data.f32[2] = 3;
5812
0
  a->data.f32[3] = 4;
5813
0
  b->data.f32[0] = 5;
5814
0
  b->data.f32[1] = 6;
5815
0
  float ctp[] = {
5816
0
    6, 7,
5817
0
    7, 8,
5818
0
    8, 9,
5819
0
    9, 10
5820
0
  };
5821
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
5822
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5823
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5824
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5825
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5826
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
5827
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, 0), 0);
5828
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, 0), TENSOR_LIST(da, 0), 0);
5829
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, 0), 0);
5830
5831
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5832
0
  ccv_nnc_tensor_free(a);
5833
0
  ccv_nnc_tensor_free(b);
5834
0
  ccv_nnc_tensor_free(c);
5835
0
  ccv_nnc_tensor_free(da);
5836
0
  ccv_nnc_tensor_free(dat);
5837
0
  ccv_nnc_tensor_free(ga);
5838
0
  ccv_nnc_tensor_free(gb);
5839
0
  ccv_nnc_tensor_free(gc);
5840
0
  ccv_nnc_tensor_free(gda);
5841
0
}
5842
5843
TEST_CASE("broadcasting semantics for mul backward (nil,b)")
5844
1
{
5845
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
5846
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
5847
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5848
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5849
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5850
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5851
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5852
0
  a->data.f32[0] = 1;
5853
0
  a->data.f32[1] = 2;
5854
0
  a->data.f32[2] = 3;
5855
0
  a->data.f32[3] = 4;
5856
0
  b->data.f32[0] = 5;
5857
0
  b->data.f32[1] = 6;
5858
0
  float ctp[] = {
5859
0
    6, 7,
5860
0
    7, 8,
5861
0
    8, 9,
5862
0
    9, 10
5863
0
  };
5864
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
5865
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5866
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5867
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5868
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5869
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
5870
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(0, gdb), 0);
5871
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, gdb), TENSOR_LIST(0, db), 0);
5872
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(0, dbt), 0);
5873
5874
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
5875
0
  ccv_nnc_tensor_free(a);
5876
0
  ccv_nnc_tensor_free(b);
5877
0
  ccv_nnc_tensor_free(c);
5878
0
  ccv_nnc_tensor_free(db);
5879
0
  ccv_nnc_tensor_free(dbt);
5880
0
  ccv_nnc_tensor_free(ga);
5881
0
  ccv_nnc_tensor_free(gb);
5882
0
  ccv_nnc_tensor_free(gc);
5883
0
  ccv_nnc_tensor_free(gdb);
5884
0
}
5885
5886
TEST_CASE("broadcasting semantics for mul backward (no output db)")
5887
1
{
5888
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
5889
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
5890
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5891
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5892
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5893
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5894
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5895
0
  a->data.f32[0] = 1;
5896
0
  a->data.f32[1] = 2;
5897
0
  a->data.f32[2] = 3;
5898
0
  a->data.f32[3] = 4;
5899
0
  b->data.f32[0] = 5;
5900
0
  b->data.f32[1] = 6;
5901
0
  float ctp[] = {
5902
0
    6, 7,
5903
0
    7, 8,
5904
0
    8, 9,
5905
0
    9, 10
5906
0
  };
5907
0
  memcpy(c->data.f32, ctp, sizeof(ctp));
5908
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5909
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5910
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5911
0
  ccv_nnc_tensor_t* const gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5912
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, c), TENSOR_LIST(ga, gb, gc), 0);
5913
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(gc, ga, gb), TENSOR_LIST(gda, 0), 0);
5914
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, 0), TENSOR_LIST(da, 0), 0);
5915
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(c, a, b), TENSOR_LIST(dat, 0), 0);
5916
5917
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5918
0
  ccv_nnc_tensor_free(a);
5919
0
  ccv_nnc_tensor_free(b);
5920
0
  ccv_nnc_tensor_free(c);
5921
0
  ccv_nnc_tensor_free(da);
5922
0
  ccv_nnc_tensor_free(dat);
5923
0
  ccv_nnc_tensor_free(ga);
5924
0
  ccv_nnc_tensor_free(gb);
5925
0
  ccv_nnc_tensor_free(gc);
5926
0
  ccv_nnc_tensor_free(gda);
5927
0
}
5928
5929
TEST_CASE("broadcasting semantics for mul backward (no input grad)")
5930
1
{
5931
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
5932
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
5933
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5934
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5935
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5936
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5937
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 1), 0);
5938
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2), 0);
5939
0
  a->data.f32[0] = 1;
5940
0
  a->data.f32[1] = 2;
5941
0
  a->data.f32[2] = 3;
5942
0
  a->data.f32[3] = 4;
5943
0
  b->data.f32[0] = 5;
5944
0
  b->data.f32[1] = 6;
5945
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5946
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5947
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 1), 0);
5948
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2), 0);
5949
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
5950
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
5951
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
5952
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
5953
5954
5955
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5956
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
5957
0
  ccv_nnc_tensor_free(a);
5958
0
  ccv_nnc_tensor_free(b);
5959
0
  ccv_nnc_tensor_free(da);
5960
0
  ccv_nnc_tensor_free(db);
5961
0
  ccv_nnc_tensor_free(dat);
5962
0
  ccv_nnc_tensor_free(dbt);
5963
0
  ccv_nnc_tensor_free(ga);
5964
0
  ccv_nnc_tensor_free(gb);
5965
0
  ccv_nnc_tensor_free(gda);
5966
0
  ccv_nnc_tensor_free(gdb);
5967
0
}
5968
5969
5970
TEST_CASE("broadcasting semantics for mul backward (no input grad) for b")
5971
1
{
5972
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
5973
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
5974
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5975
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5976
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5977
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5978
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5979
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5980
0
  a->data.f32[0] = 1;
5981
0
  a->data.f32[1] = 2;
5982
0
  a->data.f32[2] = 3;
5983
0
  a->data.f32[3] = 4;
5984
0
  a->data.f32[4] = 5;
5985
0
  a->data.f32[5] = 6;
5986
0
  b->data.f32[0] = 7;
5987
0
  b->data.f32[1] = 8;
5988
0
  b->data.f32[2] = 9;
5989
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5990
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5991
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5992
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5993
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
5994
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
5995
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
5996
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
5997
5998
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
5999
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
6000
0
  ccv_nnc_tensor_free(a);
6001
0
  ccv_nnc_tensor_free(b);
6002
0
  ccv_nnc_tensor_free(da);
6003
0
  ccv_nnc_tensor_free(db);
6004
0
  ccv_nnc_tensor_free(dat);
6005
0
  ccv_nnc_tensor_free(dbt);
6006
0
  ccv_nnc_tensor_free(ga);
6007
0
  ccv_nnc_tensor_free(gb);
6008
0
  ccv_nnc_tensor_free(gda);
6009
0
  ccv_nnc_tensor_free(gdb);
6010
0
}
6011
6012
TEST_CASE("broadcasting semantics for mul backward (no input grad) for a")
6013
1
{
6014
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_MPS) &&
6015
1
    ccv_nnc_cmd_ok(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
6016
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
6017
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
6018
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
6019
0
  ccv_nnc_tensor_t* const da = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
6020
0
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
6021
0
  ccv_nnc_tensor_t* const dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
6022
0
  b->data.f32[0] = 1;
6023
0
  b->data.f32[1] = 2;
6024
0
  b->data.f32[2] = 3;
6025
0
  b->data.f32[3] = 4;
6026
0
  b->data.f32[4] = 5;
6027
0
  b->data.f32[5] = 6;
6028
0
  a->data.f32[0] = 7;
6029
0
  a->data.f32[1] = 8;
6030
0
  a->data.f32[2] = 9;
6031
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
6032
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
6033
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
6034
0
  ccv_nnc_tensor_t* const gda = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
6035
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
6036
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ga, gb), TENSOR_LIST(gda, gdb), 0);
6037
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gda, gdb), TENSOR_LIST(da, db), 0);
6038
0
  ccv_nnc_cmd_exec(CMD_MUL_BACKWARD(0.5), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a, b), TENSOR_LIST(dat, dbt), 0);
6039
6040
0
  REQUIRE_TENSOR_EQ(dat, da, "gradient of a should be equal");
6041
0
  REQUIRE_TENSOR_EQ(dbt, db, "gradient of b should be equal");
6042
0
  ccv_nnc_tensor_free(a);
6043
0
  ccv_nnc_tensor_free(b);
6044
0
  ccv_nnc_tensor_free(da);
6045
0
  ccv_nnc_tensor_free(db);
6046
0
  ccv_nnc_tensor_free(dat);
6047
0
  ccv_nnc_tensor_free(dbt);
6048
0
  ccv_nnc_tensor_free(ga);
6049
0
  ccv_nnc_tensor_free(gb);
6050
0
  ccv_nnc_tensor_free(gda);
6051
0
  ccv_nnc_tensor_free(gdb);
6052
0
}
6053
6054
TEST_CASE("mps scalar mul forward")
6055
1
{
6056
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_MPS) &&
6057
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
6058
6059
0
  ccv_nnc_tensor_t* const x = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
6060
0
  ccv_nnc_tensor_t* const gx = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
6061
  
6062
0
  dsfmt_t dsfmt;
6063
0
  dsfmt_init_gen_rand(&dsfmt, 0);
6064
0
  int i;
6065
0
  for (i = 0; i < 4; i++)
6066
0
      x->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6067
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x), TENSOR_LIST(gx), 0);
6068
6069
0
  ccv_nnc_tensor_t* const gy = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
6070
6071
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_FORWARD(1.1), ccv_nnc_no_hint, 0, TENSOR_LIST(gx), TENSOR_LIST(gy), 0);
6072
6073
0
  ccv_nnc_tensor_t* const y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
6074
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gy), TENSOR_LIST(y), 0);
6075
0
  for (i = 0; i < 4; i++) {
6076
0
      REQUIRE_EQ_WITH_TOLERANCE(x->data.f32[i] * 1.1, y->data.f32[i], 1e-5, "scalarmul forward cy has to be 1.1 * x");
6077
0
  }
6078
6079
0
  ccv_nnc_tensor_free(x);
6080
0
  ccv_nnc_tensor_free(gx);
6081
0
  ccv_nnc_tensor_free(gy);
6082
0
  ccv_nnc_tensor_free(y);
6083
0
}
6084
6085
TEST_CASE("mps scalar mul backward")
6086
1
{
6087
1
  GUARD_ELSE_RETURN(
6088
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_MPS));
6089
6090
0
  ccv_nnc_tensor_t* const y = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
6091
6092
0
  dsfmt_t dsfmt;
6093
0
  dsfmt_init_gen_rand(&dsfmt, 0);
6094
0
  int i;
6095
0
  for (i = 0; i < 4; i++)
6096
0
      y->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6097
0
  ccv_nnc_tensor_t* const gy = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
6098
0
  ccv_nnc_tensor_t* const gdx = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
6099
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y), TENSOR_LIST(gy), 0);
6100
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_BACKWARD(1.1), ccv_nnc_no_hint, 0, TENSOR_LIST(gy), TENSOR_LIST(gdx), 0);
6101
  
6102
0
  ccv_nnc_tensor_t* const dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
6103
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdx), TENSOR_LIST(dx), 0);
6104
6105
0
  for (i = 0; i < 4; i++) {
6106
0
      REQUIRE_EQ_WITH_TOLERANCE(dx->data.f32[i], y->data.f32[i] * 1.1, 1e-5, "scalarmul backward dx has to be 1.1 * dy");
6107
0
  }
6108
6109
0
  ccv_nnc_tensor_free(y);
6110
0
  ccv_nnc_tensor_free(gy);
6111
0
  ccv_nnc_tensor_free(gdx);
6112
0
  ccv_nnc_tensor_free(dx);
6113
0
}
6114
6115
TEST_CASE("mps scalar mul backward, no input")
6116
1
{
6117
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_MPS));
6118
6119
0
  ccv_nnc_tensor_t* const gdx = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4), 0);
6120
0
  ccv_nnc_cmd_exec(CMD_SCALAR_MUL_BACKWARD(1.1), ccv_nnc_no_hint, 0, TENSOR_LIST(0), TENSOR_LIST(gdx), 0);
6121
0
  ccv_nnc_tensor_t* const dx = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4), 0);
6122
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gdx), TENSOR_LIST(dx), 0);
6123
6124
0
  for (int i = 0; i < 4; i++)
6125
0
      REQUIRE_EQ_WITH_TOLERANCE(dx->data.f32[i], 1.1, 1e-5, "scalar mul backward without input should be 1.1 ");
6126
0
  ccv_nnc_tensor_free(gdx);
6127
0
  ccv_nnc_tensor_free(dx);
6128
0
}
6129
6130
TEST_CASE("mps forward convolution transpose")
6131
1
{
6132
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
6133
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
6134
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
6135
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
6136
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
6137
0
  assert(cmd.backend >= 0);
6138
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
6139
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
6140
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, INPUT_DIM), 0);
6141
  // configure the inlets.
6142
0
  dsfmt_t dsfmt;
6143
0
  dsfmt_init_gen_rand(&dsfmt, 0);
6144
0
  int i;
6145
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
6146
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
6147
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
6148
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6149
0
  for (i = 0; i < INPUT_DIM; i++)
6150
0
    bias->data.f32[i] = (float)i / INPUT_DIM;
6151
  // Copy generated matrix values over to GPU.
6152
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
6153
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
6154
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
6155
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, INPUT_DIM), 0);
6156
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
6157
0
  move.backend = CCV_NNC_BACKEND_MPS;
6158
0
  assert(move.backend >= 0);
6159
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
6160
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
6161
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
6162
6163
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
6164
0
  transform.backend = CCV_NNC_BACKEND_MPS;
6165
0
  assert(transform.backend >= 0);
6166
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
6167
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
6168
0
  ccv_nnc_stream_context_wait(stream_context);
6169
0
  ccv_nnc_tensor_free(gw);
6170
6171
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
6172
0
  assert(cmd.backend >= 0);
6173
0
  cmd.algorithm = -1;
6174
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
6175
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
6176
0
  ccv_nnc_stream_context_wait(stream_context);
6177
0
  ccv_nnc_stream_context_free(stream_context);
6178
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
6179
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
6180
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-4, "output from mps should match from CPU");
6181
0
  ccv_nnc_tensor_free(c);
6182
0
  ccv_nnc_tensor_free(gc);
6183
0
  ccv_nnc_tensor_free(bias);
6184
0
  ccv_nnc_tensor_free(w);
6185
0
  ccv_nnc_tensor_free(b);
6186
0
  ccv_nnc_tensor_free(a);
6187
0
  ccv_nnc_tensor_free(gbias);
6188
0
  ccv_nnc_tensor_free(gwo);
6189
0
  ccv_nnc_tensor_free(ga);
6190
0
}
6191
6192
TEST_CASE("mps forward convolution transpose in nchw format")
6193
1
{
6194
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
6195
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
6196
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
6197
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
6198
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
6199
0
  assert(cmd.backend >= 0);
6200
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
6201
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
6202
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, INPUT_DIM), 0);
6203
  // configure the inlets.
6204
0
  dsfmt_t dsfmt;
6205
0
  dsfmt_init_gen_rand(&dsfmt, 0);
6206
0
  int i;
6207
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
6208
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
6209
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
6210
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6211
0
  for (i = 0; i < INPUT_DIM; i++)
6212
0
    bias->data.f32[i] = (float)i / INPUT_DIM;
6213
  // Copy generated matrix values over to GPU.
6214
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, OUTPUT_DIM, OUTPUT_SIZE, OUTPUT_SIZE), 0);
6215
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
6216
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, INPUT_DIM), 0);
6217
0
  ccv_nnc_cmd_t move = CMD_DATA_TRANSFER_FORWARD();
6218
0
  move.backend = CCV_NNC_BACKEND_MPS;
6219
0
  assert(move.backend >= 0);
6220
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(ga, gw, gbias), 0);
6221
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
6222
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
6223
6224
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
6225
0
  transform.backend = CCV_NNC_BACKEND_MPS;
6226
0
  assert(transform.backend >= 0);
6227
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
6228
0
  assert(cmd.backend >= 0);
6229
0
  cmd.algorithm = -1;
6230
0
  cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0);
6231
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gw, gbias), TENSOR_LIST(gc), 0));
6232
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, BATCH_SIZE, INPUT_DIM, INPUT_SIZE, INPUT_SIZE), 0);
6233
0
  ccv_nnc_cmd_exec(move, ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
6234
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 1e-5, "output from mps should match from CPU");
6235
0
  ccv_nnc_tensor_free(c);
6236
0
  ccv_nnc_tensor_free(gc);
6237
0
  ccv_nnc_tensor_free(bias);
6238
0
  ccv_nnc_tensor_free(w);
6239
0
  ccv_nnc_tensor_free(b);
6240
0
  ccv_nnc_tensor_free(a);
6241
0
  ccv_nnc_tensor_free(gbias);
6242
0
  ccv_nnc_tensor_free(gw);
6243
0
  ccv_nnc_tensor_free(ga);
6244
0
}
6245
6246
TEST_CASE("mps forward convolution transpose in half precision")
6247
1
{
6248
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_MPS));
6249
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
6250
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
6251
0
  ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_TRANSPOSE_FORWARD(1, INPUT_DIM, 0, KERNEL_SIZE, KERNEL_SIZE, OUTPUT_DIM);
6252
0
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
6253
0
  assert(cmd.backend >= 0);
6254
0
  ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, b->info, a->info);
6255
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
6256
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, INPUT_DIM), 0);
6257
  // configure the inlets.
6258
0
  dsfmt_t dsfmt;
6259
0
  dsfmt_init_gen_rand(&dsfmt, 0);
6260
0
  int i;
6261
0
  for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
6262
0
    w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
6263
0
  for (i = 0; i < OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
6264
0
    a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6265
0
  for (i = 0; i < INPUT_DIM; i++)
6266
0
    bias->data.f32[i] = (float)i / INPUT_DIM;
6267
0
  ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
6268
0
  ccv_nnc_tensor_t* w1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
6269
0
  ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, INPUT_DIM), 0);
6270
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(a1, w1, bias1), 0);
6271
  // Copy generated matrix values over to GPU.
6272
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
6273
0
  ccv_nnc_tensor_t* gw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
6274
0
  ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
6275
0
  ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, INPUT_DIM), 0);
6276
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, w1, bias1), TENSOR_LIST(ga, gw, gbias), 0);
6277
0
  ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
6278
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
6279
6280
0
  ccv_nnc_cmd_t transform = CMD_FORMAT_TRANSFORM_FORWARD();
6281
0
  transform.backend = CCV_NNC_BACKEND_MPS;
6282
0
  assert(transform.backend >= 0);
6283
0
  ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
6284
0
  ccv_nnc_cmd_exec(transform, ccv_nnc_no_hint, 0, TENSOR_LIST(gw), TENSOR_LIST(gwo), stream_context);
6285
0
  ccv_nnc_stream_context_wait(stream_context);
6286
0
  ccv_nnc_tensor_free(gw);
6287
6288
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
6289
0
  assert(cmd.backend >= 0);
6290
0
  cmd.algorithm = -1;
6291
0
  cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
6292
0
  assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
6293
0
  ccv_nnc_stream_context_wait(stream_context);
6294
0
  ccv_nnc_stream_context_free(stream_context);
6295
0
  ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
6296
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
6297
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
6298
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
6299
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * INPUT_DIM * INPUT_SIZE * INPUT_SIZE, 5e-3, "output from mps should match from CPU");
6300
0
  ccv_nnc_tensor_free(c);
6301
0
  ccv_nnc_tensor_free(gc);
6302
0
  ccv_nnc_tensor_free(bias);
6303
0
  ccv_nnc_tensor_free(w);
6304
0
  ccv_nnc_tensor_free(b);
6305
0
  ccv_nnc_tensor_free(a);
6306
0
  ccv_nnc_tensor_free(c1);
6307
0
  ccv_nnc_tensor_free(bias1);
6308
0
  ccv_nnc_tensor_free(w1);
6309
0
  ccv_nnc_tensor_free(a1);
6310
0
  ccv_nnc_tensor_free(gbias);
6311
0
  ccv_nnc_tensor_free(gwo);
6312
0
  ccv_nnc_tensor_free(ga);
6313
0
}
6314
6315
TEST_CASE("compare tanh with mps")
6316
1
{
6317
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TANH_FORWARD, CCV_NNC_BACKEND_MPS));
6318
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
6319
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
6320
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
6321
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_TANH_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "tanh");
6322
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
6323
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
6324
0
  ccv_nnc_graph_t* graph = 0;
6325
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
6326
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
6327
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
6328
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
6329
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
6330
0
  dsfmt_t dsfmt;
6331
0
  dsfmt_init_gen_rand(&dsfmt, 0);
6332
0
  int i;
6333
0
  for (i = 0; i < 20 * 10; i++)
6334
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6335
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
6336
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
6337
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
6338
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
6339
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
6340
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y_tensor), 0);
6341
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
6342
0
  ccv_nnc_cmd_exec(CMD_TANH_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
6343
0
  REQUIRE_TENSOR_EQ(ty, y_tensor, "tanh from mps should match from CPU");
6344
0
  ccv_nnc_tensor_free(x_tensor);
6345
0
  ccv_nnc_tensor_free(y_tensor);
6346
0
  ccv_nnc_tensor_free(ty);
6347
0
  ccv_nnc_graph_free(graph);
6348
0
  ccv_nnc_tensor_arena_free(tensor_arena);
6349
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
6350
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
6351
0
}
6352
6353
TEST_CASE("compare tanh with mps in half precision")
6354
1
{
6355
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TANH_FORWARD, CCV_NNC_BACKEND_MPS));
6356
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
6357
0
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
6358
0
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
6359
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_TANH_FORWARD(), TENSOR_SYMBOL_LIST(a), TENSOR_SYMBOL_LIST(b), "tanh");
6360
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
6361
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
6362
0
  ccv_nnc_graph_t* graph = 0;
6363
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
6364
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
6365
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
6366
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
6367
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
6368
0
  dsfmt_t dsfmt;
6369
0
  dsfmt_init_gen_rand(&dsfmt, 0);
6370
0
  int i;
6371
0
  for (i = 0; i < 20 * 10; i++)
6372
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6373
0
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
6374
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
6375
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
6376
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
6377
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
6378
0
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
6379
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
6380
0
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
6381
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b_tensor), TENSOR_LIST(y16_tensor), 0);
6382
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
6383
0
  ccv_nnc_tensor_t* const ty = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
6384
0
  ccv_nnc_cmd_exec(CMD_TANH_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty), 0);
6385
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty->data.f32, y_tensor->data.f32, 20 * 10, 1e-3, "tanh from mps should match from CPU");
6386
0
  ccv_nnc_tensor_free(x_tensor);
6387
0
  ccv_nnc_tensor_free(x16_tensor);
6388
0
  ccv_nnc_tensor_free(y16_tensor);
6389
0
  ccv_nnc_tensor_free(y_tensor);
6390
0
  ccv_nnc_tensor_free(ty);
6391
0
  ccv_nnc_graph_free(graph);
6392
0
  ccv_nnc_tensor_arena_free(tensor_arena);
6393
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
6394
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
6395
0
}
6396
6397
TEST_CASE("compare tanh gradient with mps")
6398
1
{
6399
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TANH_FORWARD, CCV_NNC_BACKEND_MPS) &&
6400
1
    ccv_nnc_cmd_ok(CCV_NNC_TANH_BACKWARD, CCV_NNC_BACKEND_MPS));
6401
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
6402
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "x");
6403
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 10, 100), "y");
6404
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_TANH_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "tanh");
6405
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
6406
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
6407
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
6408
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
6409
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
6410
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
6411
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6412
0
  dsfmt_t dsfmt;
6413
0
  dsfmt_init_gen_rand(&dsfmt, 0);
6414
0
  int i;
6415
0
  for (i = 0; i < 10 * 100; i++)
6416
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6417
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6418
0
  for (i = 0; i < 10 * 100; i++)
6419
0
    dy_tensor->data.f32[i] = 0;
6420
0
  for (i = 0; i < 10; i++)
6421
0
    dy_tensor->data.f32[i * 100 + i] = 1;
6422
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
6423
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dyt), 0);
6424
0
  ccv_nnc_graph_t* graph = 0;
6425
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
6426
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
6427
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
6428
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
6429
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
6430
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
6431
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
6432
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6433
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
6434
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6435
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
6436
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx_tensor), 0);
6437
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y_tensor), 0);
6438
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6439
0
  ccv_nnc_cmd_exec(CMD_TANH_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
6440
0
  REQUIRE_TENSOR_EQ(ty_tensor, y_tensor, "forward pass should match");
6441
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6442
0
  ccv_nnc_cmd_exec(CMD_TANH_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
6443
0
  REQUIRE_TENSOR_EQ(tdx_tensor, dx_tensor, "backward pass should match");
6444
0
  ccv_nnc_tensor_free(x_tensor);
6445
0
  ccv_nnc_tensor_free(y_tensor);
6446
0
  ccv_nnc_tensor_free(dx_tensor);
6447
0
  ccv_nnc_tensor_free(dy_tensor);
6448
0
  ccv_nnc_tensor_free(ty_tensor);
6449
0
  ccv_nnc_tensor_free(tdx_tensor);
6450
0
  ccv_nnc_tensor_free(dyt);
6451
0
  ccv_nnc_graph_free(graph);
6452
0
  ccv_nnc_tensor_arena_free(tensor_arena);
6453
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
6454
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
6455
0
}
6456
6457
TEST_CASE("compare tanh gradient with mps in half precision")
6458
1
{
6459
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_TANH_FORWARD, CCV_NNC_BACKEND_MPS) &&
6460
1
    ccv_nnc_cmd_ok(CCV_NNC_TANH_BACKWARD, CCV_NNC_BACKEND_MPS));
6461
0
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
6462
0
  ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 100), "x");
6463
0
  ccv_nnc_tensor_symbol_t y = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 10, 100), "y");
6464
0
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_TANH_FORWARD(), TENSOR_SYMBOL_LIST(x), TENSOR_SYMBOL_LIST(y), "tanh");
6465
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
6466
0
  ccv_nnc_symbolic_graph_backward(symbolic_graph, TENSOR_SYMBOL_LIST(y), TENSOR_SYMBOL_LIST(x), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph));
6467
0
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
6468
0
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
6469
0
  ccv_nnc_tensor_symbol_t dy = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, y);
6470
0
  ccv_nnc_tensor_symbol_t dx = ccv_nnc_tensor_symbol_for_backward(symbolic_graph, x);
6471
0
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6472
0
  dsfmt_t dsfmt;
6473
0
  dsfmt_init_gen_rand(&dsfmt, 0);
6474
0
  int i;
6475
0
  for (i = 0; i < 10 * 100; i++)
6476
0
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6477
0
  ccv_nnc_tensor_t* const dy_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6478
0
  for (i = 0; i < 10 * 100; i++)
6479
0
    dy_tensor->data.f32[i] = 0;
6480
0
  for (i = 0; i < 10; i++)
6481
0
    dy_tensor->data.f32[i * 100 + i] = 1;
6482
0
  ccv_nnc_tensor_t* const dy16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
6483
0
  ccv_nnc_tensor_t* const dyt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 100), 0);
6484
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor), TENSOR_LIST(dy16_tensor), 0);
6485
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy16_tensor), TENSOR_LIST(dyt), 0);
6486
0
  ccv_nnc_graph_t* graph = 0;
6487
0
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
6488
0
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
6489
0
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, TENSOR_BIND_MAP(KV(dy, dyt)), TENSOR_SYMBOL_LIST(y), SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
6490
0
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
6491
0
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_from_symbol(tensor_arena, x);
6492
0
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
6493
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
6494
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(xt), 0);
6495
0
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
6496
0
  ccv_nnc_tensor_t* const dx16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
6497
0
  ccv_nnc_tensor_t* const dx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6498
0
  ccv_nnc_tensor_t* const dxt = ccv_nnc_tensor_from_symbol(tensor_arena, dx);
6499
0
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 100), 0);
6500
0
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6501
0
  ccv_nnc_tensor_t* const yt = ccv_nnc_tensor_from_symbol(tensor_arena, y);
6502
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dxt), TENSOR_LIST(dx16_tensor), 0);
6503
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dx16_tensor), TENSOR_LIST(dx_tensor), 0);
6504
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(yt), TENSOR_LIST(y16_tensor), 0);
6505
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(y_tensor), 0);
6506
0
  ccv_nnc_tensor_t* const ty_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6507
0
  ccv_nnc_cmd_exec(CMD_TANH_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(ty_tensor), 0);
6508
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ty_tensor->data.f32, y_tensor->data.f32, 10 * 100, 1e-3, "forward pass should match");
6509
0
  ccv_nnc_tensor_t* const tdx_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
6510
0
  ccv_nnc_cmd_exec(CMD_TANH_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dy_tensor, 0, ty_tensor), TENSOR_LIST(tdx_tensor), 0);
6511
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdx_tensor->data.f32, dx_tensor->data.f32, 10 * 100, 1e-3, "backward pass should match");
6512
0
  ccv_nnc_tensor_free(x_tensor);
6513
0
  ccv_nnc_tensor_free(x16_tensor);
6514
0
  ccv_nnc_tensor_free(y_tensor);
6515
0
  ccv_nnc_tensor_free(y16_tensor);
6516
0
  ccv_nnc_tensor_free(dx_tensor);
6517
0
  ccv_nnc_tensor_free(dx16_tensor);
6518
0
  ccv_nnc_tensor_free(dy_tensor);
6519
0
  ccv_nnc_tensor_free(dy16_tensor);
6520
0
  ccv_nnc_tensor_free(ty_tensor);
6521
0
  ccv_nnc_tensor_free(tdx_tensor);
6522
0
  ccv_nnc_tensor_free(dyt);
6523
0
  ccv_nnc_graph_free(graph);
6524
0
  ccv_nnc_tensor_arena_free(tensor_arena);
6525
0
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
6526
0
  ccv_nnc_symbolic_graph_free(symbolic_graph);
6527
0
}
6528
6529
#include "case_main.h"