Coverage Report

Created: 2025-05-08 13:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/mpsblas.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
TEST_CASE("gemm no transpose")
15
1
{
16
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
17
0
  float ap[] = {
18
0
    1, 2,
19
0
    3, 4,
20
0
    5, 6,
21
0
    7, 8,
22
0
  };
23
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
24
0
  float bp[] = {
25
0
    7, 8, 9,
26
0
    10, 11, 12,
27
0
  };
28
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
29
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
30
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
31
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
32
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
33
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
34
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
35
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
36
0
  float ctp[] = {
37
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
38
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
39
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
40
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
41
0
  };
42
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
43
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
44
0
  ccv_nnc_tensor_free(a);
45
0
  ccv_nnc_tensor_free(b);
46
0
  ccv_nnc_tensor_free(c);
47
0
  ccv_nnc_tensor_free(ga);
48
0
  ccv_nnc_tensor_free(gb);
49
0
  ccv_nnc_tensor_free(gc);
50
0
}
51
52
TEST_CASE("gemm transpose a")
53
1
{
54
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
55
0
  float ap[] = {
56
0
    1, 3, 5, 7,
57
0
    2, 4, 6, 8,
58
0
  };
59
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
60
0
  float bp[] = {
61
0
    7, 8, 9,
62
0
    10, 11, 12,
63
0
  };
64
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
65
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
66
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
67
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
68
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
69
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
70
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
71
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
72
0
  float ctp[] = {
73
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
74
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
75
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
76
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
77
0
  };
78
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
79
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
80
0
  ccv_nnc_tensor_free(a);
81
0
  ccv_nnc_tensor_free(b);
82
0
  ccv_nnc_tensor_free(c);
83
0
  ccv_nnc_tensor_free(ga);
84
0
  ccv_nnc_tensor_free(gb);
85
0
  ccv_nnc_tensor_free(gc);
86
0
}
87
88
TEST_CASE("gemm transpose b")
89
1
{
90
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
91
0
  float ap[] = {
92
0
    1, 2,
93
0
    3, 4,
94
0
    5, 6,
95
0
    7, 8,
96
0
  };
97
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
98
0
  float bp[] = {
99
0
    7, 10,
100
0
    8, 11,
101
0
    9, 12,
102
0
  };
103
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
104
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
105
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
106
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
107
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
108
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
109
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
110
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
111
0
  float ctp[] = {
112
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
113
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
114
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
115
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
116
0
  };
117
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
118
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
119
0
  ccv_nnc_tensor_free(a);
120
0
  ccv_nnc_tensor_free(b);
121
0
  ccv_nnc_tensor_free(c);
122
0
  ccv_nnc_tensor_free(ga);
123
0
  ccv_nnc_tensor_free(gb);
124
0
  ccv_nnc_tensor_free(gc);
125
0
}
126
127
TEST_CASE("gemm transpose a and b")
128
1
{
129
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
130
0
  float ap[] = {
131
0
    1, 3, 5, 7,
132
0
    2, 4, 6, 8,
133
0
  };
134
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
135
0
  float bp[] = {
136
0
    7, 10,
137
0
    8, 11,
138
0
    9, 12,
139
0
  };
140
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
141
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
142
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
143
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
144
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
145
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
146
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
147
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
148
0
  float ctp[] = {
149
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
150
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
151
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
152
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
153
0
  };
154
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
155
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
156
0
  ccv_nnc_tensor_free(a);
157
0
  ccv_nnc_tensor_free(b);
158
0
  ccv_nnc_tensor_free(c);
159
0
  ccv_nnc_tensor_free(ga);
160
0
  ccv_nnc_tensor_free(gb);
161
0
  ccv_nnc_tensor_free(gc);
162
0
}
163
164
TEST_CASE("gemm no transpose with bias")
165
1
{
166
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
167
0
  float ap[] = {
168
0
    1, 2,
169
0
    3, 4,
170
0
    5, 6,
171
0
    7, 8,
172
0
  };
173
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
174
0
  float bp[] = {
175
0
    7, 8, 9,
176
0
    10, 11, 12,
177
0
  };
178
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
179
0
  float dp[] = {
180
0
    1, -1, 1,
181
0
    1, -1, 1,
182
0
    1, -1, 1,
183
0
    1, -1, 1,
184
0
  };
185
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
186
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
187
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
188
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
189
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
190
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
191
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
192
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
193
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
194
0
  float ctp[] = {
195
0
    1 * 7 + 2 * 10 + 1, 1 * 8 + 2 * 11 - 1, 1 * 9 + 2 * 12 + 1,
196
0
    3 * 7 + 4 * 10 + 1, 3 * 8 + 4 * 11 - 1, 3 * 9 + 4 * 12 + 1,
197
0
    5 * 7 + 6 * 10 + 1, 5 * 8 + 6 * 11 - 1, 5 * 9 + 6 * 12 + 1,
198
0
    7 * 7 + 8 * 10 + 1, 7 * 8 + 8 * 11 - 1, 7 * 9 + 8 * 12 + 1,
199
0
  };
200
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
201
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
202
0
  ccv_nnc_tensor_free(a);
203
0
  ccv_nnc_tensor_free(b);
204
0
  ccv_nnc_tensor_free(c);
205
0
  ccv_nnc_tensor_free(d);
206
0
  ccv_nnc_tensor_free(ga);
207
0
  ccv_nnc_tensor_free(gb);
208
0
  ccv_nnc_tensor_free(gc);
209
0
  ccv_nnc_tensor_free(gd);
210
0
}
211
212
TEST_CASE("gemm no transpose batch 2, no batch b")
213
1
{
214
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
215
0
  float ap[] = {
216
0
    1, 2,
217
0
    3, 4,
218
0
    5, 6,
219
0
    7, 8,
220
0
    2, 3,
221
0
    4, 5,
222
0
    6, 7,
223
0
    8, 9
224
0
  };
225
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
226
0
  float bp[] = {
227
0
    7, 8, 9,
228
0
    10, 11, 12,
229
0
  };
230
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
231
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
232
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
233
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
234
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
235
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
236
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
237
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
238
0
  float ctp[] = {
239
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
240
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
241
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
242
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
243
0
    2 * 7 + 3 * 10, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12,
244
0
    4 * 7 + 5 * 10, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12,
245
0
    6 * 7 + 7 * 10, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12,
246
0
    8 * 7 + 9 * 10, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12,
247
0
  };
248
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
249
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
250
0
  ccv_nnc_tensor_free(a);
251
0
  ccv_nnc_tensor_free(b);
252
0
  ccv_nnc_tensor_free(c);
253
0
  ccv_nnc_tensor_free(ga);
254
0
  ccv_nnc_tensor_free(gb);
255
0
  ccv_nnc_tensor_free(gc);
256
0
}
257
258
TEST_CASE("gemm no transpose batch 2")
259
1
{
260
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
261
0
  float ap[] = {
262
0
    1, 2,
263
0
    3, 4,
264
0
    5, 6,
265
0
    7, 8,
266
0
    2, 3,
267
0
    4, 5,
268
0
    6, 7,
269
0
    8, 9
270
0
  };
271
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
272
0
  float bp[] = {
273
0
    7, 8, 9,
274
0
    10, 11, 12,
275
0
    8, 9, 10,
276
0
    11, 12, 13,
277
0
  };
278
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
279
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
280
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
281
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
282
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
283
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
284
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
285
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
286
0
  float ctp[] = {
287
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
288
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
289
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
290
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
291
0
    2 * 8 + 3 * 11, 2 * 9 + 3 * 12, 2 * 10 + 3 * 13,
292
0
    4 * 8 + 5 * 11, 4 * 9 + 5 * 12, 4 * 10 + 5 * 13,
293
0
    6 * 8 + 7 * 11, 6 * 9 + 7 * 12, 6 * 10 + 7 * 13,
294
0
    8 * 8 + 9 * 11, 8 * 9 + 9 * 12, 8 * 10 + 9 * 13,
295
0
  };
296
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
297
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
298
0
  ccv_nnc_tensor_free(a);
299
0
  ccv_nnc_tensor_free(b);
300
0
  ccv_nnc_tensor_free(c);
301
0
  ccv_nnc_tensor_free(ga);
302
0
  ccv_nnc_tensor_free(gb);
303
0
  ccv_nnc_tensor_free(gc);
304
0
}
305
306
TEST_CASE("gemm transpose a batch 2, no batch b, with bias")
307
1
{
308
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
309
0
  float ap[] = {
310
0
    1, 3, 5, 7,
311
0
    2, 4, 6, 8,
312
0
    2, 4, 6, 8,
313
0
    3, 5, 7, 9,
314
0
  };
315
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
316
0
  float bp[] = {
317
0
    7, 8, 9,
318
0
    10, 11, 12,
319
0
  };
320
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
321
0
  float dp[] = {
322
0
    -1, 0, 1,
323
0
  };
324
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 3), 0);
325
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
326
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
327
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
328
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
329
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
330
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
331
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
332
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
333
0
  float ctp[] = {
334
0
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
335
0
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
336
0
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
337
0
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
338
0
    2 * 7 + 3 * 10 - 1, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12 + 1,
339
0
    4 * 7 + 5 * 10 - 1, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12 + 1,
340
0
    6 * 7 + 7 * 10 - 1, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12 + 1,
341
0
    8 * 7 + 9 * 10 - 1, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12 + 1,
342
0
  };
343
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
344
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
345
0
  ccv_nnc_tensor_free(a);
346
0
  ccv_nnc_tensor_free(b);
347
0
  ccv_nnc_tensor_free(c);
348
0
  ccv_nnc_tensor_free(d);
349
0
  ccv_nnc_tensor_free(ga);
350
0
  ccv_nnc_tensor_free(gb);
351
0
  ccv_nnc_tensor_free(gc);
352
0
  ccv_nnc_tensor_free(gd);
353
0
}
354
355
TEST_CASE("gemm transpose a batch 2, with bias")
356
1
{
357
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
358
0
  float ap[] = {
359
0
    1, 3, 5, 7,
360
0
    2, 4, 6, 8,
361
0
    2, 4, 6, 8,
362
0
    3, 5, 7, 9,
363
0
  };
364
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
365
0
  float bp[] = {
366
0
    7, 8, 9,
367
0
    10, 11, 12,
368
0
    8, 9, 10,
369
0
    11, 12, 13,
370
0
  };
371
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
372
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
373
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
374
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
375
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
376
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
377
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
378
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
379
0
  float ctp[] = {
380
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
381
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
382
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
383
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
384
0
    2 * 8 + 3 * 11, 2 * 9 + 3 * 12, 2 * 10 + 3 * 13,
385
0
    4 * 8 + 5 * 11, 4 * 9 + 5 * 12, 4 * 10 + 5 * 13,
386
0
    6 * 8 + 7 * 11, 6 * 9 + 7 * 12, 6 * 10 + 7 * 13,
387
0
    8 * 8 + 9 * 11, 8 * 9 + 9 * 12, 8 * 10 + 9 * 13,
388
0
  };
389
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
390
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
391
0
  ccv_nnc_tensor_free(a);
392
0
  ccv_nnc_tensor_free(b);
393
0
  ccv_nnc_tensor_free(c);
394
0
  ccv_nnc_tensor_free(ga);
395
0
  ccv_nnc_tensor_free(gb);
396
0
  ccv_nnc_tensor_free(gc);
397
0
}
398
399
TEST_CASE("gemm transpose b batch 2, with bias")
400
1
{
401
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
402
0
  float ap[] = {
403
0
    1, 2,
404
0
    3, 4,
405
0
    5, 6,
406
0
    7, 8,
407
0
    2, 3,
408
0
    4, 5,
409
0
    6, 7,
410
0
    8, 9
411
0
  };
412
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
413
0
  float bp[] = {
414
0
    7, 10,
415
0
    8, 11,
416
0
    9, 12,
417
0
    80, 110,
418
0
    90, 120,
419
0
    10, 13,
420
0
  };
421
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
422
0
  float dp[] = {
423
0
    -1, 0, 1,
424
0
    2, 3, -4,
425
0
  };
426
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
427
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
428
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
429
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
430
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
431
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
432
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
433
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
434
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
435
0
  float ctp[] = {
436
0
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
437
0
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
438
0
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
439
0
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
440
0
    2 * 80 + 3 * 110 + 2, 2 * 90 + 3 * 120 + 3, 2 * 10 + 3 * 13 - 4,
441
0
    4 * 80 + 5 * 110 + 2, 4 * 90 + 5 * 120 + 3, 4 * 10 + 5 * 13 - 4,
442
0
    6 * 80 + 7 * 110 + 2, 6 * 90 + 7 * 120 + 3, 6 * 10 + 7 * 13 - 4,
443
0
    8 * 80 + 9 * 110 + 2, 8 * 90 + 9 * 120 + 3, 8 * 10 + 9 * 13 - 4,
444
0
  };
445
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
446
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
447
0
  ccv_nnc_tensor_free(a);
448
0
  ccv_nnc_tensor_free(b);
449
0
  ccv_nnc_tensor_free(c);
450
0
  ccv_nnc_tensor_free(d);
451
0
  ccv_nnc_tensor_free(ga);
452
0
  ccv_nnc_tensor_free(gb);
453
0
  ccv_nnc_tensor_free(gc);
454
0
  ccv_nnc_tensor_free(gd);
455
0
}
456
457
TEST_CASE("gemm transpose b batch 2")
458
1
{
459
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
460
0
  float ap[] = {
461
0
    1, 2,
462
0
    3, 4,
463
0
    5, 6,
464
0
    7, 8,
465
0
    2, 3,
466
0
    4, 5,
467
0
    6, 7,
468
0
    8, 9
469
0
  };
470
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
471
0
  float bp[] = {
472
0
    7, 10,
473
0
    8, 11,
474
0
    9, 12,
475
0
    80, 110,
476
0
    90, 120,
477
0
    10, 13,
478
0
  };
479
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
480
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
481
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
482
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
483
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
484
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
485
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
486
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
487
0
  float ctp[] = {
488
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
489
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
490
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
491
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
492
0
    2 * 80 + 3 * 110, 2 * 90 + 3 * 120, 2 * 10 + 3 * 13,
493
0
    4 * 80 + 5 * 110, 4 * 90 + 5 * 120, 4 * 10 + 5 * 13,
494
0
    6 * 80 + 7 * 110, 6 * 90 + 7 * 120, 6 * 10 + 7 * 13,
495
0
    8 * 80 + 9 * 110, 8 * 90 + 9 * 120, 8 * 10 + 9 * 13,
496
0
  };
497
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
498
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
499
0
  ccv_nnc_tensor_free(a);
500
0
  ccv_nnc_tensor_free(b);
501
0
  ccv_nnc_tensor_free(c);
502
0
  ccv_nnc_tensor_free(ga);
503
0
  ccv_nnc_tensor_free(gb);
504
0
  ccv_nnc_tensor_free(gc);
505
0
}
506
507
TEST_CASE("mps forward gemm")
508
1
{
509
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
510
0
  dsfmt_t dsfmt;
511
0
  dsfmt_init_gen_rand(&dsfmt, 0);
512
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
513
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
514
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
515
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
516
517
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
518
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
519
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
520
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
521
0
  int i;
522
0
  for (i = 0; i < 64 * 128; i++)
523
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
524
0
  for (i = 0; i < 64; i++)
525
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
526
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
527
0
  for (i = 0; i < 10 * 128; i++)
528
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
529
0
  for (i = 0; i < 128; i++)
530
0
    ha->data.f32[i] = ha1->data.f32[i];
531
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(a, w, bias), 0);
532
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
533
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
534
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
535
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
536
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
537
0
  for (i = 0; i < 64; i++)
538
0
    tb1->data.f32[i] = tb->data.f32[i];
539
0
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
540
0
  ccv_nnc_tensor_free(a);
541
0
  ccv_nnc_tensor_free(w);
542
0
  ccv_nnc_tensor_free(bias);
543
0
  ccv_nnc_tensor_free(tb);
544
0
  ccv_nnc_tensor_free(b);
545
0
  ccv_nnc_tensor_free(ha);
546
0
  ccv_nnc_tensor_free(ha1);
547
0
  ccv_nnc_tensor_free(tb1);
548
0
  ccv_nnc_tensor_free(hw);
549
0
  ccv_nnc_tensor_free(hbias);
550
0
  ccv_nnc_tensor_free(hb);
551
0
}
552
553
TEST_CASE("mps forward gemm in half precision")
554
1
{
555
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
556
0
  dsfmt_t dsfmt;
557
0
  dsfmt_init_gen_rand(&dsfmt, 0);
558
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
559
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
560
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
561
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
562
563
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
564
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
565
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
566
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
567
0
  int i;
568
0
  for (i = 0; i < 64 * 128; i++)
569
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
570
0
  for (i = 0; i < 64; i++)
571
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
572
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
573
0
  for (i = 0; i < 10 * 128; i++)
574
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
575
0
  for (i = 0; i < 128; i++)
576
0
    ha->data.f32[i] = ha1->data.f32[i];
577
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
578
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
579
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
580
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
581
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
582
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
583
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
584
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
585
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
586
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
587
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
588
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-3, "GPU computed output should be the same as CPU computed ones");
589
0
  ccv_nnc_tensor_free(a);
590
0
  ccv_nnc_tensor_free(w);
591
0
  ccv_nnc_tensor_free(bias);
592
0
  ccv_nnc_tensor_free(b);
593
0
  ccv_nnc_tensor_free(tb);
594
0
  ccv_nnc_tensor_free(ha);
595
0
  ccv_nnc_tensor_free(ha1);
596
0
  ccv_nnc_tensor_free(tb1);
597
0
  ccv_nnc_tensor_free(hw);
598
0
  ccv_nnc_tensor_free(hbias);
599
0
  ccv_nnc_tensor_free(hb);
600
0
  ccv_nnc_tensor_free(ha2);
601
0
  ccv_nnc_tensor_free(hw2);
602
0
  ccv_nnc_tensor_free(hbias2);
603
0
}
604
605
TEST_CASE("mps forward gemv in half precision, variant 1")
606
1
{
607
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
608
0
  dsfmt_t dsfmt;
609
0
  dsfmt_init_gen_rand(&dsfmt, 0);
610
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
611
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
612
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
613
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
614
615
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
616
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
617
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
618
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
619
0
  int i;
620
0
  for (i = 0; i < 64 * 128; i++)
621
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
622
0
  for (i = 0; i < 64; i++)
623
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
624
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
625
0
  for (i = 0; i < 128; i++)
626
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
627
0
  for (i = 0; i < 128; i++)
628
0
    ha->data.f32[i] = ha1->data.f32[i];
629
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
630
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
631
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
632
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
633
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
634
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
635
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
636
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
637
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
638
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
639
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
640
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
641
0
  ccv_nnc_tensor_free(a);
642
0
  ccv_nnc_tensor_free(w);
643
0
  ccv_nnc_tensor_free(bias);
644
0
  ccv_nnc_tensor_free(b);
645
0
  ccv_nnc_tensor_free(tb);
646
0
  ccv_nnc_tensor_free(ha);
647
0
  ccv_nnc_tensor_free(ha1);
648
0
  ccv_nnc_tensor_free(tb1);
649
0
  ccv_nnc_tensor_free(hw);
650
0
  ccv_nnc_tensor_free(hbias);
651
0
  ccv_nnc_tensor_free(hb);
652
0
  ccv_nnc_tensor_free(ha2);
653
0
  ccv_nnc_tensor_free(hw2);
654
0
  ccv_nnc_tensor_free(hbias2);
655
0
}
656
657
TEST_CASE("mps forward gemm no bias")
658
1
{
659
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
660
0
  dsfmt_t dsfmt;
661
0
  dsfmt_init_gen_rand(&dsfmt, 0);
662
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
663
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
664
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
665
666
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
667
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
668
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
669
0
  int i;
670
0
  for (i = 0; i < 64 * 128; i++)
671
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
672
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
673
0
  for (i = 0; i < 10 * 128; i++)
674
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
675
0
  for (i = 0; i < 128; i++)
676
0
    ha->data.f32[i] = ha1->data.f32[i];
677
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(a, w), 0);
678
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
679
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
680
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
681
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
682
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
683
0
  for (i = 0; i < 64; i++)
684
0
    tb1->data.f32[i] = tb->data.f32[i];
685
0
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
686
0
  ccv_nnc_tensor_free(a);
687
0
  ccv_nnc_tensor_free(w);
688
0
  ccv_nnc_tensor_free(b);
689
0
  ccv_nnc_tensor_free(tb);
690
0
  ccv_nnc_tensor_free(ha);
691
0
  ccv_nnc_tensor_free(ha1);
692
0
  ccv_nnc_tensor_free(tb1);
693
0
  ccv_nnc_tensor_free(hw);
694
0
  ccv_nnc_tensor_free(hb);
695
0
}
696
697
TEST_CASE("mps forward gemm no bias in half precision")
698
1
{
699
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
700
0
  dsfmt_t dsfmt;
701
0
  dsfmt_init_gen_rand(&dsfmt, 0);
702
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
703
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
704
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
705
706
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
707
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
708
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
709
0
  int i;
710
0
  for (i = 0; i < 64 * 128; i++)
711
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
712
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
713
0
  for (i = 0; i < 10 * 128; i++)
714
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
715
0
  for (i = 0; i < 128; i++)
716
0
    ha->data.f32[i] = ha1->data.f32[i];
717
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
718
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
719
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
720
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
721
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
722
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
723
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
724
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
725
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
726
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
727
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
728
0
  ccv_nnc_tensor_free(a);
729
0
  ccv_nnc_tensor_free(w);
730
0
  ccv_nnc_tensor_free(b);
731
0
  ccv_nnc_tensor_free(tb);
732
0
  ccv_nnc_tensor_free(ha);
733
0
  ccv_nnc_tensor_free(ha1);
734
0
  ccv_nnc_tensor_free(tb1);
735
0
  ccv_nnc_tensor_free(hw);
736
0
  ccv_nnc_tensor_free(hb);
737
0
  ccv_nnc_tensor_free(ha2);
738
0
  ccv_nnc_tensor_free(hw2);
739
0
}
740
741
TEST_CASE("mps forward gemv in half precision no bias, variant 1")
742
1
{
743
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
744
0
  dsfmt_t dsfmt;
745
0
  dsfmt_init_gen_rand(&dsfmt, 0);
746
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
747
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
748
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
749
750
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
751
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
752
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
753
0
  int i;
754
0
  for (i = 0; i < 64 * 128; i++)
755
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
756
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
757
0
  for (i = 0; i < 128; i++)
758
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
759
0
  for (i = 0; i < 128; i++)
760
0
    ha->data.f32[i] = ha1->data.f32[i];
761
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
762
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
763
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
764
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
765
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
766
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
767
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
768
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
769
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
770
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
771
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
772
0
  ccv_nnc_tensor_free(a);
773
0
  ccv_nnc_tensor_free(w);
774
0
  ccv_nnc_tensor_free(b);
775
0
  ccv_nnc_tensor_free(tb);
776
0
  ccv_nnc_tensor_free(ha);
777
0
  ccv_nnc_tensor_free(ha1);
778
0
  ccv_nnc_tensor_free(tb1);
779
0
  ccv_nnc_tensor_free(hw);
780
0
  ccv_nnc_tensor_free(hb);
781
0
  ccv_nnc_tensor_free(ha2);
782
0
  ccv_nnc_tensor_free(hw2);
783
0
}
784
785
TEST_CASE("mps forward gemv in half precision no bias, variant 2")
786
1
{
787
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
788
0
  dsfmt_t dsfmt;
789
0
  dsfmt_init_gen_rand(&dsfmt, 0);
790
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
791
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 128, 1), 0);
792
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 1), 0);
793
794
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
795
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
796
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
797
0
  int i;
798
0
  for (i = 0; i < 64 * 128; i++)
799
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
800
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
801
0
  for (i = 0; i < 128; i++)
802
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
803
0
  for (i = 0; i < 128; i++)
804
0
    ha->data.f32[i] = ha1->data.f32[i];
805
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
806
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 128, 1), 0);
807
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
808
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
809
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(hw, ha), TENSOR_LIST(hb), 0);
810
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(w, a), TENSOR_LIST(b), 0);
811
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 1), 0);
812
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
813
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
814
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
815
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
816
0
  ccv_nnc_tensor_free(a);
817
0
  ccv_nnc_tensor_free(w);
818
0
  ccv_nnc_tensor_free(b);
819
0
  ccv_nnc_tensor_free(tb);
820
0
  ccv_nnc_tensor_free(ha);
821
0
  ccv_nnc_tensor_free(ha1);
822
0
  ccv_nnc_tensor_free(tb1);
823
0
  ccv_nnc_tensor_free(hw);
824
0
  ccv_nnc_tensor_free(hb);
825
0
  ccv_nnc_tensor_free(ha2);
826
0
  ccv_nnc_tensor_free(hw2);
827
0
}
828
829
TEST_CASE("mps handle permute")
830
1
{
831
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
832
0
  dsfmt_t dsfmt;
833
0
  dsfmt_init_gen_rand(&dsfmt, 0);
834
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 2, 128), 0);
835
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 2, 128), 0);
836
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 2, 128), 0);
837
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 2, 128), 0);
838
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
839
840
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), 0);
841
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), 0);
842
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
843
0
  int i;
844
0
  for (i = 0; i < 2 * 64 * 128; i++)
845
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
846
0
  for (i = 0; i < 2 * 10 * 128; i++)
847
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
848
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
849
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(at), 0);
850
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(w), TENSOR_LIST(wt), 0);
851
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
852
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
853
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
854
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
855
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
856
0
  ccv_nnc_tensor_t* hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
857
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, bt), TENSOR_LIST(hb, hbt), 0);
858
0
  REQUIRE_TENSOR_EQ(hb, hbt, "permute computed output should be the same as non-permute computed ones");
859
0
  ccv_nnc_tensor_free(ha);
860
0
  ccv_nnc_tensor_free(hw);
861
0
  ccv_nnc_tensor_free(a);
862
0
  ccv_nnc_tensor_free(w);
863
0
  ccv_nnc_tensor_free(b);
864
0
  ccv_nnc_tensor_view_free(av);
865
0
  ccv_nnc_tensor_view_free(wv);
866
0
  ccv_nnc_tensor_free(at);
867
0
  ccv_nnc_tensor_free(wt);
868
0
  ccv_nnc_tensor_free(bt);
869
0
  ccv_nnc_tensor_free(hb);
870
0
  ccv_nnc_tensor_free(hbt);
871
0
}
872
873
TEST_CASE("generalized batched gemm with batch (2, 4) compare mps")
874
1
{
875
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
876
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
877
0
  dsfmt_t dsfmt;
878
0
  dsfmt_init_gen_rand(&dsfmt, 0);
879
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
880
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
881
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
882
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
883
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
884
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
885
886
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
887
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
888
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
889
0
  int i;
890
0
  for (i = 0; i < 8 * 64 * 128; i++)
891
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
892
0
  for (i = 0; i < 8 * 10 * 128; i++)
893
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
894
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
895
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
896
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
897
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
898
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
899
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
900
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
901
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
902
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
903
0
  ccv_nnc_tensor_free(ha);
904
0
  ccv_nnc_tensor_free(hw);
905
0
  ccv_nnc_tensor_free(hb);
906
0
  ccv_nnc_tensor_free(a);
907
0
  ccv_nnc_tensor_free(w);
908
0
  ccv_nnc_tensor_free(b);
909
0
  ccv_nnc_tensor_view_free(av);
910
0
  ccv_nnc_tensor_view_free(wv);
911
0
  ccv_nnc_tensor_free(at);
912
0
  ccv_nnc_tensor_free(wt);
913
0
  ccv_nnc_tensor_free(bt);
914
0
}
915
916
TEST_CASE("generalized batched gemm with batch (2, 4) and broadcast compare mps")
917
1
{
918
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
919
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
920
0
  dsfmt_t dsfmt;
921
0
  dsfmt_init_gen_rand(&dsfmt, 0);
922
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
923
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
924
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
925
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
926
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
927
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
928
929
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
930
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
931
0
  int i;
932
0
  for (i = 0; i < 64 * 128; i++)
933
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
934
0
  for (i = 0; i < 8 * 10 * 128; i++)
935
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
936
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
937
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
938
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
939
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w), TENSOR_LIST(b), 0);
940
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw), TENSOR_LIST(bt), 0);
941
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
942
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
943
0
  ccv_nnc_tensor_free(ha);
944
0
  ccv_nnc_tensor_free(hw);
945
0
  ccv_nnc_tensor_free(hb);
946
0
  ccv_nnc_tensor_free(a);
947
0
  ccv_nnc_tensor_free(w);
948
0
  ccv_nnc_tensor_free(b);
949
0
  ccv_nnc_tensor_view_free(av);
950
0
  ccv_nnc_tensor_free(at);
951
0
  ccv_nnc_tensor_free(bt);
952
0
}
953
954
TEST_CASE("generalized batched gemm with batch (2, 4) with bias compare mps")
955
1
{
956
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
957
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
958
0
  dsfmt_t dsfmt;
959
0
  dsfmt_init_gen_rand(&dsfmt, 0);
960
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
961
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
962
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
963
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
964
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
965
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
966
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
967
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
968
969
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
970
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
971
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
972
0
  int i;
973
0
  for (i = 0; i < 8 * 64 * 128; i++)
974
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
975
0
  for (i = 0; i < 64; i++)
976
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
977
0
  for (i = 0; i < 8 * 10 * 128; i++)
978
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
979
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
980
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
981
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
982
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
983
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
984
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv, bias), TENSOR_LIST(b), 0);
985
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt, hbias), TENSOR_LIST(bt), 0);
986
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
987
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
988
0
  ccv_nnc_tensor_free(ha);
989
0
  ccv_nnc_tensor_free(hw);
990
0
  ccv_nnc_tensor_free(hbias);
991
0
  ccv_nnc_tensor_free(hb);
992
0
  ccv_nnc_tensor_free(a);
993
0
  ccv_nnc_tensor_free(w);
994
0
  ccv_nnc_tensor_free(bias);
995
0
  ccv_nnc_tensor_free(b);
996
0
  ccv_nnc_tensor_view_free(av);
997
0
  ccv_nnc_tensor_view_free(wv);
998
0
  ccv_nnc_tensor_free(at);
999
0
  ccv_nnc_tensor_free(wt);
1000
0
  ccv_nnc_tensor_free(bt);
1001
0
}
1002
1003
TEST_CASE("generalized batched gemm with batch (2, 4) with bias and broadcast compare mps")
1004
1
{
1005
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1006
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1007
0
  dsfmt_t dsfmt;
1008
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1009
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1010
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1011
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1012
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1013
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1014
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1015
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1016
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1017
1018
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1019
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1020
0
  int i;
1021
0
  for (i = 0; i < 64 * 128; i++)
1022
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1023
0
  for (i = 0; i < 64; i++)
1024
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
1025
0
  for (i = 0; i < 8 * 10 * 128; i++)
1026
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1027
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
1028
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
1029
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1030
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w, bias), TENSOR_LIST(b), 0);
1031
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw, hbias), TENSOR_LIST(bt), 0);
1032
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1033
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
1034
0
  ccv_nnc_tensor_free(ha);
1035
0
  ccv_nnc_tensor_free(hw);
1036
0
  ccv_nnc_tensor_free(hbias);
1037
0
  ccv_nnc_tensor_free(hb);
1038
0
  ccv_nnc_tensor_free(a);
1039
0
  ccv_nnc_tensor_free(w);
1040
0
  ccv_nnc_tensor_free(bias);
1041
0
  ccv_nnc_tensor_free(b);
1042
0
  ccv_nnc_tensor_view_free(av);
1043
0
  ccv_nnc_tensor_free(at);
1044
0
  ccv_nnc_tensor_free(bt);
1045
0
}
1046
1047
TEST_CASE("generalized batched backward gemm with batch (2, 4) compare mps")
1048
1
{
1049
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
1050
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1051
0
  dsfmt_t dsfmt;
1052
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1053
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1054
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1055
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1056
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1057
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1058
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1059
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
1060
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1061
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
1062
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1063
1064
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1065
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
1066
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1067
0
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
1068
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1069
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1070
0
  int i;
1071
0
  for (i = 0; i < 8 * 64 * 128; i++)
1072
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1073
0
  for (i = 0; i < 8 * 10 * 128; i++)
1074
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1075
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
1076
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1077
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
1078
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
1079
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
1080
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1081
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
1082
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1083
0
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
1084
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv), 0);
1085
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt), 0);
1086
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
1087
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
1088
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
1089
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
1090
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
1091
0
  ccv_nnc_tensor_free(ha);
1092
0
  ccv_nnc_tensor_free(hw);
1093
0
  ccv_nnc_tensor_free(hda);
1094
0
  ccv_nnc_tensor_free(hdw);
1095
0
  ccv_nnc_tensor_free(hb);
1096
0
  ccv_nnc_tensor_free(a);
1097
0
  ccv_nnc_tensor_free(w);
1098
0
  ccv_nnc_tensor_free(da);
1099
0
  ccv_nnc_tensor_free(dw);
1100
0
  ccv_nnc_tensor_free(b);
1101
0
  ccv_nnc_tensor_view_free(av);
1102
0
  ccv_nnc_tensor_view_free(wv);
1103
0
  ccv_nnc_tensor_view_free(dav);
1104
0
  ccv_nnc_tensor_view_free(dwv);
1105
0
  ccv_nnc_tensor_free(at);
1106
0
  ccv_nnc_tensor_free(wt);
1107
0
  ccv_nnc_tensor_free(dat);
1108
0
  ccv_nnc_tensor_free(tda);
1109
0
  ccv_nnc_tensor_free(dwt);
1110
0
  ccv_nnc_tensor_free(tdw);
1111
0
}
1112
1113
TEST_CASE("generalized batched backward gemm with batch (2, 4) and broadcast compare mps")
1114
1
{
1115
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
1116
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1117
0
  dsfmt_t dsfmt;
1118
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1119
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1120
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1121
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1122
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1123
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1124
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1125
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1126
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1127
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1128
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1129
1130
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1131
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1132
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1133
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1134
0
  int i;
1135
0
  for (i = 0; i < 64 * 128; i++)
1136
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1137
0
  for (i = 0; i < 8 * 10 * 128; i++)
1138
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1139
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
1140
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1141
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
1142
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
1143
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1144
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1145
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw), 0);
1146
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw), TENSOR_LIST(dat, tdw), 0);
1147
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
1148
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
1149
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
1150
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
1151
0
  ccv_nnc_tensor_free(ha);
1152
0
  ccv_nnc_tensor_free(hw);
1153
0
  ccv_nnc_tensor_free(hda);
1154
0
  ccv_nnc_tensor_free(hdw);
1155
0
  ccv_nnc_tensor_free(hb);
1156
0
  ccv_nnc_tensor_free(a);
1157
0
  ccv_nnc_tensor_free(w);
1158
0
  ccv_nnc_tensor_free(da);
1159
0
  ccv_nnc_tensor_free(dw);
1160
0
  ccv_nnc_tensor_free(b);
1161
0
  ccv_nnc_tensor_view_free(av);
1162
0
  ccv_nnc_tensor_view_free(dav);
1163
0
  ccv_nnc_tensor_free(at);
1164
0
  ccv_nnc_tensor_free(dat);
1165
0
  ccv_nnc_tensor_free(tda);
1166
0
  ccv_nnc_tensor_free(tdw);
1167
0
}
1168
1169
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias compare mps")
1170
1
{
1171
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
1172
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1173
0
  dsfmt_t dsfmt;
1174
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1175
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1176
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1177
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1178
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1179
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1180
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1181
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1182
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
1183
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1184
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
1185
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1186
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1187
1188
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1189
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
1190
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1191
0
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
1192
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1193
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1194
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1195
0
  int i;
1196
0
  for (i = 0; i < 8 * 64 * 128; i++)
1197
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1198
0
  for (i = 0; i < 8 * 10 * 128; i++)
1199
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1200
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
1201
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1202
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
1203
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
1204
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
1205
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1206
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
1207
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1208
0
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
1209
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv, dbias), 0);
1210
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt, tdbias), 0);
1211
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
1212
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
1213
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
1214
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
1215
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
1216
0
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
1217
0
  ccv_nnc_tensor_free(ha);
1218
0
  ccv_nnc_tensor_free(hw);
1219
0
  ccv_nnc_tensor_free(hda);
1220
0
  ccv_nnc_tensor_free(hdw);
1221
0
  ccv_nnc_tensor_free(hdbias);
1222
0
  ccv_nnc_tensor_free(hb);
1223
0
  ccv_nnc_tensor_free(a);
1224
0
  ccv_nnc_tensor_free(w);
1225
0
  ccv_nnc_tensor_free(da);
1226
0
  ccv_nnc_tensor_free(dw);
1227
0
  ccv_nnc_tensor_free(dbias);
1228
0
  ccv_nnc_tensor_free(b);
1229
0
  ccv_nnc_tensor_view_free(av);
1230
0
  ccv_nnc_tensor_view_free(wv);
1231
0
  ccv_nnc_tensor_view_free(dav);
1232
0
  ccv_nnc_tensor_view_free(dwv);
1233
0
  ccv_nnc_tensor_free(at);
1234
0
  ccv_nnc_tensor_free(wt);
1235
0
  ccv_nnc_tensor_free(dat);
1236
0
  ccv_nnc_tensor_free(dwt);
1237
0
  ccv_nnc_tensor_free(tda);
1238
0
  ccv_nnc_tensor_free(tdw);
1239
0
  ccv_nnc_tensor_free(tdbias);
1240
0
}
1241
1242
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias and broadcast compare mps")
1243
1
{
1244
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
1245
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1246
0
  dsfmt_t dsfmt;
1247
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1248
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1249
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1250
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1251
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1252
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1253
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1254
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1255
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1256
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1257
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1258
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1259
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1260
1261
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1262
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1263
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1264
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1265
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1266
0
  int i;
1267
0
  for (i = 0; i < 64 * 128; i++)
1268
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1269
0
  for (i = 0; i < 8 * 10 * 128; i++)
1270
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1271
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
1272
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1273
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
1274
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
1275
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1276
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1277
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w, dbias), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw, dbias), 0);
1278
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw, hdbias), TENSOR_LIST(dat, tdw, tdbias), 0);
1279
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
1280
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
1281
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
1282
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
1283
0
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
1284
0
  ccv_nnc_tensor_free(ha);
1285
0
  ccv_nnc_tensor_free(hw);
1286
0
  ccv_nnc_tensor_free(hda);
1287
0
  ccv_nnc_tensor_free(hdw);
1288
0
  ccv_nnc_tensor_free(hdbias);
1289
0
  ccv_nnc_tensor_free(hb);
1290
0
  ccv_nnc_tensor_free(a);
1291
0
  ccv_nnc_tensor_free(w);
1292
0
  ccv_nnc_tensor_free(da);
1293
0
  ccv_nnc_tensor_free(dw);
1294
0
  ccv_nnc_tensor_free(dbias);
1295
0
  ccv_nnc_tensor_free(b);
1296
0
  ccv_nnc_tensor_view_free(av);
1297
0
  ccv_nnc_tensor_view_free(dav);
1298
0
  ccv_nnc_tensor_free(at);
1299
0
  ccv_nnc_tensor_free(dat);
1300
0
  ccv_nnc_tensor_free(tdw);
1301
0
  ccv_nnc_tensor_free(tdbias);
1302
0
}
1303
1304
TEST_CASE("ewdiv forward with reciprocal")
1305
1
{
1306
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_MPS));
1307
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1308
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1309
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1310
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1311
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1312
0
  dsfmt_t dsfmt;
1313
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1314
0
  int i;
1315
0
  for (i = 0; i < 1000; i++)
1316
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
1317
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1318
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a), TENSOR_LIST(b), 0);
1319
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ha), TENSOR_LIST(bt), 0);
1320
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1321
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
1322
0
  ccv_nnc_tensor_free(a);
1323
0
  ccv_nnc_tensor_free(b);
1324
0
  ccv_nnc_tensor_free(ha);
1325
0
  ccv_nnc_tensor_free(hb);
1326
0
  ccv_nnc_tensor_free(bt);
1327
0
}
1328
1329
TEST_CASE("ewdiv forward")
1330
1
{
1331
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_MPS));
1332
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1333
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1334
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1335
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1336
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1337
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1338
0
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1339
0
  dsfmt_t dsfmt;
1340
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1341
0
  int i;
1342
0
  for (i = 0; i < 1000; i++)
1343
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
1344
0
  for (i = 0; i < 1000; i++)
1345
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
1346
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
1347
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
1348
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(ct), 0);
1349
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
1350
0
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
1351
0
  ccv_nnc_tensor_free(a);
1352
0
  ccv_nnc_tensor_free(b);
1353
0
  ccv_nnc_tensor_free(c);
1354
0
  ccv_nnc_tensor_free(ha);
1355
0
  ccv_nnc_tensor_free(hb);
1356
0
  ccv_nnc_tensor_free(hc);
1357
0
  ccv_nnc_tensor_free(ct);
1358
0
}
1359
1360
TEST_CASE("exp forward")
1361
1
{
1362
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_MPS));
1363
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1364
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1365
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1366
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1367
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1368
0
  dsfmt_t dsfmt;
1369
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1370
0
  int i;
1371
0
  for (i = 0; i < 1000; i++)
1372
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
1373
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1374
0
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
1375
0
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
1376
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1377
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
1378
0
  ccv_nnc_tensor_free(a);
1379
0
  ccv_nnc_tensor_free(b);
1380
0
  ccv_nnc_tensor_free(ha);
1381
0
  ccv_nnc_tensor_free(hb);
1382
0
  ccv_nnc_tensor_free(bt);
1383
0
}
1384
1385
TEST_CASE("ewlog forward")
1386
1
{
1387
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_MPS));
1388
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1389
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1390
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1391
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1392
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1393
0
  dsfmt_t dsfmt;
1394
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1395
0
  int i;
1396
0
  for (i = 0; i < 1000; i++)
1397
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
1398
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1399
0
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
1400
0
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
1401
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1402
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
1403
0
  ccv_nnc_tensor_free(a);
1404
0
  ccv_nnc_tensor_free(b);
1405
0
  ccv_nnc_tensor_free(ha);
1406
0
  ccv_nnc_tensor_free(hb);
1407
0
  ccv_nnc_tensor_free(bt);
1408
0
}
1409
1410
TEST_CASE("ewsqrt forward")
1411
1
{
1412
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_MPS));
1413
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1414
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1415
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1416
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1417
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1418
0
  dsfmt_t dsfmt;
1419
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1420
0
  int i;
1421
0
  for (i = 0; i < 1000; i++)
1422
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
1423
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1424
0
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
1425
0
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
1426
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1427
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
1428
0
  ccv_nnc_tensor_free(a);
1429
0
  ccv_nnc_tensor_free(b);
1430
0
  ccv_nnc_tensor_free(ha);
1431
0
  ccv_nnc_tensor_free(hb);
1432
0
  ccv_nnc_tensor_free(bt);
1433
0
}
1434
1435
TEST_CASE("ewabs forward")
1436
1
{
1437
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWABS_FORWARD, CCV_NNC_BACKEND_MPS));
1438
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1439
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1440
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1441
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1442
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1443
0
  dsfmt_t dsfmt;
1444
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1445
0
  int i;
1446
0
  for (i = 0; i < 1000; i++)
1447
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5 + 0.0001;
1448
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1449
0
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
1450
0
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
1451
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1452
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
1453
0
  ccv_nnc_tensor_free(a);
1454
0
  ccv_nnc_tensor_free(b);
1455
0
  ccv_nnc_tensor_free(ha);
1456
0
  ccv_nnc_tensor_free(hb);
1457
0
  ccv_nnc_tensor_free(bt);
1458
0
}
1459
1460
TEST_CASE("clamp forward")
1461
1
{
1462
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
1463
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1464
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1465
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1466
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1467
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1468
0
  dsfmt_t dsfmt;
1469
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1470
0
  int i;
1471
0
  for (i = 0; i < 1000; i++)
1472
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
1473
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1474
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
1475
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
1476
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1477
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
1478
0
  ccv_nnc_tensor_free(a);
1479
0
  ccv_nnc_tensor_free(b);
1480
0
  ccv_nnc_tensor_free(ha);
1481
0
  ccv_nnc_tensor_free(hb);
1482
0
  ccv_nnc_tensor_free(bt);
1483
0
}
1484
1485
TEST_CASE("clamp forward with only max")
1486
1
{
1487
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
1488
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1489
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1490
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1491
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1492
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1493
0
  dsfmt_t dsfmt;
1494
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1495
0
  int i;
1496
0
  for (i = 0; i < 1000; i++)
1497
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
1498
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1499
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
1500
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
1501
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1502
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
1503
0
  ccv_nnc_tensor_free(a);
1504
0
  ccv_nnc_tensor_free(b);
1505
0
  ccv_nnc_tensor_free(ha);
1506
0
  ccv_nnc_tensor_free(hb);
1507
0
  ccv_nnc_tensor_free(bt);
1508
0
}
1509
1510
TEST_CASE("clamp forward with only min")
1511
1
{
1512
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
1513
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1514
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
1515
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1516
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1517
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
1518
0
  dsfmt_t dsfmt;
1519
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1520
0
  int i;
1521
0
  for (i = 0; i < 1000; i++)
1522
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
1523
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
1524
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
1525
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
1526
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1527
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
1528
0
  ccv_nnc_tensor_free(a);
1529
0
  ccv_nnc_tensor_free(b);
1530
0
  ccv_nnc_tensor_free(ha);
1531
0
  ccv_nnc_tensor_free(hb);
1532
0
  ccv_nnc_tensor_free(bt);
1533
0
}
1534
1535
TEST_CASE("compare set with mps")
1536
1
{
1537
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
1538
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 11, 10, 9, 8), 0);
1539
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 11, 10, 9, 8), 0);
1540
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 11, 10, 9, 8), 0);
1541
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(a), 0);
1542
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(ga), 0);
1543
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ha), 0);
1544
0
  REQUIRE_TENSOR_EQ(ha, ga, "format transform result should be the same");
1545
0
  ccv_nnc_tensor_free(a);
1546
0
  ccv_nnc_tensor_free(ha);
1547
0
  ccv_nnc_tensor_free(ga);
1548
0
}
1549
1550
TEST_CASE("scaled dot product attention with mps")
1551
1
{
1552
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
1553
  // Bypass error: variable-sized object may not be initialized
1554
0
#define num_long_trials 4
1555
0
#define num_short_trials 2
1556
0
#define num_trials (num_long_trials + num_short_trials)
1557
1558
0
  for (int trial = 0; trial < num_long_trials; ++trial) {
1559
0
    int B_candidates[num_trials] =         {  32, 1,  32,   3, 2, 1 };
1560
0
    int R_candidates[num_trials] =         { 128, 4096, 128,  61, 6, 2 };
1561
0
    int C_candidates[num_trials] =         { 128, 4096, 128,  49, 2, 1 };
1562
0
    int Hq_candidates[num_trials] =        {   8, 32,  32,  13, 3, 1 };
1563
0
    int Hk_candidates[num_trials] =        {   8, 32,   8,  13, 3, 1 };
1564
0
    int D_candidates[num_trials] =         {  64, 32, 128, 191, 4, 8 };
1565
0
    int is_causal_candidates[num_trials] = {   0, 0,   1,   0, 1, 0 };
1566
1567
0
    int B = B_candidates[trial];
1568
0
    int R = R_candidates[trial];
1569
0
    int C = C_candidates[trial];
1570
0
    int Hq = Hq_candidates[trial];
1571
0
    int Hk = Hk_candidates[trial];
1572
0
    int D = D_candidates[trial];
1573
0
    int is_causal = is_causal_candidates[trial];
1574
0
    float scale = 1.0 / sqrt((float)D);
1575
1576
0
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
1577
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
1578
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
1579
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
1580
1581
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
1582
0
      q_tensor->data.f32[i] = (float)(i) / (float)(ccv_min(B * R * Hq * D, 131072));
1583
0
    }
1584
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
1585
0
      k_tensor->data.f32[i] = (float)(i) / (float)(ccv_min(B * C * Hk * D, 131072));
1586
0
    }
1587
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
1588
0
      v_tensor->data.f32[i] = (float)(i) / (float)(ccv_min(B * C * Hk * D, 131072));
1589
0
    }
1590
1591
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
1592
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
1593
1594
    // Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC?
1595
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, Hq, D), 0);
1596
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, Hk, D), 0);
1597
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, Hk, D), 0);
1598
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, Hq, D), 0);
1599
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
1600
1601
0
    if (is_causal)
1602
0
    {
1603
0
      ccv_nnc_tensor_t* const causal_mask = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, R, C), 0);
1604
0
      ccv_nnc_tensor_t* const gpu_causal_mask = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, R, C), 0);
1605
0
      for (int i = 0; i < R; i++)
1606
0
        for (int j = 0; j < C; j++)
1607
0
          causal_mask->data.f32[i * C + j] = 0;
1608
0
      for (int i = 0; i < R - 1; i++)
1609
0
        for (int j = i - R + C + 1; j < C; j++)
1610
0
          causal_mask->data.f32[i * C + j] = -FLT_MAX;
1611
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(causal_mask), TENSOR_LIST(gpu_causal_mask), 0);
1612
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_causal_mask), TENSOR_LIST(gpu_o_tensor), 0);
1613
0
      ccv_nnc_tensor_free(gpu_causal_mask);
1614
0
      ccv_nnc_tensor_free(causal_mask);
1615
0
    } else {
1616
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
1617
0
    }
1618
1619
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
1620
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
1621
1622
0
    REQUIRE_TENSOR_EQ(copy_of_gpu_o_tensor, o_tensor, "scaled dot product attention result should be the same");
1623
1624
0
    ccv_nnc_tensor_free(o_tensor);
1625
0
    ccv_nnc_tensor_free(gpu_o_tensor);
1626
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
1627
0
    ccv_nnc_tensor_free(q_tensor);
1628
0
    ccv_nnc_tensor_free(k_tensor);
1629
0
    ccv_nnc_tensor_free(v_tensor);
1630
0
    ccv_nnc_tensor_free(gpu_q_tensor);
1631
0
    ccv_nnc_tensor_free(gpu_k_tensor);
1632
0
    ccv_nnc_tensor_free(gpu_v_tensor);
1633
0
  }
1634
0
#undef num_long_trials
1635
0
#undef num_short_trials
1636
0
#undef num_trials
1637
0
}
1638
1639
TEST_CASE("scaled dot product attention + unify head with mps")
1640
1
{
1641
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
1642
0
  ccv_nnc_symbolic_graph_t* const sdp_symbolic_graph = ccv_nnc_symbolic_graph_new();
1643
0
  ccv_nnc_tensor_symbol_t q = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "q");
1644
0
  ccv_nnc_tensor_symbol_t k = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "k");
1645
0
  ccv_nnc_tensor_symbol_t v = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "v");
1646
0
  ccv_nnc_tensor_symbol_t w = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512, 512), "w");
1647
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512), "bias");
1648
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "c");
1649
0
  ccv_nnc_tensor_symbol_t r = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 512), "r");
1650
0
  ccv_nnc_graph_exec_symbol_new(sdp_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(q, k, v, NO_TENSOR_SYMBOL, w, bias), TENSOR_SYMBOL_LIST(r, NO_TENSOR_SYMBOL, c), "scaled_dot_product_attention");
1651
0
  ccv_nnc_graph_exec_symbol_autogen(sdp_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1652
0
  ccv_nnc_graph_t* sdp_graph = 0;
1653
0
  ccv_nnc_tensor_arena_t* sdp_tensor_arena = 0;
1654
0
  ccv_nnc_graph_exec_arena_t* sdp_graph_exec_arena = 0;
1655
0
  ccv_nnc_symbolic_graph_compile(sdp_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(sdp_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(sdp_symbolic_graph), &sdp_graph, &sdp_tensor_arena, &sdp_graph_exec_arena);
1656
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, q);
1657
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, k);
1658
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, v);
1659
0
  ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, w);
1660
0
  ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, bias);
1661
0
  dsfmt_t dsfmt;
1662
0
  int i;
1663
0
  dsfmt_init_gen_rand(&dsfmt, 1);
1664
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
1665
0
    q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1666
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
1667
0
    k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1668
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
1669
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1670
0
  for (i = 0; i < 512 * 512; i++)
1671
0
    w_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1672
0
  for (i = 0; i < 512; i++)
1673
0
    bias_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1674
0
  ccv_nnc_symbolic_graph_t* const g_symbolic_graph = ccv_nnc_symbolic_graph_new();
1675
0
  ccv_nnc_tensor_symbol_t gq = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "q");
1676
0
  ccv_nnc_tensor_symbol_t gk = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "k");
1677
0
  ccv_nnc_tensor_symbol_t gv = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "v");
1678
0
  ccv_nnc_tensor_symbol_t gw = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 512, 512), "w");
1679
0
  ccv_nnc_tensor_symbol_t gbias = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 512), "bias");
1680
0
  ccv_nnc_tensor_symbol_t gc = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "c");
1681
0
  ccv_nnc_tensor_symbol_t gr = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 512), "r");
1682
0
  ccv_nnc_graph_exec_symbol_new(g_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(gq, gk, gv, NO_TENSOR_SYMBOL, gw, gbias), TENSOR_SYMBOL_LIST(gr, NO_TENSOR_SYMBOL, gc), "scaled_dot_product_attention");
1683
0
  ccv_nnc_graph_exec_symbol_autogen(g_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1684
0
  ccv_nnc_graph_t* g_graph = 0;
1685
0
  ccv_nnc_tensor_arena_t* g_tensor_arena = 0;
1686
0
  ccv_nnc_graph_exec_arena_t* g_graph_exec_arena = 0;
1687
0
  ccv_nnc_symbolic_graph_compile(g_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(g_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(g_symbolic_graph), &g_graph, &g_tensor_arena, &g_graph_exec_arena);
1688
0
  ccv_nnc_tensor_t* const gq_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gq);
1689
0
  ccv_nnc_tensor_t* const gk_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gk);
1690
0
  ccv_nnc_tensor_t* const gv_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gv);
1691
0
  ccv_nnc_tensor_t* const gw_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gw);
1692
0
  ccv_nnc_tensor_t* const gbias_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gbias);
1693
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, w_tensor, bias_tensor), TENSOR_LIST(gq_tensor, gk_tensor, gv_tensor, gw_tensor, gbias_tensor), 0);
1694
0
  ccv_nnc_graph_run(sdp_graph, 0, TRAVERSE_FULL, 0, 0);
1695
0
  ccv_nnc_graph_run(g_graph, 0, TRAVERSE_FULL, 0, 0);
1696
0
  ccv_nnc_tensor_t* const r_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, r);
1697
0
  ccv_nnc_tensor_t* const gr_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gr);
1698
0
  ccv_nnc_tensor_t* const hr = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 512), 0);
1699
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gr_tensor), TENSOR_LIST(hr), 0);
1700
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, r_tensor->data.f32, hr->data.f32, 32 * 128 * 512, 1e-4, "graph computed result should match scaled dot product attention op result");
1701
0
  ccv_nnc_symbolic_graph_free(sdp_symbolic_graph);
1702
0
  ccv_nnc_tensor_arena_free(sdp_tensor_arena);
1703
0
  ccv_nnc_graph_exec_arena_free(sdp_graph_exec_arena);
1704
0
  ccv_nnc_graph_free(sdp_graph);
1705
0
  ccv_nnc_symbolic_graph_free(g_symbolic_graph);
1706
0
  ccv_nnc_tensor_arena_free(g_tensor_arena);
1707
0
  ccv_nnc_graph_exec_arena_free(g_graph_exec_arena);
1708
0
  ccv_nnc_graph_free(g_graph);
1709
0
  ccv_nnc_tensor_free(hr);
1710
0
}
1711
1712
TEST_CASE("scaled dot product attention gradient with mps")
1713
1
{
1714
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
1715
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
1716
0
#define num_long_trials 2
1717
0
#define num_short_trials 2
1718
0
#define num_trials (num_long_trials + num_short_trials)
1719
1720
0
  dsfmt_t dsfmt;
1721
0
  dsfmt_init_gen_rand(&dsfmt, 10);
1722
0
  for (int trial = 0; trial < num_trials; ++trial) {
1723
0
    int B_candidates[num_trials] = {  32,   3, 2, 1 };
1724
0
    int R_candidates[num_trials] = { 128,  61, 6, 2 };
1725
0
    int C_candidates[num_trials] = { 128,  49, 2, 1 };
1726
0
    int H_candidates[num_trials] = {   8,  13, 3, 1 };
1727
0
    int D_candidates[num_trials] = {  64, 191, 4, 8 };
1728
1729
0
    int B = B_candidates[trial];
1730
0
    int R = R_candidates[trial];
1731
0
    int C = C_candidates[trial];
1732
0
    int H = H_candidates[trial];
1733
0
    int D = D_candidates[trial];
1734
0
    float scale = 1.0 / sqrt((float)D);
1735
1736
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
1737
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
1738
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
1739
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
1740
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
1741
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
1742
1743
0
    for (int i = 0; i < B * R * H * D; ++i) {
1744
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1745
0
    }
1746
0
    for (int i = 0; i < B * C * H * D; ++i) {
1747
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1748
0
    }
1749
0
    for (int i = 0; i < B * C * H * D; ++i) {
1750
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1751
0
    }
1752
1753
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
1754
0
    for (int i = 0; i < B * R * H * D; ++i) {
1755
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1756
0
    }
1757
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
1758
1759
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
1760
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
1761
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
1762
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
1763
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
1764
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
1765
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
1766
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
1767
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
1768
1769
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
1770
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
1771
1772
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
1773
1774
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
1775
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
1776
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
1777
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
1778
1779
0
    REQUIRE_TENSOR_EQ(copy_of_gpu_dv_tensor, dv_tensor, "scaled dot product attention result should be the same");
1780
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * H * D, 1e-6, "scaled dot product attention result should be the same");
1781
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * H * D, 1e-6, "scaled dot product attention result should be the same");
1782
1783
0
    ccv_nnc_tensor_free(do_tensor);
1784
0
    ccv_nnc_tensor_free(gpu_do_tensor);
1785
0
    ccv_nnc_tensor_free(gpu_o_tensor);
1786
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
1787
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
1788
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
1789
0
    ccv_nnc_tensor_free(q_tensor);
1790
0
    ccv_nnc_tensor_free(k_tensor);
1791
0
    ccv_nnc_tensor_free(v_tensor);
1792
0
    ccv_nnc_tensor_free(gpu_q_tensor);
1793
0
    ccv_nnc_tensor_free(gpu_k_tensor);
1794
0
    ccv_nnc_tensor_free(gpu_v_tensor);
1795
0
    ccv_nnc_tensor_free(dq_tensor);
1796
0
    ccv_nnc_tensor_free(dk_tensor);
1797
0
    ccv_nnc_tensor_free(dv_tensor);
1798
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
1799
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
1800
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
1801
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
1802
0
  }
1803
0
#undef num_long_trials
1804
0
#undef num_short_trials
1805
0
#undef num_trials
1806
0
}
1807
1808
TEST_CASE("scaled dot product attention gradient with mps in half-precision")
1809
1
{
1810
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
1811
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
1812
0
#define num_long_trials 8
1813
0
#define num_short_trials 4
1814
0
#define num_trials (num_long_trials + num_short_trials)
1815
1816
0
  dsfmt_t dsfmt;
1817
0
  dsfmt_init_gen_rand(&dsfmt, 10);
1818
0
  for (int trial = 0; trial < num_trials; ++trial) {
1819
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
1820
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
1821
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
1822
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
1823
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 64, 40, 160, 192, 256, 128 };
1824
1825
0
    const int B = B_candidates[trial];
1826
0
    const int R = R_candidates[trial];
1827
0
    const int C = C_candidates[trial];
1828
0
    const int Hq = Hq_candidates[trial];
1829
0
    const int Hk = Hq_candidates[trial];
1830
0
    const int D = D_candidates[trial];
1831
0
    const int is_causal = 0;
1832
0
    const float scale = 1.0 / sqrt((float)D);
1833
1834
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
1835
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
1836
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
1837
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
1838
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
1839
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
1840
1841
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
1842
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1843
0
    }
1844
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
1845
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1846
0
    }
1847
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
1848
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1849
0
    }
1850
1851
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
1852
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
1853
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1854
0
    }
1855
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
1856
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
1857
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
1858
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
1859
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
1860
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), 0);
1861
1862
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
1863
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
1864
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
1865
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
1866
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
1867
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
1868
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
1869
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
1870
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
1871
1872
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
1873
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
1874
1875
0
    ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal);
1876
0
    cmd.info.scaled_dot_product_attention.deterministic = 0;
1877
0
    ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
1878
1879
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
1880
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
1881
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
1882
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), 0);
1883
1884
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
1885
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
1886
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
1887
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
1888
1889
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * Hq * D, 1e-3, "scaled dot product attention result should be the same");
1890
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * Hk * D, 3e-3, "scaled dot product attention result should be the same");
1891
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * Hk * D, 6e-3, "GPU computed output should be the same as CPU computed ones");
1892
1893
0
    ccv_nnc_tensor_free(do_tensor);
1894
0
    ccv_nnc_tensor_free(gpu_do_tensor);
1895
0
    ccv_nnc_tensor_free(gpu_o_tensor);
1896
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor_f16);
1897
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor_f16);
1898
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor_f16);
1899
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
1900
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
1901
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
1902
0
    ccv_nnc_tensor_free(q_tensor);
1903
0
    ccv_nnc_tensor_free(k_tensor);
1904
0
    ccv_nnc_tensor_free(v_tensor);
1905
0
    ccv_nnc_tensor_free(q_tensor_f16);
1906
0
    ccv_nnc_tensor_free(k_tensor_f16);
1907
0
    ccv_nnc_tensor_free(v_tensor_f16);
1908
0
    ccv_nnc_tensor_free(do_tensor_f16);
1909
0
    ccv_nnc_tensor_free(gpu_q_tensor);
1910
0
    ccv_nnc_tensor_free(gpu_k_tensor);
1911
0
    ccv_nnc_tensor_free(gpu_v_tensor);
1912
0
    ccv_nnc_tensor_free(dq_tensor);
1913
0
    ccv_nnc_tensor_free(dk_tensor);
1914
0
    ccv_nnc_tensor_free(dv_tensor);
1915
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
1916
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
1917
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
1918
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
1919
0
  }
1920
0
#undef num_long_trials
1921
0
#undef num_short_trials
1922
0
#undef num_trials
1923
0
}
1924
1925
TEST_CASE("backward gemm with no transpose")
1926
1
{
1927
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
1928
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
1929
0
  float gp[] = {
1930
0
    1, 2, 3,
1931
0
    4, 5, 6,
1932
0
    7, 8, 9,
1933
0
    10, 11, 12,
1934
0
  };
1935
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1936
1937
0
  float ap[] = {
1938
0
    13, 14,
1939
0
    15, 16,
1940
0
    17, 18,
1941
0
    19, 20,
1942
0
  };
1943
1944
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1945
1946
0
  float bp[] = {
1947
0
    21, 22, 23,
1948
0
    24, 25, 26,
1949
0
  };
1950
1951
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1952
1953
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1954
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1955
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1956
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
1957
1958
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1959
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1960
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1961
0
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD();
1962
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
1963
0
  cmd.algorithm = 1; // This is cblas.
1964
1965
0
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(h, db, dbias), 0);
1966
1967
0
  ccv_nnc_tensor_t* const ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 4, 2), 0);
1968
0
  ccv_nnc_tensor_t* const cdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 2, 3), 0);
1969
0
  ccv_nnc_tensor_t* const cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 3), 0);
1970
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(h, db, dbias), TENSOR_LIST(ch, cdb, cdbias), 0);
1971
1972
0
  float dbiastp[] = {
1973
0
    22, 26, 30,
1974
0
  };
1975
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
1976
1977
0
  REQUIRE_TENSOR_EQ(cdbias, &dbiast, "bias should be equal");
1978
0
  float htp[] = {
1979
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
1980
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
1981
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
1982
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
1983
0
  };
1984
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1985
1986
0
  REQUIRE_TENSOR_EQ(ch, &ht, "h should be equal");
1987
0
  float dbtp[] = {
1988
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
1989
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
1990
0
  };
1991
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1992
0
  REQUIRE_TENSOR_EQ(cdb, &dbt, "db should be equal");
1993
0
  ccv_nnc_tensor_free(g);
1994
0
  ccv_nnc_tensor_free(a);
1995
0
  ccv_nnc_tensor_free(b);
1996
0
  ccv_nnc_tensor_free(h);
1997
0
  ccv_nnc_tensor_free(db);
1998
0
  ccv_nnc_tensor_free(dbias);
1999
0
}
2000
2001
TEST_CASE("backward gemm with transpose a")
2002
1
{
2003
1
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2004
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2005
0
  float gp[] = {
2006
0
    1, 2, 3,
2007
0
    4, 5, 6,
2008
0
    7, 8, 9,
2009
0
    10, 11, 12,
2010
0
  };
2011
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
2012
0
  float ap[] = {
2013
0
    13, 15, 17, 19,
2014
0
    14, 16, 18, 20,
2015
0
  };
2016
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
2017
0
  float bp[] = {
2018
0
    21, 22, 23,
2019
0
    24, 25, 26,
2020
0
  };
2021
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2022
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
2023
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2024
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
2025
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
2026
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
2027
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
2028
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
2029
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
2030
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
2031
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2032
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
2033
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
2034
0
  float dbiastp[] = {
2035
0
    22, 26, 30,
2036
0
  };
2037
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
2038
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
2039
0
  float htp[] = {
2040
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
2041
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
2042
0
  };
2043
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
2044
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2045
0
  float dbtp[] = {
2046
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
2047
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
2048
0
  };
2049
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2050
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2051
0
  ccv_nnc_tensor_free(g);
2052
0
  ccv_nnc_tensor_free(a);
2053
0
  ccv_nnc_tensor_free(b);
2054
0
  ccv_nnc_tensor_free(h);
2055
0
  ccv_nnc_tensor_free(db);
2056
0
  ccv_nnc_tensor_free(dbias);
2057
0
  ccv_nnc_tensor_free(gg);
2058
0
  ccv_nnc_tensor_free(ga);
2059
0
  ccv_nnc_tensor_free(gb);
2060
0
  ccv_nnc_tensor_free(gh);
2061
0
  ccv_nnc_tensor_free(gdb);
2062
0
  ccv_nnc_tensor_free(gdbias);
2063
0
}
2064
2065
TEST_CASE("backward gemm with transpose b")
2066
1
{
2067
1
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2068
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2069
0
  float gp[] = {
2070
0
    1, 2, 3,
2071
0
    4, 5, 6,
2072
0
    7, 8, 9,
2073
0
    10, 11, 12,
2074
0
  };
2075
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
2076
0
  float ap[] = {
2077
0
    13, 14,
2078
0
    15, 16,
2079
0
    17, 18,
2080
0
    19, 20,
2081
0
  };
2082
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
2083
0
  float bp[] = {
2084
0
    21, 24,
2085
0
    22, 25,
2086
0
    23, 26,
2087
0
  };
2088
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
2089
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
2090
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
2091
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
2092
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
2093
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
2094
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
2095
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
2096
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
2097
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
2098
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2099
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
2100
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
2101
0
  float dbiastp[] = {
2102
0
    22, 26, 30,
2103
0
  };
2104
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
2105
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
2106
0
  float htp[] = {
2107
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
2108
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
2109
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
2110
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
2111
0
  };
2112
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
2113
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2114
0
  float dbtp[] = {
2115
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
2116
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
2117
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
2118
0
  };
2119
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
2120
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2121
0
  ccv_nnc_tensor_free(g);
2122
0
  ccv_nnc_tensor_free(a);
2123
0
  ccv_nnc_tensor_free(b);
2124
0
  ccv_nnc_tensor_free(h);
2125
0
  ccv_nnc_tensor_free(db);
2126
0
  ccv_nnc_tensor_free(dbias);
2127
0
  ccv_nnc_tensor_free(gg);
2128
0
  ccv_nnc_tensor_free(ga);
2129
0
  ccv_nnc_tensor_free(gb);
2130
0
  ccv_nnc_tensor_free(gh);
2131
0
  ccv_nnc_tensor_free(gdb);
2132
0
  ccv_nnc_tensor_free(gdbias);
2133
0
}
2134
2135
TEST_CASE("backward gemm with transpose a and b")
2136
1
{
2137
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2138
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2139
0
  float gp[] = {
2140
0
    1, 2, 3,
2141
0
    4, 5, 6,
2142
0
    7, 8, 9,
2143
0
    10, 11, 12,
2144
0
  };
2145
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
2146
0
  float ap[] = {
2147
0
    13, 15, 17, 19,
2148
0
    14, 16, 18, 20,
2149
0
  };
2150
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
2151
0
  float bp[] = {
2152
0
    21, 24,
2153
0
    22, 25,
2154
0
    23, 26,
2155
0
  };
2156
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
2157
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
2158
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
2159
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
2160
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
2161
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
2162
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
2163
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
2164
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
2165
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
2166
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2167
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
2168
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
2169
0
  float dbiastp[] = {
2170
0
    22, 26, 30,
2171
0
  };
2172
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
2173
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
2174
0
  float htp[] = {
2175
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
2176
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
2177
0
  };
2178
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
2179
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2180
0
  float dbtp[] = {
2181
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
2182
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
2183
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
2184
0
  };
2185
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
2186
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2187
0
  ccv_nnc_tensor_free(g);
2188
0
  ccv_nnc_tensor_free(a);
2189
0
  ccv_nnc_tensor_free(b);
2190
0
  ccv_nnc_tensor_free(h);
2191
0
  ccv_nnc_tensor_free(db);
2192
0
  ccv_nnc_tensor_free(dbias);
2193
0
  ccv_nnc_tensor_free(gg);
2194
0
  ccv_nnc_tensor_free(ga);
2195
0
  ccv_nnc_tensor_free(gb);
2196
0
  ccv_nnc_tensor_free(gh);
2197
0
  ccv_nnc_tensor_free(gdb);
2198
0
  ccv_nnc_tensor_free(gdbias);
2199
0
}
2200
2201
2202
TEST_CASE("backward gemm large data set")
2203
1
{
2204
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2205
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2206
0
  dsfmt_t dsfmt;
2207
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2208
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2209
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2210
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2211
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2212
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2213
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2214
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2215
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2216
2217
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2218
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2219
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2220
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2221
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2222
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2223
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2224
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2225
0
  int i;
2226
0
  for (i = 0; i < 64 * 128; i++)
2227
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2228
0
  for (i = 0; i < 64; i++)
2229
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2230
0
  for (i = 0; i < 10 * 128; i++)
2231
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2232
0
  for (i = 0; i < 10 * 64; i++)
2233
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2234
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
2235
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2236
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
2237
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2238
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
2239
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2240
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2241
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2242
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2243
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
2244
0
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
2245
0
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
2246
0
  REQUIRE_TENSOR_EQ(tdbias, hdbias, "GPU computed output should be the same as CPU computed ones");
2247
0
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
2248
0
  ccv_nnc_tensor_free(a);
2249
0
  ccv_nnc_tensor_free(w);
2250
0
  ccv_nnc_tensor_free(bias);
2251
0
  ccv_nnc_tensor_free(b);
2252
0
  ccv_nnc_tensor_free(g);
2253
0
  ccv_nnc_tensor_free(dw);
2254
0
  ccv_nnc_tensor_free(dbias);
2255
0
  ccv_nnc_tensor_free(h);
2256
0
  ccv_nnc_tensor_free(ha);
2257
0
  ccv_nnc_tensor_free(hw);
2258
0
  ccv_nnc_tensor_free(hbias);
2259
0
  ccv_nnc_tensor_free(hb);
2260
0
  ccv_nnc_tensor_free(hg);
2261
0
  ccv_nnc_tensor_free(hdw);
2262
0
  ccv_nnc_tensor_free(hdbias);
2263
0
  ccv_nnc_tensor_free(hh);
2264
0
  ccv_nnc_tensor_free(tb);
2265
0
  ccv_nnc_tensor_free(th);
2266
0
  ccv_nnc_tensor_free(tdw);
2267
0
  ccv_nnc_tensor_free(tdbias);
2268
0
}
2269
2270
TEST_CASE("backward gemm no bias")
2271
1
{
2272
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2273
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2274
0
  dsfmt_t dsfmt;
2275
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2276
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2277
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2278
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2279
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2280
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2281
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2282
2283
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2284
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2285
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2286
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2287
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2288
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2289
0
  int i;
2290
0
  for (i = 0; i < 64 * 128; i++)
2291
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2292
0
  for (i = 0; i < 10 * 128; i++)
2293
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2294
0
  for (i = 0; i < 10 * 64; i++)
2295
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2296
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(a, w, g), 0);
2297
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2298
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
2299
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2300
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
2301
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2302
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2303
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2304
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
2305
0
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
2306
0
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
2307
0
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
2308
0
  ccv_nnc_tensor_free(a);
2309
0
  ccv_nnc_tensor_free(w);
2310
0
  ccv_nnc_tensor_free(b);
2311
0
  ccv_nnc_tensor_free(g);
2312
0
  ccv_nnc_tensor_free(dw);
2313
0
  ccv_nnc_tensor_free(h);
2314
0
  ccv_nnc_tensor_free(ha);
2315
0
  ccv_nnc_tensor_free(hw);
2316
0
  ccv_nnc_tensor_free(hb);
2317
0
  ccv_nnc_tensor_free(hg);
2318
0
  ccv_nnc_tensor_free(hdw);
2319
0
  ccv_nnc_tensor_free(hh);
2320
0
  ccv_nnc_tensor_free(tb);
2321
0
  ccv_nnc_tensor_free(th);
2322
0
  ccv_nnc_tensor_free(tdw);
2323
0
}
2324
2325
TEST_CASE("backward gemm no h")
2326
1
{
2327
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2328
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2329
0
  dsfmt_t dsfmt;
2330
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2331
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2332
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2333
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2334
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2335
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2336
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2337
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2338
2339
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2340
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2341
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2342
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2343
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2344
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2345
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2346
0
  int i;
2347
0
  for (i = 0; i < 64 * 128; i++)
2348
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2349
0
  for (i = 0; i < 64; i++)
2350
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2351
0
  for (i = 0; i < 10 * 128; i++)
2352
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2353
0
  for (i = 0; i < 10 * 64; i++)
2354
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2355
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
2356
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2357
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(0, hdw, hdbias), 0);
2358
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2359
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(0, dw, dbias), 0);
2360
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2361
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2362
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2363
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, 0), TENSOR_LIST(tb, tdw, tdbias, 0), 0);
2364
0
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
2365
0
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
2366
0
  REQUIRE_TENSOR_EQ(tdbias, hdbias, "GPU computed output should be the same as CPU computed ones");
2367
0
  ccv_nnc_tensor_free(a);
2368
0
  ccv_nnc_tensor_free(w);
2369
0
  ccv_nnc_tensor_free(bias);
2370
0
  ccv_nnc_tensor_free(b);
2371
0
  ccv_nnc_tensor_free(g);
2372
0
  ccv_nnc_tensor_free(dw);
2373
0
  ccv_nnc_tensor_free(dbias);
2374
0
  ccv_nnc_tensor_free(ha);
2375
0
  ccv_nnc_tensor_free(hw);
2376
0
  ccv_nnc_tensor_free(hbias);
2377
0
  ccv_nnc_tensor_free(hb);
2378
0
  ccv_nnc_tensor_free(hg);
2379
0
  ccv_nnc_tensor_free(hdw);
2380
0
  ccv_nnc_tensor_free(hdbias);
2381
0
  ccv_nnc_tensor_free(tb);
2382
0
  ccv_nnc_tensor_free(tdw);
2383
0
  ccv_nnc_tensor_free(tdbias);
2384
0
}
2385
2386
TEST_CASE("backward gemm no dw")
2387
1
{
2388
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2389
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2390
0
  dsfmt_t dsfmt;
2391
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2392
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2393
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2394
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2395
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2396
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2397
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2398
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2399
2400
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2401
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2402
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2403
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2404
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2405
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2406
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2407
0
  int i;
2408
0
  for (i = 0; i < 64 * 128; i++)
2409
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2410
0
  for (i = 0; i < 64; i++)
2411
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2412
0
  for (i = 0; i < 10 * 128; i++)
2413
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2414
0
  for (i = 0; i < 10 * 64; i++)
2415
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2416
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
2417
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2418
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, 0, hdbias), 0);
2419
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2420
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, 0, dbias), 0);
2421
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2422
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2423
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2424
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, 0, dbias, h), TENSOR_LIST(tb, 0, tdbias, th), 0);
2425
0
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
2426
0
  REQUIRE_TENSOR_EQ(tdbias, hdbias, "GPU computed output should be the same as CPU computed ones");
2427
0
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
2428
0
  ccv_nnc_tensor_free(a);
2429
0
  ccv_nnc_tensor_free(w);
2430
0
  ccv_nnc_tensor_free(bias);
2431
0
  ccv_nnc_tensor_free(b);
2432
0
  ccv_nnc_tensor_free(g);
2433
0
  ccv_nnc_tensor_free(dbias);
2434
0
  ccv_nnc_tensor_free(h);
2435
0
  ccv_nnc_tensor_free(ha);
2436
0
  ccv_nnc_tensor_free(hw);
2437
0
  ccv_nnc_tensor_free(hbias);
2438
0
  ccv_nnc_tensor_free(hb);
2439
0
  ccv_nnc_tensor_free(hg);
2440
0
  ccv_nnc_tensor_free(hdbias);
2441
0
  ccv_nnc_tensor_free(hh);
2442
0
  ccv_nnc_tensor_free(tb);
2443
0
  ccv_nnc_tensor_free(th);
2444
0
  ccv_nnc_tensor_free(tdbias);
2445
0
}
2446
2447
TEST_CASE("backwar gemm with no transpose batch 2, same b")
2448
1
{
2449
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2450
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2451
0
  float gp[] = {
2452
0
    1, 2, 3,
2453
0
    4, 5, 6,
2454
0
    7, 8, 9,
2455
0
    10, 11, 12,
2456
0
    10, 20, 30,
2457
0
    40, 50, 60,
2458
0
    70, 80, 90,
2459
0
    100, 110, 120,
2460
0
  };
2461
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2462
0
  float ap[] = {
2463
0
    13, 14,
2464
0
    15, 16,
2465
0
    17, 18,
2466
0
    19, 20,
2467
0
    131, 141,
2468
0
    151, 161,
2469
0
    171, 181,
2470
0
    191, 201,
2471
0
  };
2472
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2473
0
  float bp[] = {
2474
0
    21, 22, 23,
2475
0
    24, 25, 26,
2476
0
  };
2477
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2478
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2479
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2480
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
2481
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2482
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2483
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
2484
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2485
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
2486
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
2487
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2488
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
2489
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
2490
0
  float dbiastp[] = {
2491
0
    22 + 220, 26 + 260, 30 + 300,
2492
0
  };
2493
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
2494
  
2495
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
2496
0
  float htp[] = {
2497
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
2498
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
2499
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
2500
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
2501
0
    10 * 21 + 20 * 22 + 30 * 23, 10 * 24 + 20 * 25 + 30 * 26,
2502
0
    40 * 21 + 50 * 22 + 60 * 23, 40 * 24 + 50 * 25 + 60 * 26,
2503
0
    70 * 21 + 80 * 22 + 90 * 23, 70 * 24 + 80 * 25 + 90 * 26,
2504
0
    100 * 21 + 110 * 22 + 120 * 23, 100 * 24 + 110 * 25 + 120 * 26,
2505
0
  };
2506
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2507
  
2508
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2509
0
  float dbtp[] = {
2510
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
2511
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
2512
0
  };
2513
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2514
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2515
0
  ccv_nnc_tensor_free(g);
2516
0
  ccv_nnc_tensor_free(a);
2517
0
  ccv_nnc_tensor_free(b);
2518
0
  ccv_nnc_tensor_free(h);
2519
0
  ccv_nnc_tensor_free(db);
2520
0
  ccv_nnc_tensor_free(dbias);
2521
0
  ccv_nnc_tensor_free(gg);
2522
0
  ccv_nnc_tensor_free(ga);
2523
0
  ccv_nnc_tensor_free(gb);
2524
0
  ccv_nnc_tensor_free(gh);
2525
0
  ccv_nnc_tensor_free(gdb);
2526
0
  ccv_nnc_tensor_free(gdbias);
2527
0
}
2528
2529
TEST_CASE("backward gemm with no transpose batch 2, batched b")
2530
1
{
2531
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2532
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2533
0
  float gp[] = {
2534
0
    1, 2, 3,
2535
0
    4, 5, 6,
2536
0
    7, 8, 9,
2537
0
    10, 11, 12,
2538
0
    10, 20, 30,
2539
0
    40, 50, 60,
2540
0
    70, 80, 90,
2541
0
    100, 110, 120,
2542
0
  };
2543
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2544
0
  float ap[] = {
2545
0
    13, 14,
2546
0
    15, 16,
2547
0
    17, 18,
2548
0
    19, 20,
2549
0
    131, 141,
2550
0
    151, 161,
2551
0
    171, 181,
2552
0
    191, 201,
2553
0
  };
2554
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2555
0
  float bp[] = {
2556
0
    21, 22, 23,
2557
0
    24, 25, 26,
2558
0
    212, 222, 232,
2559
0
    242, 252, 262,
2560
0
  };
2561
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
2562
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2563
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
2564
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
2565
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2566
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2567
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
2568
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2569
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
2570
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
2571
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2572
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
2573
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
2574
0
  float dbiastp[] = {
2575
0
    22, 26, 30,
2576
0
    220, 260, 300,
2577
0
  };
2578
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
2579
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
2580
0
  float htp[] = {
2581
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
2582
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
2583
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
2584
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
2585
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
2586
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
2587
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
2588
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
2589
0
  };
2590
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2591
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2592
0
  float dbtp[] = {
2593
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
2594
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
2595
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
2596
0
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
2597
0
  };
2598
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
2599
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2600
0
  ccv_nnc_tensor_free(g);
2601
0
  ccv_nnc_tensor_free(a);
2602
0
  ccv_nnc_tensor_free(b);
2603
0
  ccv_nnc_tensor_free(h);
2604
0
  ccv_nnc_tensor_free(db);
2605
0
  ccv_nnc_tensor_free(dbias);
2606
0
  ccv_nnc_tensor_free(gg);
2607
0
  ccv_nnc_tensor_free(ga);
2608
0
  ccv_nnc_tensor_free(gb);
2609
0
  ccv_nnc_tensor_free(gh);
2610
0
  ccv_nnc_tensor_free(gdb);
2611
0
  ccv_nnc_tensor_free(gdbias);
2612
0
}
2613
2614
TEST_CASE("backward gemm with transpose a batch 2, same b")
2615
1
{
2616
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2617
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2618
0
  float gp[] = {
2619
0
    1, 2, 3,
2620
0
    4, 5, 6,
2621
0
    7, 8, 9,
2622
0
    10, 11, 12,
2623
0
    10, 20, 30,
2624
0
    40, 50, 60,
2625
0
    70, 80, 90,
2626
0
    100, 110, 120,
2627
0
  };
2628
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2629
0
  float ap[] = {
2630
0
    13, 15, 17, 19,
2631
0
    14, 16, 18, 20,
2632
0
    131, 151, 171, 191,
2633
0
    141, 161, 181, 201,
2634
0
  };
2635
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
2636
0
  float bp[] = {
2637
0
    21, 22, 23,
2638
0
    24, 25, 26,
2639
0
  };
2640
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2641
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
2642
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2643
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
2644
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2645
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
2646
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
2647
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
2648
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
2649
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
2650
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2651
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
2652
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
2653
0
  float dbiastp[] = {
2654
0
    22 + 220, 26 + 260, 30 + 300,
2655
0
  };
2656
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
2657
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
2658
0
  float htp[] = {
2659
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
2660
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
2661
0
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
2662
0
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
2663
0
  };
2664
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
2665
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2666
0
  float dbtp[] = {
2667
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
2668
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
2669
0
  };
2670
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2671
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2672
0
  ccv_nnc_tensor_free(g);
2673
0
  ccv_nnc_tensor_free(a);
2674
0
  ccv_nnc_tensor_free(b);
2675
0
  ccv_nnc_tensor_free(h);
2676
0
  ccv_nnc_tensor_free(db);
2677
0
  ccv_nnc_tensor_free(dbias);
2678
0
  ccv_nnc_tensor_free(gg);
2679
0
  ccv_nnc_tensor_free(ga);
2680
0
  ccv_nnc_tensor_free(gb);
2681
0
  ccv_nnc_tensor_free(gh);
2682
0
  ccv_nnc_tensor_free(gdb);
2683
0
  ccv_nnc_tensor_free(gdbias);
2684
0
}
2685
2686
TEST_CASE("backward gemm with transpose b batch 2, batched b")
2687
1
{
2688
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2689
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2690
0
  float gp[] = {
2691
0
    1, 2, 3,
2692
0
    4, 5, 6,
2693
0
    7, 8, 9,
2694
0
    10, 11, 12,
2695
0
    10, 20, 30,
2696
0
    40, 50, 60,
2697
0
    70, 80, 90,
2698
0
    100, 110, 120,
2699
0
  };
2700
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2701
0
  float ap[] = {
2702
0
    13, 14,
2703
0
    15, 16,
2704
0
    17, 18,
2705
0
    19, 20,
2706
0
    131, 141,
2707
0
    151, 161,
2708
0
    171, 181,
2709
0
    191, 201,
2710
0
  };
2711
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2712
0
  float bp[] = {
2713
0
    21, 24,
2714
0
    22, 25,
2715
0
    23, 26,
2716
0
    212, 242,
2717
0
    222, 252,
2718
0
    232, 262,
2719
0
  };
2720
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
2721
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2722
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
2723
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
2724
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2725
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2726
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
2727
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2728
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
2729
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
2730
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2731
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
2732
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
2733
0
  float dbiastp[] = {
2734
0
    22, 26, 30,
2735
0
    220, 260, 300,
2736
0
  };
2737
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
2738
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
2739
0
  float htp[] = {
2740
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
2741
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
2742
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
2743
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
2744
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
2745
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
2746
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
2747
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
2748
0
  };
2749
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2750
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2751
0
  float dbtp[] = {
2752
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
2753
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
2754
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
2755
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
2756
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
2757
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
2758
0
  };
2759
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
2760
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2761
0
  ccv_nnc_tensor_free(g);
2762
0
  ccv_nnc_tensor_free(a);
2763
0
  ccv_nnc_tensor_free(b);
2764
0
  ccv_nnc_tensor_free(h);
2765
0
  ccv_nnc_tensor_free(db);
2766
0
  ccv_nnc_tensor_free(dbias);
2767
0
  ccv_nnc_tensor_free(gg);
2768
0
  ccv_nnc_tensor_free(ga);
2769
0
  ccv_nnc_tensor_free(gb);
2770
0
  ccv_nnc_tensor_free(gh);
2771
0
  ccv_nnc_tensor_free(gdb);
2772
0
  ccv_nnc_tensor_free(gdbias);
2773
0
}
2774
2775
TEST_CASE("backward gemm with transpose a and b batch 2, same b")
2776
1
{
2777
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2778
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2779
0
  float gp[] = {
2780
0
    1, 2, 3,
2781
0
    4, 5, 6,
2782
0
    7, 8, 9,
2783
0
    10, 11, 12,
2784
0
    10, 20, 30,
2785
0
    40, 50, 60,
2786
0
    70, 80, 90,
2787
0
    100, 110, 120,
2788
0
  };
2789
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2790
0
  float ap[] = {
2791
0
    13, 15, 17, 19,
2792
0
    14, 16, 18, 20,
2793
0
    131, 151, 171, 191,
2794
0
    141, 161, 181, 201,
2795
0
  };
2796
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
2797
0
  float bp[] = {
2798
0
    21, 24,
2799
0
    22, 25,
2800
0
    23, 26,
2801
0
  };
2802
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
2803
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
2804
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
2805
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
2806
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2807
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
2808
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
2809
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
2810
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
2811
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
2812
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2813
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
2814
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
2815
0
  float dbiastp[] = {
2816
0
    22 + 220, 26 + 260, 30 + 300,
2817
0
  };
2818
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
2819
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
2820
0
  float htp[] = {
2821
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
2822
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
2823
0
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
2824
0
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
2825
0
  };
2826
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
2827
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2828
0
  float dbtp[] = {
2829
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
2830
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
2831
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
2832
0
  };
2833
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
2834
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2835
0
  ccv_nnc_tensor_free(g);
2836
0
  ccv_nnc_tensor_free(a);
2837
0
  ccv_nnc_tensor_free(b);
2838
0
  ccv_nnc_tensor_free(h);
2839
0
  ccv_nnc_tensor_free(db);
2840
0
  ccv_nnc_tensor_free(dbias);
2841
0
  ccv_nnc_tensor_free(gg);
2842
0
  ccv_nnc_tensor_free(ga);
2843
0
  ccv_nnc_tensor_free(gb);
2844
0
  ccv_nnc_tensor_free(gh);
2845
0
  ccv_nnc_tensor_free(gdb);
2846
0
  ccv_nnc_tensor_free(gdbias);
2847
0
}
2848
2849
TEST_CASE("backward gemm with no transpose batch 2, batched b, no bias")
2850
1
{
2851
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2852
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2853
0
  float gp[] = {
2854
0
    1, 2, 3,
2855
0
    4, 5, 6,
2856
0
    7, 8, 9,
2857
0
    10, 11, 12,
2858
0
    10, 20, 30,
2859
0
    40, 50, 60,
2860
0
    70, 80, 90,
2861
0
    100, 110, 120,
2862
0
  };
2863
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2864
0
  float ap[] = {
2865
0
    13, 14,
2866
0
    15, 16,
2867
0
    17, 18,
2868
0
    19, 20,
2869
0
    131, 141,
2870
0
    151, 161,
2871
0
    171, 181,
2872
0
    191, 201,
2873
0
  };
2874
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2875
0
  float bp[] = {
2876
0
    21, 22, 23,
2877
0
    24, 25, 26,
2878
0
    212, 222, 232,
2879
0
    242, 252, 262,
2880
0
  };
2881
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
2882
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2883
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
2884
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2885
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2886
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
2887
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2888
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
2889
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2890
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
2891
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
2892
0
  float htp[] = {
2893
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
2894
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
2895
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
2896
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
2897
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
2898
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
2899
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
2900
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
2901
0
  };
2902
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2903
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2904
0
  float dbtp[] = {
2905
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
2906
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
2907
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
2908
0
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
2909
0
  };
2910
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
2911
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2912
0
  ccv_nnc_tensor_free(g);
2913
0
  ccv_nnc_tensor_free(a);
2914
0
  ccv_nnc_tensor_free(b);
2915
0
  ccv_nnc_tensor_free(h);
2916
0
  ccv_nnc_tensor_free(db);
2917
0
  ccv_nnc_tensor_free(gg);
2918
0
  ccv_nnc_tensor_free(ga);
2919
0
  ccv_nnc_tensor_free(gb);
2920
0
  ccv_nnc_tensor_free(gh);
2921
0
  ccv_nnc_tensor_free(gdb);
2922
0
}
2923
2924
TEST_CASE("backward gemm with transpose b batch 2, batched b, no bias")
2925
1
{
2926
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
2927
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2928
0
  float gp[] = {
2929
0
    1, 2, 3,
2930
0
    4, 5, 6,
2931
0
    7, 8, 9,
2932
0
    10, 11, 12,
2933
0
    10, 20, 30,
2934
0
    40, 50, 60,
2935
0
    70, 80, 90,
2936
0
    100, 110, 120,
2937
0
  };
2938
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2939
0
  float ap[] = {
2940
0
    13, 14,
2941
0
    15, 16,
2942
0
    17, 18,
2943
0
    19, 20,
2944
0
    131, 141,
2945
0
    151, 161,
2946
0
    171, 181,
2947
0
    191, 201,
2948
0
  };
2949
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2950
0
  float bp[] = {
2951
0
    21, 24,
2952
0
    22, 25,
2953
0
    23, 26,
2954
0
    212, 242,
2955
0
    222, 252,
2956
0
    232, 262,
2957
0
  };
2958
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
2959
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2960
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
2961
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2962
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2963
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
2964
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2965
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
2966
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
2967
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
2968
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
2969
0
  float htp[] = {
2970
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
2971
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
2972
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
2973
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
2974
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
2975
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
2976
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
2977
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
2978
0
  };
2979
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2980
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
2981
0
  float dbtp[] = {
2982
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
2983
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
2984
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
2985
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
2986
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
2987
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
2988
0
  };
2989
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
2990
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
2991
0
  ccv_nnc_tensor_free(g);
2992
0
  ccv_nnc_tensor_free(a);
2993
0
  ccv_nnc_tensor_free(b);
2994
0
  ccv_nnc_tensor_free(h);
2995
0
  ccv_nnc_tensor_free(db);
2996
0
  ccv_nnc_tensor_free(gg);
2997
0
  ccv_nnc_tensor_free(ga);
2998
0
  ccv_nnc_tensor_free(gb);
2999
0
  ccv_nnc_tensor_free(gh);
3000
0
  ccv_nnc_tensor_free(gdb);
3001
0
}
3002
3003
TEST_CASE("backward gemm with transpose a and b batch 2, batch b, no bias")
3004
1
{
3005
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
3006
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
3007
0
  float gp[] = {
3008
0
    1, 2, 3,
3009
0
    4, 5, 6,
3010
0
    7, 8, 9,
3011
0
    10, 11, 12,
3012
0
    10, 20, 30,
3013
0
    40, 50, 60,
3014
0
    70, 80, 90,
3015
0
    100, 110, 120,
3016
0
  };
3017
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
3018
0
  float ap[] = {
3019
0
    13, 15, 17, 19,
3020
0
    14, 16, 18, 20,
3021
0
    131, 151, 171, 191,
3022
0
    141, 161, 181, 201,
3023
0
  };
3024
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
3025
0
  float bp[] = {
3026
0
    21, 24,
3027
0
    22, 25,
3028
0
    23, 26,
3029
0
    212, 242,
3030
0
    222, 252,
3031
0
    232, 262,
3032
0
  };
3033
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
3034
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
3035
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
3036
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
3037
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
3038
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
3039
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
3040
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
3041
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
3042
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
3043
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
3044
0
  float htp[] = {
3045
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
3046
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
3047
0
    10 * 212 + 20 * 222 + 30 * 232, 40 * 212 + 50 * 222 + 60 * 232, 70 * 212 + 80 * 222 + 90 * 232, 100 * 212 + 110 * 222 + 120 * 232,
3048
0
    10 * 242 + 20 * 252 + 30 * 262, 40 * 242 + 50 * 252 + 60 * 262, 70 * 242 + 80 * 252 + 90 * 262, 100 * 242 + 110 * 252 + 120 * 262,
3049
0
  };
3050
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
3051
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
3052
0
  float dbtp[] = {
3053
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
3054
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
3055
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
3056
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
3057
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
3058
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
3059
0
  };
3060
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
3061
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
3062
0
  ccv_nnc_tensor_free(g);
3063
0
  ccv_nnc_tensor_free(a);
3064
0
  ccv_nnc_tensor_free(b);
3065
0
  ccv_nnc_tensor_free(h);
3066
0
  ccv_nnc_tensor_free(db);
3067
0
  ccv_nnc_tensor_free(gg);
3068
0
  ccv_nnc_tensor_free(ga);
3069
0
  ccv_nnc_tensor_free(gb);
3070
0
  ccv_nnc_tensor_free(gh);
3071
0
  ccv_nnc_tensor_free(gdb);
3072
0
}
3073
3074
#include "case_main.h"