Coverage Report

Created: 2026-04-14 19:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/cublas.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
TEST_CASE("gemm no transpose")
15
1
{
16
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
17
1
  float ap[] = {
18
1
    1, 2,
19
1
    3, 4,
20
1
    5, 6,
21
1
    7, 8,
22
1
  };
23
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
24
1
  float bp[] = {
25
1
    7, 8, 9,
26
1
    10, 11, 12,
27
1
  };
28
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
29
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
30
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
31
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
32
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
33
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
34
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
35
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
36
1
  float ctp[] = {
37
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
38
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
39
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
40
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
41
1
  };
42
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
43
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
44
1
  ccv_nnc_tensor_free(a);
45
1
  ccv_nnc_tensor_free(b);
46
1
  ccv_nnc_tensor_free(c);
47
1
  ccv_nnc_tensor_free(ga);
48
1
  ccv_nnc_tensor_free(gb);
49
1
  ccv_nnc_tensor_free(gc);
50
1
}
51
52
TEST_CASE("gemm transpose a")
53
1
{
54
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
55
1
  float ap[] = {
56
1
    1, 3, 5, 7,
57
1
    2, 4, 6, 8,
58
1
  };
59
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
60
1
  float bp[] = {
61
1
    7, 8, 9,
62
1
    10, 11, 12,
63
1
  };
64
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
65
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
66
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
67
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
68
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
69
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
70
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
71
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
72
1
  float ctp[] = {
73
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
74
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
75
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
76
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
77
1
  };
78
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
79
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
80
1
  ccv_nnc_tensor_free(a);
81
1
  ccv_nnc_tensor_free(b);
82
1
  ccv_nnc_tensor_free(c);
83
1
  ccv_nnc_tensor_free(ga);
84
1
  ccv_nnc_tensor_free(gb);
85
1
  ccv_nnc_tensor_free(gc);
86
1
}
87
88
TEST_CASE("gemm transpose b")
89
1
{
90
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
91
1
  float ap[] = {
92
1
    1, 2,
93
1
    3, 4,
94
1
    5, 6,
95
1
    7, 8,
96
1
  };
97
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
98
1
  float bp[] = {
99
1
    7, 10,
100
1
    8, 11,
101
1
    9, 12,
102
1
  };
103
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
104
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
105
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
106
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
107
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
108
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
109
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
110
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
111
1
  float ctp[] = {
112
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
113
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
114
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
115
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
116
1
  };
117
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
118
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
119
1
  ccv_nnc_tensor_free(a);
120
1
  ccv_nnc_tensor_free(b);
121
1
  ccv_nnc_tensor_free(c);
122
1
  ccv_nnc_tensor_free(ga);
123
1
  ccv_nnc_tensor_free(gb);
124
1
  ccv_nnc_tensor_free(gc);
125
1
}
126
127
TEST_CASE("gemm transpose a and b")
128
1
{
129
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
130
1
  float ap[] = {
131
1
    1, 3, 5, 7,
132
1
    2, 4, 6, 8,
133
1
  };
134
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
135
1
  float bp[] = {
136
1
    7, 10,
137
1
    8, 11,
138
1
    9, 12,
139
1
  };
140
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
141
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
142
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
143
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
144
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
145
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
146
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
147
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
148
1
  float ctp[] = {
149
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
150
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
151
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
152
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
153
1
  };
154
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
155
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
156
1
  ccv_nnc_tensor_free(a);
157
1
  ccv_nnc_tensor_free(b);
158
1
  ccv_nnc_tensor_free(c);
159
1
  ccv_nnc_tensor_free(ga);
160
1
  ccv_nnc_tensor_free(gb);
161
1
  ccv_nnc_tensor_free(gc);
162
1
}
163
164
TEST_CASE("gemm no transpose with bias")
165
1
{
166
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
167
1
  float ap[] = {
168
1
    1, 2,
169
1
    3, 4,
170
1
    5, 6,
171
1
    7, 8,
172
1
  };
173
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
174
1
  float bp[] = {
175
1
    7, 8, 9,
176
1
    10, 11, 12,
177
1
  };
178
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
179
1
  float dp[] = {
180
1
    1, -1, 1,
181
1
    1, -1, 1,
182
1
    1, -1, 1,
183
1
    1, -1, 1,
184
1
  };
185
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
186
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
187
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
188
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
189
1
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
190
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
191
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
192
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
193
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
194
1
  float ctp[] = {
195
1
    1 * 7 + 2 * 10 + 1, 1 * 8 + 2 * 11 - 1, 1 * 9 + 2 * 12 + 1,
196
1
    3 * 7 + 4 * 10 + 1, 3 * 8 + 4 * 11 - 1, 3 * 9 + 4 * 12 + 1,
197
1
    5 * 7 + 6 * 10 + 1, 5 * 8 + 6 * 11 - 1, 5 * 9 + 6 * 12 + 1,
198
1
    7 * 7 + 8 * 10 + 1, 7 * 8 + 8 * 11 - 1, 7 * 9 + 8 * 12 + 1,
199
1
  };
200
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
201
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
202
1
  ccv_nnc_tensor_free(a);
203
1
  ccv_nnc_tensor_free(b);
204
1
  ccv_nnc_tensor_free(c);
205
1
  ccv_nnc_tensor_free(d);
206
1
  ccv_nnc_tensor_free(ga);
207
1
  ccv_nnc_tensor_free(gb);
208
1
  ccv_nnc_tensor_free(gc);
209
1
  ccv_nnc_tensor_free(gd);
210
1
}
211
212
TEST_CASE("gemm no transpose with bias and palettize weights")
213
1
{
214
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
215
1
  float ap[] = {
216
1
    1, 2,
217
1
    3, 4,
218
1
    5, 6,
219
1
    7, 8,
220
1
  };
221
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
222
1
  float bp[] = {
223
1
    7, 8, 9,
224
1
    10, 11, 12,
225
1
  };
226
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
227
1
  float dp[] = {
228
1
    1, -1, 1,
229
1
    1, -1, 1,
230
1
    1, -1, 1,
231
1
    1, -1, 1,
232
1
  };
233
1
  ccv_nnc_tensor_t* const pb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(CPU_TENSOR_NHWC(32F, 2, 3), 4, 128), 0);
234
1
  (void)ccv_nnc_palettize(b->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, 6, 4, 128, pb->data.u8, ccv_nnc_tensor_data_size_without_padding(pb->info));
235
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
236
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
237
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
238
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(GPU_TENSOR_NHWC(000, 32F, 2, 3), 4, 128), 0);
239
1
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
240
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
241
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, pb, d), TENSOR_LIST(ga, gb, gd), 0);
242
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
243
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
244
1
  float ctp[] = {
245
1
    1 * 7 + 2 * 10 + 1, 1 * 8 + 2 * 11 - 1, 1 * 9 + 2 * 12 + 1,
246
1
    3 * 7 + 4 * 10 + 1, 3 * 8 + 4 * 11 - 1, 3 * 9 + 4 * 12 + 1,
247
1
    5 * 7 + 6 * 10 + 1, 5 * 8 + 6 * 11 - 1, 5 * 9 + 6 * 12 + 1,
248
1
    7 * 7 + 8 * 10 + 1, 7 * 8 + 8 * 11 - 1, 7 * 9 + 8 * 12 + 1,
249
1
  };
250
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
251
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
252
1
  ccv_nnc_tensor_free(a);
253
1
  ccv_nnc_tensor_free(b);
254
1
  ccv_nnc_tensor_free(pb);
255
1
  ccv_nnc_tensor_free(c);
256
1
  ccv_nnc_tensor_free(d);
257
1
  ccv_nnc_tensor_free(ga);
258
1
  ccv_nnc_tensor_free(gb);
259
1
  ccv_nnc_tensor_free(gc);
260
1
  ccv_nnc_tensor_free(gd);
261
1
}
262
263
TEST_CASE("gemm no transpose with bias and row-wise int8 weights")
264
1
{
265
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
266
1
  float ap[] = {
267
1
    1, 2,
268
1
    3, 4,
269
1
    5, 6,
270
1
    7, 8,
271
1
  };
272
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
273
1
  float bp[] = {
274
1
    7, 8, 9,
275
1
    10, 11, 12,
276
1
  };
277
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
278
1
  ccv_nnc_tensor_t* const qb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, 2, 3)), 0);
279
1
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(b->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, 6, 3, qb->data.u8, ccv_nnc_tensor_data_size_without_padding(qb->info));
280
1
  ccv_nnc_tensor_t* const dq_b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
281
1
  ccv_nnc_dequantize_8i_rowwise(qb->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, qsize, 3, dq_b->data.u8, 6);
282
1
  float dp[] = {
283
1
    1, -1, 1,
284
1
    1, -1, 1,
285
1
    1, -1, 1,
286
1
    1, -1, 1,
287
1
  };
288
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
289
1
  ccv_nnc_tensor_t* const ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
290
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD();
291
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
292
1
  assert(cmd.backend >= 0);
293
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, dq_b, d), TENSOR_LIST(ct), 0);
294
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
295
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
296
1
  ccv_nnc_tensor_t* gqb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 32F, 2, 3)), 0);
297
1
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
298
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
299
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, qb, d), TENSOR_LIST(ga, gqb, gd), 0);
300
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gqb, gd), TENSOR_LIST(gc), 0);
301
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
302
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ct->data.f32, c->data.f32, 12, 1e-5, "result should match CPU with dequantized row-wise weights");
303
1
  ccv_nnc_tensor_free(a);
304
1
  ccv_nnc_tensor_free(b);
305
1
  ccv_nnc_tensor_free(qb);
306
1
  ccv_nnc_tensor_free(dq_b);
307
1
  ccv_nnc_tensor_free(d);
308
1
  ccv_nnc_tensor_free(ct);
309
1
  ccv_nnc_tensor_free(c);
310
1
  ccv_nnc_tensor_free(ga);
311
1
  ccv_nnc_tensor_free(gqb);
312
1
  ccv_nnc_tensor_free(gd);
313
1
  ccv_nnc_tensor_free(gc);
314
1
}
315
316
TEST_CASE("backward gemm with no transpose")
317
1
{
318
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
319
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
320
1
  float gp[] = {
321
1
    1, 2, 3,
322
1
    4, 5, 6,
323
1
    7, 8, 9,
324
1
    10, 11, 12,
325
1
  };
326
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
327
1
  float ap[] = {
328
1
    13, 14,
329
1
    15, 16,
330
1
    17, 18,
331
1
    19, 20,
332
1
  };
333
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
334
1
  float bp[] = {
335
1
    21, 22, 23,
336
1
    24, 25, 26,
337
1
  };
338
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
339
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
340
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
341
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
342
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
343
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
344
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
345
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
346
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
347
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
348
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
349
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
350
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
351
1
  float dbiastp[] = {
352
1
    22, 26, 30,
353
1
  };
354
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
355
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
356
1
  float htp[] = {
357
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
358
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
359
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
360
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
361
1
  };
362
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
363
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
364
1
  float dbtp[] = {
365
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
366
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
367
1
  };
368
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
369
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
370
1
  ccv_nnc_tensor_free(g);
371
1
  ccv_nnc_tensor_free(a);
372
1
  ccv_nnc_tensor_free(b);
373
1
  ccv_nnc_tensor_free(h);
374
1
  ccv_nnc_tensor_free(db);
375
1
  ccv_nnc_tensor_free(dbias);
376
1
  ccv_nnc_tensor_free(gg);
377
1
  ccv_nnc_tensor_free(ga);
378
1
  ccv_nnc_tensor_free(gb);
379
1
  ccv_nnc_tensor_free(gh);
380
1
  ccv_nnc_tensor_free(gdb);
381
1
  ccv_nnc_tensor_free(gdbias);
382
1
}
383
384
TEST_CASE("backward gemm with no transpose and palettize weights")
385
1
{
386
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
387
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
388
1
  float gp[] = {
389
1
    1, 2, 3,
390
1
    4, 5, 6,
391
1
    7, 8, 9,
392
1
    10, 11, 12,
393
1
  };
394
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
395
1
  float ap[] = {
396
1
    13, 14,
397
1
    15, 16,
398
1
    17, 18,
399
1
    19, 20,
400
1
  };
401
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
402
1
  float bp[] = {
403
1
    21, 22, 23,
404
1
    24, 25, 26,
405
1
  };
406
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
407
1
  ccv_nnc_tensor_t* const pb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(CPU_TENSOR_NHWC(32F, 2, 3), 4, 128), 0);
408
1
  (void)ccv_nnc_palettize(b->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, 6, 4, 128, pb->data.u8, ccv_nnc_tensor_data_size_without_padding(pb->info));
409
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
410
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
411
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
412
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
413
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
414
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(GPU_TENSOR_NHWC(000, 32F, 2, 3), 4, 128), 0);
415
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
416
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
417
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
418
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, pb), TENSOR_LIST(gg, ga, gb), 0);
419
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
420
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
421
1
  float dbiastp[] = {
422
1
    22, 26, 30,
423
1
  };
424
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
425
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
426
1
  float htp[] = {
427
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
428
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
429
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
430
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
431
1
  };
432
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
433
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
434
1
  float dbtp[] = {
435
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
436
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
437
1
  };
438
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
439
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
440
1
  ccv_nnc_tensor_free(g);
441
1
  ccv_nnc_tensor_free(a);
442
1
  ccv_nnc_tensor_free(b);
443
1
  ccv_nnc_tensor_free(pb);
444
1
  ccv_nnc_tensor_free(h);
445
1
  ccv_nnc_tensor_free(db);
446
1
  ccv_nnc_tensor_free(dbias);
447
1
  ccv_nnc_tensor_free(gg);
448
1
  ccv_nnc_tensor_free(ga);
449
1
  ccv_nnc_tensor_free(gb);
450
1
  ccv_nnc_tensor_free(gh);
451
1
  ccv_nnc_tensor_free(gdb);
452
1
  ccv_nnc_tensor_free(gdbias);
453
1
}
454
455
TEST_CASE("backward gemm with no transpose and row-wise int8 weights")
456
1
{
457
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
458
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
459
1
  float gp[] = {
460
1
    1, 2, 3,
461
1
    4, 5, 6,
462
1
    7, 8, 9,
463
1
    10, 11, 12,
464
1
  };
465
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
466
1
  float ap[] = {
467
1
    13, 14,
468
1
    15, 16,
469
1
    17, 18,
470
1
    19, 20,
471
1
  };
472
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
473
1
  float bp[] = {
474
1
    21, 22, 23,
475
1
    24, 25, 26,
476
1
  };
477
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
478
1
  ccv_nnc_tensor_t* const qb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, 2, 3)), 0);
479
1
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(b->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, 6, 3, qb->data.u8, ccv_nnc_tensor_data_size_without_padding(qb->info));
480
1
  ccv_nnc_tensor_t* const dq_b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
481
1
  ccv_nnc_dequantize_8i_rowwise(qb->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, qsize, 3, dq_b->data.u8, 6);
482
1
  ccv_nnc_tensor_t* const ht = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
483
1
  ccv_nnc_tensor_t* const dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
484
1
  ccv_nnc_tensor_t* const dbiast = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
485
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD();
486
1
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
487
1
  assert(cmd.backend >= 0);
488
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, dq_b), TENSOR_LIST(ht, dbt, dbiast), 0);
489
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
490
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
491
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
492
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
493
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
494
1
  ccv_nnc_tensor_t* gqb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 32F, 2, 3)), 0);
495
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
496
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
497
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
498
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, qb), TENSOR_LIST(gg, ga, gqb), 0);
499
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gqb), TENSOR_LIST(gh, gdb, gdbias), 0);
500
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
501
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ht->data.f32, h->data.f32, 8, 1e-5, "h should match CPU with dequantized row-wise weights");
502
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbt->data.f32, db->data.f32, 6, 1e-5, "db should match CPU with dequantized row-wise weights");
503
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbiast->data.f32, dbias->data.f32, 3, 1e-5, "dbias should match CPU with dequantized row-wise weights");
504
1
  ccv_nnc_tensor_free(g);
505
1
  ccv_nnc_tensor_free(a);
506
1
  ccv_nnc_tensor_free(b);
507
1
  ccv_nnc_tensor_free(qb);
508
1
  ccv_nnc_tensor_free(dq_b);
509
1
  ccv_nnc_tensor_free(ht);
510
1
  ccv_nnc_tensor_free(dbt);
511
1
  ccv_nnc_tensor_free(dbiast);
512
1
  ccv_nnc_tensor_free(h);
513
1
  ccv_nnc_tensor_free(db);
514
1
  ccv_nnc_tensor_free(dbias);
515
1
  ccv_nnc_tensor_free(gg);
516
1
  ccv_nnc_tensor_free(ga);
517
1
  ccv_nnc_tensor_free(gqb);
518
1
  ccv_nnc_tensor_free(gh);
519
1
  ccv_nnc_tensor_free(gdb);
520
1
  ccv_nnc_tensor_free(gdbias);
521
1
}
522
523
TEST_CASE("backward gemm with transpose a")
524
1
{
525
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
526
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
527
1
  float gp[] = {
528
1
    1, 2, 3,
529
1
    4, 5, 6,
530
1
    7, 8, 9,
531
1
    10, 11, 12,
532
1
  };
533
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
534
1
  float ap[] = {
535
1
    13, 15, 17, 19,
536
1
    14, 16, 18, 20,
537
1
  };
538
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
539
1
  float bp[] = {
540
1
    21, 22, 23,
541
1
    24, 25, 26,
542
1
  };
543
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
544
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
545
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
546
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
547
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
548
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
549
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
550
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
551
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
552
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
553
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
554
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
555
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
556
1
  float dbiastp[] = {
557
1
    22, 26, 30,
558
1
  };
559
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
560
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
561
1
  float htp[] = {
562
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
563
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
564
1
  };
565
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
566
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
567
1
  float dbtp[] = {
568
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
569
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
570
1
  };
571
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
572
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
573
1
  ccv_nnc_tensor_free(g);
574
1
  ccv_nnc_tensor_free(a);
575
1
  ccv_nnc_tensor_free(b);
576
1
  ccv_nnc_tensor_free(h);
577
1
  ccv_nnc_tensor_free(db);
578
1
  ccv_nnc_tensor_free(dbias);
579
1
  ccv_nnc_tensor_free(gg);
580
1
  ccv_nnc_tensor_free(ga);
581
1
  ccv_nnc_tensor_free(gb);
582
1
  ccv_nnc_tensor_free(gh);
583
1
  ccv_nnc_tensor_free(gdb);
584
1
  ccv_nnc_tensor_free(gdbias);
585
1
}
586
587
TEST_CASE("backward gemm with transpose b")
588
1
{
589
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
590
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
591
1
  float gp[] = {
592
1
    1, 2, 3,
593
1
    4, 5, 6,
594
1
    7, 8, 9,
595
1
    10, 11, 12,
596
1
  };
597
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
598
1
  float ap[] = {
599
1
    13, 14,
600
1
    15, 16,
601
1
    17, 18,
602
1
    19, 20,
603
1
  };
604
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
605
1
  float bp[] = {
606
1
    21, 24,
607
1
    22, 25,
608
1
    23, 26,
609
1
  };
610
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
611
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
612
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
613
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
614
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
615
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
616
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
617
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
618
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
619
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
620
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
621
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
622
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
623
1
  float dbiastp[] = {
624
1
    22, 26, 30,
625
1
  };
626
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
627
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
628
1
  float htp[] = {
629
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
630
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
631
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
632
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
633
1
  };
634
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
635
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
636
1
  float dbtp[] = {
637
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
638
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
639
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
640
1
  };
641
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
642
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
643
1
  ccv_nnc_tensor_free(g);
644
1
  ccv_nnc_tensor_free(a);
645
1
  ccv_nnc_tensor_free(b);
646
1
  ccv_nnc_tensor_free(h);
647
1
  ccv_nnc_tensor_free(db);
648
1
  ccv_nnc_tensor_free(dbias);
649
1
  ccv_nnc_tensor_free(gg);
650
1
  ccv_nnc_tensor_free(ga);
651
1
  ccv_nnc_tensor_free(gb);
652
1
  ccv_nnc_tensor_free(gh);
653
1
  ccv_nnc_tensor_free(gdb);
654
1
  ccv_nnc_tensor_free(gdbias);
655
1
}
656
657
TEST_CASE("backward gemm with transpose a and b")
658
1
{
659
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
660
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
661
1
  float gp[] = {
662
1
    1, 2, 3,
663
1
    4, 5, 6,
664
1
    7, 8, 9,
665
1
    10, 11, 12,
666
1
  };
667
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
668
1
  float ap[] = {
669
1
    13, 15, 17, 19,
670
1
    14, 16, 18, 20,
671
1
  };
672
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
673
1
  float bp[] = {
674
1
    21, 24,
675
1
    22, 25,
676
1
    23, 26,
677
1
  };
678
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
679
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
680
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
681
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
682
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
683
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
684
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
685
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
686
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
687
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
688
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
689
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
690
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
691
1
  float dbiastp[] = {
692
1
    22, 26, 30,
693
1
  };
694
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
695
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
696
1
  float htp[] = {
697
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
698
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
699
1
  };
700
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
701
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
702
1
  float dbtp[] = {
703
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
704
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
705
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
706
1
  };
707
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
708
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
709
1
  ccv_nnc_tensor_free(g);
710
1
  ccv_nnc_tensor_free(a);
711
1
  ccv_nnc_tensor_free(b);
712
1
  ccv_nnc_tensor_free(h);
713
1
  ccv_nnc_tensor_free(db);
714
1
  ccv_nnc_tensor_free(dbias);
715
1
  ccv_nnc_tensor_free(gg);
716
1
  ccv_nnc_tensor_free(ga);
717
1
  ccv_nnc_tensor_free(gb);
718
1
  ccv_nnc_tensor_free(gh);
719
1
  ccv_nnc_tensor_free(gdb);
720
1
  ccv_nnc_tensor_free(gdbias);
721
1
}
722
723
TEST_CASE("gemm no transpose batch 2")
724
1
{
725
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
726
1
  float ap[] = {
727
1
    1, 2,
728
1
    3, 4,
729
1
    5, 6,
730
1
    7, 8,
731
1
    2, 3,
732
1
    4, 5,
733
1
    6, 7,
734
1
    8, 9
735
1
  };
736
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
737
1
  float bp[] = {
738
1
    7, 8, 9,
739
1
    10, 11, 12,
740
1
  };
741
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
742
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
743
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
744
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
745
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
746
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
747
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
748
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
749
1
  float ctp[] = {
750
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
751
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
752
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
753
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
754
1
    2 * 7 + 3 * 10, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12,
755
1
    4 * 7 + 5 * 10, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12,
756
1
    6 * 7 + 7 * 10, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12,
757
1
    8 * 7 + 9 * 10, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12,
758
1
  };
759
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
760
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
761
1
  ccv_nnc_tensor_free(a);
762
1
  ccv_nnc_tensor_free(b);
763
1
  ccv_nnc_tensor_free(c);
764
1
  ccv_nnc_tensor_free(ga);
765
1
  ccv_nnc_tensor_free(gb);
766
1
  ccv_nnc_tensor_free(gc);
767
1
}
768
769
TEST_CASE("gemm transpose a batch 2")
770
1
{
771
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
772
1
  float ap[] = {
773
1
    1, 3, 5, 7,
774
1
    2, 4, 6, 8,
775
1
    2, 4, 6, 8,
776
1
    3, 5, 7, 9,
777
1
  };
778
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
779
1
  float bp[] = {
780
1
    7, 8, 9,
781
1
    10, 11, 12,
782
1
  };
783
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
784
1
  float dp[] = {
785
1
    -1, 0, 1,
786
1
  };
787
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 3), 0);
788
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
789
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
790
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
791
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
792
1
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
793
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
794
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
795
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
796
1
  float ctp[] = {
797
1
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
798
1
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
799
1
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
800
1
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
801
1
    2 * 7 + 3 * 10 - 1, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12 + 1,
802
1
    4 * 7 + 5 * 10 - 1, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12 + 1,
803
1
    6 * 7 + 7 * 10 - 1, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12 + 1,
804
1
    8 * 7 + 9 * 10 - 1, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12 + 1,
805
1
  };
806
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
807
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
808
1
  ccv_nnc_tensor_free(a);
809
1
  ccv_nnc_tensor_free(b);
810
1
  ccv_nnc_tensor_free(c);
811
1
  ccv_nnc_tensor_free(d);
812
1
  ccv_nnc_tensor_free(ga);
813
1
  ccv_nnc_tensor_free(gb);
814
1
  ccv_nnc_tensor_free(gc);
815
1
  ccv_nnc_tensor_free(gd);
816
1
}
817
818
TEST_CASE("gemm transpose b batch 2")
819
1
{
820
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
821
1
  float ap[] = {
822
1
    1, 2,
823
1
    3, 4,
824
1
    5, 6,
825
1
    7, 8,
826
1
    2, 3,
827
1
    4, 5,
828
1
    6, 7,
829
1
    8, 9
830
1
  };
831
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
832
1
  float bp[] = {
833
1
    7, 10,
834
1
    8, 11,
835
1
    9, 12,
836
1
    80, 110,
837
1
    90, 120,
838
1
    10, 13,
839
1
  };
840
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
841
1
  float dp[] = {
842
1
    -1, 0, 1,
843
1
    2, 3, -4,
844
1
  };
845
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
846
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
847
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
848
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
849
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
850
1
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
851
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
852
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
853
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
854
1
  float ctp[] = {
855
1
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
856
1
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
857
1
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
858
1
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
859
1
    2 * 80 + 3 * 110 + 2, 2 * 90 + 3 * 120 + 3, 2 * 10 + 3 * 13 - 4,
860
1
    4 * 80 + 5 * 110 + 2, 4 * 90 + 5 * 120 + 3, 4 * 10 + 5 * 13 - 4,
861
1
    6 * 80 + 7 * 110 + 2, 6 * 90 + 7 * 120 + 3, 6 * 10 + 7 * 13 - 4,
862
1
    8 * 80 + 9 * 110 + 2, 8 * 90 + 9 * 120 + 3, 8 * 10 + 9 * 13 - 4,
863
1
  };
864
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
865
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
866
1
  ccv_nnc_tensor_free(a);
867
1
  ccv_nnc_tensor_free(b);
868
1
  ccv_nnc_tensor_free(c);
869
1
  ccv_nnc_tensor_free(d);
870
1
  ccv_nnc_tensor_free(ga);
871
1
  ccv_nnc_tensor_free(gb);
872
1
  ccv_nnc_tensor_free(gc);
873
1
  ccv_nnc_tensor_free(gd);
874
1
}
875
876
TEST_CASE("backward gemm with no transpose batch 2, same b")
877
1
{
878
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
879
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
880
1
  float gp[] = {
881
1
    1, 2, 3,
882
1
    4, 5, 6,
883
1
    7, 8, 9,
884
1
    10, 11, 12,
885
1
    10, 20, 30,
886
1
    40, 50, 60,
887
1
    70, 80, 90,
888
1
    100, 110, 120,
889
1
  };
890
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
891
1
  float ap[] = {
892
1
    13, 14,
893
1
    15, 16,
894
1
    17, 18,
895
1
    19, 20,
896
1
    131, 141,
897
1
    151, 161,
898
1
    171, 181,
899
1
    191, 201,
900
1
  };
901
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
902
1
  float bp[] = {
903
1
    21, 22, 23,
904
1
    24, 25, 26,
905
1
  };
906
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
907
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
908
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
909
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
910
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
911
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
912
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
913
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
914
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
915
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
916
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
917
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
918
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
919
1
  float dbiastp[] = {
920
1
    22 + 220, 26 + 260, 30 + 300,
921
1
  };
922
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
923
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
924
1
  float htp[] = {
925
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
926
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
927
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
928
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
929
1
    10 * 21 + 20 * 22 + 30 * 23, 10 * 24 + 20 * 25 + 30 * 26,
930
1
    40 * 21 + 50 * 22 + 60 * 23, 40 * 24 + 50 * 25 + 60 * 26,
931
1
    70 * 21 + 80 * 22 + 90 * 23, 70 * 24 + 80 * 25 + 90 * 26,
932
1
    100 * 21 + 110 * 22 + 120 * 23, 100 * 24 + 110 * 25 + 120 * 26,
933
1
  };
934
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
935
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
936
1
  float dbtp[] = {
937
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
938
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
939
1
  };
940
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
941
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
942
1
  ccv_nnc_tensor_free(g);
943
1
  ccv_nnc_tensor_free(a);
944
1
  ccv_nnc_tensor_free(b);
945
1
  ccv_nnc_tensor_free(h);
946
1
  ccv_nnc_tensor_free(db);
947
1
  ccv_nnc_tensor_free(dbias);
948
1
  ccv_nnc_tensor_free(gg);
949
1
  ccv_nnc_tensor_free(ga);
950
1
  ccv_nnc_tensor_free(gb);
951
1
  ccv_nnc_tensor_free(gh);
952
1
  ccv_nnc_tensor_free(gdb);
953
1
  ccv_nnc_tensor_free(gdbias);
954
1
}
955
956
TEST_CASE("backward gemm with no transpose batch 2, batched b")
957
1
{
958
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
959
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
960
1
  float gp[] = {
961
1
    1, 2, 3,
962
1
    4, 5, 6,
963
1
    7, 8, 9,
964
1
    10, 11, 12,
965
1
    10, 20, 30,
966
1
    40, 50, 60,
967
1
    70, 80, 90,
968
1
    100, 110, 120,
969
1
  };
970
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
971
1
  float ap[] = {
972
1
    13, 14,
973
1
    15, 16,
974
1
    17, 18,
975
1
    19, 20,
976
1
    131, 141,
977
1
    151, 161,
978
1
    171, 181,
979
1
    191, 201,
980
1
  };
981
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
982
1
  float bp[] = {
983
1
    21, 22, 23,
984
1
    24, 25, 26,
985
1
    212, 222, 232,
986
1
    242, 252, 262,
987
1
  };
988
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
989
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
990
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
991
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
992
1
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
993
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
994
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
995
1
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
996
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
997
1
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
998
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
999
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
1000
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
1001
1
  float dbiastp[] = {
1002
1
    22, 26, 30,
1003
1
    220, 260, 300,
1004
1
  };
1005
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
1006
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
1007
1
  float htp[] = {
1008
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
1009
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
1010
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
1011
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
1012
1
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
1013
1
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
1014
1
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
1015
1
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
1016
1
  };
1017
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1018
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
1019
1
  float dbtp[] = {
1020
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
1021
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
1022
1
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
1023
1
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
1024
1
  };
1025
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
1026
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
1027
1
  ccv_nnc_tensor_free(g);
1028
1
  ccv_nnc_tensor_free(a);
1029
1
  ccv_nnc_tensor_free(b);
1030
1
  ccv_nnc_tensor_free(h);
1031
1
  ccv_nnc_tensor_free(db);
1032
1
  ccv_nnc_tensor_free(dbias);
1033
1
  ccv_nnc_tensor_free(gg);
1034
1
  ccv_nnc_tensor_free(ga);
1035
1
  ccv_nnc_tensor_free(gb);
1036
1
  ccv_nnc_tensor_free(gh);
1037
1
  ccv_nnc_tensor_free(gdb);
1038
1
  ccv_nnc_tensor_free(gdbias);
1039
1
}
1040
1041
TEST_CASE("backward gemm with transpose a batch 2, same b")
1042
1
{
1043
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1044
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1045
1
  float gp[] = {
1046
1
    1, 2, 3,
1047
1
    4, 5, 6,
1048
1
    7, 8, 9,
1049
1
    10, 11, 12,
1050
1
    10, 20, 30,
1051
1
    40, 50, 60,
1052
1
    70, 80, 90,
1053
1
    100, 110, 120,
1054
1
  };
1055
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1056
1
  float ap[] = {
1057
1
    13, 15, 17, 19,
1058
1
    14, 16, 18, 20,
1059
1
    131, 151, 171, 191,
1060
1
    141, 161, 181, 201,
1061
1
  };
1062
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1063
1
  float bp[] = {
1064
1
    21, 22, 23,
1065
1
    24, 25, 26,
1066
1
  };
1067
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1068
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1069
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1070
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
1071
1
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1072
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1073
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1074
1
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1075
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1076
1
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1077
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
1078
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
1079
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
1080
1
  float dbiastp[] = {
1081
1
    22 + 220, 26 + 260, 30 + 300,
1082
1
  };
1083
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
1084
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
1085
1
  float htp[] = {
1086
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
1087
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
1088
1
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
1089
1
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
1090
1
  };
1091
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1092
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
1093
1
  float dbtp[] = {
1094
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
1095
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
1096
1
  };
1097
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1098
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
1099
1
  ccv_nnc_tensor_free(g);
1100
1
  ccv_nnc_tensor_free(a);
1101
1
  ccv_nnc_tensor_free(b);
1102
1
  ccv_nnc_tensor_free(h);
1103
1
  ccv_nnc_tensor_free(db);
1104
1
  ccv_nnc_tensor_free(dbias);
1105
1
  ccv_nnc_tensor_free(gg);
1106
1
  ccv_nnc_tensor_free(ga);
1107
1
  ccv_nnc_tensor_free(gb);
1108
1
  ccv_nnc_tensor_free(gh);
1109
1
  ccv_nnc_tensor_free(gdb);
1110
1
  ccv_nnc_tensor_free(gdbias);
1111
1
}
1112
1113
TEST_CASE("backward gemm with transpose b batch 2, batched b")
1114
1
{
1115
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1116
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1117
1
  float gp[] = {
1118
1
    1, 2, 3,
1119
1
    4, 5, 6,
1120
1
    7, 8, 9,
1121
1
    10, 11, 12,
1122
1
    10, 20, 30,
1123
1
    40, 50, 60,
1124
1
    70, 80, 90,
1125
1
    100, 110, 120,
1126
1
  };
1127
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1128
1
  float ap[] = {
1129
1
    13, 14,
1130
1
    15, 16,
1131
1
    17, 18,
1132
1
    19, 20,
1133
1
    131, 141,
1134
1
    151, 161,
1135
1
    171, 181,
1136
1
    191, 201,
1137
1
  };
1138
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1139
1
  float bp[] = {
1140
1
    21, 24,
1141
1
    22, 25,
1142
1
    23, 26,
1143
1
    212, 242,
1144
1
    222, 252,
1145
1
    232, 262,
1146
1
  };
1147
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1148
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1149
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1150
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
1151
1
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1152
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1153
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
1154
1
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1155
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
1156
1
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
1157
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
1158
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
1159
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
1160
1
  float dbiastp[] = {
1161
1
    22, 26, 30,
1162
1
    220, 260, 300,
1163
1
  };
1164
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
1165
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
1166
1
  float htp[] = {
1167
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
1168
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
1169
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
1170
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
1171
1
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
1172
1
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
1173
1
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
1174
1
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
1175
1
  };
1176
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1177
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
1178
1
  float dbtp[] = {
1179
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
1180
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
1181
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
1182
1
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
1183
1
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
1184
1
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
1185
1
  };
1186
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1187
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
1188
1
  ccv_nnc_tensor_free(g);
1189
1
  ccv_nnc_tensor_free(a);
1190
1
  ccv_nnc_tensor_free(b);
1191
1
  ccv_nnc_tensor_free(h);
1192
1
  ccv_nnc_tensor_free(db);
1193
1
  ccv_nnc_tensor_free(dbias);
1194
1
  ccv_nnc_tensor_free(gg);
1195
1
  ccv_nnc_tensor_free(ga);
1196
1
  ccv_nnc_tensor_free(gb);
1197
1
  ccv_nnc_tensor_free(gh);
1198
1
  ccv_nnc_tensor_free(gdb);
1199
1
  ccv_nnc_tensor_free(gdbias);
1200
1
}
1201
1202
TEST_CASE("backward gemm with transpose a and b batch 2, same b")
1203
1
{
1204
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1205
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1206
1
  float gp[] = {
1207
1
    1, 2, 3,
1208
1
    4, 5, 6,
1209
1
    7, 8, 9,
1210
1
    10, 11, 12,
1211
1
    10, 20, 30,
1212
1
    40, 50, 60,
1213
1
    70, 80, 90,
1214
1
    100, 110, 120,
1215
1
  };
1216
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1217
1
  float ap[] = {
1218
1
    13, 15, 17, 19,
1219
1
    14, 16, 18, 20,
1220
1
    131, 151, 171, 191,
1221
1
    141, 161, 181, 201,
1222
1
  };
1223
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1224
1
  float bp[] = {
1225
1
    21, 24,
1226
1
    22, 25,
1227
1
    23, 26,
1228
1
  };
1229
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1230
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1231
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1232
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
1233
1
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1234
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1235
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1236
1
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1237
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1238
1
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1239
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
1240
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
1241
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
1242
1
  float dbiastp[] = {
1243
1
    22 + 220, 26 + 260, 30 + 300,
1244
1
  };
1245
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
1246
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
1247
1
  float htp[] = {
1248
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
1249
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
1250
1
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
1251
1
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
1252
1
  };
1253
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1254
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
1255
1
  float dbtp[] = {
1256
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
1257
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
1258
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
1259
1
  };
1260
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1261
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
1262
1
  ccv_nnc_tensor_free(g);
1263
1
  ccv_nnc_tensor_free(a);
1264
1
  ccv_nnc_tensor_free(b);
1265
1
  ccv_nnc_tensor_free(h);
1266
1
  ccv_nnc_tensor_free(db);
1267
1
  ccv_nnc_tensor_free(dbias);
1268
1
  ccv_nnc_tensor_free(gg);
1269
1
  ccv_nnc_tensor_free(ga);
1270
1
  ccv_nnc_tensor_free(gb);
1271
1
  ccv_nnc_tensor_free(gh);
1272
1
  ccv_nnc_tensor_free(gdb);
1273
1
  ccv_nnc_tensor_free(gdbias);
1274
1
}
1275
1276
TEST_CASE("cublas forward gemm")
1277
1
{
1278
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1279
1
  dsfmt_t dsfmt;
1280
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1281
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1282
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1283
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1284
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1285
1286
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1287
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1288
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1289
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1290
1
  int i;
1291
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1292
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1293
65
  for (i = 0; i < 64; 
i++64
)
1294
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1295
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1296
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1297
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1298
129
  for (i = 0; i < 128; 
i++128
)
1299
128
    ha->data.f32[i] = ha1->data.f32[i];
1300
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(a, w, bias), 0);
1301
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1302
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1303
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1304
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1305
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1306
65
  for (i = 0; i < 64; 
i++64
)
1307
64
    tb1->data.f32[i] = tb->data.f32[i];
1308
1
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
1309
1
  ccv_nnc_tensor_free(a);
1310
1
  ccv_nnc_tensor_free(w);
1311
1
  ccv_nnc_tensor_free(bias);
1312
1
  ccv_nnc_tensor_free(tb);
1313
1
  ccv_nnc_tensor_free(b);
1314
1
  ccv_nnc_tensor_free(ha);
1315
1
  ccv_nnc_tensor_free(ha1);
1316
1
  ccv_nnc_tensor_free(tb1);
1317
1
  ccv_nnc_tensor_free(hw);
1318
1
  ccv_nnc_tensor_free(hbias);
1319
1
  ccv_nnc_tensor_free(hb);
1320
1
}
1321
1322
TEST_CASE("cublas forward gemm in half precision")
1323
1
{
1324
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1325
1
  dsfmt_t dsfmt;
1326
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1327
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1328
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1329
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1330
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1331
1332
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1333
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1334
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1335
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1336
1
  int i;
1337
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1338
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1339
65
  for (i = 0; i < 64; 
i++64
)
1340
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1341
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1342
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1343
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1344
129
  for (i = 0; i < 128; 
i++128
)
1345
128
    ha->data.f32[i] = ha1->data.f32[i];
1346
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1347
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1348
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1349
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1350
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1351
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1352
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1353
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1354
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1355
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1356
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1357
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1358
1
  ccv_nnc_tensor_free(a);
1359
1
  ccv_nnc_tensor_free(w);
1360
1
  ccv_nnc_tensor_free(bias);
1361
1
  ccv_nnc_tensor_free(b);
1362
1
  ccv_nnc_tensor_free(tb);
1363
1
  ccv_nnc_tensor_free(ha);
1364
1
  ccv_nnc_tensor_free(ha1);
1365
1
  ccv_nnc_tensor_free(tb1);
1366
1
  ccv_nnc_tensor_free(hw);
1367
1
  ccv_nnc_tensor_free(hbias);
1368
1
  ccv_nnc_tensor_free(hb);
1369
1
  ccv_nnc_tensor_free(ha2);
1370
1
  ccv_nnc_tensor_free(hw2);
1371
1
  ccv_nnc_tensor_free(hbias2);
1372
1
}
1373
1374
TEST_CASE("cublas forward gemm in bfloat precision")
1375
1
{
1376
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1377
1
  dsfmt_t dsfmt;
1378
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1379
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 128), 0);
1380
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
1381
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64), 0);
1382
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 64), 0);
1383
1384
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1385
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1386
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1387
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1388
1
  int i;
1389
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1390
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1391
65
  for (i = 0; i < 64; 
i++64
)
1392
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1393
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1394
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1395
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1396
129
  for (i = 0; i < 128; 
i++128
)
1397
128
    ha->data.f32[i] = ha1->data.f32[i];
1398
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 128), 0);
1399
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
1400
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64), 0);
1401
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1402
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1403
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1404
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1405
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 64), 0);
1406
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1407
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1408
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1409
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-2, "GPU computed output should be the same as CPU computed ones");
1410
1
  ccv_nnc_tensor_free(a);
1411
1
  ccv_nnc_tensor_free(w);
1412
1
  ccv_nnc_tensor_free(bias);
1413
1
  ccv_nnc_tensor_free(b);
1414
1
  ccv_nnc_tensor_free(tb);
1415
1
  ccv_nnc_tensor_free(ha);
1416
1
  ccv_nnc_tensor_free(ha1);
1417
1
  ccv_nnc_tensor_free(tb1);
1418
1
  ccv_nnc_tensor_free(hw);
1419
1
  ccv_nnc_tensor_free(hbias);
1420
1
  ccv_nnc_tensor_free(hb);
1421
1
  ccv_nnc_tensor_free(ha2);
1422
1
  ccv_nnc_tensor_free(hw2);
1423
1
  ccv_nnc_tensor_free(hbias2);
1424
1
}
1425
1426
TEST_CASE("cublas forward gemv in half precision, variant 1")
1427
1
{
1428
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1429
1
  dsfmt_t dsfmt;
1430
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1431
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
1432
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1433
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1434
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
1435
1436
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1437
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1438
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1439
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1440
1
  int i;
1441
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1442
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1443
65
  for (i = 0; i < 64; 
i++64
)
1444
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1445
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1446
129
  for (i = 0; i < 128; 
i++128
)
1447
128
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1448
129
  for (i = 0; i < 128; 
i++128
)
1449
128
    ha->data.f32[i] = ha1->data.f32[i];
1450
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
1451
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1452
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1453
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1454
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1455
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1456
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1457
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
1458
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1459
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1460
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1461
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1462
1
  ccv_nnc_tensor_free(a);
1463
1
  ccv_nnc_tensor_free(w);
1464
1
  ccv_nnc_tensor_free(bias);
1465
1
  ccv_nnc_tensor_free(b);
1466
1
  ccv_nnc_tensor_free(tb);
1467
1
  ccv_nnc_tensor_free(ha);
1468
1
  ccv_nnc_tensor_free(ha1);
1469
1
  ccv_nnc_tensor_free(tb1);
1470
1
  ccv_nnc_tensor_free(hw);
1471
1
  ccv_nnc_tensor_free(hbias);
1472
1
  ccv_nnc_tensor_free(hb);
1473
1
  ccv_nnc_tensor_free(ha2);
1474
1
  ccv_nnc_tensor_free(hw2);
1475
1
  ccv_nnc_tensor_free(hbias2);
1476
1
}
1477
1478
TEST_CASE("cublas forward gemv in bfloat precision, variant 1")
1479
1
{
1480
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1481
1
  dsfmt_t dsfmt;
1482
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1483
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 1, 128), 0);
1484
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
1485
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64), 0);
1486
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 1, 64), 0);
1487
1488
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1489
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1490
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1491
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1492
1
  int i;
1493
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1494
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1495
65
  for (i = 0; i < 64; 
i++64
)
1496
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1497
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1498
129
  for (i = 0; i < 128; 
i++128
)
1499
128
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1500
129
  for (i = 0; i < 128; 
i++128
)
1501
128
    ha->data.f32[i] = ha1->data.f32[i];
1502
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 1, 128), 0);
1503
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
1504
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64), 0);
1505
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1506
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1507
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1508
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1509
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 1, 64), 0);
1510
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1511
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1512
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1513
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-2, "GPU computed output should be the same as CPU computed ones");
1514
1
  ccv_nnc_tensor_free(a);
1515
1
  ccv_nnc_tensor_free(w);
1516
1
  ccv_nnc_tensor_free(bias);
1517
1
  ccv_nnc_tensor_free(b);
1518
1
  ccv_nnc_tensor_free(tb);
1519
1
  ccv_nnc_tensor_free(ha);
1520
1
  ccv_nnc_tensor_free(ha1);
1521
1
  ccv_nnc_tensor_free(tb1);
1522
1
  ccv_nnc_tensor_free(hw);
1523
1
  ccv_nnc_tensor_free(hbias);
1524
1
  ccv_nnc_tensor_free(hb);
1525
1
  ccv_nnc_tensor_free(ha2);
1526
1
  ccv_nnc_tensor_free(hw2);
1527
1
  ccv_nnc_tensor_free(hbias2);
1528
1
}
1529
1530
TEST_CASE("cublas forward gemm no bias")
1531
1
{
1532
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1533
1
  dsfmt_t dsfmt;
1534
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1535
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1536
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1537
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1538
1539
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1540
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1541
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1542
1
  int i;
1543
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1544
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1545
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1546
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1547
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1548
129
  for (i = 0; i < 128; 
i++128
)
1549
128
    ha->data.f32[i] = ha1->data.f32[i];
1550
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(a, w), 0);
1551
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1552
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1553
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1554
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1555
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1556
65
  for (i = 0; i < 64; 
i++64
)
1557
64
    tb1->data.f32[i] = tb->data.f32[i];
1558
1
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
1559
1
  ccv_nnc_tensor_free(a);
1560
1
  ccv_nnc_tensor_free(w);
1561
1
  ccv_nnc_tensor_free(b);
1562
1
  ccv_nnc_tensor_free(tb);
1563
1
  ccv_nnc_tensor_free(ha);
1564
1
  ccv_nnc_tensor_free(ha1);
1565
1
  ccv_nnc_tensor_free(tb1);
1566
1
  ccv_nnc_tensor_free(hw);
1567
1
  ccv_nnc_tensor_free(hb);
1568
1
}
1569
1570
TEST_CASE("cublas forward gemm no bias in half precision")
1571
1
{
1572
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1573
1
  dsfmt_t dsfmt;
1574
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1575
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1576
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1577
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1578
1579
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1580
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1581
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1582
1
  int i;
1583
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1584
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1585
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1586
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1587
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1588
129
  for (i = 0; i < 128; 
i++128
)
1589
128
    ha->data.f32[i] = ha1->data.f32[i];
1590
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1591
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1592
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
1593
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
1594
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1595
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1596
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1597
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1598
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1599
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1600
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1601
1
  ccv_nnc_tensor_free(a);
1602
1
  ccv_nnc_tensor_free(w);
1603
1
  ccv_nnc_tensor_free(b);
1604
1
  ccv_nnc_tensor_free(tb);
1605
1
  ccv_nnc_tensor_free(ha);
1606
1
  ccv_nnc_tensor_free(ha1);
1607
1
  ccv_nnc_tensor_free(tb1);
1608
1
  ccv_nnc_tensor_free(hw);
1609
1
  ccv_nnc_tensor_free(hb);
1610
1
  ccv_nnc_tensor_free(ha2);
1611
1
  ccv_nnc_tensor_free(hw2);
1612
1
}
1613
1614
TEST_CASE("cublas forward gemv in half precision no bias, variant 1")
1615
1
{
1616
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1617
1
  dsfmt_t dsfmt;
1618
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1619
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
1620
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1621
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
1622
1623
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1624
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1625
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1626
1
  int i;
1627
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1628
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1629
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1630
129
  for (i = 0; i < 128; 
i++128
)
1631
128
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1632
129
  for (i = 0; i < 128; 
i++128
)
1633
128
    ha->data.f32[i] = ha1->data.f32[i];
1634
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
1635
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1636
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
1637
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
1638
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1639
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1640
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
1641
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1642
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1643
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1644
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1645
1
  ccv_nnc_tensor_free(a);
1646
1
  ccv_nnc_tensor_free(w);
1647
1
  ccv_nnc_tensor_free(b);
1648
1
  ccv_nnc_tensor_free(tb);
1649
1
  ccv_nnc_tensor_free(ha);
1650
1
  ccv_nnc_tensor_free(ha1);
1651
1
  ccv_nnc_tensor_free(tb1);
1652
1
  ccv_nnc_tensor_free(hw);
1653
1
  ccv_nnc_tensor_free(hb);
1654
1
  ccv_nnc_tensor_free(ha2);
1655
1
  ccv_nnc_tensor_free(hw2);
1656
1
}
1657
1658
TEST_CASE("cublas forward gemv in half precision no bias, variant 2")
1659
1
{
1660
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1661
1
  dsfmt_t dsfmt;
1662
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1663
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1664
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 128, 1), 0);
1665
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 1), 0);
1666
1667
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1668
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
1669
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
1670
1
  int i;
1671
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1672
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1673
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
1674
129
  for (i = 0; i < 128; 
i++128
)
1675
128
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1676
129
  for (i = 0; i < 128; 
i++128
)
1677
128
    ha->data.f32[i] = ha1->data.f32[i];
1678
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1679
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 128, 1), 0);
1680
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
1681
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
1682
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(hw, ha), TENSOR_LIST(hb), 0);
1683
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(w, a), TENSOR_LIST(b), 0);
1684
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 1), 0);
1685
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1686
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
1687
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1688
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1689
1
  ccv_nnc_tensor_free(a);
1690
1
  ccv_nnc_tensor_free(w);
1691
1
  ccv_nnc_tensor_free(b);
1692
1
  ccv_nnc_tensor_free(tb);
1693
1
  ccv_nnc_tensor_free(ha);
1694
1
  ccv_nnc_tensor_free(ha1);
1695
1
  ccv_nnc_tensor_free(tb1);
1696
1
  ccv_nnc_tensor_free(hw);
1697
1
  ccv_nnc_tensor_free(hb);
1698
1
  ccv_nnc_tensor_free(ha2);
1699
1
  ccv_nnc_tensor_free(hw2);
1700
1
}
1701
1702
TEST_CASE("cublas backward gemm")
1703
1
{
1704
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1705
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1706
1
  dsfmt_t dsfmt;
1707
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1708
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1709
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1710
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1711
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1712
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1713
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1714
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1715
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1716
1717
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1718
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1719
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1720
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1721
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1722
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1723
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1724
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1725
1
  int i;
1726
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1727
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1728
65
  for (i = 0; i < 64; 
i++64
)
1729
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1730
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1731
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1732
641
  for (i = 0; i < 10 * 64; 
i++640
)
1733
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1734
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
1735
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1736
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
1737
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1738
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
1739
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1740
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1741
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1742
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1743
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
1744
1
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
1745
1
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
1746
1
  REQUIRE_TENSOR_EQ(tdbias, hdbias, "GPU computed output should be the same as CPU computed ones");
1747
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
1748
1
  ccv_nnc_tensor_free(a);
1749
1
  ccv_nnc_tensor_free(w);
1750
1
  ccv_nnc_tensor_free(bias);
1751
1
  ccv_nnc_tensor_free(b);
1752
1
  ccv_nnc_tensor_free(g);
1753
1
  ccv_nnc_tensor_free(dw);
1754
1
  ccv_nnc_tensor_free(dbias);
1755
1
  ccv_nnc_tensor_free(h);
1756
1
  ccv_nnc_tensor_free(ha);
1757
1
  ccv_nnc_tensor_free(hw);
1758
1
  ccv_nnc_tensor_free(hbias);
1759
1
  ccv_nnc_tensor_free(hb);
1760
1
  ccv_nnc_tensor_free(hg);
1761
1
  ccv_nnc_tensor_free(hdw);
1762
1
  ccv_nnc_tensor_free(hdbias);
1763
1
  ccv_nnc_tensor_free(hh);
1764
1
  ccv_nnc_tensor_free(tb);
1765
1
  ccv_nnc_tensor_free(th);
1766
1
  ccv_nnc_tensor_free(tdw);
1767
1
  ccv_nnc_tensor_free(tdbias);
1768
1
}
1769
1770
TEST_CASE("cublas backward gemm in half precision")
1771
1
{
1772
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1773
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1774
1
  dsfmt_t dsfmt;
1775
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1776
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1777
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1778
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1779
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1780
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1781
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1782
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1783
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1784
1785
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1786
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1787
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1788
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1789
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1790
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1791
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1792
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1793
1
  int i;
1794
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1795
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1796
65
  for (i = 0; i < 64; 
i++64
)
1797
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1798
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1799
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1800
641
  for (i = 0; i < 10 * 64; 
i++640
)
1801
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1802
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1803
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1804
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1805
1
  ccv_nnc_tensor_t* hg2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1806
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(ha2, hw2, hbias2, hg2), 0);
1807
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2, hg2), TENSOR_LIST(a, w, bias, g), 0);
1808
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1809
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
1810
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1811
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
1812
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1813
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1814
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1815
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1816
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
1817
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1818
1
  ccv_nnc_tensor_t* tdw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1819
1
  ccv_nnc_tensor_t* tdbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1820
1
  ccv_nnc_tensor_t* th1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1821
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb, tdw, tdbias, th), TENSOR_LIST(tb1, tdw1, tdbias1, th1), 0);
1822
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 10 * 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1823
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw1->data.f32, hdw->data.f32, 64 * 128, 1e-2, "GPU computed output should be the same as CPU computed ones");
1824
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias1->data.f32, hdbias->data.f32, 64, 1e-2, "GPU computed output should be the same as CPU computed ones");
1825
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th1->data.f32, hh->data.f32, 10 * 128, 1e-3, "GPU computed output should be the same as CPU computed ones");
1826
1
  ccv_nnc_tensor_free(a);
1827
1
  ccv_nnc_tensor_free(w);
1828
1
  ccv_nnc_tensor_free(bias);
1829
1
  ccv_nnc_tensor_free(b);
1830
1
  ccv_nnc_tensor_free(g);
1831
1
  ccv_nnc_tensor_free(dw);
1832
1
  ccv_nnc_tensor_free(dbias);
1833
1
  ccv_nnc_tensor_free(h);
1834
1
  ccv_nnc_tensor_free(ha);
1835
1
  ccv_nnc_tensor_free(hw);
1836
1
  ccv_nnc_tensor_free(hbias);
1837
1
  ccv_nnc_tensor_free(hb);
1838
1
  ccv_nnc_tensor_free(hg);
1839
1
  ccv_nnc_tensor_free(hdw);
1840
1
  ccv_nnc_tensor_free(hdbias);
1841
1
  ccv_nnc_tensor_free(hh);
1842
1
  ccv_nnc_tensor_free(tb);
1843
1
  ccv_nnc_tensor_free(th);
1844
1
  ccv_nnc_tensor_free(tdw);
1845
1
  ccv_nnc_tensor_free(tdbias);
1846
1
  ccv_nnc_tensor_free(ha2);
1847
1
  ccv_nnc_tensor_free(hw2);
1848
1
  ccv_nnc_tensor_free(hbias2);
1849
1
  ccv_nnc_tensor_free(hg2);
1850
1
  ccv_nnc_tensor_free(tb1);
1851
1
  ccv_nnc_tensor_free(tdw1);
1852
1
  ccv_nnc_tensor_free(tdbias1);
1853
1
  ccv_nnc_tensor_free(th1);
1854
1
}
1855
1856
TEST_CASE("cublas backward gemm no bias")
1857
1
{
1858
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1859
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1860
1
  dsfmt_t dsfmt;
1861
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1862
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1863
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1864
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1865
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1866
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1867
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1868
1869
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1870
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1871
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1872
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1873
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1874
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1875
1
  int i;
1876
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1877
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1878
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1879
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1880
641
  for (i = 0; i < 10 * 64; 
i++640
)
1881
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1882
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(a, w, g), 0);
1883
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1884
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
1885
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1886
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
1887
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1888
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1889
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1890
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
1891
1
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
1892
1
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
1893
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
1894
1
  ccv_nnc_tensor_free(a);
1895
1
  ccv_nnc_tensor_free(w);
1896
1
  ccv_nnc_tensor_free(b);
1897
1
  ccv_nnc_tensor_free(g);
1898
1
  ccv_nnc_tensor_free(dw);
1899
1
  ccv_nnc_tensor_free(h);
1900
1
  ccv_nnc_tensor_free(ha);
1901
1
  ccv_nnc_tensor_free(hw);
1902
1
  ccv_nnc_tensor_free(hb);
1903
1
  ccv_nnc_tensor_free(hg);
1904
1
  ccv_nnc_tensor_free(hdw);
1905
1
  ccv_nnc_tensor_free(hh);
1906
1
  ccv_nnc_tensor_free(tb);
1907
1
  ccv_nnc_tensor_free(th);
1908
1
  ccv_nnc_tensor_free(tdw);
1909
1
}
1910
1911
TEST_CASE("cublas backward gemm no bias in half precision")
1912
1
{
1913
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1914
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1915
1
  dsfmt_t dsfmt;
1916
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1917
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1918
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1919
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1920
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1921
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1922
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1923
1924
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1925
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1926
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1927
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1928
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1929
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1930
1
  int i;
1931
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1932
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1933
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1934
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1935
641
  for (i = 0; i < 10 * 64; 
i++640
)
1936
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1937
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1938
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1939
1
  ccv_nnc_tensor_t* hg2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1940
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(ha2, hw2, hg2), 0);
1941
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hg2), TENSOR_LIST(a, w, g), 0);
1942
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1943
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
1944
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1945
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
1946
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1947
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1948
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1949
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1950
1
  ccv_nnc_tensor_t* tdw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1951
1
  ccv_nnc_tensor_t* th1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1952
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
1953
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb, tdw, th), TENSOR_LIST(tb1, tdw1, th1), 0);
1954
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 10 * 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1955
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw1->data.f32, hdw->data.f32, 64 * 128, 1e-2, "GPU computed output should be the same as CPU computed ones");
1956
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th1->data.f32, hh->data.f32, 10 * 128, 1e-3, "GPU computed output should be the same as CPU computed ones");
1957
1
  ccv_nnc_tensor_free(a);
1958
1
  ccv_nnc_tensor_free(w);
1959
1
  ccv_nnc_tensor_free(b);
1960
1
  ccv_nnc_tensor_free(g);
1961
1
  ccv_nnc_tensor_free(dw);
1962
1
  ccv_nnc_tensor_free(h);
1963
1
  ccv_nnc_tensor_free(ha);
1964
1
  ccv_nnc_tensor_free(hw);
1965
1
  ccv_nnc_tensor_free(hb);
1966
1
  ccv_nnc_tensor_free(hg);
1967
1
  ccv_nnc_tensor_free(hdw);
1968
1
  ccv_nnc_tensor_free(hh);
1969
1
  ccv_nnc_tensor_free(tb);
1970
1
  ccv_nnc_tensor_free(th);
1971
1
  ccv_nnc_tensor_free(tdw);
1972
1
  ccv_nnc_tensor_free(ha2);
1973
1
  ccv_nnc_tensor_free(hw2);
1974
1
  ccv_nnc_tensor_free(hg2);
1975
1
  ccv_nnc_tensor_free(tb1);
1976
1
  ccv_nnc_tensor_free(tdw1);
1977
1
  ccv_nnc_tensor_free(th1);
1978
1
}
1979
1980
TEST_CASE("cublas handle permute")
1981
1
{
1982
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1983
1
  dsfmt_t dsfmt;
1984
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1985
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 2, 128), 0);
1986
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 2, 128), 0);
1987
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 2, 128), 0);
1988
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 2, 128), 0);
1989
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
1990
1991
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), 0);
1992
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), 0);
1993
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
1994
1
  int i;
1995
16.3k
  for (i = 0; i < 2 * 64 * 128; 
i++16.3k
)
1996
16.3k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1997
2.56k
  for (i = 0; i < 2 * 10 * 128; 
i++2.56k
)
1998
2.56k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1999
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
2000
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(at), 0);
2001
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(w), TENSOR_LIST(wt), 0);
2002
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
2003
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
2004
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
2005
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
2006
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
2007
1
  ccv_nnc_tensor_t* hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
2008
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, bt), TENSOR_LIST(hb, hbt), 0);
2009
1
  REQUIRE_TENSOR_EQ(hb, hbt, "permute computed output should be the same as non-permute computed ones");
2010
1
  ccv_nnc_tensor_free(ha);
2011
1
  ccv_nnc_tensor_free(hw);
2012
1
  ccv_nnc_tensor_free(a);
2013
1
  ccv_nnc_tensor_free(w);
2014
1
  ccv_nnc_tensor_free(b);
2015
1
  ccv_nnc_tensor_view_free(av);
2016
1
  ccv_nnc_tensor_view_free(wv);
2017
1
  ccv_nnc_tensor_free(at);
2018
1
  ccv_nnc_tensor_free(wt);
2019
1
  ccv_nnc_tensor_free(bt);
2020
1
  ccv_nnc_tensor_free(hb);
2021
1
  ccv_nnc_tensor_free(hbt);
2022
1
}
2023
2024
TEST_CASE("generalized batched gemm with batch (2, 4) compare cublas")
2025
1
{
2026
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2027
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2028
1
  dsfmt_t dsfmt;
2029
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2030
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2031
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2032
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2033
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2034
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2035
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2036
2037
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2038
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2039
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2040
1
  int i;
2041
65.5k
  for (i = 0; i < 8 * 64 * 128; 
i++65.5k
)
2042
65.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2043
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2044
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2045
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2046
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2047
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
2048
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2049
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2050
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
2051
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
2052
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2053
1
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2054
1
  ccv_nnc_tensor_free(ha);
2055
1
  ccv_nnc_tensor_free(hw);
2056
1
  ccv_nnc_tensor_free(hb);
2057
1
  ccv_nnc_tensor_free(a);
2058
1
  ccv_nnc_tensor_free(w);
2059
1
  ccv_nnc_tensor_free(b);
2060
1
  ccv_nnc_tensor_view_free(av);
2061
1
  ccv_nnc_tensor_view_free(wv);
2062
1
  ccv_nnc_tensor_free(at);
2063
1
  ccv_nnc_tensor_free(wt);
2064
1
  ccv_nnc_tensor_free(bt);
2065
1
}
2066
2067
TEST_CASE("generalized batched gemm with batch (2, 4) and broadcast compare cublas")
2068
1
{
2069
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2070
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2071
1
  dsfmt_t dsfmt;
2072
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2073
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2074
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2075
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2076
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2077
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2078
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2079
2080
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2081
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2082
1
  int i;
2083
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
2084
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2085
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2086
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2087
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2088
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
2089
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2090
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w), TENSOR_LIST(b), 0);
2091
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw), TENSOR_LIST(bt), 0);
2092
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2093
1
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2094
1
  ccv_nnc_tensor_free(ha);
2095
1
  ccv_nnc_tensor_free(hw);
2096
1
  ccv_nnc_tensor_free(hb);
2097
1
  ccv_nnc_tensor_free(a);
2098
1
  ccv_nnc_tensor_free(w);
2099
1
  ccv_nnc_tensor_free(b);
2100
1
  ccv_nnc_tensor_view_free(av);
2101
1
  ccv_nnc_tensor_free(at);
2102
1
  ccv_nnc_tensor_free(bt);
2103
1
}
2104
2105
TEST_CASE("generalized batched gemm with batch (2, 4) with bias compare cublas")
2106
1
{
2107
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2108
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2109
1
  dsfmt_t dsfmt;
2110
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2111
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2112
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2113
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2114
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2115
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2116
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2117
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2118
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2119
2120
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2121
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2122
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2123
1
  int i;
2124
65.5k
  for (i = 0; i < 8 * 64 * 128; 
i++65.5k
)
2125
65.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2126
65
  for (i = 0; i < 64; 
i++64
)
2127
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
2128
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2129
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2130
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2131
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2132
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
2133
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2134
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2135
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv, bias), TENSOR_LIST(b), 0);
2136
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt, hbias), TENSOR_LIST(bt), 0);
2137
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2138
1
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2139
1
  ccv_nnc_tensor_free(ha);
2140
1
  ccv_nnc_tensor_free(hw);
2141
1
  ccv_nnc_tensor_free(hbias);
2142
1
  ccv_nnc_tensor_free(hb);
2143
1
  ccv_nnc_tensor_free(a);
2144
1
  ccv_nnc_tensor_free(w);
2145
1
  ccv_nnc_tensor_free(bias);
2146
1
  ccv_nnc_tensor_free(b);
2147
1
  ccv_nnc_tensor_view_free(av);
2148
1
  ccv_nnc_tensor_view_free(wv);
2149
1
  ccv_nnc_tensor_free(at);
2150
1
  ccv_nnc_tensor_free(wt);
2151
1
  ccv_nnc_tensor_free(bt);
2152
1
}
2153
2154
TEST_CASE("generalized batched gemm with batch (2, 4) with bias and broadcast compare cublas")
2155
1
{
2156
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2157
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2158
1
  dsfmt_t dsfmt;
2159
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2160
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2161
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2162
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2163
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2164
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2165
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2166
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2167
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2168
2169
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2170
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2171
1
  int i;
2172
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
2173
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2174
65
  for (i = 0; i < 64; 
i++64
)
2175
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
2176
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2177
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2178
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2179
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
2180
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2181
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w, bias), TENSOR_LIST(b), 0);
2182
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw, hbias), TENSOR_LIST(bt), 0);
2183
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2184
1
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2185
1
  ccv_nnc_tensor_free(ha);
2186
1
  ccv_nnc_tensor_free(hw);
2187
1
  ccv_nnc_tensor_free(hbias);
2188
1
  ccv_nnc_tensor_free(hb);
2189
1
  ccv_nnc_tensor_free(a);
2190
1
  ccv_nnc_tensor_free(w);
2191
1
  ccv_nnc_tensor_free(bias);
2192
1
  ccv_nnc_tensor_free(b);
2193
1
  ccv_nnc_tensor_view_free(av);
2194
1
  ccv_nnc_tensor_free(at);
2195
1
  ccv_nnc_tensor_free(bt);
2196
1
}
2197
2198
TEST_CASE("generalized batched backward gemm with batch (2, 4) compare cublas")
2199
1
{
2200
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2201
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2202
1
  dsfmt_t dsfmt;
2203
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2204
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2205
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2206
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2207
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2208
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2209
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2210
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2211
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2212
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2213
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2214
2215
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2216
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2217
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2218
1
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2219
1
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2220
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2221
1
  int i;
2222
65.5k
  for (i = 0; i < 8 * 64 * 128; 
i++65.5k
)
2223
65.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2224
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2225
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2226
5.12k
  for (i = 0; i < 2 * 4 * 10 * 64; 
i++5.12k
)
2227
5.12k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2228
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2229
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2230
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2231
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2232
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2233
1
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2234
1
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2235
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv), 0);
2236
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt), 0);
2237
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2238
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
2239
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
2240
1
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2241
1
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2242
1
  ccv_nnc_tensor_free(ha);
2243
1
  ccv_nnc_tensor_free(hw);
2244
1
  ccv_nnc_tensor_free(hda);
2245
1
  ccv_nnc_tensor_free(hdw);
2246
1
  ccv_nnc_tensor_free(hb);
2247
1
  ccv_nnc_tensor_free(a);
2248
1
  ccv_nnc_tensor_free(w);
2249
1
  ccv_nnc_tensor_free(da);
2250
1
  ccv_nnc_tensor_free(dw);
2251
1
  ccv_nnc_tensor_free(b);
2252
1
  ccv_nnc_tensor_view_free(av);
2253
1
  ccv_nnc_tensor_view_free(wv);
2254
1
  ccv_nnc_tensor_view_free(dav);
2255
1
  ccv_nnc_tensor_view_free(dwv);
2256
1
  ccv_nnc_tensor_free(at);
2257
1
  ccv_nnc_tensor_free(wt);
2258
1
  ccv_nnc_tensor_free(dat);
2259
1
  ccv_nnc_tensor_free(tda);
2260
1
  ccv_nnc_tensor_free(dwt);
2261
1
  ccv_nnc_tensor_free(tdw);
2262
1
}
2263
2264
TEST_CASE("generalized batched backward gemm with batch (2, 4) and broadcast compare cublas")
2265
1
{
2266
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2267
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2268
1
  dsfmt_t dsfmt;
2269
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2270
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2271
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2272
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2273
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2274
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2275
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2276
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2277
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2278
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2279
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2280
2281
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2282
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2283
1
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2284
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2285
1
  int i;
2286
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
2287
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2288
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2289
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2290
5.12k
  for (i = 0; i < 2 * 4 * 10 * 64; 
i++5.12k
)
2291
5.12k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2292
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2293
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2294
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2295
1
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2296
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw), 0);
2297
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw), TENSOR_LIST(dat, tdw), 0);
2298
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2299
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
2300
1
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2301
1
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2302
1
  ccv_nnc_tensor_free(ha);
2303
1
  ccv_nnc_tensor_free(hw);
2304
1
  ccv_nnc_tensor_free(hda);
2305
1
  ccv_nnc_tensor_free(hdw);
2306
1
  ccv_nnc_tensor_free(hb);
2307
1
  ccv_nnc_tensor_free(a);
2308
1
  ccv_nnc_tensor_free(w);
2309
1
  ccv_nnc_tensor_free(da);
2310
1
  ccv_nnc_tensor_free(dw);
2311
1
  ccv_nnc_tensor_free(b);
2312
1
  ccv_nnc_tensor_view_free(av);
2313
1
  ccv_nnc_tensor_view_free(dav);
2314
1
  ccv_nnc_tensor_free(at);
2315
1
  ccv_nnc_tensor_free(dat);
2316
1
  ccv_nnc_tensor_free(tda);
2317
1
  ccv_nnc_tensor_free(tdw);
2318
1
}
2319
2320
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias compare cublas")
2321
1
{
2322
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2323
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2324
1
  dsfmt_t dsfmt;
2325
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2326
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2327
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2328
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2329
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2330
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2331
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2332
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2333
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2334
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2335
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2336
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2337
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2338
2339
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2340
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2341
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2342
1
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2343
1
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2344
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2345
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2346
1
  int i;
2347
65.5k
  for (i = 0; i < 8 * 64 * 128; 
i++65.5k
)
2348
65.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2349
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2350
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2351
5.12k
  for (i = 0; i < 2 * 4 * 10 * 64; 
i++5.12k
)
2352
5.12k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2353
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2354
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2355
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2356
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2357
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2358
1
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2359
1
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2360
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv, dbias), 0);
2361
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt, tdbias), 0);
2362
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
2363
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2364
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
2365
1
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2366
1
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2367
1
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
2368
1
  ccv_nnc_tensor_free(ha);
2369
1
  ccv_nnc_tensor_free(hw);
2370
1
  ccv_nnc_tensor_free(hda);
2371
1
  ccv_nnc_tensor_free(hdw);
2372
1
  ccv_nnc_tensor_free(hdbias);
2373
1
  ccv_nnc_tensor_free(hb);
2374
1
  ccv_nnc_tensor_free(a);
2375
1
  ccv_nnc_tensor_free(w);
2376
1
  ccv_nnc_tensor_free(da);
2377
1
  ccv_nnc_tensor_free(dw);
2378
1
  ccv_nnc_tensor_free(dbias);
2379
1
  ccv_nnc_tensor_free(b);
2380
1
  ccv_nnc_tensor_view_free(av);
2381
1
  ccv_nnc_tensor_view_free(wv);
2382
1
  ccv_nnc_tensor_view_free(dav);
2383
1
  ccv_nnc_tensor_view_free(dwv);
2384
1
  ccv_nnc_tensor_free(at);
2385
1
  ccv_nnc_tensor_free(wt);
2386
1
  ccv_nnc_tensor_free(dat);
2387
1
  ccv_nnc_tensor_free(dwt);
2388
1
  ccv_nnc_tensor_free(tda);
2389
1
  ccv_nnc_tensor_free(tdw);
2390
1
  ccv_nnc_tensor_free(tdbias);
2391
1
}
2392
2393
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias and broadcast compare cublas")
2394
1
{
2395
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2396
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2397
1
  dsfmt_t dsfmt;
2398
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2399
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2400
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2401
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2402
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2403
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2404
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2405
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2406
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2407
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2408
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2409
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2410
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2411
2412
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2413
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2414
1
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2415
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2416
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2417
1
  int i;
2418
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
2419
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2420
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2421
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2422
5.12k
  for (i = 0; i < 2 * 4 * 10 * 64; 
i++5.12k
)
2423
5.12k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2424
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2425
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2426
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2427
1
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2428
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w, dbias), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw, dbias), 0);
2429
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw, hdbias), TENSOR_LIST(dat, tdw, tdbias), 0);
2430
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
2431
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2432
1
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2433
1
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2434
1
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
2435
1
  ccv_nnc_tensor_free(ha);
2436
1
  ccv_nnc_tensor_free(hw);
2437
1
  ccv_nnc_tensor_free(hda);
2438
1
  ccv_nnc_tensor_free(hdw);
2439
1
  ccv_nnc_tensor_free(hdbias);
2440
1
  ccv_nnc_tensor_free(hb);
2441
1
  ccv_nnc_tensor_free(a);
2442
1
  ccv_nnc_tensor_free(w);
2443
1
  ccv_nnc_tensor_free(da);
2444
1
  ccv_nnc_tensor_free(dw);
2445
1
  ccv_nnc_tensor_free(dbias);
2446
1
  ccv_nnc_tensor_free(b);
2447
1
  ccv_nnc_tensor_view_free(av);
2448
1
  ccv_nnc_tensor_view_free(dav);
2449
1
  ccv_nnc_tensor_free(at);
2450
1
  ccv_nnc_tensor_free(dat);
2451
1
  ccv_nnc_tensor_free(tdw);
2452
1
  ccv_nnc_tensor_free(tdbias);
2453
1
}
2454
2455
TEST_CASE("ewdiv forward with reciprocal")
2456
1
{
2457
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2458
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2459
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2460
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2461
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2462
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2463
1
  dsfmt_t dsfmt;
2464
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2465
1
  int i;
2466
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2467
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2468
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2469
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a), TENSOR_LIST(b), 0);
2470
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ha), TENSOR_LIST(bt), 0);
2471
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2472
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2473
1
  ccv_nnc_tensor_free(a);
2474
1
  ccv_nnc_tensor_free(b);
2475
1
  ccv_nnc_tensor_free(ha);
2476
1
  ccv_nnc_tensor_free(hb);
2477
1
  ccv_nnc_tensor_free(bt);
2478
1
}
2479
2480
TEST_CASE("ewdiv forward")
2481
1
{
2482
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2483
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2484
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2485
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2486
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2487
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2488
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2489
1
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2490
1
  dsfmt_t dsfmt;
2491
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2492
1
  int i;
2493
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2494
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2495
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2496
1.00k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2497
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
2498
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2499
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(ct), 0);
2500
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
2501
1
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
2502
1
  ccv_nnc_tensor_free(a);
2503
1
  ccv_nnc_tensor_free(b);
2504
1
  ccv_nnc_tensor_free(c);
2505
1
  ccv_nnc_tensor_free(ha);
2506
1
  ccv_nnc_tensor_free(hb);
2507
1
  ccv_nnc_tensor_free(hc);
2508
1
  ccv_nnc_tensor_free(ct);
2509
1
}
2510
2511
TEST_CASE("ewdiv backward with output 1")
2512
1
{
2513
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2514
1
    ccv_nnc_cmd_ok(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2515
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2516
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2517
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2518
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2519
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2520
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2521
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2522
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2523
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2524
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2525
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2526
1
  dsfmt_t dsfmt;
2527
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2528
1
  int i;
2529
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2530
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2531
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2532
1.00k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2533
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2534
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2535
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2536
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2537
1
  ccv_nnc_cmd_exec(CMD_EWDIV_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
2538
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2539
1
  ccv_nnc_cmd_exec(CMD_EWDIV_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
2540
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2541
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2542
1
  ccv_nnc_tensor_free(a);
2543
1
  ccv_nnc_tensor_free(b);
2544
1
  ccv_nnc_tensor_free(c);
2545
1
  ccv_nnc_tensor_free(g);
2546
1
  ccv_nnc_tensor_free(da);
2547
1
  ccv_nnc_tensor_free(ha);
2548
1
  ccv_nnc_tensor_free(hb);
2549
1
  ccv_nnc_tensor_free(hc);
2550
1
  ccv_nnc_tensor_free(hg);
2551
1
  ccv_nnc_tensor_free(hda);
2552
1
  ccv_nnc_tensor_free(dat);
2553
1
}
2554
2555
TEST_CASE("ewdiv backward with output 2")
2556
1
{
2557
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2558
1
    ccv_nnc_cmd_ok(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2559
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2560
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2561
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2562
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2563
1
  ccv_nnc_tensor_t* db = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2564
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2565
1
  ccv_nnc_tensor_t* hdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2566
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2567
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2568
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2569
1
  ccv_nnc_tensor_t* dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2570
1
  dsfmt_t dsfmt;
2571
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2572
1
  int i;
2573
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2574
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2575
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2576
1.00k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2577
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2578
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2579
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2580
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2581
1
  ccv_nnc_cmd_exec(CMD_EWDIV_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b, c), TENSOR_LIST(0, db), 0);
2582
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2583
1
  ccv_nnc_cmd_exec(CMD_EWDIV_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb, hc), TENSOR_LIST(0, dbt), 0);
2584
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(db), TENSOR_LIST(hdb), 0);
2585
1
  REQUIRE_TENSOR_EQ(dbt, hdb, "GPU computed output should be the same as CPU computed ones");
2586
1
  ccv_nnc_tensor_free(a);
2587
1
  ccv_nnc_tensor_free(b);
2588
1
  ccv_nnc_tensor_free(c);
2589
1
  ccv_nnc_tensor_free(g);
2590
1
  ccv_nnc_tensor_free(db);
2591
1
  ccv_nnc_tensor_free(ha);
2592
1
  ccv_nnc_tensor_free(hb);
2593
1
  ccv_nnc_tensor_free(hc);
2594
1
  ccv_nnc_tensor_free(hg);
2595
1
  ccv_nnc_tensor_free(hdb);
2596
1
  ccv_nnc_tensor_free(dbt);
2597
1
}
2598
2599
TEST_CASE("exp forward")
2600
1
{
2601
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2602
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2603
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2604
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2605
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2606
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2607
1
  dsfmt_t dsfmt;
2608
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2609
1
  int i;
2610
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2611
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
2612
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2613
1
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2614
1
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2615
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2616
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2617
1
  ccv_nnc_tensor_free(a);
2618
1
  ccv_nnc_tensor_free(b);
2619
1
  ccv_nnc_tensor_free(ha);
2620
1
  ccv_nnc_tensor_free(hb);
2621
1
  ccv_nnc_tensor_free(bt);
2622
1
}
2623
2624
TEST_CASE("ewexp backward")
2625
1
{
2626
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2627
1
    ccv_nnc_cmd_ok(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2628
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2629
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2630
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2631
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2632
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2633
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2634
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2635
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2636
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2637
1
  dsfmt_t dsfmt;
2638
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2639
1
  int i;
2640
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2641
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
2642
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2643
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2644
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2645
1
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2646
1
  ccv_nnc_cmd_exec(CMD_EWEXP_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
2647
1
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2648
1
  ccv_nnc_cmd_exec(CMD_EWEXP_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
2649
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2650
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2651
1
  ccv_nnc_tensor_free(a);
2652
1
  ccv_nnc_tensor_free(b);
2653
1
  ccv_nnc_tensor_free(g);
2654
1
  ccv_nnc_tensor_free(da);
2655
1
  ccv_nnc_tensor_free(ha);
2656
1
  ccv_nnc_tensor_free(hb);
2657
1
  ccv_nnc_tensor_free(hg);
2658
1
  ccv_nnc_tensor_free(hda);
2659
1
  ccv_nnc_tensor_free(dat);
2660
1
}
2661
2662
TEST_CASE("ewpow forward")
2663
1
{
2664
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWPOW_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2665
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2666
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2667
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2668
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2669
1
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2670
1
  dsfmt_t dsfmt;
2671
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2672
1
  int i;
2673
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2674
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 + 0.1;
2675
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2676
1
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(c), 0);
2677
1
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(ct), 0);
2678
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
2679
1
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
2680
1
  ccv_nnc_tensor_free(a);
2681
1
  ccv_nnc_tensor_free(c);
2682
1
  ccv_nnc_tensor_free(ha);
2683
1
  ccv_nnc_tensor_free(hc);
2684
1
  ccv_nnc_tensor_free(ct);
2685
1
}
2686
2687
TEST_CASE("ewpow backward")
2688
1
{
2689
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWPOW_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2690
1
    ccv_nnc_cmd_ok(CCV_NNC_EWPOW_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2691
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2692
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2693
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2694
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2695
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2696
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2697
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2698
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2699
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2700
1
  dsfmt_t dsfmt;
2701
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2702
1
  int i;
2703
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2704
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 + 0.1;
2705
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2706
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2707
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2708
1
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(c), 0);
2709
1
  ccv_nnc_cmd_exec(CMD_EWPOW_BACKWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a), TENSOR_LIST(da), 0);
2710
1
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hc), 0);
2711
1
  ccv_nnc_cmd_exec(CMD_EWPOW_BACKWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha), TENSOR_LIST(dat), 0);
2712
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2713
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed dA should be the same as CPU computed ones");
2714
1
  ccv_nnc_tensor_free(a);
2715
1
  ccv_nnc_tensor_free(c);
2716
1
  ccv_nnc_tensor_free(g);
2717
1
  ccv_nnc_tensor_free(da);
2718
1
  ccv_nnc_tensor_free(ha);
2719
1
  ccv_nnc_tensor_free(hc);
2720
1
  ccv_nnc_tensor_free(hg);
2721
1
  ccv_nnc_tensor_free(hda);
2722
1
  ccv_nnc_tensor_free(dat);
2723
1
}
2724
2725
TEST_CASE("ewsin forward")
2726
1
{
2727
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSIN_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2728
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2729
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2730
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2731
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2732
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2733
1
  dsfmt_t dsfmt;
2734
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2735
1
  int i;
2736
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2737
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
2738
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2739
1
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2740
1
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2741
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2742
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, bt->data.f32, hb->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
2743
1
  ccv_nnc_tensor_free(a);
2744
1
  ccv_nnc_tensor_free(b);
2745
1
  ccv_nnc_tensor_free(ha);
2746
1
  ccv_nnc_tensor_free(hb);
2747
1
  ccv_nnc_tensor_free(bt);
2748
1
}
2749
2750
TEST_CASE("ewsin backward")
2751
1
{
2752
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSIN_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2753
1
    ccv_nnc_cmd_ok(CCV_NNC_EWSIN_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2754
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2755
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2756
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2757
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2758
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2759
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2760
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2761
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2762
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2763
1
  dsfmt_t dsfmt;
2764
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2765
1
  int i;
2766
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2767
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
2768
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2769
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2770
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2771
1
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2772
1
  ccv_nnc_cmd_exec(CMD_EWSIN_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a), TENSOR_LIST(da), 0);
2773
1
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2774
1
  ccv_nnc_cmd_exec(CMD_EWSIN_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha), TENSOR_LIST(dat), 0);
2775
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2776
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dat->data.f32, hda->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
2777
1
  ccv_nnc_tensor_free(a);
2778
1
  ccv_nnc_tensor_free(b);
2779
1
  ccv_nnc_tensor_free(g);
2780
1
  ccv_nnc_tensor_free(da);
2781
1
  ccv_nnc_tensor_free(ha);
2782
1
  ccv_nnc_tensor_free(hb);
2783
1
  ccv_nnc_tensor_free(hg);
2784
1
  ccv_nnc_tensor_free(hda);
2785
1
  ccv_nnc_tensor_free(dat);
2786
1
}
2787
2788
TEST_CASE("ewcos forward")
2789
1
{
2790
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWCOS_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2791
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2792
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2793
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2794
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2795
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2796
1
  dsfmt_t dsfmt;
2797
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2798
1
  int i;
2799
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2800
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
2801
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2802
1
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2803
1
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2804
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2805
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, bt->data.f32, hb->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
2806
1
  ccv_nnc_tensor_free(a);
2807
1
  ccv_nnc_tensor_free(b);
2808
1
  ccv_nnc_tensor_free(ha);
2809
1
  ccv_nnc_tensor_free(hb);
2810
1
  ccv_nnc_tensor_free(bt);
2811
1
}
2812
2813
TEST_CASE("ewcos backward")
2814
1
{
2815
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWCOS_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2816
1
    ccv_nnc_cmd_ok(CCV_NNC_EWCOS_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2817
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2818
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2819
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2820
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2821
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2822
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2823
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2824
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2825
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2826
1
  dsfmt_t dsfmt;
2827
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2828
1
  int i;
2829
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2830
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
2831
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2832
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 - 1;
2833
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2834
1
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2835
1
  ccv_nnc_cmd_exec(CMD_EWCOS_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a), TENSOR_LIST(da), 0);
2836
1
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2837
1
  ccv_nnc_cmd_exec(CMD_EWCOS_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha), TENSOR_LIST(dat), 0);
2838
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2839
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dat->data.f32, hda->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
2840
1
  ccv_nnc_tensor_free(a);
2841
1
  ccv_nnc_tensor_free(b);
2842
1
  ccv_nnc_tensor_free(g);
2843
1
  ccv_nnc_tensor_free(da);
2844
1
  ccv_nnc_tensor_free(ha);
2845
1
  ccv_nnc_tensor_free(hb);
2846
1
  ccv_nnc_tensor_free(hg);
2847
1
  ccv_nnc_tensor_free(hda);
2848
1
  ccv_nnc_tensor_free(dat);
2849
1
}
2850
2851
TEST_CASE("ewlog forward")
2852
1
{
2853
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2854
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2855
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2856
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2857
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2858
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2859
1
  dsfmt_t dsfmt;
2860
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2861
1
  int i;
2862
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2863
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
2864
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2865
1
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2866
1
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2867
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2868
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2869
1
  ccv_nnc_tensor_free(a);
2870
1
  ccv_nnc_tensor_free(b);
2871
1
  ccv_nnc_tensor_free(ha);
2872
1
  ccv_nnc_tensor_free(hb);
2873
1
  ccv_nnc_tensor_free(bt);
2874
1
}
2875
2876
TEST_CASE("ewlog backward")
2877
1
{
2878
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2879
1
    ccv_nnc_cmd_ok(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2880
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2881
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2882
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2883
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2884
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2885
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2886
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2887
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2888
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2889
1
  dsfmt_t dsfmt;
2890
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2891
1
  int i;
2892
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2893
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
2894
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2895
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2896
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2897
1
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2898
1
  ccv_nnc_cmd_exec(CMD_EWLOG_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a), TENSOR_LIST(da), 0);
2899
1
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2900
1
  ccv_nnc_cmd_exec(CMD_EWLOG_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha), TENSOR_LIST(dat), 0);
2901
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2902
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2903
1
  ccv_nnc_tensor_free(a);
2904
1
  ccv_nnc_tensor_free(b);
2905
1
  ccv_nnc_tensor_free(g);
2906
1
  ccv_nnc_tensor_free(da);
2907
1
  ccv_nnc_tensor_free(ha);
2908
1
  ccv_nnc_tensor_free(hb);
2909
1
  ccv_nnc_tensor_free(hg);
2910
1
  ccv_nnc_tensor_free(hda);
2911
1
  ccv_nnc_tensor_free(dat);
2912
1
}
2913
2914
TEST_CASE("ewsqrt forward")
2915
1
{
2916
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2917
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2918
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2919
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2920
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2921
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2922
1
  dsfmt_t dsfmt;
2923
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2924
1
  int i;
2925
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2926
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
2927
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2928
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2929
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2930
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2931
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2932
1
  ccv_nnc_tensor_free(a);
2933
1
  ccv_nnc_tensor_free(b);
2934
1
  ccv_nnc_tensor_free(ha);
2935
1
  ccv_nnc_tensor_free(hb);
2936
1
  ccv_nnc_tensor_free(bt);
2937
1
}
2938
2939
TEST_CASE("ewsqrt backward")
2940
1
{
2941
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2942
1
    ccv_nnc_cmd_ok(CCV_NNC_EWSQRT_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2943
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2944
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2945
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2946
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2947
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2948
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2949
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2950
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2951
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2952
1
  dsfmt_t dsfmt;
2953
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2954
1
  int i;
2955
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2956
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
2957
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2958
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2959
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2960
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2961
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
2962
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2963
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
2964
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2965
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2966
1
  ccv_nnc_tensor_free(a);
2967
1
  ccv_nnc_tensor_free(b);
2968
1
  ccv_nnc_tensor_free(g);
2969
1
  ccv_nnc_tensor_free(da);
2970
1
  ccv_nnc_tensor_free(ha);
2971
1
  ccv_nnc_tensor_free(hb);
2972
1
  ccv_nnc_tensor_free(hg);
2973
1
  ccv_nnc_tensor_free(hda);
2974
1
  ccv_nnc_tensor_free(dat);
2975
1
}
2976
2977
TEST_CASE("ewabs forward")
2978
1
{
2979
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWABS_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2980
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2981
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2982
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2983
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2984
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2985
1
  dsfmt_t dsfmt;
2986
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2987
1
  int i;
2988
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2989
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5 + 0.0001;
2990
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2991
1
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2992
1
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2993
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2994
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2995
1
  ccv_nnc_tensor_free(a);
2996
1
  ccv_nnc_tensor_free(b);
2997
1
  ccv_nnc_tensor_free(ha);
2998
1
  ccv_nnc_tensor_free(hb);
2999
1
  ccv_nnc_tensor_free(bt);
3000
1
}
3001
3002
TEST_CASE("ewabs backward")
3003
1
{
3004
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWABS_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
3005
1
    ccv_nnc_cmd_ok(CCV_NNC_EWABS_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
3006
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3007
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3008
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3009
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3010
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3011
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3012
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3013
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3014
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3015
1
  dsfmt_t dsfmt;
3016
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3017
1
  int i;
3018
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3019
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
3020
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3021
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3022
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
3023
1
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3024
1
  ccv_nnc_cmd_exec(CMD_EWABS_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a), TENSOR_LIST(da), 0);
3025
1
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
3026
1
  ccv_nnc_cmd_exec(CMD_EWABS_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha), TENSOR_LIST(dat), 0);
3027
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
3028
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
3029
1
  ccv_nnc_tensor_free(a);
3030
1
  ccv_nnc_tensor_free(b);
3031
1
  ccv_nnc_tensor_free(g);
3032
1
  ccv_nnc_tensor_free(da);
3033
1
  ccv_nnc_tensor_free(ha);
3034
1
  ccv_nnc_tensor_free(hb);
3035
1
  ccv_nnc_tensor_free(hg);
3036
1
  ccv_nnc_tensor_free(hda);
3037
1
  ccv_nnc_tensor_free(dat);
3038
1
}
3039
3040
TEST_CASE("clamp forward")
3041
1
{
3042
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF));
3043
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3044
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3045
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3046
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3047
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3048
1
  dsfmt_t dsfmt;
3049
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3050
1
  int i;
3051
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3052
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3053
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3054
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3055
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3056
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3057
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3058
1
  ccv_nnc_tensor_free(a);
3059
1
  ccv_nnc_tensor_free(b);
3060
1
  ccv_nnc_tensor_free(ha);
3061
1
  ccv_nnc_tensor_free(hb);
3062
1
  ccv_nnc_tensor_free(bt);
3063
1
}
3064
3065
TEST_CASE("clamp backward")
3066
1
{
3067
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
3068
1
    ccv_nnc_cmd_ok(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
3069
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3070
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3071
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3072
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3073
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3074
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3075
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3076
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3077
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3078
1
  dsfmt_t dsfmt;
3079
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3080
1
  int i;
3081
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3082
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
3083
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3084
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3085
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
3086
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3087
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(0, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
3088
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
3089
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(0, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
3090
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
3091
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
3092
1
  ccv_nnc_tensor_free(a);
3093
1
  ccv_nnc_tensor_free(b);
3094
1
  ccv_nnc_tensor_free(g);
3095
1
  ccv_nnc_tensor_free(da);
3096
1
  ccv_nnc_tensor_free(ha);
3097
1
  ccv_nnc_tensor_free(hb);
3098
1
  ccv_nnc_tensor_free(hg);
3099
1
  ccv_nnc_tensor_free(hda);
3100
1
  ccv_nnc_tensor_free(dat);
3101
1
}
3102
3103
TEST_CASE("clamp forward with only max")
3104
1
{
3105
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF));
3106
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3107
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3108
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3109
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3110
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3111
1
  dsfmt_t dsfmt;
3112
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3113
1
  int i;
3114
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3115
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3116
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3117
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3118
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3119
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3120
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3121
1
  ccv_nnc_tensor_free(a);
3122
1
  ccv_nnc_tensor_free(b);
3123
1
  ccv_nnc_tensor_free(ha);
3124
1
  ccv_nnc_tensor_free(hb);
3125
1
  ccv_nnc_tensor_free(bt);
3126
1
}
3127
3128
TEST_CASE("clamp backward with only max")
3129
1
{
3130
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
3131
1
    ccv_nnc_cmd_ok(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
3132
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3133
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3134
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3135
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3136
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3137
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3138
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3139
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3140
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3141
1
  dsfmt_t dsfmt;
3142
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3143
1
  int i;
3144
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3145
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
3146
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3147
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3148
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
3149
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3150
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(NAN, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
3151
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
3152
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(NAN, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
3153
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
3154
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
3155
1
  ccv_nnc_tensor_free(a);
3156
1
  ccv_nnc_tensor_free(b);
3157
1
  ccv_nnc_tensor_free(g);
3158
1
  ccv_nnc_tensor_free(da);
3159
1
  ccv_nnc_tensor_free(ha);
3160
1
  ccv_nnc_tensor_free(hb);
3161
1
  ccv_nnc_tensor_free(hg);
3162
1
  ccv_nnc_tensor_free(hda);
3163
1
  ccv_nnc_tensor_free(dat);
3164
1
}
3165
3166
TEST_CASE("clamp forward with only min")
3167
1
{
3168
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF));
3169
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3170
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3171
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3172
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3173
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3174
1
  dsfmt_t dsfmt;
3175
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3176
1
  int i;
3177
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3178
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3179
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3180
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3181
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3182
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3183
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3184
1
  ccv_nnc_tensor_free(a);
3185
1
  ccv_nnc_tensor_free(b);
3186
1
  ccv_nnc_tensor_free(ha);
3187
1
  ccv_nnc_tensor_free(hb);
3188
1
  ccv_nnc_tensor_free(bt);
3189
1
}
3190
3191
TEST_CASE("clamp backward with only min")
3192
1
{
3193
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
3194
1
    ccv_nnc_cmd_ok(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
3195
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3196
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3197
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3198
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3199
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3200
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3201
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3202
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3203
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3204
1
  dsfmt_t dsfmt;
3205
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3206
1
  int i;
3207
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3208
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
3209
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
3210
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3211
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
3212
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3213
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
3214
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
3215
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
3216
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
3217
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
3218
1
  ccv_nnc_tensor_free(a);
3219
1
  ccv_nnc_tensor_free(b);
3220
1
  ccv_nnc_tensor_free(g);
3221
1
  ccv_nnc_tensor_free(da);
3222
1
  ccv_nnc_tensor_free(ha);
3223
1
  ccv_nnc_tensor_free(hb);
3224
1
  ccv_nnc_tensor_free(hg);
3225
1
  ccv_nnc_tensor_free(hda);
3226
1
  ccv_nnc_tensor_free(dat);
3227
1
}
3228
3229
TEST_CASE("scaled dot product attention with flash_attn")
3230
1
{
3231
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
3232
  // Bypass error: variable-sized object may not be initialized
3233
0
#define num_long_trials 4
3234
0
#define num_short_trials 2
3235
0
#define num_trials (num_long_trials + num_short_trials)
3236
3237
0
  for (int trial = 0; trial < num_trials; ++trial) {
3238
0
    int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1 };
3239
0
    int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5 };
3240
0
    int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5 };
3241
0
    int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32 };
3242
0
    int Hk_candidates[num_trials] = {   8,  8, 8, 8, 2, 8 };
3243
0
    int D_candidates[num_trials] = {  64, 40, 160, 224, 224, 128 };
3244
0
    int is_causal_candidates[num_trials] = {  1, 0, 1, 1, 0, 1 };
3245
3246
0
    int B = B_candidates[trial];
3247
0
    int R = R_candidates[trial];
3248
0
    int C = C_candidates[trial];
3249
0
    int Hq = Hq_candidates[trial];
3250
0
    int Hk = Hk_candidates[trial];
3251
0
    int D = D_candidates[trial];
3252
0
    int is_causal = is_causal_candidates[trial];
3253
0
    float scale = 1.0 / sqrt((float)D);
3254
3255
0
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
3256
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3257
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3258
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3259
3260
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
3261
0
      q_tensor->data.f32[i] = (float)(i) / (float)(B * R * Hq * D);
3262
0
    }
3263
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3264
0
      k_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3265
0
    }
3266
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3267
0
      v_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3268
0
    }
3269
3270
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3271
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, NULL, NULL, NULL), TENSOR_LIST(o_tensor, NULL), 0);
3272
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
3273
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
3274
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
3275
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), 0);
3276
3277
    // Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC?
3278
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
3279
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
3280
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
3281
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
3282
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3283
3284
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, NULL), 0);
3285
3286
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
3287
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor_f16), 0);
3288
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3289
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_o_tensor_f16), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3290
3291
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 3e-3, "GPU computed output should be the same as CPU computed ones");
3292
3293
0
    ccv_nnc_tensor_free(o_tensor);
3294
0
    ccv_nnc_tensor_free(gpu_o_tensor);
3295
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3296
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor_f16);
3297
0
    ccv_nnc_tensor_free(q_tensor);
3298
0
    ccv_nnc_tensor_free(k_tensor);
3299
0
    ccv_nnc_tensor_free(v_tensor);
3300
0
    ccv_nnc_tensor_free(q_tensor_f16);
3301
0
    ccv_nnc_tensor_free(k_tensor_f16);
3302
0
    ccv_nnc_tensor_free(v_tensor_f16);
3303
0
    ccv_nnc_tensor_free(gpu_q_tensor);
3304
0
    ccv_nnc_tensor_free(gpu_k_tensor);
3305
0
    ccv_nnc_tensor_free(gpu_v_tensor);
3306
0
  }
3307
0
#undef num_long_trials
3308
0
#undef num_short_trials
3309
0
#undef num_trials
3310
0
}
3311
3312
TEST_CASE("scaled dot product attention with flash_attn in bfloat")
3313
1
{
3314
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
3315
  // Bypass error: variable-sized object may not be initialized
3316
0
#define num_long_trials 4
3317
0
#define num_short_trials 2
3318
0
#define num_trials (num_long_trials + num_short_trials)
3319
3320
0
  for (int trial = 0; trial < num_trials; ++trial) {
3321
0
    int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1 };
3322
0
    int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5 };
3323
0
    int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5 };
3324
0
    int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32 };
3325
0
    int Hk_candidates[num_trials] = {   8,  8, 8, 8, 2, 8 };
3326
0
    int D_candidates[num_trials] = {  64, 40, 160, 224, 224, 128 };
3327
0
    int is_causal_candidates[num_trials] = {  1, 0, 1, 1, 0, 1 };
3328
3329
0
    int B = B_candidates[trial];
3330
0
    int R = R_candidates[trial];
3331
0
    int C = C_candidates[trial];
3332
0
    int Hq = Hq_candidates[trial];
3333
0
    int Hk = Hk_candidates[trial];
3334
0
    int D = D_candidates[trial];
3335
0
    int is_causal = is_causal_candidates[trial];
3336
0
    float scale = 1.0 / sqrt((float)D);
3337
3338
0
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
3339
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3340
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3341
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3342
3343
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
3344
0
      q_tensor->data.f32[i] = (float)(i) / (float)(B * R * Hq * D);
3345
0
    }
3346
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3347
0
      k_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3348
0
    }
3349
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3350
0
      v_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3351
0
    }
3352
3353
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3354
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, NULL, NULL, NULL), TENSOR_LIST(o_tensor, NULL), 0);
3355
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
3356
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
3357
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
3358
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), 0);
3359
3360
    // Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC?
3361
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
3362
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
3363
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
3364
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
3365
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3366
3367
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, NULL), 0);
3368
3369
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
3370
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor_f16), 0);
3371
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3372
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_o_tensor_f16), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3373
3374
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 1e-2, "GPU computed output should be the same as CPU computed ones");
3375
3376
0
    ccv_nnc_tensor_free(o_tensor);
3377
0
    ccv_nnc_tensor_free(gpu_o_tensor);
3378
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3379
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor_f16);
3380
0
    ccv_nnc_tensor_free(q_tensor);
3381
0
    ccv_nnc_tensor_free(k_tensor);
3382
0
    ccv_nnc_tensor_free(v_tensor);
3383
0
    ccv_nnc_tensor_free(q_tensor_f16);
3384
0
    ccv_nnc_tensor_free(k_tensor_f16);
3385
0
    ccv_nnc_tensor_free(v_tensor_f16);
3386
0
    ccv_nnc_tensor_free(gpu_q_tensor);
3387
0
    ccv_nnc_tensor_free(gpu_k_tensor);
3388
0
    ccv_nnc_tensor_free(gpu_v_tensor);
3389
0
  }
3390
0
#undef num_long_trials
3391
0
#undef num_short_trials
3392
0
#undef num_trials
3393
0
}
3394
3395
TEST_CASE("scaled dot product attention + unify head with flash_attn")
3396
1
{
3397
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
3398
0
  ccv_nnc_symbolic_graph_t* const sdp_symbolic_graph = ccv_nnc_symbolic_graph_new();
3399
0
  ccv_nnc_tensor_symbol_t q = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "q");
3400
0
  ccv_nnc_tensor_symbol_t k = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "k");
3401
0
  ccv_nnc_tensor_symbol_t v = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "v");
3402
0
  ccv_nnc_tensor_symbol_t w = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512, 512), "w");
3403
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512), "bias");
3404
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "c");
3405
0
  ccv_nnc_tensor_symbol_t r = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 512), "r");
3406
0
  ccv_nnc_graph_exec_symbol_new(sdp_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(q, k, v, NO_TENSOR_SYMBOL, w, bias), TENSOR_SYMBOL_LIST(r, NO_TENSOR_SYMBOL, c), "scaled_dot_product_attention");
3407
0
  ccv_nnc_graph_exec_symbol_autogen(sdp_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3408
0
  ccv_nnc_graph_t* sdp_graph = 0;
3409
0
  ccv_nnc_tensor_arena_t* sdp_tensor_arena = 0;
3410
0
  ccv_nnc_graph_exec_arena_t* sdp_graph_exec_arena = 0;
3411
0
  ccv_nnc_symbolic_graph_compile(sdp_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(sdp_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(sdp_symbolic_graph), &sdp_graph, &sdp_tensor_arena, &sdp_graph_exec_arena);
3412
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, q);
3413
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, k);
3414
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, v);
3415
0
  ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, w);
3416
0
  ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, bias);
3417
0
  dsfmt_t dsfmt;
3418
0
  int i;
3419
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3420
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
3421
0
    q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3422
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
3423
0
    k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3424
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
3425
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3426
0
  for (i = 0; i < 512 * 512; i++)
3427
0
    w_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / sqrtf(512);
3428
0
  for (i = 0; i < 512; i++)
3429
0
    bias_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3430
0
  ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 8, 64), 0);
3431
0
  ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 8, 64), 0);
3432
0
  ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 8, 64), 0);
3433
0
  ccv_nnc_tensor_t* const w_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 512, 512), 0);
3434
0
  ccv_nnc_tensor_t* const bias_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 512), 0);
3435
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, w_tensor, bias_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, w_tensor_f16, bias_tensor_f16), 0);
3436
0
  ccv_nnc_symbolic_graph_t* const g_symbolic_graph = ccv_nnc_symbolic_graph_new();
3437
0
  ccv_nnc_tensor_symbol_t gq = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 8, 64), "q");
3438
0
  ccv_nnc_tensor_symbol_t gk = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 8, 64), "k");
3439
0
  ccv_nnc_tensor_symbol_t gv = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 8, 64), "v");
3440
0
  ccv_nnc_tensor_symbol_t gw = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 512, 512), "w");
3441
0
  ccv_nnc_tensor_symbol_t gbias = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 512), "bias");
3442
0
  ccv_nnc_tensor_symbol_t gc = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 8, 64), "c");
3443
0
  ccv_nnc_tensor_symbol_t gr = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 512), "r");
3444
0
  ccv_nnc_graph_exec_symbol_new(g_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(gq, gk, gv, NO_TENSOR_SYMBOL, gw, gbias), TENSOR_SYMBOL_LIST(gr, NO_TENSOR_SYMBOL, gc), "scaled_dot_product_attention");
3445
0
  ccv_nnc_graph_exec_symbol_autogen(g_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3446
0
  ccv_nnc_graph_t* g_graph = 0;
3447
0
  ccv_nnc_tensor_arena_t* g_tensor_arena = 0;
3448
0
  ccv_nnc_graph_exec_arena_t* g_graph_exec_arena = 0;
3449
0
  ccv_nnc_symbolic_graph_compile(g_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(g_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(g_symbolic_graph), &g_graph, &g_tensor_arena, &g_graph_exec_arena);
3450
0
  ccv_nnc_tensor_t* const gq_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gq);
3451
0
  ccv_nnc_tensor_t* const gk_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gk);
3452
0
  ccv_nnc_tensor_t* const gv_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gv);
3453
0
  ccv_nnc_tensor_t* const gw_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gw);
3454
0
  ccv_nnc_tensor_t* const gbias_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gbias);
3455
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, w_tensor_f16, bias_tensor_f16), TENSOR_LIST(gq_tensor, gk_tensor, gv_tensor, gw_tensor, gbias_tensor), 0);
3456
0
  ccv_nnc_graph_run(sdp_graph, 0, TRAVERSE_FULL, 0, 0);
3457
0
  ccv_nnc_graph_run(g_graph, 0, TRAVERSE_FULL, 0, 0);
3458
0
  ccv_nnc_tensor_t* const r_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, r);
3459
0
  ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, c);
3460
0
  ccv_nnc_tensor_t* const gc_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gc);
3461
0
  ccv_nnc_tensor_t* const gr_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gr);
3462
0
  ccv_nnc_tensor_t* const ho_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 8, 64), 0);
3463
0
  ccv_nnc_tensor_t* const hr_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 512), 0);
3464
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc_tensor, gr_tensor), TENSOR_LIST(ho_f16, hr_f16), 0);
3465
0
  ccv_nnc_tensor_t* const ho = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), 0);
3466
0
  ccv_nnc_tensor_t* const hr = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 512), 0);
3467
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ho_f16, hr_f16), TENSOR_LIST(ho, hr), 0);
3468
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, o_tensor->data.f32, ho->data.f32, 32 * 128 * 8 * 64, 3e-3, "graph computed result should match scaled dot product attention op result");
3469
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, r_tensor->data.f32, hr->data.f32, 32 * 128 * 512, 3e-2, "graph computed result should match scaled dot product attention op result");
3470
0
  ccv_nnc_symbolic_graph_free(sdp_symbolic_graph);
3471
0
  ccv_nnc_tensor_arena_free(sdp_tensor_arena);
3472
0
  ccv_nnc_graph_exec_arena_free(sdp_graph_exec_arena);
3473
0
  ccv_nnc_graph_free(sdp_graph);
3474
0
  ccv_nnc_symbolic_graph_free(g_symbolic_graph);
3475
0
  ccv_nnc_tensor_arena_free(g_tensor_arena);
3476
0
  ccv_nnc_graph_exec_arena_free(g_graph_exec_arena);
3477
0
  ccv_nnc_graph_free(g_graph);
3478
0
  ccv_nnc_tensor_free(ho);
3479
0
  ccv_nnc_tensor_free(hr);
3480
0
  ccv_nnc_tensor_free(ho_f16);
3481
0
  ccv_nnc_tensor_free(hr_f16);
3482
0
  ccv_nnc_tensor_free(q_tensor_f16);
3483
0
  ccv_nnc_tensor_free(k_tensor_f16);
3484
0
  ccv_nnc_tensor_free(v_tensor_f16);
3485
0
  ccv_nnc_tensor_free(w_tensor_f16);
3486
0
  ccv_nnc_tensor_free(bias_tensor_f16);
3487
0
}
3488
3489
TEST_CASE("scaled dot product attention + unify head with flash_attn in bfloat")
3490
1
{
3491
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
3492
0
  ccv_nnc_symbolic_graph_t* const sdp_symbolic_graph = ccv_nnc_symbolic_graph_new();
3493
0
  ccv_nnc_tensor_symbol_t q = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "q");
3494
0
  ccv_nnc_tensor_symbol_t k = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "k");
3495
0
  ccv_nnc_tensor_symbol_t v = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "v");
3496
0
  ccv_nnc_tensor_symbol_t w = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512, 512), "w");
3497
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512), "bias");
3498
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "c");
3499
0
  ccv_nnc_tensor_symbol_t r = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 512), "r");
3500
0
  ccv_nnc_graph_exec_symbol_new(sdp_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(q, k, v, NO_TENSOR_SYMBOL, w, bias), TENSOR_SYMBOL_LIST(r, NO_TENSOR_SYMBOL, c), "scaled_dot_product_attention");
3501
0
  ccv_nnc_graph_exec_symbol_autogen(sdp_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3502
0
  ccv_nnc_graph_t* sdp_graph = 0;
3503
0
  ccv_nnc_tensor_arena_t* sdp_tensor_arena = 0;
3504
0
  ccv_nnc_graph_exec_arena_t* sdp_graph_exec_arena = 0;
3505
0
  ccv_nnc_symbolic_graph_compile(sdp_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(sdp_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(sdp_symbolic_graph), &sdp_graph, &sdp_tensor_arena, &sdp_graph_exec_arena);
3506
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, q);
3507
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, k);
3508
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, v);
3509
0
  ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, w);
3510
0
  ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, bias);
3511
0
  dsfmt_t dsfmt;
3512
0
  int i;
3513
0
  dsfmt_init_gen_rand(&dsfmt, 1);
3514
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
3515
0
    q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3516
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
3517
0
    k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3518
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
3519
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3520
0
  for (i = 0; i < 512 * 512; i++)
3521
0
    w_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / sqrtf(512);
3522
0
  for (i = 0; i < 512; i++)
3523
0
    bias_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3524
0
  ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 32, 128, 8, 64), 0);
3525
0
  ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 32, 128, 8, 64), 0);
3526
0
  ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 32, 128, 8, 64), 0);
3527
0
  ccv_nnc_tensor_t* const w_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 512, 512), 0);
3528
0
  ccv_nnc_tensor_t* const bias_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 512), 0);
3529
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, w_tensor, bias_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, w_tensor_f16, bias_tensor_f16), 0);
3530
0
  ccv_nnc_symbolic_graph_t* const g_symbolic_graph = ccv_nnc_symbolic_graph_new();
3531
0
  ccv_nnc_tensor_symbol_t gq = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16BF, 32, 128, 8, 64), "q");
3532
0
  ccv_nnc_tensor_symbol_t gk = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16BF, 32, 128, 8, 64), "k");
3533
0
  ccv_nnc_tensor_symbol_t gv = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16BF, 32, 128, 8, 64), "v");
3534
0
  ccv_nnc_tensor_symbol_t gw = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16BF, 512, 512), "w");
3535
0
  ccv_nnc_tensor_symbol_t gbias = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16BF, 512), "bias");
3536
0
  ccv_nnc_tensor_symbol_t gc = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16BF, 32, 128, 8, 64), "c");
3537
0
  ccv_nnc_tensor_symbol_t gr = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16BF, 32, 128, 512), "r");
3538
0
  ccv_nnc_graph_exec_symbol_new(g_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(gq, gk, gv, NO_TENSOR_SYMBOL, gw, gbias), TENSOR_SYMBOL_LIST(gr, NO_TENSOR_SYMBOL, gc), "scaled_dot_product_attention");
3539
0
  ccv_nnc_graph_exec_symbol_autogen(g_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3540
0
  ccv_nnc_graph_t* g_graph = 0;
3541
0
  ccv_nnc_tensor_arena_t* g_tensor_arena = 0;
3542
0
  ccv_nnc_graph_exec_arena_t* g_graph_exec_arena = 0;
3543
0
  ccv_nnc_symbolic_graph_compile(g_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(g_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(g_symbolic_graph), &g_graph, &g_tensor_arena, &g_graph_exec_arena);
3544
0
  ccv_nnc_tensor_t* const gq_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gq);
3545
0
  ccv_nnc_tensor_t* const gk_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gk);
3546
0
  ccv_nnc_tensor_t* const gv_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gv);
3547
0
  ccv_nnc_tensor_t* const gw_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gw);
3548
0
  ccv_nnc_tensor_t* const gbias_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gbias);
3549
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, w_tensor_f16, bias_tensor_f16), TENSOR_LIST(gq_tensor, gk_tensor, gv_tensor, gw_tensor, gbias_tensor), 0);
3550
0
  ccv_nnc_graph_run(sdp_graph, 0, TRAVERSE_FULL, 0, 0);
3551
0
  ccv_nnc_graph_run(g_graph, 0, TRAVERSE_FULL, 0, 0);
3552
0
  ccv_nnc_tensor_t* const r_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, r);
3553
0
  ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, c);
3554
0
  ccv_nnc_tensor_t* const gc_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gc);
3555
0
  ccv_nnc_tensor_t* const gr_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gr);
3556
0
  ccv_nnc_tensor_t* const ho_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 32, 128, 8, 64), 0);
3557
0
  ccv_nnc_tensor_t* const hr_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 32, 128, 512), 0);
3558
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc_tensor, gr_tensor), TENSOR_LIST(ho_f16, hr_f16), 0);
3559
0
  ccv_nnc_tensor_t* const ho = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), 0);
3560
0
  ccv_nnc_tensor_t* const hr = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 512), 0);
3561
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ho_f16, hr_f16), TENSOR_LIST(ho, hr), 0);
3562
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, o_tensor->data.f32, ho->data.f32, 32 * 128 * 8 * 64, 1e-2, "graph computed result should match scaled dot product attention op result");
3563
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, r_tensor->data.f32, hr->data.f32, 32 * 128 * 512, 1e-1, "graph computed result should match scaled dot product attention op result");
3564
0
  ccv_nnc_symbolic_graph_free(sdp_symbolic_graph);
3565
0
  ccv_nnc_tensor_arena_free(sdp_tensor_arena);
3566
0
  ccv_nnc_graph_exec_arena_free(sdp_graph_exec_arena);
3567
0
  ccv_nnc_graph_free(sdp_graph);
3568
0
  ccv_nnc_symbolic_graph_free(g_symbolic_graph);
3569
0
  ccv_nnc_tensor_arena_free(g_tensor_arena);
3570
0
  ccv_nnc_graph_exec_arena_free(g_graph_exec_arena);
3571
0
  ccv_nnc_graph_free(g_graph);
3572
0
  ccv_nnc_tensor_free(ho);
3573
0
  ccv_nnc_tensor_free(hr);
3574
0
  ccv_nnc_tensor_free(ho_f16);
3575
0
  ccv_nnc_tensor_free(hr_f16);
3576
0
  ccv_nnc_tensor_free(q_tensor_f16);
3577
0
  ccv_nnc_tensor_free(k_tensor_f16);
3578
0
  ccv_nnc_tensor_free(v_tensor_f16);
3579
0
  ccv_nnc_tensor_free(w_tensor_f16);
3580
0
  ccv_nnc_tensor_free(bias_tensor_f16);
3581
0
}
3582
3583
TEST_CASE("scaled dot product attention gradient with flash_attn")
3584
1
{
3585
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
3586
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
3587
0
#define num_long_trials 8
3588
0
#define num_short_trials 4
3589
0
#define num_trials (num_long_trials + num_short_trials)
3590
3591
0
  dsfmt_t dsfmt;
3592
0
  dsfmt_init_gen_rand(&dsfmt, 10);
3593
0
  for (int trial = 0; trial < num_trials; ++trial) {
3594
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
3595
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
3596
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
3597
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
3598
0
    const int Hk_candidates[num_trials] = {   8,  8, 8, 8, 2, 8, 8,  8, 8, 8, 2, 8 };
3599
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 64, 40, 160, 192, 256, 128 };
3600
0
    const int is_causal_candidates[num_trials] = {  1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1 };
3601
0
    const int deterministic_candidates[num_trials] = {  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1 };
3602
3603
0
    const int B = B_candidates[trial];
3604
0
    const int R = R_candidates[trial];
3605
0
    const int C = C_candidates[trial];
3606
0
    const int Hq = Hq_candidates[trial];
3607
0
    const int Hk = Hk_candidates[trial];
3608
0
    const int D = D_candidates[trial];
3609
0
    const int is_causal = is_causal_candidates[trial];
3610
0
    const int deterministic = deterministic_candidates[trial];
3611
0
    const float scale = 1.0 / sqrt((float)D);
3612
3613
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3614
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3615
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3616
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3617
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3618
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3619
3620
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
3621
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3622
0
    }
3623
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3624
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3625
0
    }
3626
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3627
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3628
0
    }
3629
3630
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3631
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
3632
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3633
0
    }
3634
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
3635
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
3636
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
3637
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
3638
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
3639
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), 0);
3640
3641
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
3642
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
3643
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
3644
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
3645
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
3646
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
3647
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
3648
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
3649
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
3650
3651
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
3652
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
3653
3654
0
    ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal);
3655
0
    cmd.info.scaled_dot_product_attention.deterministic = deterministic;
3656
0
    ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
3657
3658
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
3659
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
3660
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
3661
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), 0);
3662
3663
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3664
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3665
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3666
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
3667
3668
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * Hq * D, 1e-3, "scaled dot product attention result should be the same");
3669
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * Hk * D, 3e-3, "scaled dot product attention result should be the same");
3670
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * Hk * D, 6e-3, "GPU computed output should be the same as CPU computed ones");
3671
3672
0
    ccv_nnc_tensor_free(do_tensor);
3673
0
    ccv_nnc_tensor_free(gpu_do_tensor);
3674
0
    ccv_nnc_tensor_free(gpu_o_tensor);
3675
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor_f16);
3676
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor_f16);
3677
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor_f16);
3678
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
3679
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
3680
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
3681
0
    ccv_nnc_tensor_free(q_tensor);
3682
0
    ccv_nnc_tensor_free(k_tensor);
3683
0
    ccv_nnc_tensor_free(v_tensor);
3684
0
    ccv_nnc_tensor_free(q_tensor_f16);
3685
0
    ccv_nnc_tensor_free(k_tensor_f16);
3686
0
    ccv_nnc_tensor_free(v_tensor_f16);
3687
0
    ccv_nnc_tensor_free(do_tensor_f16);
3688
0
    ccv_nnc_tensor_free(gpu_q_tensor);
3689
0
    ccv_nnc_tensor_free(gpu_k_tensor);
3690
0
    ccv_nnc_tensor_free(gpu_v_tensor);
3691
0
    ccv_nnc_tensor_free(dq_tensor);
3692
0
    ccv_nnc_tensor_free(dk_tensor);
3693
0
    ccv_nnc_tensor_free(dv_tensor);
3694
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
3695
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
3696
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
3697
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
3698
0
  }
3699
0
#undef num_long_trials
3700
0
#undef num_short_trials
3701
0
#undef num_trials
3702
0
}
3703
3704
TEST_CASE("cmul in float")
3705
1
{
3706
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
3707
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3708
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
3709
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
3710
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "c");
3711
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
3712
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3713
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3714
1
  ccv_nnc_graph_t* graph = 0;
3715
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3716
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3717
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3718
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3719
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3720
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3721
1
  dsfmt_t dsfmt;
3722
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3723
1
  int i;
3724
201
  for (i = 0; i < 20 * 10; 
i++200
)
3725
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3726
201
  for (i = 0; i < 20 * 10; 
i++200
)
3727
200
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3728
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3729
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3730
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3731
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
3732
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3733
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3734
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3735
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z_tensor), 0);
3736
1
  ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3737
1
  ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
3738
1
  REQUIRE_TENSOR_EQ(tz, z_tensor, "gelu from cudnn should match from CPU");
3739
1
  ccv_nnc_tensor_free(x_tensor);
3740
1
  ccv_nnc_tensor_free(y_tensor);
3741
1
  ccv_nnc_tensor_free(z_tensor);
3742
1
  ccv_nnc_tensor_free(tz);
3743
1
  ccv_nnc_graph_free(graph);
3744
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3745
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3746
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3747
1
}
3748
3749
TEST_CASE("cmul in half precision")
3750
1
{
3751
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
3752
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3753
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
3754
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
3755
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "c");
3756
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
3757
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3758
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3759
1
  ccv_nnc_graph_t* graph = 0;
3760
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3761
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3762
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3763
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3764
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3765
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3766
1
  dsfmt_t dsfmt;
3767
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3768
1
  int i;
3769
201
  for (i = 0; i < 20 * 10; 
i++200
)
3770
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3771
201
  for (i = 0; i < 20 * 10; 
i++200
)
3772
200
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3773
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3774
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3775
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3776
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
3777
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3778
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3779
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(y16_tensor), 0);
3780
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(b_tensor), 0);
3781
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3782
1
  ccv_nnc_tensor_t* const z16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3783
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3784
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3785
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z16_tensor), 0);
3786
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(z16_tensor), TENSOR_LIST(z_tensor), 0);
3787
1
  ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3788
1
  ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
3789
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tz->data.f32, z_tensor->data.f32, 20 * 10, 2e-3, "gelu from cudnn should match from CPU");
3790
1
  ccv_nnc_tensor_free(x_tensor);
3791
1
  ccv_nnc_tensor_free(x16_tensor);
3792
1
  ccv_nnc_tensor_free(y16_tensor);
3793
1
  ccv_nnc_tensor_free(y_tensor);
3794
1
  ccv_nnc_tensor_free(z16_tensor);
3795
1
  ccv_nnc_tensor_free(z_tensor);
3796
1
  ccv_nnc_tensor_free(tz);
3797
1
  ccv_nnc_graph_free(graph);
3798
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3799
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3800
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3801
1
}
3802
3803
TEST_CASE("cmul in float, broadcast semantics")
3804
1
{
3805
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
3806
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3807
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 5, 8, 128), "a");
3808
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 5, 1, 128), "b");
3809
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 5, 8, 128), "c");
3810
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
3811
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3812
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3813
1
  ccv_nnc_graph_t* graph = 0;
3814
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3815
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3816
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3817
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3818
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 5, 8, 128), 0);
3819
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 5, 1, 128), 0);
3820
1
  dsfmt_t dsfmt;
3821
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3822
1
  int i;
3823
5.12k
  for (i = 0; i < 1 * 5 * 8 * 128; 
i++5.12k
)
3824
5.12k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3825
641
  for (i = 0; i < 1 * 5 * 1 * 128; 
i++640
)
3826
640
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3827
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3828
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3829
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3830
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
3831
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3832
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 5, 8, 128), 0);
3833
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3834
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z_tensor), 0);
3835
1
  ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 5, 8, 128), 0);
3836
1
  ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
3837
1
  REQUIRE_TENSOR_EQ(tz, z_tensor, "gelu from cudnn should match from CPU");
3838
1
  ccv_nnc_tensor_free(x_tensor);
3839
1
  ccv_nnc_tensor_free(y_tensor);
3840
1
  ccv_nnc_tensor_free(z_tensor);
3841
1
  ccv_nnc_tensor_free(tz);
3842
1
  ccv_nnc_graph_free(graph);
3843
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3844
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3845
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3846
1
}
3847
3848
TEST_CASE("cmul in float, broadcast semantics with longer than 65535 sequence")
3849
1
{
3850
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
3851
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3852
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 70000, 8, 16), "a");
3853
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 70000, 1, 16), "b");
3854
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 70000, 8, 16), "c");
3855
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
3856
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3857
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3858
1
  ccv_nnc_graph_t* graph = 0;
3859
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3860
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3861
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3862
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3863
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 70000, 8, 16), 0);
3864
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 70000, 1, 16), 0);
3865
1
  dsfmt_t dsfmt;
3866
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3867
1
  int i;
3868
8.96M
  for (i = 0; i < 1 * 70000 * 8 * 16; 
i++8.96M
)
3869
8.96M
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3870
1.12M
  for (i = 0; i < 1 * 70000 * 1 * 16; 
i++1.12M
)
3871
1.12M
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3872
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3873
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3874
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3875
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
3876
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3877
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 70000, 8, 16), 0);
3878
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3879
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z_tensor), 0);
3880
1
  ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 70000, 8, 16), 0);
3881
1
  ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
3882
1
  REQUIRE_TENSOR_EQ(tz, z_tensor, "gelu from cudnn should match from CPU");
3883
1
  ccv_nnc_tensor_free(x_tensor);
3884
1
  ccv_nnc_tensor_free(y_tensor);
3885
1
  ccv_nnc_tensor_free(z_tensor);
3886
1
  ccv_nnc_tensor_free(tz);
3887
1
  ccv_nnc_graph_free(graph);
3888
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3889
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3890
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3891
1
}
3892
3893
TEST_CASE("cmul in float, broadcast semantics with longer than 65535 sequence and more than 1 batch size")
3894
1
{
3895
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
3896
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3897
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 2, 40000, 8, 16), "a");
3898
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 40000, 1, 16), "b");
3899
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 2, 40000, 8, 16), "c");
3900
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
3901
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3902
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3903
1
  ccv_nnc_graph_t* graph = 0;
3904
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3905
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3906
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3907
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3908
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2, 40000, 8, 16), 0);
3909
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 40000, 1, 16), 0);
3910
1
  dsfmt_t dsfmt;
3911
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3912
1
  int i;
3913
10.2M
  for (i = 0; i < 2 * 40000 * 8 * 16; 
i++10.2M
)
3914
10.2M
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3915
640k
  for (i = 0; i < 1 * 40000 * 1 * 16; 
i++640k
)
3916
640k
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3917
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3918
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3919
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3920
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
3921
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3922
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2, 40000, 8, 16), 0);
3923
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3924
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z_tensor), 0);
3925
1
  ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2, 40000, 8, 16), 0);
3926
1
  ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
3927
1
  REQUIRE_TENSOR_EQ(tz, z_tensor, "gelu from cudnn should match from CPU");
3928
1
  ccv_nnc_tensor_free(x_tensor);
3929
1
  ccv_nnc_tensor_free(y_tensor);
3930
1
  ccv_nnc_tensor_free(z_tensor);
3931
1
  ccv_nnc_tensor_free(tz);
3932
1
  ccv_nnc_graph_free(graph);
3933
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3934
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3935
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3936
1
}
3937
3938
TEST_CASE("cmul gradient in float")
3939
1
{
3940
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3941
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3942
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
3943
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
3944
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "c");
3945
1
  ccv_nnc_tensor_symbol_t d = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "d");
3946
1
  ccv_nnc_tensor_symbol_t e = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "e");
3947
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_BACKWARD(), TENSOR_SYMBOL_LIST(a, b, c), TENSOR_SYMBOL_LIST(d, e), "cmul");
3948
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3949
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3950
1
  ccv_nnc_graph_t* graph = 0;
3951
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3952
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3953
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3954
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3955
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3956
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3957
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3958
1
  dsfmt_t dsfmt;
3959
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3960
1
  int i;
3961
201
  for (i = 0; i < 20 * 10; 
i++200
)
3962
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3963
201
  for (i = 0; i < 20 * 10; 
i++200
)
3964
200
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3965
201
  for (i = 0; i < 20 * 10; 
i++200
)
3966
200
    z_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3967
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3968
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3969
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3970
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
3971
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3972
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(z_tensor), TENSOR_LIST(c_tensor), 0);
3973
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3974
1
  ccv_nnc_tensor_t* const od_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3975
1
  ccv_nnc_tensor_t* const d_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, d);
3976
1
  ccv_nnc_tensor_t* const oe_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3977
1
  ccv_nnc_tensor_t* const e_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, e);
3978
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d_tensor, e_tensor), TENSOR_LIST(od_tensor, oe_tensor), 0);
3979
1
  ccv_nnc_tensor_t* const td = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3980
1
  ccv_nnc_tensor_t* const te = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3981
1
  ccv_nnc_cmd_exec(CMD_CMUL_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor, z_tensor), TENSOR_LIST(td, te), 0);
3982
1
  REQUIRE_TENSOR_EQ(td, od_tensor, "cmul gradient from cudnn should match from CPU");
3983
1
  REQUIRE_TENSOR_EQ(te, oe_tensor, "cmul gradient from cudnn should match from CPU");
3984
1
  ccv_nnc_tensor_free(x_tensor);
3985
1
  ccv_nnc_tensor_free(y_tensor);
3986
1
  ccv_nnc_tensor_free(z_tensor);
3987
1
  ccv_nnc_tensor_free(od_tensor);
3988
1
  ccv_nnc_tensor_free(oe_tensor);
3989
1
  ccv_nnc_tensor_free(td);
3990
1
  ccv_nnc_tensor_free(te);
3991
1
  ccv_nnc_graph_free(graph);
3992
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3993
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3994
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3995
1
}
3996
3997
TEST_CASE("cmul gradient in half precision")
3998
1
{
3999
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_MPS));
4000
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
4001
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
4002
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
4003
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "c");
4004
1
  ccv_nnc_tensor_symbol_t d = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "c");
4005
1
  ccv_nnc_tensor_symbol_t e = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "c");
4006
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_BACKWARD(), TENSOR_SYMBOL_LIST(a, b, c), TENSOR_SYMBOL_LIST(d, e), "cmul");
4007
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4008
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
4009
1
  ccv_nnc_graph_t* graph = 0;
4010
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
4011
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
4012
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
4013
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
4014
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4015
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4016
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4017
1
  dsfmt_t dsfmt;
4018
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4019
1
  int i;
4020
201
  for (i = 0; i < 20 * 10; 
i++200
)
4021
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4022
201
  for (i = 0; i < 20 * 10; 
i++200
)
4023
200
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4024
201
  for (i = 0; i < 20 * 10; 
i++200
)
4025
200
    z_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4026
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
4027
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
4028
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
4029
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
4030
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
4031
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
4032
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(y16_tensor), 0);
4033
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(b_tensor), 0);
4034
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
4035
1
  ccv_nnc_tensor_t* const z16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
4036
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(z_tensor), TENSOR_LIST(z16_tensor), 0);
4037
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(z16_tensor), TENSOR_LIST(c_tensor), 0);
4038
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
4039
1
  ccv_nnc_tensor_t* const od16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
4040
1
  ccv_nnc_tensor_t* const od_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4041
1
  ccv_nnc_tensor_t* const d_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, d);
4042
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d_tensor), TENSOR_LIST(od16_tensor), 0);
4043
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(od16_tensor), TENSOR_LIST(od_tensor), 0);
4044
1
  ccv_nnc_tensor_t* const oe16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
4045
1
  ccv_nnc_tensor_t* const oe_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4046
1
  ccv_nnc_tensor_t* const e_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, e);
4047
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(e_tensor), TENSOR_LIST(oe16_tensor), 0);
4048
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(oe16_tensor), TENSOR_LIST(oe_tensor), 0);
4049
1
  ccv_nnc_tensor_t* const td = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4050
1
  ccv_nnc_tensor_t* const te = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
4051
1
  ccv_nnc_cmd_exec(CMD_CMUL_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor, z_tensor), TENSOR_LIST(td, te), 0);
4052
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, td->data.f32, od_tensor->data.f32, 20 * 10, 2e-3, "gelu from cudnn should match from CPU");
4053
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, te->data.f32, oe_tensor->data.f32, 20 * 10, 2e-3, "gelu from cudnn should match from CPU");
4054
1
  ccv_nnc_tensor_free(x_tensor);
4055
1
  ccv_nnc_tensor_free(x16_tensor);
4056
1
  ccv_nnc_tensor_free(y_tensor);
4057
1
  ccv_nnc_tensor_free(y16_tensor);
4058
1
  ccv_nnc_tensor_free(z_tensor);
4059
1
  ccv_nnc_tensor_free(z16_tensor);
4060
1
  ccv_nnc_tensor_free(od_tensor);
4061
1
  ccv_nnc_tensor_free(od16_tensor);
4062
1
  ccv_nnc_tensor_free(td);
4063
1
  ccv_nnc_tensor_free(oe_tensor);
4064
1
  ccv_nnc_tensor_free(oe16_tensor);
4065
1
  ccv_nnc_tensor_free(te);
4066
1
  ccv_nnc_graph_free(graph);
4067
1
  ccv_nnc_tensor_arena_free(tensor_arena);
4068
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
4069
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
4070
1
}
4071
4072
TEST_CASE("segmented gemm")
4073
1
{
4074
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) || ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
4075
1
  dsfmt_t dsfmt;
4076
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4077
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 128), 0);
4078
1
  ccv_nnc_tensor_t* hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4079
1
  hindices->data.i32[0] = 0;
4080
1
  hindices->data.i32[1] = 2;
4081
1
  hindices->data.i32[2] = 1;
4082
1
  ccv_nnc_tensor_t* hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4083
1
  hcounts->data.i32[0] = 20;
4084
1
  hcounts->data.i32[1] = 25;
4085
1
  hcounts->data.i32[2] = 35;
4086
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 64, 128), 0);
4087
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4088
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4089
1
  int i;
4090
24.5k
  for (i = 0; i < 3 * 64 * 128; 
i++24.5k
)
4091
24.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
4092
10.2k
  for (i = 0; i < 80 * 128; 
i++10.2k
)
4093
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4094
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 80, 128), 0);
4095
1
  ccv_nnc_tensor_t* indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4096
1
  ccv_nnc_tensor_t* counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4097
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 64, 128), 0);
4098
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 80, 64), 0);
4099
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(a, indices, counts, w), 0);
4100
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
4101
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
4102
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(bt), 0);
4103
1
  REQUIRE_TENSOR_EQ(hb, bt, "should match from CPU");
4104
1
  ccv_nnc_tensor_free(a);
4105
1
  ccv_nnc_tensor_free(indices);
4106
1
  ccv_nnc_tensor_free(counts);
4107
1
  ccv_nnc_tensor_free(w);
4108
1
  ccv_nnc_tensor_free(b);
4109
1
  ccv_nnc_tensor_free(ha);
4110
1
  ccv_nnc_tensor_free(hindices);
4111
1
  ccv_nnc_tensor_free(hcounts);
4112
1
  ccv_nnc_tensor_free(hw);
4113
1
  ccv_nnc_tensor_free(hb);
4114
1
  ccv_nnc_tensor_free(bt);
4115
1
}
4116
4117
TEST_CASE("segmented gemm with bias")
4118
1
{
4119
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) || ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
4120
1
  dsfmt_t dsfmt;
4121
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4122
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 128), 0);
4123
1
  ccv_nnc_tensor_t* hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4124
1
  hindices->data.i32[0] = 0;
4125
1
  hindices->data.i32[1] = 1;
4126
1
  hindices->data.i32[2] = 2;
4127
1
  ccv_nnc_tensor_t* hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4128
1
  hcounts->data.i32[0] = 20;
4129
1
  hcounts->data.i32[1] = 25;
4130
1
  hcounts->data.i32[2] = 35;
4131
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 64, 128), 0);
4132
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 64), 0);
4133
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4134
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4135
1
  int i;
4136
24.5k
  for (i = 0; i < 3 * 64 * 128; 
i++24.5k
)
4137
24.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
4138
193
  for (i = 0; i < 3 * 64; 
i++192
)
4139
192
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
4140
10.2k
  for (i = 0; i < 80 * 128; 
i++10.2k
)
4141
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4142
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 80, 128), 0);
4143
1
  ccv_nnc_tensor_t* indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4144
1
  ccv_nnc_tensor_t* counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4145
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 64, 128), 0);
4146
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 64), 0);
4147
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 80, 64), 0);
4148
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw, hbias), TENSOR_LIST(a, indices, counts, w, bias), 0);
4149
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w, bias), TENSOR_LIST(b), 0);
4150
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
4151
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw, hbias), TENSOR_LIST(bt), 0);
4152
1
  REQUIRE_TENSOR_EQ(hb, bt, "should match from CPU");
4153
1
  ccv_nnc_tensor_free(a);
4154
1
  ccv_nnc_tensor_free(indices);
4155
1
  ccv_nnc_tensor_free(counts);
4156
1
  ccv_nnc_tensor_free(w);
4157
1
  ccv_nnc_tensor_free(bias);
4158
1
  ccv_nnc_tensor_free(b);
4159
1
  ccv_nnc_tensor_free(ha);
4160
1
  ccv_nnc_tensor_free(hindices);
4161
1
  ccv_nnc_tensor_free(hcounts);
4162
1
  ccv_nnc_tensor_free(hw);
4163
1
  ccv_nnc_tensor_free(hbias);
4164
1
  ccv_nnc_tensor_free(hb);
4165
1
  ccv_nnc_tensor_free(bt);
4166
1
}
4167
4168
TEST_CASE("segmented gemm in half precision")
4169
1
{
4170
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) || ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
4171
1
  dsfmt_t dsfmt;
4172
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4173
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 128), 0);
4174
1
  ccv_nnc_tensor_t* hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4175
1
  hindices->data.i32[0] = 0;
4176
1
  hindices->data.i32[1] = 2;
4177
1
  hindices->data.i32[2] = 1;
4178
1
  ccv_nnc_tensor_t* hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4179
1
  hcounts->data.i32[0] = 20;
4180
1
  hcounts->data.i32[1] = 25;
4181
1
  hcounts->data.i32[2] = 35;
4182
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 64, 128), 0);
4183
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4184
1
  ccv_nnc_tensor_t* hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 80, 64), 0);
4185
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4186
1
  int i;
4187
24.5k
  for (i = 0; i < 3 * 64 * 128; 
i++24.5k
)
4188
24.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
4189
10.2k
  for (i = 0; i < 80 * 128; 
i++10.2k
)
4190
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4191
1
  ccv_nnc_tensor_t* ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 80, 128), 0);
4192
1
  ccv_nnc_tensor_t* hw16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 64, 128), 0);
4193
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(ha16, hw16), 0);
4194
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 80, 128), 0);
4195
1
  ccv_nnc_tensor_t* indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4196
1
  ccv_nnc_tensor_t* counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4197
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 3, 64, 128), 0);
4198
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 80, 64), 0);
4199
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hindices, hcounts, hw16), TENSOR_LIST(a, indices, counts, w), 0);
4200
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
4201
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb16), 0);
4202
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hb16), TENSOR_LIST(hb), 0);
4203
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(bt), 0);
4204
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, bt->data.f32, 80 * 64, 1e-3, "should match from CPU");
4205
1
  ccv_nnc_tensor_free(a);
4206
1
  ccv_nnc_tensor_free(indices);
4207
1
  ccv_nnc_tensor_free(counts);
4208
1
  ccv_nnc_tensor_free(w);
4209
1
  ccv_nnc_tensor_free(b);
4210
1
  ccv_nnc_tensor_free(ha);
4211
1
  ccv_nnc_tensor_free(hindices);
4212
1
  ccv_nnc_tensor_free(hcounts);
4213
1
  ccv_nnc_tensor_free(hw);
4214
1
  ccv_nnc_tensor_free(hb);
4215
1
  ccv_nnc_tensor_free(ha16);
4216
1
  ccv_nnc_tensor_free(hw16);
4217
1
  ccv_nnc_tensor_free(hb16);
4218
1
  ccv_nnc_tensor_free(bt);
4219
1
}
4220
4221
TEST_CASE("segmented gemm with bias in half precision")
4222
1
{
4223
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) || ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
4224
1
  dsfmt_t dsfmt;
4225
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4226
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 128), 0);
4227
1
  ccv_nnc_tensor_t* hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4228
1
  hindices->data.i32[0] = 0;
4229
1
  hindices->data.i32[1] = 1;
4230
1
  hindices->data.i32[2] = 2;
4231
1
  ccv_nnc_tensor_t* hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4232
1
  hcounts->data.i32[0] = 20;
4233
1
  hcounts->data.i32[1] = 25;
4234
1
  hcounts->data.i32[2] = 35;
4235
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 64, 128), 0);
4236
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 64), 0);
4237
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4238
1
  ccv_nnc_tensor_t* hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 80, 64), 0);
4239
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4240
1
  int i;
4241
24.5k
  for (i = 0; i < 3 * 64 * 128; 
i++24.5k
)
4242
24.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
4243
193
  for (i = 0; i < 3 * 64; 
i++192
)
4244
192
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
4245
10.2k
  for (i = 0; i < 80 * 128; 
i++10.2k
)
4246
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4247
1
  ccv_nnc_tensor_t* ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 80, 128), 0);
4248
1
  ccv_nnc_tensor_t* hw16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 64, 128), 0);
4249
1
  ccv_nnc_tensor_t* hbias16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 3, 64), 0);
4250
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(ha16, hw16, hbias16), 0);
4251
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 80, 128), 0);
4252
1
  ccv_nnc_tensor_t* indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4253
1
  ccv_nnc_tensor_t* counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4254
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 3, 64, 128), 0);
4255
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 3, 64), 0);
4256
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 80, 64), 0);
4257
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hindices, hcounts, hw16, hbias16), TENSOR_LIST(a, indices, counts, w, bias), 0);
4258
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w, bias), TENSOR_LIST(b), 0);
4259
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb16), 0);
4260
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hb16), TENSOR_LIST(hb), 0);
4261
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw, hbias), TENSOR_LIST(bt), 0);
4262
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, bt->data.f32, 80 * 64, 1e-3, "should match from CPU");
4263
1
  ccv_nnc_tensor_free(a);
4264
1
  ccv_nnc_tensor_free(indices);
4265
1
  ccv_nnc_tensor_free(counts);
4266
1
  ccv_nnc_tensor_free(w);
4267
1
  ccv_nnc_tensor_free(bias);
4268
1
  ccv_nnc_tensor_free(b);
4269
1
  ccv_nnc_tensor_free(ha);
4270
1
  ccv_nnc_tensor_free(hindices);
4271
1
  ccv_nnc_tensor_free(hcounts);
4272
1
  ccv_nnc_tensor_free(hw);
4273
1
  ccv_nnc_tensor_free(hbias);
4274
1
  ccv_nnc_tensor_free(hb);
4275
1
  ccv_nnc_tensor_free(ha16);
4276
1
  ccv_nnc_tensor_free(hw16);
4277
1
  ccv_nnc_tensor_free(hbias16);
4278
1
  ccv_nnc_tensor_free(hb16);
4279
1
  ccv_nnc_tensor_free(bt);
4280
1
}
4281
4282
TEST_CASE("segmented gemm, reuse")
4283
1
{
4284
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) || ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
4285
1
  dsfmt_t dsfmt;
4286
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4287
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 128), 0);
4288
1
  ccv_nnc_tensor_t* hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4289
1
  hindices->data.i32[0] = 0;
4290
1
  hindices->data.i32[1] = 1;
4291
1
  hindices->data.i32[2] = 2;
4292
1
  ccv_nnc_tensor_t* hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4293
1
  hcounts->data.i32[0] = 20;
4294
1
  hcounts->data.i32[1] = 30;
4295
1
  hcounts->data.i32[2] = 30;
4296
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 64, 128), 0);
4297
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4298
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 64), 0);
4299
1
  int i;
4300
24.5k
  for (i = 0; i < 3 * 64 * 128; 
i++24.5k
)
4301
24.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
4302
10.2k
  for (i = 0; i < 80 * 128; 
i++10.2k
)
4303
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4304
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 80, 128), 0);
4305
1
  ccv_nnc_tensor_t* indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4306
1
  ccv_nnc_tensor_t* counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4307
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 64, 128), 0);
4308
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 80, 64), 0);
4309
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(a, indices, counts, w), 0);
4310
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
4311
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
4312
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(bt), 0);
4313
1
  REQUIRE_TENSOR_EQ(hb, bt, "should match from CPU");
4314
1
  ccv_nnc_tensor_free(a);
4315
1
  ccv_nnc_tensor_free(indices);
4316
1
  ccv_nnc_tensor_free(counts);
4317
1
  ccv_nnc_tensor_free(w);
4318
1
  ccv_nnc_tensor_free(b);
4319
1
  ccv_nnc_tensor_free(ha);
4320
1
  ccv_nnc_tensor_free(hindices);
4321
1
  ccv_nnc_tensor_free(hcounts);
4322
1
  ccv_nnc_tensor_free(hw);
4323
1
  ccv_nnc_tensor_free(hb);
4324
1
  ccv_nnc_tensor_free(bt);
4325
1
}
4326
4327
TEST_CASE("segmented gemm, large k")
4328
1
{
4329
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) || ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
4330
1
  dsfmt_t dsfmt;
4331
1
  dsfmt_init_gen_rand(&dsfmt, 0);
4332
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 5120), 0);
4333
1
  ccv_nnc_tensor_t* hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4334
1
  hindices->data.i32[0] = 0;
4335
1
  hindices->data.i32[1] = 1;
4336
1
  hindices->data.i32[2] = 2;
4337
1
  ccv_nnc_tensor_t* hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
4338
1
  hcounts->data.i32[0] = 20;
4339
1
  hcounts->data.i32[1] = 30;
4340
1
  hcounts->data.i32[2] = 30;
4341
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2560, 5120), 0);
4342
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 2560), 0);
4343
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 80, 2560), 0);
4344
1
  int i;
4345
39.3M
  for (i = 0; i < 3 * 2560 * 5120; 
i++39.3M
)
4346
39.3M
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 5120;
4347
409k
  for (i = 0; i < 80 * 5120; 
i++409k
)
4348
409k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4349
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 80, 5120), 0);
4350
1
  ccv_nnc_tensor_t* indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4351
1
  ccv_nnc_tensor_t* counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
4352
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2560, 5120), 0);
4353
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 80, 2560), 0);
4354
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(a, indices, counts, w), 0);
4355
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
4356
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
4357
1
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(bt), 0);
4358
1
  REQUIRE_TENSOR_EQ(hb, bt, "should match from CPU");
4359
1
  ccv_nnc_tensor_free(a);
4360
1
  ccv_nnc_tensor_free(indices);
4361
1
  ccv_nnc_tensor_free(counts);
4362
1
  ccv_nnc_tensor_free(w);
4363
1
  ccv_nnc_tensor_free(b);
4364
1
  ccv_nnc_tensor_free(ha);
4365
1
  ccv_nnc_tensor_free(hindices);
4366
1
  ccv_nnc_tensor_free(hcounts);
4367
1
  ccv_nnc_tensor_free(hw);
4368
1
  ccv_nnc_tensor_free(hb);
4369
1
  ccv_nnc_tensor_free(bt);
4370
1
}
4371
4372
#include "case_main.h"