Coverage Report

Created: 2024-12-10 23:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/cublas.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
TEST_CASE("gemm no transpose")
15
1
{
16
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
17
1
  float ap[] = {
18
1
    1, 2,
19
1
    3, 4,
20
1
    5, 6,
21
1
    7, 8,
22
1
  };
23
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
24
1
  float bp[] = {
25
1
    7, 8, 9,
26
1
    10, 11, 12,
27
1
  };
28
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
29
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
30
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
31
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
32
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
33
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
34
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
35
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
36
1
  float ctp[] = {
37
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
38
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
39
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
40
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
41
1
  };
42
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
43
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
44
1
  ccv_nnc_tensor_free(a);
45
1
  ccv_nnc_tensor_free(b);
46
1
  ccv_nnc_tensor_free(c);
47
1
  ccv_nnc_tensor_free(ga);
48
1
  ccv_nnc_tensor_free(gb);
49
1
  ccv_nnc_tensor_free(gc);
50
1
}
51
52
TEST_CASE("gemm transpose a")
53
1
{
54
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
55
1
  float ap[] = {
56
1
    1, 3, 5, 7,
57
1
    2, 4, 6, 8,
58
1
  };
59
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
60
1
  float bp[] = {
61
1
    7, 8, 9,
62
1
    10, 11, 12,
63
1
  };
64
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
65
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
66
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
67
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
68
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
69
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
70
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
71
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
72
1
  float ctp[] = {
73
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
74
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
75
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
76
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
77
1
  };
78
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
79
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
80
1
  ccv_nnc_tensor_free(a);
81
1
  ccv_nnc_tensor_free(b);
82
1
  ccv_nnc_tensor_free(c);
83
1
  ccv_nnc_tensor_free(ga);
84
1
  ccv_nnc_tensor_free(gb);
85
1
  ccv_nnc_tensor_free(gc);
86
1
}
87
88
TEST_CASE("gemm transpose b")
89
1
{
90
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
91
1
  float ap[] = {
92
1
    1, 2,
93
1
    3, 4,
94
1
    5, 6,
95
1
    7, 8,
96
1
  };
97
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
98
1
  float bp[] = {
99
1
    7, 10,
100
1
    8, 11,
101
1
    9, 12,
102
1
  };
103
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
104
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
105
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
106
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
107
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
108
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
109
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
110
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
111
1
  float ctp[] = {
112
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
113
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
114
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
115
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
116
1
  };
117
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
118
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
119
1
  ccv_nnc_tensor_free(a);
120
1
  ccv_nnc_tensor_free(b);
121
1
  ccv_nnc_tensor_free(c);
122
1
  ccv_nnc_tensor_free(ga);
123
1
  ccv_nnc_tensor_free(gb);
124
1
  ccv_nnc_tensor_free(gc);
125
1
}
126
127
TEST_CASE("gemm transpose a and b")
128
1
{
129
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
130
1
  float ap[] = {
131
1
    1, 3, 5, 7,
132
1
    2, 4, 6, 8,
133
1
  };
134
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
135
1
  float bp[] = {
136
1
    7, 10,
137
1
    8, 11,
138
1
    9, 12,
139
1
  };
140
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
141
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
142
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
143
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
144
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
145
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
146
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
147
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
148
1
  float ctp[] = {
149
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
150
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
151
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
152
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
153
1
  };
154
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
155
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
156
1
  ccv_nnc_tensor_free(a);
157
1
  ccv_nnc_tensor_free(b);
158
1
  ccv_nnc_tensor_free(c);
159
1
  ccv_nnc_tensor_free(ga);
160
1
  ccv_nnc_tensor_free(gb);
161
1
  ccv_nnc_tensor_free(gc);
162
1
}
163
164
TEST_CASE("gemm no transpose with bias")
165
1
{
166
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
167
1
  float ap[] = {
168
1
    1, 2,
169
1
    3, 4,
170
1
    5, 6,
171
1
    7, 8,
172
1
  };
173
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
174
1
  float bp[] = {
175
1
    7, 8, 9,
176
1
    10, 11, 12,
177
1
  };
178
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
179
1
  float dp[] = {
180
1
    1, -1, 1,
181
1
    1, -1, 1,
182
1
    1, -1, 1,
183
1
    1, -1, 1,
184
1
  };
185
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
186
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
187
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
188
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
189
1
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
190
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
191
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
192
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
193
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
194
1
  float ctp[] = {
195
1
    1 * 7 + 2 * 10 + 1, 1 * 8 + 2 * 11 - 1, 1 * 9 + 2 * 12 + 1,
196
1
    3 * 7 + 4 * 10 + 1, 3 * 8 + 4 * 11 - 1, 3 * 9 + 4 * 12 + 1,
197
1
    5 * 7 + 6 * 10 + 1, 5 * 8 + 6 * 11 - 1, 5 * 9 + 6 * 12 + 1,
198
1
    7 * 7 + 8 * 10 + 1, 7 * 8 + 8 * 11 - 1, 7 * 9 + 8 * 12 + 1,
199
1
  };
200
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
201
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
202
1
  ccv_nnc_tensor_free(a);
203
1
  ccv_nnc_tensor_free(b);
204
1
  ccv_nnc_tensor_free(c);
205
1
  ccv_nnc_tensor_free(d);
206
1
  ccv_nnc_tensor_free(ga);
207
1
  ccv_nnc_tensor_free(gb);
208
1
  ccv_nnc_tensor_free(gc);
209
1
  ccv_nnc_tensor_free(gd);
210
1
}
211
212
TEST_CASE("gemm no transpose with bias and palettize weights")
213
1
{
214
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
215
1
  float ap[] = {
216
1
    1, 2,
217
1
    3, 4,
218
1
    5, 6,
219
1
    7, 8,
220
1
  };
221
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
222
1
  float bp[] = {
223
1
    7, 8, 9,
224
1
    10, 11, 12,
225
1
  };
226
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
227
1
  float dp[] = {
228
1
    1, -1, 1,
229
1
    1, -1, 1,
230
1
    1, -1, 1,
231
1
    1, -1, 1,
232
1
  };
233
1
  ccv_nnc_tensor_t* const pb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(CPU_TENSOR_NHWC(32F, 2, 3), 4, 128), 0);
234
1
  (void)ccv_nnc_palettize(b->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, 6, 4, 128, pb->data.u8, ccv_nnc_tensor_data_size_without_padding(pb->info));
235
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
236
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
237
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
238
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(GPU_TENSOR_NHWC(000, 32F, 2, 3), 4, 128), 0);
239
1
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
240
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
241
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, pb, d), TENSOR_LIST(ga, gb, gd), 0);
242
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
243
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
244
1
  float ctp[] = {
245
1
    1 * 7 + 2 * 10 + 1, 1 * 8 + 2 * 11 - 1, 1 * 9 + 2 * 12 + 1,
246
1
    3 * 7 + 4 * 10 + 1, 3 * 8 + 4 * 11 - 1, 3 * 9 + 4 * 12 + 1,
247
1
    5 * 7 + 6 * 10 + 1, 5 * 8 + 6 * 11 - 1, 5 * 9 + 6 * 12 + 1,
248
1
    7 * 7 + 8 * 10 + 1, 7 * 8 + 8 * 11 - 1, 7 * 9 + 8 * 12 + 1,
249
1
  };
250
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
251
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
252
1
  ccv_nnc_tensor_free(a);
253
1
  ccv_nnc_tensor_free(b);
254
1
  ccv_nnc_tensor_free(pb);
255
1
  ccv_nnc_tensor_free(c);
256
1
  ccv_nnc_tensor_free(d);
257
1
  ccv_nnc_tensor_free(ga);
258
1
  ccv_nnc_tensor_free(gb);
259
1
  ccv_nnc_tensor_free(gc);
260
1
  ccv_nnc_tensor_free(gd);
261
1
}
262
263
TEST_CASE("backward gemm with no transpose")
264
1
{
265
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
266
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
267
1
  float gp[] = {
268
1
    1, 2, 3,
269
1
    4, 5, 6,
270
1
    7, 8, 9,
271
1
    10, 11, 12,
272
1
  };
273
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
274
1
  float ap[] = {
275
1
    13, 14,
276
1
    15, 16,
277
1
    17, 18,
278
1
    19, 20,
279
1
  };
280
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
281
1
  float bp[] = {
282
1
    21, 22, 23,
283
1
    24, 25, 26,
284
1
  };
285
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
286
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
287
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
288
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
289
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
290
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
291
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
292
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
293
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
294
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
295
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
296
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
297
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
298
1
  float dbiastp[] = {
299
1
    22, 26, 30,
300
1
  };
301
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
302
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
303
1
  float htp[] = {
304
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
305
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
306
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
307
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
308
1
  };
309
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
310
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
311
1
  float dbtp[] = {
312
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
313
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
314
1
  };
315
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
316
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
317
1
  ccv_nnc_tensor_free(g);
318
1
  ccv_nnc_tensor_free(a);
319
1
  ccv_nnc_tensor_free(b);
320
1
  ccv_nnc_tensor_free(h);
321
1
  ccv_nnc_tensor_free(db);
322
1
  ccv_nnc_tensor_free(dbias);
323
1
  ccv_nnc_tensor_free(gg);
324
1
  ccv_nnc_tensor_free(ga);
325
1
  ccv_nnc_tensor_free(gb);
326
1
  ccv_nnc_tensor_free(gh);
327
1
  ccv_nnc_tensor_free(gdb);
328
1
  ccv_nnc_tensor_free(gdbias);
329
1
}
330
331
TEST_CASE("backward gemm with no transpose and palettize weights")
332
1
{
333
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
334
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
335
1
  float gp[] = {
336
1
    1, 2, 3,
337
1
    4, 5, 6,
338
1
    7, 8, 9,
339
1
    10, 11, 12,
340
1
  };
341
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
342
1
  float ap[] = {
343
1
    13, 14,
344
1
    15, 16,
345
1
    17, 18,
346
1
    19, 20,
347
1
  };
348
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
349
1
  float bp[] = {
350
1
    21, 22, 23,
351
1
    24, 25, 26,
352
1
  };
353
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
354
1
  ccv_nnc_tensor_t* const pb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(CPU_TENSOR_NHWC(32F, 2, 3), 4, 128), 0);
355
1
  (void)ccv_nnc_palettize(b->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, 6, 4, 128, pb->data.u8, ccv_nnc_tensor_data_size_without_padding(pb->info));
356
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
357
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
358
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
359
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
360
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
361
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(GPU_TENSOR_NHWC(000, 32F, 2, 3), 4, 128), 0);
362
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
363
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
364
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
365
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, pb), TENSOR_LIST(gg, ga, gb), 0);
366
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
367
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
368
1
  float dbiastp[] = {
369
1
    22, 26, 30,
370
1
  };
371
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
372
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
373
1
  float htp[] = {
374
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
375
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
376
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
377
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
378
1
  };
379
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
380
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
381
1
  float dbtp[] = {
382
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
383
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
384
1
  };
385
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
386
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
387
1
  ccv_nnc_tensor_free(g);
388
1
  ccv_nnc_tensor_free(a);
389
1
  ccv_nnc_tensor_free(b);
390
1
  ccv_nnc_tensor_free(pb);
391
1
  ccv_nnc_tensor_free(h);
392
1
  ccv_nnc_tensor_free(db);
393
1
  ccv_nnc_tensor_free(dbias);
394
1
  ccv_nnc_tensor_free(gg);
395
1
  ccv_nnc_tensor_free(ga);
396
1
  ccv_nnc_tensor_free(gb);
397
1
  ccv_nnc_tensor_free(gh);
398
1
  ccv_nnc_tensor_free(gdb);
399
1
  ccv_nnc_tensor_free(gdbias);
400
1
}
401
402
TEST_CASE("backward gemm with transpose a")
403
1
{
404
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
405
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
406
1
  float gp[] = {
407
1
    1, 2, 3,
408
1
    4, 5, 6,
409
1
    7, 8, 9,
410
1
    10, 11, 12,
411
1
  };
412
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
413
1
  float ap[] = {
414
1
    13, 15, 17, 19,
415
1
    14, 16, 18, 20,
416
1
  };
417
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
418
1
  float bp[] = {
419
1
    21, 22, 23,
420
1
    24, 25, 26,
421
1
  };
422
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
423
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
424
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
425
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
426
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
427
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
428
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
429
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
430
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
431
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
432
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
433
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
434
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
435
1
  float dbiastp[] = {
436
1
    22, 26, 30,
437
1
  };
438
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
439
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
440
1
  float htp[] = {
441
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
442
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
443
1
  };
444
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
445
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
446
1
  float dbtp[] = {
447
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
448
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
449
1
  };
450
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
451
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
452
1
  ccv_nnc_tensor_free(g);
453
1
  ccv_nnc_tensor_free(a);
454
1
  ccv_nnc_tensor_free(b);
455
1
  ccv_nnc_tensor_free(h);
456
1
  ccv_nnc_tensor_free(db);
457
1
  ccv_nnc_tensor_free(dbias);
458
1
  ccv_nnc_tensor_free(gg);
459
1
  ccv_nnc_tensor_free(ga);
460
1
  ccv_nnc_tensor_free(gb);
461
1
  ccv_nnc_tensor_free(gh);
462
1
  ccv_nnc_tensor_free(gdb);
463
1
  ccv_nnc_tensor_free(gdbias);
464
1
}
465
466
TEST_CASE("backward gemm with transpose b")
467
1
{
468
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
469
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
470
1
  float gp[] = {
471
1
    1, 2, 3,
472
1
    4, 5, 6,
473
1
    7, 8, 9,
474
1
    10, 11, 12,
475
1
  };
476
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
477
1
  float ap[] = {
478
1
    13, 14,
479
1
    15, 16,
480
1
    17, 18,
481
1
    19, 20,
482
1
  };
483
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
484
1
  float bp[] = {
485
1
    21, 24,
486
1
    22, 25,
487
1
    23, 26,
488
1
  };
489
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
490
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
491
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
492
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
493
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
494
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
495
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
496
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
497
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
498
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
499
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
500
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
501
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
502
1
  float dbiastp[] = {
503
1
    22, 26, 30,
504
1
  };
505
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
506
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
507
1
  float htp[] = {
508
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
509
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
510
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
511
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
512
1
  };
513
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
514
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
515
1
  float dbtp[] = {
516
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
517
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
518
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
519
1
  };
520
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
521
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
522
1
  ccv_nnc_tensor_free(g);
523
1
  ccv_nnc_tensor_free(a);
524
1
  ccv_nnc_tensor_free(b);
525
1
  ccv_nnc_tensor_free(h);
526
1
  ccv_nnc_tensor_free(db);
527
1
  ccv_nnc_tensor_free(dbias);
528
1
  ccv_nnc_tensor_free(gg);
529
1
  ccv_nnc_tensor_free(ga);
530
1
  ccv_nnc_tensor_free(gb);
531
1
  ccv_nnc_tensor_free(gh);
532
1
  ccv_nnc_tensor_free(gdb);
533
1
  ccv_nnc_tensor_free(gdbias);
534
1
}
535
536
TEST_CASE("backward gemm with transpose a and b")
537
1
{
538
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
539
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
540
1
  float gp[] = {
541
1
    1, 2, 3,
542
1
    4, 5, 6,
543
1
    7, 8, 9,
544
1
    10, 11, 12,
545
1
  };
546
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
547
1
  float ap[] = {
548
1
    13, 15, 17, 19,
549
1
    14, 16, 18, 20,
550
1
  };
551
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
552
1
  float bp[] = {
553
1
    21, 24,
554
1
    22, 25,
555
1
    23, 26,
556
1
  };
557
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
558
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
559
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
560
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
561
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
562
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
563
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
564
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
565
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
566
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
567
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
568
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
569
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
570
1
  float dbiastp[] = {
571
1
    22, 26, 30,
572
1
  };
573
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
574
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
575
1
  float htp[] = {
576
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
577
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
578
1
  };
579
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
580
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
581
1
  float dbtp[] = {
582
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
583
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
584
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
585
1
  };
586
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
587
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
588
1
  ccv_nnc_tensor_free(g);
589
1
  ccv_nnc_tensor_free(a);
590
1
  ccv_nnc_tensor_free(b);
591
1
  ccv_nnc_tensor_free(h);
592
1
  ccv_nnc_tensor_free(db);
593
1
  ccv_nnc_tensor_free(dbias);
594
1
  ccv_nnc_tensor_free(gg);
595
1
  ccv_nnc_tensor_free(ga);
596
1
  ccv_nnc_tensor_free(gb);
597
1
  ccv_nnc_tensor_free(gh);
598
1
  ccv_nnc_tensor_free(gdb);
599
1
  ccv_nnc_tensor_free(gdbias);
600
1
}
601
602
TEST_CASE("gemm no transpose batch 2")
603
1
{
604
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
605
1
  float ap[] = {
606
1
    1, 2,
607
1
    3, 4,
608
1
    5, 6,
609
1
    7, 8,
610
1
    2, 3,
611
1
    4, 5,
612
1
    6, 7,
613
1
    8, 9
614
1
  };
615
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
616
1
  float bp[] = {
617
1
    7, 8, 9,
618
1
    10, 11, 12,
619
1
  };
620
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
621
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
622
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
623
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
624
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
625
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
626
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
627
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
628
1
  float ctp[] = {
629
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
630
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
631
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
632
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
633
1
    2 * 7 + 3 * 10, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12,
634
1
    4 * 7 + 5 * 10, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12,
635
1
    6 * 7 + 7 * 10, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12,
636
1
    8 * 7 + 9 * 10, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12,
637
1
  };
638
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
639
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
640
1
  ccv_nnc_tensor_free(a);
641
1
  ccv_nnc_tensor_free(b);
642
1
  ccv_nnc_tensor_free(c);
643
1
  ccv_nnc_tensor_free(ga);
644
1
  ccv_nnc_tensor_free(gb);
645
1
  ccv_nnc_tensor_free(gc);
646
1
}
647
648
TEST_CASE("gemm transpose a batch 2")
649
1
{
650
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
651
1
  float ap[] = {
652
1
    1, 3, 5, 7,
653
1
    2, 4, 6, 8,
654
1
    2, 4, 6, 8,
655
1
    3, 5, 7, 9,
656
1
  };
657
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
658
1
  float bp[] = {
659
1
    7, 8, 9,
660
1
    10, 11, 12,
661
1
  };
662
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
663
1
  float dp[] = {
664
1
    -1, 0, 1,
665
1
  };
666
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 3), 0);
667
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
668
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
669
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
670
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
671
1
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
672
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
673
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
674
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
675
1
  float ctp[] = {
676
1
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
677
1
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
678
1
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
679
1
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
680
1
    2 * 7 + 3 * 10 - 1, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12 + 1,
681
1
    4 * 7 + 5 * 10 - 1, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12 + 1,
682
1
    6 * 7 + 7 * 10 - 1, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12 + 1,
683
1
    8 * 7 + 9 * 10 - 1, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12 + 1,
684
1
  };
685
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
686
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
687
1
  ccv_nnc_tensor_free(a);
688
1
  ccv_nnc_tensor_free(b);
689
1
  ccv_nnc_tensor_free(c);
690
1
  ccv_nnc_tensor_free(d);
691
1
  ccv_nnc_tensor_free(ga);
692
1
  ccv_nnc_tensor_free(gb);
693
1
  ccv_nnc_tensor_free(gc);
694
1
  ccv_nnc_tensor_free(gd);
695
1
}
696
697
TEST_CASE("gemm transpose b batch 2")
698
1
{
699
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
700
1
  float ap[] = {
701
1
    1, 2,
702
1
    3, 4,
703
1
    5, 6,
704
1
    7, 8,
705
1
    2, 3,
706
1
    4, 5,
707
1
    6, 7,
708
1
    8, 9
709
1
  };
710
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
711
1
  float bp[] = {
712
1
    7, 10,
713
1
    8, 11,
714
1
    9, 12,
715
1
    80, 110,
716
1
    90, 120,
717
1
    10, 13,
718
1
  };
719
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
720
1
  float dp[] = {
721
1
    -1, 0, 1,
722
1
    2, 3, -4,
723
1
  };
724
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
725
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
726
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
727
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
728
1
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
729
1
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
730
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
731
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
732
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
733
1
  float ctp[] = {
734
1
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
735
1
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
736
1
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
737
1
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
738
1
    2 * 80 + 3 * 110 + 2, 2 * 90 + 3 * 120 + 3, 2 * 10 + 3 * 13 - 4,
739
1
    4 * 80 + 5 * 110 + 2, 4 * 90 + 5 * 120 + 3, 4 * 10 + 5 * 13 - 4,
740
1
    6 * 80 + 7 * 110 + 2, 6 * 90 + 7 * 120 + 3, 6 * 10 + 7 * 13 - 4,
741
1
    8 * 80 + 9 * 110 + 2, 8 * 90 + 9 * 120 + 3, 8 * 10 + 9 * 13 - 4,
742
1
  };
743
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
744
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
745
1
  ccv_nnc_tensor_free(a);
746
1
  ccv_nnc_tensor_free(b);
747
1
  ccv_nnc_tensor_free(c);
748
1
  ccv_nnc_tensor_free(d);
749
1
  ccv_nnc_tensor_free(ga);
750
1
  ccv_nnc_tensor_free(gb);
751
1
  ccv_nnc_tensor_free(gc);
752
1
  ccv_nnc_tensor_free(gd);
753
1
}
754
755
TEST_CASE("backward gemm with no transpose batch 2, same b")
756
1
{
757
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
758
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
759
1
  float gp[] = {
760
1
    1, 2, 3,
761
1
    4, 5, 6,
762
1
    7, 8, 9,
763
1
    10, 11, 12,
764
1
    10, 20, 30,
765
1
    40, 50, 60,
766
1
    70, 80, 90,
767
1
    100, 110, 120,
768
1
  };
769
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
770
1
  float ap[] = {
771
1
    13, 14,
772
1
    15, 16,
773
1
    17, 18,
774
1
    19, 20,
775
1
    131, 141,
776
1
    151, 161,
777
1
    171, 181,
778
1
    191, 201,
779
1
  };
780
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
781
1
  float bp[] = {
782
1
    21, 22, 23,
783
1
    24, 25, 26,
784
1
  };
785
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
786
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
787
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
788
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
789
1
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
790
1
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
791
1
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
792
1
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
793
1
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
794
1
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
795
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
796
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
797
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
798
1
  float dbiastp[] = {
799
1
    22 + 220, 26 + 260, 30 + 300,
800
1
  };
801
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
802
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
803
1
  float htp[] = {
804
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
805
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
806
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
807
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
808
1
    10 * 21 + 20 * 22 + 30 * 23, 10 * 24 + 20 * 25 + 30 * 26,
809
1
    40 * 21 + 50 * 22 + 60 * 23, 40 * 24 + 50 * 25 + 60 * 26,
810
1
    70 * 21 + 80 * 22 + 90 * 23, 70 * 24 + 80 * 25 + 90 * 26,
811
1
    100 * 21 + 110 * 22 + 120 * 23, 100 * 24 + 110 * 25 + 120 * 26,
812
1
  };
813
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
814
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
815
1
  float dbtp[] = {
816
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
817
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
818
1
  };
819
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
820
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
821
1
  ccv_nnc_tensor_free(g);
822
1
  ccv_nnc_tensor_free(a);
823
1
  ccv_nnc_tensor_free(b);
824
1
  ccv_nnc_tensor_free(h);
825
1
  ccv_nnc_tensor_free(db);
826
1
  ccv_nnc_tensor_free(dbias);
827
1
  ccv_nnc_tensor_free(gg);
828
1
  ccv_nnc_tensor_free(ga);
829
1
  ccv_nnc_tensor_free(gb);
830
1
  ccv_nnc_tensor_free(gh);
831
1
  ccv_nnc_tensor_free(gdb);
832
1
  ccv_nnc_tensor_free(gdbias);
833
1
}
834
835
TEST_CASE("backward gemm with no transpose batch 2, batched b")
836
1
{
837
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
838
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
839
1
  float gp[] = {
840
1
    1, 2, 3,
841
1
    4, 5, 6,
842
1
    7, 8, 9,
843
1
    10, 11, 12,
844
1
    10, 20, 30,
845
1
    40, 50, 60,
846
1
    70, 80, 90,
847
1
    100, 110, 120,
848
1
  };
849
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
850
1
  float ap[] = {
851
1
    13, 14,
852
1
    15, 16,
853
1
    17, 18,
854
1
    19, 20,
855
1
    131, 141,
856
1
    151, 161,
857
1
    171, 181,
858
1
    191, 201,
859
1
  };
860
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
861
1
  float bp[] = {
862
1
    21, 22, 23,
863
1
    24, 25, 26,
864
1
    212, 222, 232,
865
1
    242, 252, 262,
866
1
  };
867
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
868
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
869
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
870
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
871
1
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
872
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
873
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
874
1
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
875
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
876
1
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
877
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
878
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
879
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
880
1
  float dbiastp[] = {
881
1
    22, 26, 30,
882
1
    220, 260, 300,
883
1
  };
884
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
885
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
886
1
  float htp[] = {
887
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
888
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
889
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
890
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
891
1
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
892
1
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
893
1
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
894
1
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
895
1
  };
896
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
897
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
898
1
  float dbtp[] = {
899
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
900
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
901
1
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
902
1
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
903
1
  };
904
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
905
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
906
1
  ccv_nnc_tensor_free(g);
907
1
  ccv_nnc_tensor_free(a);
908
1
  ccv_nnc_tensor_free(b);
909
1
  ccv_nnc_tensor_free(h);
910
1
  ccv_nnc_tensor_free(db);
911
1
  ccv_nnc_tensor_free(dbias);
912
1
  ccv_nnc_tensor_free(gg);
913
1
  ccv_nnc_tensor_free(ga);
914
1
  ccv_nnc_tensor_free(gb);
915
1
  ccv_nnc_tensor_free(gh);
916
1
  ccv_nnc_tensor_free(gdb);
917
1
  ccv_nnc_tensor_free(gdbias);
918
1
}
919
920
TEST_CASE("backward gemm with transpose a batch 2, same b")
921
1
{
922
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
923
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
924
1
  float gp[] = {
925
1
    1, 2, 3,
926
1
    4, 5, 6,
927
1
    7, 8, 9,
928
1
    10, 11, 12,
929
1
    10, 20, 30,
930
1
    40, 50, 60,
931
1
    70, 80, 90,
932
1
    100, 110, 120,
933
1
  };
934
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
935
1
  float ap[] = {
936
1
    13, 15, 17, 19,
937
1
    14, 16, 18, 20,
938
1
    131, 151, 171, 191,
939
1
    141, 161, 181, 201,
940
1
  };
941
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
942
1
  float bp[] = {
943
1
    21, 22, 23,
944
1
    24, 25, 26,
945
1
  };
946
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
947
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
948
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
949
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
950
1
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
951
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
952
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
953
1
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
954
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
955
1
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
956
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
957
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
958
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
959
1
  float dbiastp[] = {
960
1
    22 + 220, 26 + 260, 30 + 300,
961
1
  };
962
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
963
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
964
1
  float htp[] = {
965
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
966
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
967
1
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
968
1
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
969
1
  };
970
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
971
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
972
1
  float dbtp[] = {
973
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
974
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
975
1
  };
976
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
977
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
978
1
  ccv_nnc_tensor_free(g);
979
1
  ccv_nnc_tensor_free(a);
980
1
  ccv_nnc_tensor_free(b);
981
1
  ccv_nnc_tensor_free(h);
982
1
  ccv_nnc_tensor_free(db);
983
1
  ccv_nnc_tensor_free(dbias);
984
1
  ccv_nnc_tensor_free(gg);
985
1
  ccv_nnc_tensor_free(ga);
986
1
  ccv_nnc_tensor_free(gb);
987
1
  ccv_nnc_tensor_free(gh);
988
1
  ccv_nnc_tensor_free(gdb);
989
1
  ccv_nnc_tensor_free(gdbias);
990
1
}
991
992
TEST_CASE("backward gemm with transpose b batch 2, batched b")
993
1
{
994
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
995
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
996
1
  float gp[] = {
997
1
    1, 2, 3,
998
1
    4, 5, 6,
999
1
    7, 8, 9,
1000
1
    10, 11, 12,
1001
1
    10, 20, 30,
1002
1
    40, 50, 60,
1003
1
    70, 80, 90,
1004
1
    100, 110, 120,
1005
1
  };
1006
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1007
1
  float ap[] = {
1008
1
    13, 14,
1009
1
    15, 16,
1010
1
    17, 18,
1011
1
    19, 20,
1012
1
    131, 141,
1013
1
    151, 161,
1014
1
    171, 181,
1015
1
    191, 201,
1016
1
  };
1017
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1018
1
  float bp[] = {
1019
1
    21, 24,
1020
1
    22, 25,
1021
1
    23, 26,
1022
1
    212, 242,
1023
1
    222, 252,
1024
1
    232, 262,
1025
1
  };
1026
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1027
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1028
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1029
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
1030
1
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1031
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1032
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
1033
1
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1034
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
1035
1
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
1036
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
1037
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
1038
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
1039
1
  float dbiastp[] = {
1040
1
    22, 26, 30,
1041
1
    220, 260, 300,
1042
1
  };
1043
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
1044
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
1045
1
  float htp[] = {
1046
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
1047
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
1048
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
1049
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
1050
1
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
1051
1
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
1052
1
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
1053
1
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
1054
1
  };
1055
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1056
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
1057
1
  float dbtp[] = {
1058
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
1059
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
1060
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
1061
1
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
1062
1
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
1063
1
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
1064
1
  };
1065
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1066
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
1067
1
  ccv_nnc_tensor_free(g);
1068
1
  ccv_nnc_tensor_free(a);
1069
1
  ccv_nnc_tensor_free(b);
1070
1
  ccv_nnc_tensor_free(h);
1071
1
  ccv_nnc_tensor_free(db);
1072
1
  ccv_nnc_tensor_free(dbias);
1073
1
  ccv_nnc_tensor_free(gg);
1074
1
  ccv_nnc_tensor_free(ga);
1075
1
  ccv_nnc_tensor_free(gb);
1076
1
  ccv_nnc_tensor_free(gh);
1077
1
  ccv_nnc_tensor_free(gdb);
1078
1
  ccv_nnc_tensor_free(gdbias);
1079
1
}
1080
1081
TEST_CASE("backward gemm with transpose a and b batch 2, same b")
1082
1
{
1083
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1084
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1085
1
  float gp[] = {
1086
1
    1, 2, 3,
1087
1
    4, 5, 6,
1088
1
    7, 8, 9,
1089
1
    10, 11, 12,
1090
1
    10, 20, 30,
1091
1
    40, 50, 60,
1092
1
    70, 80, 90,
1093
1
    100, 110, 120,
1094
1
  };
1095
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1096
1
  float ap[] = {
1097
1
    13, 15, 17, 19,
1098
1
    14, 16, 18, 20,
1099
1
    131, 151, 171, 191,
1100
1
    141, 161, 181, 201,
1101
1
  };
1102
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1103
1
  float bp[] = {
1104
1
    21, 24,
1105
1
    22, 25,
1106
1
    23, 26,
1107
1
  };
1108
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1109
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1110
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1111
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
1112
1
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1113
1
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1114
1
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1115
1
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1116
1
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1117
1
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1118
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
1119
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
1120
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
1121
1
  float dbiastp[] = {
1122
1
    22 + 220, 26 + 260, 30 + 300,
1123
1
  };
1124
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
1125
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
1126
1
  float htp[] = {
1127
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
1128
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
1129
1
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
1130
1
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
1131
1
  };
1132
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1133
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
1134
1
  float dbtp[] = {
1135
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
1136
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
1137
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
1138
1
  };
1139
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1140
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
1141
1
  ccv_nnc_tensor_free(g);
1142
1
  ccv_nnc_tensor_free(a);
1143
1
  ccv_nnc_tensor_free(b);
1144
1
  ccv_nnc_tensor_free(h);
1145
1
  ccv_nnc_tensor_free(db);
1146
1
  ccv_nnc_tensor_free(dbias);
1147
1
  ccv_nnc_tensor_free(gg);
1148
1
  ccv_nnc_tensor_free(ga);
1149
1
  ccv_nnc_tensor_free(gb);
1150
1
  ccv_nnc_tensor_free(gh);
1151
1
  ccv_nnc_tensor_free(gdb);
1152
1
  ccv_nnc_tensor_free(gdbias);
1153
1
}
1154
1155
TEST_CASE("cublas forward gemm")
1156
1
{
1157
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1158
1
  dsfmt_t dsfmt;
1159
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1160
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1161
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1162
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1163
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1164
1165
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1166
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1167
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1168
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1169
1
  int i;
1170
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1171
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1172
65
  for (i = 0; i < 64; 
i++64
)
1173
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1174
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1175
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1176
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1177
129
  for (i = 0; i < 128; 
i++128
)
1178
128
    ha->data.f32[i] = ha1->data.f32[i];
1179
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(a, w, bias), 0);
1180
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1181
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1182
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1183
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1184
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1185
65
  for (i = 0; i < 64; 
i++64
)
1186
64
    tb1->data.f32[i] = tb->data.f32[i];
1187
1
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
1188
1
  ccv_nnc_tensor_free(a);
1189
1
  ccv_nnc_tensor_free(w);
1190
1
  ccv_nnc_tensor_free(bias);
1191
1
  ccv_nnc_tensor_free(tb);
1192
1
  ccv_nnc_tensor_free(b);
1193
1
  ccv_nnc_tensor_free(ha);
1194
1
  ccv_nnc_tensor_free(ha1);
1195
1
  ccv_nnc_tensor_free(tb1);
1196
1
  ccv_nnc_tensor_free(hw);
1197
1
  ccv_nnc_tensor_free(hbias);
1198
1
  ccv_nnc_tensor_free(hb);
1199
1
}
1200
1201
TEST_CASE("cublas forward gemm in half precision")
1202
1
{
1203
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1204
1
  dsfmt_t dsfmt;
1205
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1206
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1207
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1208
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1209
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1210
1211
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1212
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1213
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1214
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1215
1
  int i;
1216
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1217
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1218
65
  for (i = 0; i < 64; 
i++64
)
1219
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1220
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1221
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1222
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1223
129
  for (i = 0; i < 128; 
i++128
)
1224
128
    ha->data.f32[i] = ha1->data.f32[i];
1225
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1226
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1227
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1228
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1229
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1230
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1231
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1232
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1233
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1234
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1235
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1236
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1237
1
  ccv_nnc_tensor_free(a);
1238
1
  ccv_nnc_tensor_free(w);
1239
1
  ccv_nnc_tensor_free(bias);
1240
1
  ccv_nnc_tensor_free(b);
1241
1
  ccv_nnc_tensor_free(tb);
1242
1
  ccv_nnc_tensor_free(ha);
1243
1
  ccv_nnc_tensor_free(ha1);
1244
1
  ccv_nnc_tensor_free(tb1);
1245
1
  ccv_nnc_tensor_free(hw);
1246
1
  ccv_nnc_tensor_free(hbias);
1247
1
  ccv_nnc_tensor_free(hb);
1248
1
  ccv_nnc_tensor_free(ha2);
1249
1
  ccv_nnc_tensor_free(hw2);
1250
1
  ccv_nnc_tensor_free(hbias2);
1251
1
}
1252
1253
TEST_CASE("cublas forward gemv in half precision, variant 1")
1254
1
{
1255
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1256
1
  dsfmt_t dsfmt;
1257
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1258
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
1259
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1260
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1261
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
1262
1263
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1264
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1265
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1266
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1267
1
  int i;
1268
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1269
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1270
65
  for (i = 0; i < 64; 
i++64
)
1271
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1272
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1273
129
  for (i = 0; i < 128; 
i++128
)
1274
128
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1275
129
  for (i = 0; i < 128; 
i++128
)
1276
128
    ha->data.f32[i] = ha1->data.f32[i];
1277
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
1278
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1279
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1280
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1281
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1282
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1283
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1284
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
1285
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1286
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1287
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1288
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1289
1
  ccv_nnc_tensor_free(a);
1290
1
  ccv_nnc_tensor_free(w);
1291
1
  ccv_nnc_tensor_free(bias);
1292
1
  ccv_nnc_tensor_free(b);
1293
1
  ccv_nnc_tensor_free(tb);
1294
1
  ccv_nnc_tensor_free(ha);
1295
1
  ccv_nnc_tensor_free(ha1);
1296
1
  ccv_nnc_tensor_free(tb1);
1297
1
  ccv_nnc_tensor_free(hw);
1298
1
  ccv_nnc_tensor_free(hbias);
1299
1
  ccv_nnc_tensor_free(hb);
1300
1
  ccv_nnc_tensor_free(ha2);
1301
1
  ccv_nnc_tensor_free(hw2);
1302
1
  ccv_nnc_tensor_free(hbias2);
1303
1
}
1304
1305
TEST_CASE("cublas forward gemm no bias")
1306
1
{
1307
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1308
1
  dsfmt_t dsfmt;
1309
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1310
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1311
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1312
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1313
1314
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1315
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1316
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1317
1
  int i;
1318
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1319
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1320
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1321
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1322
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1323
129
  for (i = 0; i < 128; 
i++128
)
1324
128
    ha->data.f32[i] = ha1->data.f32[i];
1325
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(a, w), 0);
1326
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1327
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1328
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1329
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1330
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1331
65
  for (i = 0; i < 64; 
i++64
)
1332
64
    tb1->data.f32[i] = tb->data.f32[i];
1333
1
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
1334
1
  ccv_nnc_tensor_free(a);
1335
1
  ccv_nnc_tensor_free(w);
1336
1
  ccv_nnc_tensor_free(b);
1337
1
  ccv_nnc_tensor_free(tb);
1338
1
  ccv_nnc_tensor_free(ha);
1339
1
  ccv_nnc_tensor_free(ha1);
1340
1
  ccv_nnc_tensor_free(tb1);
1341
1
  ccv_nnc_tensor_free(hw);
1342
1
  ccv_nnc_tensor_free(hb);
1343
1
}
1344
1345
TEST_CASE("cublas forward gemm no bias in half precision")
1346
1
{
1347
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1348
1
  dsfmt_t dsfmt;
1349
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1350
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1351
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1352
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1353
1354
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1355
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1356
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1357
1
  int i;
1358
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1359
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1360
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1361
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1362
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1363
129
  for (i = 0; i < 128; 
i++128
)
1364
128
    ha->data.f32[i] = ha1->data.f32[i];
1365
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1366
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1367
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
1368
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
1369
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1370
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1371
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1372
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1373
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1374
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1375
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1376
1
  ccv_nnc_tensor_free(a);
1377
1
  ccv_nnc_tensor_free(w);
1378
1
  ccv_nnc_tensor_free(b);
1379
1
  ccv_nnc_tensor_free(tb);
1380
1
  ccv_nnc_tensor_free(ha);
1381
1
  ccv_nnc_tensor_free(ha1);
1382
1
  ccv_nnc_tensor_free(tb1);
1383
1
  ccv_nnc_tensor_free(hw);
1384
1
  ccv_nnc_tensor_free(hb);
1385
1
  ccv_nnc_tensor_free(ha2);
1386
1
  ccv_nnc_tensor_free(hw2);
1387
1
}
1388
1389
TEST_CASE("cublas forward gemv in half precision no bias, variant 1")
1390
1
{
1391
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1392
1
  dsfmt_t dsfmt;
1393
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1394
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
1395
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1396
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
1397
1398
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1399
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1400
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1401
1
  int i;
1402
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1403
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1404
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1405
129
  for (i = 0; i < 128; 
i++128
)
1406
128
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1407
129
  for (i = 0; i < 128; 
i++128
)
1408
128
    ha->data.f32[i] = ha1->data.f32[i];
1409
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
1410
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1411
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
1412
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
1413
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1414
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1415
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
1416
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1417
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1418
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1419
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1420
1
  ccv_nnc_tensor_free(a);
1421
1
  ccv_nnc_tensor_free(w);
1422
1
  ccv_nnc_tensor_free(b);
1423
1
  ccv_nnc_tensor_free(tb);
1424
1
  ccv_nnc_tensor_free(ha);
1425
1
  ccv_nnc_tensor_free(ha1);
1426
1
  ccv_nnc_tensor_free(tb1);
1427
1
  ccv_nnc_tensor_free(hw);
1428
1
  ccv_nnc_tensor_free(hb);
1429
1
  ccv_nnc_tensor_free(ha2);
1430
1
  ccv_nnc_tensor_free(hw2);
1431
1
}
1432
1433
TEST_CASE("cublas forward gemv in half precision no bias, variant 2")
1434
1
{
1435
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1436
1
  dsfmt_t dsfmt;
1437
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1438
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1439
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 128, 1), 0);
1440
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 1), 0);
1441
1442
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1443
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
1444
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
1445
1
  int i;
1446
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1447
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1448
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
1449
129
  for (i = 0; i < 128; 
i++128
)
1450
128
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1451
129
  for (i = 0; i < 128; 
i++128
)
1452
128
    ha->data.f32[i] = ha1->data.f32[i];
1453
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1454
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 128, 1), 0);
1455
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
1456
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
1457
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(hw, ha), TENSOR_LIST(hb), 0);
1458
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(w, a), TENSOR_LIST(b), 0);
1459
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 1), 0);
1460
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1461
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
1462
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1463
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1464
1
  ccv_nnc_tensor_free(a);
1465
1
  ccv_nnc_tensor_free(w);
1466
1
  ccv_nnc_tensor_free(b);
1467
1
  ccv_nnc_tensor_free(tb);
1468
1
  ccv_nnc_tensor_free(ha);
1469
1
  ccv_nnc_tensor_free(ha1);
1470
1
  ccv_nnc_tensor_free(tb1);
1471
1
  ccv_nnc_tensor_free(hw);
1472
1
  ccv_nnc_tensor_free(hb);
1473
1
  ccv_nnc_tensor_free(ha2);
1474
1
  ccv_nnc_tensor_free(hw2);
1475
1
}
1476
1477
TEST_CASE("cublas backward gemm")
1478
1
{
1479
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1480
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1481
1
  dsfmt_t dsfmt;
1482
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1483
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1484
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1485
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1486
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1487
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1488
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1489
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1490
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1491
1492
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1493
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1494
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1495
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1496
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1497
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1498
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1499
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1500
1
  int i;
1501
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1502
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1503
65
  for (i = 0; i < 64; 
i++64
)
1504
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1505
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1506
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1507
641
  for (i = 0; i < 10 * 64; 
i++640
)
1508
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1509
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
1510
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1511
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
1512
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1513
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
1514
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1515
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1516
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1517
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1518
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
1519
1
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
1520
1
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
1521
1
  REQUIRE_TENSOR_EQ(tdbias, hdbias, "GPU computed output should be the same as CPU computed ones");
1522
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
1523
1
  ccv_nnc_tensor_free(a);
1524
1
  ccv_nnc_tensor_free(w);
1525
1
  ccv_nnc_tensor_free(bias);
1526
1
  ccv_nnc_tensor_free(b);
1527
1
  ccv_nnc_tensor_free(g);
1528
1
  ccv_nnc_tensor_free(dw);
1529
1
  ccv_nnc_tensor_free(dbias);
1530
1
  ccv_nnc_tensor_free(h);
1531
1
  ccv_nnc_tensor_free(ha);
1532
1
  ccv_nnc_tensor_free(hw);
1533
1
  ccv_nnc_tensor_free(hbias);
1534
1
  ccv_nnc_tensor_free(hb);
1535
1
  ccv_nnc_tensor_free(hg);
1536
1
  ccv_nnc_tensor_free(hdw);
1537
1
  ccv_nnc_tensor_free(hdbias);
1538
1
  ccv_nnc_tensor_free(hh);
1539
1
  ccv_nnc_tensor_free(tb);
1540
1
  ccv_nnc_tensor_free(th);
1541
1
  ccv_nnc_tensor_free(tdw);
1542
1
  ccv_nnc_tensor_free(tdbias);
1543
1
}
1544
1545
TEST_CASE("cublas backward gemm in half precision")
1546
1
{
1547
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1548
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1549
1
  dsfmt_t dsfmt;
1550
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1551
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1552
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1553
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1554
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1555
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1556
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1557
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1558
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1559
1560
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1561
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1562
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1563
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1564
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1565
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1566
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1567
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1568
1
  int i;
1569
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1570
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1571
65
  for (i = 0; i < 64; 
i++64
)
1572
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1573
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1574
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1575
641
  for (i = 0; i < 10 * 64; 
i++640
)
1576
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1577
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1578
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1579
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1580
1
  ccv_nnc_tensor_t* hg2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1581
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(ha2, hw2, hbias2, hg2), 0);
1582
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2, hg2), TENSOR_LIST(a, w, bias, g), 0);
1583
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1584
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
1585
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1586
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
1587
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1588
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1589
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1590
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1591
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
1592
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1593
1
  ccv_nnc_tensor_t* tdw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1594
1
  ccv_nnc_tensor_t* tdbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1595
1
  ccv_nnc_tensor_t* th1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1596
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb, tdw, tdbias, th), TENSOR_LIST(tb1, tdw1, tdbias1, th1), 0);
1597
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 10 * 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1598
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw1->data.f32, hdw->data.f32, 64 * 128, 1e-2, "GPU computed output should be the same as CPU computed ones");
1599
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias1->data.f32, hdbias->data.f32, 64, 1e-2, "GPU computed output should be the same as CPU computed ones");
1600
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th1->data.f32, hh->data.f32, 10 * 128, 1e-3, "GPU computed output should be the same as CPU computed ones");
1601
1
  ccv_nnc_tensor_free(a);
1602
1
  ccv_nnc_tensor_free(w);
1603
1
  ccv_nnc_tensor_free(bias);
1604
1
  ccv_nnc_tensor_free(b);
1605
1
  ccv_nnc_tensor_free(g);
1606
1
  ccv_nnc_tensor_free(dw);
1607
1
  ccv_nnc_tensor_free(dbias);
1608
1
  ccv_nnc_tensor_free(h);
1609
1
  ccv_nnc_tensor_free(ha);
1610
1
  ccv_nnc_tensor_free(hw);
1611
1
  ccv_nnc_tensor_free(hbias);
1612
1
  ccv_nnc_tensor_free(hb);
1613
1
  ccv_nnc_tensor_free(hg);
1614
1
  ccv_nnc_tensor_free(hdw);
1615
1
  ccv_nnc_tensor_free(hdbias);
1616
1
  ccv_nnc_tensor_free(hh);
1617
1
  ccv_nnc_tensor_free(tb);
1618
1
  ccv_nnc_tensor_free(th);
1619
1
  ccv_nnc_tensor_free(tdw);
1620
1
  ccv_nnc_tensor_free(tdbias);
1621
1
  ccv_nnc_tensor_free(ha2);
1622
1
  ccv_nnc_tensor_free(hw2);
1623
1
  ccv_nnc_tensor_free(hbias2);
1624
1
  ccv_nnc_tensor_free(hg2);
1625
1
  ccv_nnc_tensor_free(tb1);
1626
1
  ccv_nnc_tensor_free(tdw1);
1627
1
  ccv_nnc_tensor_free(tdbias1);
1628
1
  ccv_nnc_tensor_free(th1);
1629
1
}
1630
1631
TEST_CASE("cublas backward gemm no bias")
1632
1
{
1633
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1634
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1635
1
  dsfmt_t dsfmt;
1636
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1637
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1638
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1639
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1640
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1641
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1642
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1643
1644
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1645
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1646
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1647
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1648
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1649
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1650
1
  int i;
1651
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1652
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1653
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1654
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1655
641
  for (i = 0; i < 10 * 64; 
i++640
)
1656
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1657
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(a, w, g), 0);
1658
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1659
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
1660
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1661
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
1662
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1663
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1664
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1665
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
1666
1
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
1667
1
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
1668
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
1669
1
  ccv_nnc_tensor_free(a);
1670
1
  ccv_nnc_tensor_free(w);
1671
1
  ccv_nnc_tensor_free(b);
1672
1
  ccv_nnc_tensor_free(g);
1673
1
  ccv_nnc_tensor_free(dw);
1674
1
  ccv_nnc_tensor_free(h);
1675
1
  ccv_nnc_tensor_free(ha);
1676
1
  ccv_nnc_tensor_free(hw);
1677
1
  ccv_nnc_tensor_free(hb);
1678
1
  ccv_nnc_tensor_free(hg);
1679
1
  ccv_nnc_tensor_free(hdw);
1680
1
  ccv_nnc_tensor_free(hh);
1681
1
  ccv_nnc_tensor_free(tb);
1682
1
  ccv_nnc_tensor_free(th);
1683
1
  ccv_nnc_tensor_free(tdw);
1684
1
}
1685
1686
TEST_CASE("cublas backward gemm no bias in half precision")
1687
1
{
1688
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
1689
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1690
1
  dsfmt_t dsfmt;
1691
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1692
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1693
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1694
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1695
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1696
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1697
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1698
1699
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1700
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1701
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1702
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1703
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1704
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1705
1
  int i;
1706
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1707
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1708
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1709
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1710
641
  for (i = 0; i < 10 * 64; 
i++640
)
1711
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1712
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1713
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1714
1
  ccv_nnc_tensor_t* hg2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1715
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(ha2, hw2, hg2), 0);
1716
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hg2), TENSOR_LIST(a, w, g), 0);
1717
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1718
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
1719
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1720
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
1721
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1722
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1723
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1724
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1725
1
  ccv_nnc_tensor_t* tdw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1726
1
  ccv_nnc_tensor_t* th1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1727
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
1728
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb, tdw, th), TENSOR_LIST(tb1, tdw1, th1), 0);
1729
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 10 * 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1730
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw1->data.f32, hdw->data.f32, 64 * 128, 1e-2, "GPU computed output should be the same as CPU computed ones");
1731
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th1->data.f32, hh->data.f32, 10 * 128, 1e-3, "GPU computed output should be the same as CPU computed ones");
1732
1
  ccv_nnc_tensor_free(a);
1733
1
  ccv_nnc_tensor_free(w);
1734
1
  ccv_nnc_tensor_free(b);
1735
1
  ccv_nnc_tensor_free(g);
1736
1
  ccv_nnc_tensor_free(dw);
1737
1
  ccv_nnc_tensor_free(h);
1738
1
  ccv_nnc_tensor_free(ha);
1739
1
  ccv_nnc_tensor_free(hw);
1740
1
  ccv_nnc_tensor_free(hb);
1741
1
  ccv_nnc_tensor_free(hg);
1742
1
  ccv_nnc_tensor_free(hdw);
1743
1
  ccv_nnc_tensor_free(hh);
1744
1
  ccv_nnc_tensor_free(tb);
1745
1
  ccv_nnc_tensor_free(th);
1746
1
  ccv_nnc_tensor_free(tdw);
1747
1
  ccv_nnc_tensor_free(ha2);
1748
1
  ccv_nnc_tensor_free(hw2);
1749
1
  ccv_nnc_tensor_free(hg2);
1750
1
  ccv_nnc_tensor_free(tb1);
1751
1
  ccv_nnc_tensor_free(tdw1);
1752
1
  ccv_nnc_tensor_free(th1);
1753
1
}
1754
1755
TEST_CASE("cublas handle permute")
1756
1
{
1757
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1758
1
  dsfmt_t dsfmt;
1759
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1760
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 2, 128), 0);
1761
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 2, 128), 0);
1762
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 2, 128), 0);
1763
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 2, 128), 0);
1764
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
1765
1766
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), 0);
1767
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), 0);
1768
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
1769
1
  int i;
1770
16.3k
  for (i = 0; i < 2 * 64 * 128; 
i++16.3k
)
1771
16.3k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1772
2.56k
  for (i = 0; i < 2 * 10 * 128; 
i++2.56k
)
1773
2.56k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1774
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
1775
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(at), 0);
1776
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(w), TENSOR_LIST(wt), 0);
1777
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
1778
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
1779
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
1780
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
1781
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
1782
1
  ccv_nnc_tensor_t* hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
1783
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, bt), TENSOR_LIST(hb, hbt), 0);
1784
1
  REQUIRE_TENSOR_EQ(hb, hbt, "permute computed output should be the same as non-permute computed ones");
1785
1
  ccv_nnc_tensor_free(ha);
1786
1
  ccv_nnc_tensor_free(hw);
1787
1
  ccv_nnc_tensor_free(a);
1788
1
  ccv_nnc_tensor_free(w);
1789
1
  ccv_nnc_tensor_free(b);
1790
1
  ccv_nnc_tensor_view_free(av);
1791
1
  ccv_nnc_tensor_view_free(wv);
1792
1
  ccv_nnc_tensor_free(at);
1793
1
  ccv_nnc_tensor_free(wt);
1794
1
  ccv_nnc_tensor_free(bt);
1795
1
  ccv_nnc_tensor_free(hb);
1796
1
  ccv_nnc_tensor_free(hbt);
1797
1
}
1798
1799
TEST_CASE("generalized batched gemm with batch (2, 4) compare cublas")
1800
1
{
1801
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1802
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1803
1
  dsfmt_t dsfmt;
1804
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1805
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1806
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1807
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1808
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1809
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
1810
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1811
1812
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1813
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
1814
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1815
1
  int i;
1816
65.5k
  for (i = 0; i < 8 * 64 * 128; 
i++65.5k
)
1817
65.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1818
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
1819
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1820
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
1821
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
1822
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
1823
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1824
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
1825
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
1826
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
1827
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1828
1
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
1829
1
  ccv_nnc_tensor_free(ha);
1830
1
  ccv_nnc_tensor_free(hw);
1831
1
  ccv_nnc_tensor_free(hb);
1832
1
  ccv_nnc_tensor_free(a);
1833
1
  ccv_nnc_tensor_free(w);
1834
1
  ccv_nnc_tensor_free(b);
1835
1
  ccv_nnc_tensor_view_free(av);
1836
1
  ccv_nnc_tensor_view_free(wv);
1837
1
  ccv_nnc_tensor_free(at);
1838
1
  ccv_nnc_tensor_free(wt);
1839
1
  ccv_nnc_tensor_free(bt);
1840
1
}
1841
1842
TEST_CASE("generalized batched gemm with batch (2, 4) and broadcast compare cublas")
1843
1
{
1844
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1845
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1846
1
  dsfmt_t dsfmt;
1847
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1848
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1849
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1850
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1851
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1852
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1853
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1854
1855
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1856
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1857
1
  int i;
1858
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1859
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1860
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
1861
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1862
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
1863
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
1864
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1865
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w), TENSOR_LIST(b), 0);
1866
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw), TENSOR_LIST(bt), 0);
1867
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1868
1
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
1869
1
  ccv_nnc_tensor_free(ha);
1870
1
  ccv_nnc_tensor_free(hw);
1871
1
  ccv_nnc_tensor_free(hb);
1872
1
  ccv_nnc_tensor_free(a);
1873
1
  ccv_nnc_tensor_free(w);
1874
1
  ccv_nnc_tensor_free(b);
1875
1
  ccv_nnc_tensor_view_free(av);
1876
1
  ccv_nnc_tensor_free(at);
1877
1
  ccv_nnc_tensor_free(bt);
1878
1
}
1879
1880
TEST_CASE("generalized batched gemm with batch (2, 4) with bias compare cublas")
1881
1
{
1882
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1883
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1884
1
  dsfmt_t dsfmt;
1885
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1886
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1887
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1888
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1889
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1890
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1891
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
1892
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1893
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1894
1895
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1896
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
1897
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1898
1
  int i;
1899
65.5k
  for (i = 0; i < 8 * 64 * 128; 
i++65.5k
)
1900
65.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1901
65
  for (i = 0; i < 64; 
i++64
)
1902
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
1903
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
1904
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1905
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
1906
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
1907
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
1908
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1909
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
1910
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv, bias), TENSOR_LIST(b), 0);
1911
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt, hbias), TENSOR_LIST(bt), 0);
1912
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1913
1
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
1914
1
  ccv_nnc_tensor_free(ha);
1915
1
  ccv_nnc_tensor_free(hw);
1916
1
  ccv_nnc_tensor_free(hbias);
1917
1
  ccv_nnc_tensor_free(hb);
1918
1
  ccv_nnc_tensor_free(a);
1919
1
  ccv_nnc_tensor_free(w);
1920
1
  ccv_nnc_tensor_free(bias);
1921
1
  ccv_nnc_tensor_free(b);
1922
1
  ccv_nnc_tensor_view_free(av);
1923
1
  ccv_nnc_tensor_view_free(wv);
1924
1
  ccv_nnc_tensor_free(at);
1925
1
  ccv_nnc_tensor_free(wt);
1926
1
  ccv_nnc_tensor_free(bt);
1927
1
}
1928
1929
TEST_CASE("generalized batched gemm with batch (2, 4) with bias and broadcast compare cublas")
1930
1
{
1931
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1932
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1933
1
  dsfmt_t dsfmt;
1934
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1935
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1936
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1937
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1938
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1939
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1940
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1941
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1942
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1943
1944
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1945
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1946
1
  int i;
1947
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1948
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1949
65
  for (i = 0; i < 64; 
i++64
)
1950
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
1951
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
1952
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1953
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
1954
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
1955
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
1956
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w, bias), TENSOR_LIST(b), 0);
1957
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw, hbias), TENSOR_LIST(bt), 0);
1958
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1959
1
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
1960
1
  ccv_nnc_tensor_free(ha);
1961
1
  ccv_nnc_tensor_free(hw);
1962
1
  ccv_nnc_tensor_free(hbias);
1963
1
  ccv_nnc_tensor_free(hb);
1964
1
  ccv_nnc_tensor_free(a);
1965
1
  ccv_nnc_tensor_free(w);
1966
1
  ccv_nnc_tensor_free(bias);
1967
1
  ccv_nnc_tensor_free(b);
1968
1
  ccv_nnc_tensor_view_free(av);
1969
1
  ccv_nnc_tensor_free(at);
1970
1
  ccv_nnc_tensor_free(bt);
1971
1
}
1972
1973
TEST_CASE("generalized batched backward gemm with batch (2, 4) compare cublas")
1974
1
{
1975
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
1976
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
1977
1
  dsfmt_t dsfmt;
1978
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1979
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1980
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1981
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1982
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1983
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
1984
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1985
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
1986
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
1987
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
1988
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
1989
1990
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1991
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
1992
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
1993
1
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
1994
1
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
1995
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
1996
1
  int i;
1997
65.5k
  for (i = 0; i < 8 * 64 * 128; 
i++65.5k
)
1998
65.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1999
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2000
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2001
5.12k
  for (i = 0; i < 2 * 4 * 10 * 64; 
i++5.12k
)
2002
5.12k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2003
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2004
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2005
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2006
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2007
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2008
1
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2009
1
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2010
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv), 0);
2011
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt), 0);
2012
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2013
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
2014
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
2015
1
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2016
1
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2017
1
  ccv_nnc_tensor_free(ha);
2018
1
  ccv_nnc_tensor_free(hw);
2019
1
  ccv_nnc_tensor_free(hda);
2020
1
  ccv_nnc_tensor_free(hdw);
2021
1
  ccv_nnc_tensor_free(hb);
2022
1
  ccv_nnc_tensor_free(a);
2023
1
  ccv_nnc_tensor_free(w);
2024
1
  ccv_nnc_tensor_free(da);
2025
1
  ccv_nnc_tensor_free(dw);
2026
1
  ccv_nnc_tensor_free(b);
2027
1
  ccv_nnc_tensor_view_free(av);
2028
1
  ccv_nnc_tensor_view_free(wv);
2029
1
  ccv_nnc_tensor_view_free(dav);
2030
1
  ccv_nnc_tensor_view_free(dwv);
2031
1
  ccv_nnc_tensor_free(at);
2032
1
  ccv_nnc_tensor_free(wt);
2033
1
  ccv_nnc_tensor_free(dat);
2034
1
  ccv_nnc_tensor_free(tda);
2035
1
  ccv_nnc_tensor_free(dwt);
2036
1
  ccv_nnc_tensor_free(tdw);
2037
1
}
2038
2039
TEST_CASE("generalized batched backward gemm with batch (2, 4) and broadcast compare cublas")
2040
1
{
2041
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2042
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2043
1
  dsfmt_t dsfmt;
2044
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2045
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2046
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2047
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2048
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2049
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2050
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2051
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2052
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2053
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2054
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2055
2056
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2057
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2058
1
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2059
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2060
1
  int i;
2061
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
2062
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2063
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2064
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2065
5.12k
  for (i = 0; i < 2 * 4 * 10 * 64; 
i++5.12k
)
2066
5.12k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2067
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2068
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2069
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2070
1
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2071
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw), 0);
2072
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw), TENSOR_LIST(dat, tdw), 0);
2073
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2074
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
2075
1
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2076
1
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2077
1
  ccv_nnc_tensor_free(ha);
2078
1
  ccv_nnc_tensor_free(hw);
2079
1
  ccv_nnc_tensor_free(hda);
2080
1
  ccv_nnc_tensor_free(hdw);
2081
1
  ccv_nnc_tensor_free(hb);
2082
1
  ccv_nnc_tensor_free(a);
2083
1
  ccv_nnc_tensor_free(w);
2084
1
  ccv_nnc_tensor_free(da);
2085
1
  ccv_nnc_tensor_free(dw);
2086
1
  ccv_nnc_tensor_free(b);
2087
1
  ccv_nnc_tensor_view_free(av);
2088
1
  ccv_nnc_tensor_view_free(dav);
2089
1
  ccv_nnc_tensor_free(at);
2090
1
  ccv_nnc_tensor_free(dat);
2091
1
  ccv_nnc_tensor_free(tda);
2092
1
  ccv_nnc_tensor_free(tdw);
2093
1
}
2094
2095
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias compare cublas")
2096
1
{
2097
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2098
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2099
1
  dsfmt_t dsfmt;
2100
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2101
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2102
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2103
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2104
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2105
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2106
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2107
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2108
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2109
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2110
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2111
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2112
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2113
2114
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2115
1
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2116
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2117
1
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2118
1
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2119
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2120
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2121
1
  int i;
2122
65.5k
  for (i = 0; i < 8 * 64 * 128; 
i++65.5k
)
2123
65.5k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2124
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2125
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2126
5.12k
  for (i = 0; i < 2 * 4 * 10 * 64; 
i++5.12k
)
2127
5.12k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2128
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2129
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2130
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2131
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2132
1
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2133
1
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2134
1
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2135
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv, dbias), 0);
2136
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt, tdbias), 0);
2137
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
2138
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2139
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
2140
1
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2141
1
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2142
1
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
2143
1
  ccv_nnc_tensor_free(ha);
2144
1
  ccv_nnc_tensor_free(hw);
2145
1
  ccv_nnc_tensor_free(hda);
2146
1
  ccv_nnc_tensor_free(hdw);
2147
1
  ccv_nnc_tensor_free(hdbias);
2148
1
  ccv_nnc_tensor_free(hb);
2149
1
  ccv_nnc_tensor_free(a);
2150
1
  ccv_nnc_tensor_free(w);
2151
1
  ccv_nnc_tensor_free(da);
2152
1
  ccv_nnc_tensor_free(dw);
2153
1
  ccv_nnc_tensor_free(dbias);
2154
1
  ccv_nnc_tensor_free(b);
2155
1
  ccv_nnc_tensor_view_free(av);
2156
1
  ccv_nnc_tensor_view_free(wv);
2157
1
  ccv_nnc_tensor_view_free(dav);
2158
1
  ccv_nnc_tensor_view_free(dwv);
2159
1
  ccv_nnc_tensor_free(at);
2160
1
  ccv_nnc_tensor_free(wt);
2161
1
  ccv_nnc_tensor_free(dat);
2162
1
  ccv_nnc_tensor_free(dwt);
2163
1
  ccv_nnc_tensor_free(tda);
2164
1
  ccv_nnc_tensor_free(tdw);
2165
1
  ccv_nnc_tensor_free(tdbias);
2166
1
}
2167
2168
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias and broadcast compare cublas")
2169
1
{
2170
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
2171
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2172
1
  dsfmt_t dsfmt;
2173
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2174
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2175
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2176
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2177
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2178
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2179
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2180
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2181
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2182
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2183
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2184
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2185
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2186
2187
1
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2188
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2189
1
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2190
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2191
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2192
1
  int i;
2193
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
2194
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2195
10.2k
  for (i = 0; i < 8 * 10 * 128; 
i++10.2k
)
2196
10.2k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2197
5.12k
  for (i = 0; i < 2 * 4 * 10 * 64; 
i++5.12k
)
2198
5.12k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2199
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2200
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2201
1
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2202
1
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2203
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w, dbias), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw, dbias), 0);
2204
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw, hdbias), TENSOR_LIST(dat, tdw, tdbias), 0);
2205
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
2206
1
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2207
1
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2208
1
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2209
1
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
2210
1
  ccv_nnc_tensor_free(ha);
2211
1
  ccv_nnc_tensor_free(hw);
2212
1
  ccv_nnc_tensor_free(hda);
2213
1
  ccv_nnc_tensor_free(hdw);
2214
1
  ccv_nnc_tensor_free(hdbias);
2215
1
  ccv_nnc_tensor_free(hb);
2216
1
  ccv_nnc_tensor_free(a);
2217
1
  ccv_nnc_tensor_free(w);
2218
1
  ccv_nnc_tensor_free(da);
2219
1
  ccv_nnc_tensor_free(dw);
2220
1
  ccv_nnc_tensor_free(dbias);
2221
1
  ccv_nnc_tensor_free(b);
2222
1
  ccv_nnc_tensor_view_free(av);
2223
1
  ccv_nnc_tensor_view_free(dav);
2224
1
  ccv_nnc_tensor_free(at);
2225
1
  ccv_nnc_tensor_free(dat);
2226
1
  ccv_nnc_tensor_free(tdw);
2227
1
  ccv_nnc_tensor_free(tdbias);
2228
1
}
2229
2230
TEST_CASE("ewdiv forward with reciprocal")
2231
1
{
2232
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2233
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2234
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2235
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2236
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2237
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2238
1
  dsfmt_t dsfmt;
2239
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2240
1
  int i;
2241
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2242
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2243
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2244
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a), TENSOR_LIST(b), 0);
2245
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ha), TENSOR_LIST(bt), 0);
2246
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2247
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2248
1
  ccv_nnc_tensor_free(a);
2249
1
  ccv_nnc_tensor_free(b);
2250
1
  ccv_nnc_tensor_free(ha);
2251
1
  ccv_nnc_tensor_free(hb);
2252
1
  ccv_nnc_tensor_free(bt);
2253
1
}
2254
2255
TEST_CASE("ewdiv forward")
2256
1
{
2257
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2258
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2259
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2260
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2261
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2262
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2263
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2264
1
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2265
1
  dsfmt_t dsfmt;
2266
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2267
1
  int i;
2268
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2269
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2270
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2271
1.00k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2272
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
2273
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2274
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(ct), 0);
2275
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
2276
1
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
2277
1
  ccv_nnc_tensor_free(a);
2278
1
  ccv_nnc_tensor_free(b);
2279
1
  ccv_nnc_tensor_free(c);
2280
1
  ccv_nnc_tensor_free(ha);
2281
1
  ccv_nnc_tensor_free(hb);
2282
1
  ccv_nnc_tensor_free(hc);
2283
1
  ccv_nnc_tensor_free(ct);
2284
1
}
2285
2286
TEST_CASE("ewdiv backward with output 1")
2287
1
{
2288
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2289
1
    ccv_nnc_cmd_ok(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2290
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2291
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2292
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2293
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2294
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2295
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2296
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2297
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2298
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2299
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2300
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2301
1
  dsfmt_t dsfmt;
2302
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2303
1
  int i;
2304
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2305
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2306
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2307
1.00k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2308
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2309
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2310
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2311
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2312
1
  ccv_nnc_cmd_exec(CMD_EWDIV_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
2313
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2314
1
  ccv_nnc_cmd_exec(CMD_EWDIV_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
2315
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2316
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2317
1
  ccv_nnc_tensor_free(a);
2318
1
  ccv_nnc_tensor_free(b);
2319
1
  ccv_nnc_tensor_free(c);
2320
1
  ccv_nnc_tensor_free(g);
2321
1
  ccv_nnc_tensor_free(da);
2322
1
  ccv_nnc_tensor_free(ha);
2323
1
  ccv_nnc_tensor_free(hb);
2324
1
  ccv_nnc_tensor_free(hc);
2325
1
  ccv_nnc_tensor_free(hg);
2326
1
  ccv_nnc_tensor_free(hda);
2327
1
  ccv_nnc_tensor_free(dat);
2328
1
}
2329
2330
TEST_CASE("ewdiv backward with output 2")
2331
1
{
2332
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2333
1
    ccv_nnc_cmd_ok(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2334
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2335
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2336
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2337
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2338
1
  ccv_nnc_tensor_t* db = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2339
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2340
1
  ccv_nnc_tensor_t* hdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2341
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2342
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2343
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2344
1
  ccv_nnc_tensor_t* dbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2345
1
  dsfmt_t dsfmt;
2346
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2347
1
  int i;
2348
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2349
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2350
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2351
1.00k
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2352
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2353
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2354
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
2355
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2356
1
  ccv_nnc_cmd_exec(CMD_EWDIV_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b, c), TENSOR_LIST(0, db), 0);
2357
1
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
2358
1
  ccv_nnc_cmd_exec(CMD_EWDIV_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb, hc), TENSOR_LIST(0, dbt), 0);
2359
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(db), TENSOR_LIST(hdb), 0);
2360
1
  REQUIRE_TENSOR_EQ(dbt, hdb, "GPU computed output should be the same as CPU computed ones");
2361
1
  ccv_nnc_tensor_free(a);
2362
1
  ccv_nnc_tensor_free(b);
2363
1
  ccv_nnc_tensor_free(c);
2364
1
  ccv_nnc_tensor_free(g);
2365
1
  ccv_nnc_tensor_free(db);
2366
1
  ccv_nnc_tensor_free(ha);
2367
1
  ccv_nnc_tensor_free(hb);
2368
1
  ccv_nnc_tensor_free(hc);
2369
1
  ccv_nnc_tensor_free(hg);
2370
1
  ccv_nnc_tensor_free(hdb);
2371
1
  ccv_nnc_tensor_free(dbt);
2372
1
}
2373
2374
TEST_CASE("exp forward")
2375
1
{
2376
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2377
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2378
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2379
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2380
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2381
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2382
1
  dsfmt_t dsfmt;
2383
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2384
1
  int i;
2385
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2386
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
2387
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2388
1
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2389
1
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2390
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2391
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2392
1
  ccv_nnc_tensor_free(a);
2393
1
  ccv_nnc_tensor_free(b);
2394
1
  ccv_nnc_tensor_free(ha);
2395
1
  ccv_nnc_tensor_free(hb);
2396
1
  ccv_nnc_tensor_free(bt);
2397
1
}
2398
2399
TEST_CASE("ewexp backward")
2400
1
{
2401
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2402
1
    ccv_nnc_cmd_ok(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2403
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2404
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2405
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2406
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2407
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2408
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2409
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2410
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2411
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2412
1
  dsfmt_t dsfmt;
2413
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2414
1
  int i;
2415
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2416
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
2417
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2418
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2419
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2420
1
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2421
1
  ccv_nnc_cmd_exec(CMD_EWEXP_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
2422
1
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2423
1
  ccv_nnc_cmd_exec(CMD_EWEXP_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
2424
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2425
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2426
1
  ccv_nnc_tensor_free(a);
2427
1
  ccv_nnc_tensor_free(b);
2428
1
  ccv_nnc_tensor_free(g);
2429
1
  ccv_nnc_tensor_free(da);
2430
1
  ccv_nnc_tensor_free(ha);
2431
1
  ccv_nnc_tensor_free(hb);
2432
1
  ccv_nnc_tensor_free(hg);
2433
1
  ccv_nnc_tensor_free(hda);
2434
1
  ccv_nnc_tensor_free(dat);
2435
1
}
2436
2437
TEST_CASE("ewlog forward")
2438
1
{
2439
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2440
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2441
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2442
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2443
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2444
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2445
1
  dsfmt_t dsfmt;
2446
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2447
1
  int i;
2448
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2449
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
2450
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2451
1
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2452
1
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2453
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2454
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2455
1
  ccv_nnc_tensor_free(a);
2456
1
  ccv_nnc_tensor_free(b);
2457
1
  ccv_nnc_tensor_free(ha);
2458
1
  ccv_nnc_tensor_free(hb);
2459
1
  ccv_nnc_tensor_free(bt);
2460
1
}
2461
2462
TEST_CASE("ewlog backward")
2463
1
{
2464
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2465
1
    ccv_nnc_cmd_ok(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2466
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2467
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2468
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2469
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2470
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2471
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2472
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2473
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2474
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2475
1
  dsfmt_t dsfmt;
2476
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2477
1
  int i;
2478
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2479
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
2480
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2481
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2482
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2483
1
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2484
1
  ccv_nnc_cmd_exec(CMD_EWLOG_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a), TENSOR_LIST(da), 0);
2485
1
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2486
1
  ccv_nnc_cmd_exec(CMD_EWLOG_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha), TENSOR_LIST(dat), 0);
2487
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2488
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2489
1
  ccv_nnc_tensor_free(a);
2490
1
  ccv_nnc_tensor_free(b);
2491
1
  ccv_nnc_tensor_free(g);
2492
1
  ccv_nnc_tensor_free(da);
2493
1
  ccv_nnc_tensor_free(ha);
2494
1
  ccv_nnc_tensor_free(hb);
2495
1
  ccv_nnc_tensor_free(hg);
2496
1
  ccv_nnc_tensor_free(hda);
2497
1
  ccv_nnc_tensor_free(dat);
2498
1
}
2499
2500
TEST_CASE("ewsqrt forward")
2501
1
{
2502
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2503
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2504
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2505
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2506
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2507
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2508
1
  dsfmt_t dsfmt;
2509
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2510
1
  int i;
2511
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2512
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
2513
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2514
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2515
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2516
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2517
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2518
1
  ccv_nnc_tensor_free(a);
2519
1
  ccv_nnc_tensor_free(b);
2520
1
  ccv_nnc_tensor_free(ha);
2521
1
  ccv_nnc_tensor_free(hb);
2522
1
  ccv_nnc_tensor_free(bt);
2523
1
}
2524
2525
TEST_CASE("ewsqrt backward")
2526
1
{
2527
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2528
1
    ccv_nnc_cmd_ok(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2529
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2530
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2531
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2532
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2533
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2534
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2535
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2536
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2537
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2538
1
  dsfmt_t dsfmt;
2539
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2540
1
  int i;
2541
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2542
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
2543
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2544
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2545
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2546
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2547
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
2548
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2549
1
  ccv_nnc_cmd_exec(CMD_EWSQRT_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
2550
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2551
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2552
1
  ccv_nnc_tensor_free(a);
2553
1
  ccv_nnc_tensor_free(b);
2554
1
  ccv_nnc_tensor_free(g);
2555
1
  ccv_nnc_tensor_free(da);
2556
1
  ccv_nnc_tensor_free(ha);
2557
1
  ccv_nnc_tensor_free(hb);
2558
1
  ccv_nnc_tensor_free(hg);
2559
1
  ccv_nnc_tensor_free(hda);
2560
1
  ccv_nnc_tensor_free(dat);
2561
1
}
2562
2563
TEST_CASE("clamp forward")
2564
1
{
2565
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2566
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2567
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2568
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2569
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2570
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2571
1
  dsfmt_t dsfmt;
2572
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2573
1
  int i;
2574
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2575
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
2576
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2577
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2578
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2579
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2580
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2581
1
  ccv_nnc_tensor_free(a);
2582
1
  ccv_nnc_tensor_free(b);
2583
1
  ccv_nnc_tensor_free(ha);
2584
1
  ccv_nnc_tensor_free(hb);
2585
1
  ccv_nnc_tensor_free(bt);
2586
1
}
2587
2588
TEST_CASE("clamp backward")
2589
1
{
2590
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2591
1
    ccv_nnc_cmd_ok(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2592
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2593
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2594
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2595
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2596
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2597
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2598
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2599
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2600
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2601
1
  dsfmt_t dsfmt;
2602
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2603
1
  int i;
2604
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2605
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
2606
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2607
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2608
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2609
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2610
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(0, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
2611
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2612
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(0, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
2613
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2614
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2615
1
  ccv_nnc_tensor_free(a);
2616
1
  ccv_nnc_tensor_free(b);
2617
1
  ccv_nnc_tensor_free(g);
2618
1
  ccv_nnc_tensor_free(da);
2619
1
  ccv_nnc_tensor_free(ha);
2620
1
  ccv_nnc_tensor_free(hb);
2621
1
  ccv_nnc_tensor_free(hg);
2622
1
  ccv_nnc_tensor_free(hda);
2623
1
  ccv_nnc_tensor_free(dat);
2624
1
}
2625
2626
TEST_CASE("clamp forward with only max")
2627
1
{
2628
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2629
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2630
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2631
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2632
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2633
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2634
1
  dsfmt_t dsfmt;
2635
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2636
1
  int i;
2637
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2638
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
2639
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2640
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2641
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2642
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2643
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2644
1
  ccv_nnc_tensor_free(a);
2645
1
  ccv_nnc_tensor_free(b);
2646
1
  ccv_nnc_tensor_free(ha);
2647
1
  ccv_nnc_tensor_free(hb);
2648
1
  ccv_nnc_tensor_free(bt);
2649
1
}
2650
2651
TEST_CASE("clamp backward with only max")
2652
1
{
2653
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2654
1
    ccv_nnc_cmd_ok(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2655
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2656
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2657
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2658
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2659
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2660
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2661
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2662
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2663
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2664
1
  dsfmt_t dsfmt;
2665
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2666
1
  int i;
2667
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2668
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
2669
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2670
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2671
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2672
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2673
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(NAN, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
2674
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2675
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(NAN, 5), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
2676
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2677
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2678
1
  ccv_nnc_tensor_free(a);
2679
1
  ccv_nnc_tensor_free(b);
2680
1
  ccv_nnc_tensor_free(g);
2681
1
  ccv_nnc_tensor_free(da);
2682
1
  ccv_nnc_tensor_free(ha);
2683
1
  ccv_nnc_tensor_free(hb);
2684
1
  ccv_nnc_tensor_free(hg);
2685
1
  ccv_nnc_tensor_free(hda);
2686
1
  ccv_nnc_tensor_free(dat);
2687
1
}
2688
2689
TEST_CASE("clamp forward with only min")
2690
1
{
2691
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2692
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2693
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2694
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2695
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2696
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2697
1
  dsfmt_t dsfmt;
2698
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2699
1
  int i;
2700
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2701
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
2702
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2703
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2704
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2705
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2706
1
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2707
1
  ccv_nnc_tensor_free(a);
2708
1
  ccv_nnc_tensor_free(b);
2709
1
  ccv_nnc_tensor_free(ha);
2710
1
  ccv_nnc_tensor_free(hb);
2711
1
  ccv_nnc_tensor_free(bt);
2712
1
}
2713
2714
TEST_CASE("clamp backward with only min")
2715
1
{
2716
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2717
1
    ccv_nnc_cmd_ok(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2718
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2719
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2720
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2721
1
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2722
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2723
1
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2724
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2725
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2726
1
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2727
1
  dsfmt_t dsfmt;
2728
1
  dsfmt_init_gen_rand(&dsfmt, 0);
2729
1
  int i;
2730
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2731
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10;
2732
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
2733
1.00k
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2734
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hg), TENSOR_LIST(a, g), 0);
2735
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2736
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(g, 0, b), TENSOR_LIST(da), 0);
2737
1
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
2738
1
  ccv_nnc_cmd_exec(CMD_CLAMP_BACKWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, 0, hb), TENSOR_LIST(dat), 0);
2739
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da), TENSOR_LIST(hda), 0);
2740
1
  REQUIRE_TENSOR_EQ(dat, hda, "GPU computed output should be the same as CPU computed ones");
2741
1
  ccv_nnc_tensor_free(a);
2742
1
  ccv_nnc_tensor_free(b);
2743
1
  ccv_nnc_tensor_free(g);
2744
1
  ccv_nnc_tensor_free(da);
2745
1
  ccv_nnc_tensor_free(ha);
2746
1
  ccv_nnc_tensor_free(hb);
2747
1
  ccv_nnc_tensor_free(hg);
2748
1
  ccv_nnc_tensor_free(hda);
2749
1
  ccv_nnc_tensor_free(dat);
2750
1
}
2751
2752
TEST_CASE("scaled dot product attention with flash_attn")
2753
1
{
2754
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2755
  // Bypass error: variable-sized object may not be initialized
2756
0
#define num_long_trials 4
2757
0
#define num_short_trials 2
2758
0
#define num_trials (num_long_trials + num_short_trials)
2759
2760
0
  for (int trial = 0; trial < num_trials; ++trial) {
2761
0
    int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1 };
2762
0
    int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5 };
2763
0
    int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5 };
2764
0
    int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32 };
2765
0
    int Hk_candidates[num_trials] = {   8,  8, 8, 8, 2, 8 };
2766
0
    int D_candidates[num_trials] = {  64, 40, 160, 224, 224, 128 };
2767
0
    int is_causal_candidates[num_trials] = {  1, 0, 1, 1, 0, 1 };
2768
2769
0
    int B = B_candidates[trial];
2770
0
    int R = R_candidates[trial];
2771
0
    int C = C_candidates[trial];
2772
0
    int Hq = Hq_candidates[trial];
2773
0
    int Hk = Hk_candidates[trial];
2774
0
    int D = D_candidates[trial];
2775
0
    int is_causal = is_causal_candidates[trial];
2776
0
    float scale = 1.0 / sqrt((float)D);
2777
2778
0
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2779
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
2780
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
2781
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
2782
2783
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
2784
0
      q_tensor->data.f32[i] = (float)(i) / (float)(B * R * Hq * D);
2785
0
    }
2786
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
2787
0
      k_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
2788
0
    }
2789
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
2790
0
      v_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
2791
0
    }
2792
2793
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
2794
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, NULL, NULL, NULL), TENSOR_LIST(o_tensor, NULL), 0);
2795
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
2796
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
2797
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
2798
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), 0);
2799
2800
    // Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC?
2801
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
2802
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
2803
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
2804
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
2805
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
2806
2807
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, NULL), 0);
2808
2809
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
2810
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor_f16), 0);
2811
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
2812
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_o_tensor_f16), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
2813
2814
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 3e-3, "GPU computed output should be the same as CPU computed ones");
2815
2816
0
    ccv_nnc_tensor_free(o_tensor);
2817
0
    ccv_nnc_tensor_free(gpu_o_tensor);
2818
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
2819
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor_f16);
2820
0
    ccv_nnc_tensor_free(q_tensor);
2821
0
    ccv_nnc_tensor_free(k_tensor);
2822
0
    ccv_nnc_tensor_free(v_tensor);
2823
0
    ccv_nnc_tensor_free(q_tensor_f16);
2824
0
    ccv_nnc_tensor_free(k_tensor_f16);
2825
0
    ccv_nnc_tensor_free(v_tensor_f16);
2826
0
    ccv_nnc_tensor_free(gpu_q_tensor);
2827
0
    ccv_nnc_tensor_free(gpu_k_tensor);
2828
0
    ccv_nnc_tensor_free(gpu_v_tensor);
2829
0
  }
2830
0
#undef num_long_trials
2831
0
#undef num_short_trials
2832
0
#undef num_trials
2833
0
}
2834
2835
TEST_CASE("scaled dot product attention + unify head with flash_attn")
2836
1
{
2837
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
2838
0
  ccv_nnc_symbolic_graph_t* const sdp_symbolic_graph = ccv_nnc_symbolic_graph_new();
2839
0
  ccv_nnc_tensor_symbol_t q = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "q");
2840
0
  ccv_nnc_tensor_symbol_t k = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "k");
2841
0
  ccv_nnc_tensor_symbol_t v = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "v");
2842
0
  ccv_nnc_tensor_symbol_t w = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512, 512), "w");
2843
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512), "bias");
2844
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "c");
2845
0
  ccv_nnc_tensor_symbol_t r = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 512), "r");
2846
0
  ccv_nnc_graph_exec_symbol_new(sdp_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(q, k, v, NO_TENSOR_SYMBOL, w, bias), TENSOR_SYMBOL_LIST(r, NO_TENSOR_SYMBOL, c), "scaled_dot_product_attention");
2847
0
  ccv_nnc_graph_exec_symbol_autogen(sdp_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2848
0
  ccv_nnc_graph_t* sdp_graph = 0;
2849
0
  ccv_nnc_tensor_arena_t* sdp_tensor_arena = 0;
2850
0
  ccv_nnc_graph_exec_arena_t* sdp_graph_exec_arena = 0;
2851
0
  ccv_nnc_symbolic_graph_compile(sdp_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(sdp_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(sdp_symbolic_graph), &sdp_graph, &sdp_tensor_arena, &sdp_graph_exec_arena);
2852
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, q);
2853
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, k);
2854
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, v);
2855
0
  ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, w);
2856
0
  ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, bias);
2857
0
  dsfmt_t dsfmt;
2858
0
  int i;
2859
0
  dsfmt_init_gen_rand(&dsfmt, 1);
2860
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
2861
0
    q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2862
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
2863
0
    k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2864
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
2865
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2866
0
  for (i = 0; i < 512 * 512; i++)
2867
0
    w_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / sqrtf(512);
2868
0
  for (i = 0; i < 512; i++)
2869
0
    bias_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2870
0
  ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 8, 64), 0);
2871
0
  ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 8, 64), 0);
2872
0
  ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 8, 64), 0);
2873
0
  ccv_nnc_tensor_t* const w_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 512, 512), 0);
2874
0
  ccv_nnc_tensor_t* const bias_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 512), 0);
2875
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, w_tensor, bias_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, w_tensor_f16, bias_tensor_f16), 0);
2876
0
  ccv_nnc_symbolic_graph_t* const g_symbolic_graph = ccv_nnc_symbolic_graph_new();
2877
0
  ccv_nnc_tensor_symbol_t gq = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 8, 64), "q");
2878
0
  ccv_nnc_tensor_symbol_t gk = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 8, 64), "k");
2879
0
  ccv_nnc_tensor_symbol_t gv = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 8, 64), "v");
2880
0
  ccv_nnc_tensor_symbol_t gw = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 512, 512), "w");
2881
0
  ccv_nnc_tensor_symbol_t gbias = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 512), "bias");
2882
0
  ccv_nnc_tensor_symbol_t gc = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 8, 64), "c");
2883
0
  ccv_nnc_tensor_symbol_t gr = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 32, 128, 512), "r");
2884
0
  ccv_nnc_graph_exec_symbol_new(g_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(gq, gk, gv, NO_TENSOR_SYMBOL, gw, gbias), TENSOR_SYMBOL_LIST(gr, NO_TENSOR_SYMBOL, gc), "scaled_dot_product_attention");
2885
0
  ccv_nnc_graph_exec_symbol_autogen(g_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
2886
0
  ccv_nnc_graph_t* g_graph = 0;
2887
0
  ccv_nnc_tensor_arena_t* g_tensor_arena = 0;
2888
0
  ccv_nnc_graph_exec_arena_t* g_graph_exec_arena = 0;
2889
0
  ccv_nnc_symbolic_graph_compile(g_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(g_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(g_symbolic_graph), &g_graph, &g_tensor_arena, &g_graph_exec_arena);
2890
0
  ccv_nnc_tensor_t* const gq_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gq);
2891
0
  ccv_nnc_tensor_t* const gk_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gk);
2892
0
  ccv_nnc_tensor_t* const gv_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gv);
2893
0
  ccv_nnc_tensor_t* const gw_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gw);
2894
0
  ccv_nnc_tensor_t* const gbias_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gbias);
2895
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, w_tensor_f16, bias_tensor_f16), TENSOR_LIST(gq_tensor, gk_tensor, gv_tensor, gw_tensor, gbias_tensor), 0);
2896
0
  ccv_nnc_graph_run(sdp_graph, 0, TRAVERSE_FULL, 0, 0);
2897
0
  ccv_nnc_graph_run(g_graph, 0, TRAVERSE_FULL, 0, 0);
2898
0
  ccv_nnc_tensor_t* const r_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, r);
2899
0
  ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, c);
2900
0
  ccv_nnc_tensor_t* const gc_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gc);
2901
0
  ccv_nnc_tensor_t* const gr_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gr);
2902
0
  ccv_nnc_tensor_t* const ho_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 8, 64), 0);
2903
0
  ccv_nnc_tensor_t* const hr_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 32, 128, 512), 0);
2904
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc_tensor, gr_tensor), TENSOR_LIST(ho_f16, hr_f16), 0);
2905
0
  ccv_nnc_tensor_t* const ho = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), 0);
2906
0
  ccv_nnc_tensor_t* const hr = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 512), 0);
2907
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ho_f16, hr_f16), TENSOR_LIST(ho, hr), 0);
2908
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, o_tensor->data.f32, ho->data.f32, 32 * 128 * 8 * 64, 3e-3, "graph computed result should match scaled dot product attention op result");
2909
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, r_tensor->data.f32, hr->data.f32, 32 * 128 * 512, 3e-2, "graph computed result should match scaled dot product attention op result");
2910
0
  ccv_nnc_symbolic_graph_free(sdp_symbolic_graph);
2911
0
  ccv_nnc_tensor_arena_free(sdp_tensor_arena);
2912
0
  ccv_nnc_graph_exec_arena_free(sdp_graph_exec_arena);
2913
0
  ccv_nnc_graph_free(sdp_graph);
2914
0
  ccv_nnc_symbolic_graph_free(g_symbolic_graph);
2915
0
  ccv_nnc_tensor_arena_free(g_tensor_arena);
2916
0
  ccv_nnc_graph_exec_arena_free(g_graph_exec_arena);
2917
0
  ccv_nnc_graph_free(g_graph);
2918
0
  ccv_nnc_tensor_free(ho);
2919
0
  ccv_nnc_tensor_free(hr);
2920
0
  ccv_nnc_tensor_free(ho_f16);
2921
0
  ccv_nnc_tensor_free(hr_f16);
2922
0
  ccv_nnc_tensor_free(q_tensor_f16);
2923
0
  ccv_nnc_tensor_free(k_tensor_f16);
2924
0
  ccv_nnc_tensor_free(v_tensor_f16);
2925
0
  ccv_nnc_tensor_free(w_tensor_f16);
2926
0
  ccv_nnc_tensor_free(bias_tensor_f16);
2927
0
}
2928
2929
TEST_CASE("scaled dot product attention gradient with flash_attn")
2930
1
{
2931
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
2932
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
2933
0
#define num_long_trials 8
2934
0
#define num_short_trials 4
2935
0
#define num_trials (num_long_trials + num_short_trials)
2936
2937
0
  dsfmt_t dsfmt;
2938
0
  dsfmt_init_gen_rand(&dsfmt, 10);
2939
0
  for (int trial = 0; trial < num_trials; ++trial) {
2940
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
2941
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
2942
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
2943
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
2944
0
    const int Hk_candidates[num_trials] = {   8,  8, 8, 8, 2, 8, 8,  8, 8, 8, 2, 8 };
2945
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 64, 40, 160, 192, 256, 128 };
2946
0
    const int is_causal_candidates[num_trials] = {  1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1 };
2947
0
    const int deterministic_candidates[num_trials] = {  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1 };
2948
2949
0
    const int B = B_candidates[trial];
2950
0
    const int R = R_candidates[trial];
2951
0
    const int C = C_candidates[trial];
2952
0
    const int Hq = Hq_candidates[trial];
2953
0
    const int Hk = Hk_candidates[trial];
2954
0
    const int D = D_candidates[trial];
2955
0
    const int is_causal = is_causal_candidates[trial];
2956
0
    const int deterministic = deterministic_candidates[trial];
2957
0
    const float scale = 1.0 / sqrt((float)D);
2958
2959
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
2960
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
2961
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
2962
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
2963
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
2964
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
2965
2966
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
2967
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2968
0
    }
2969
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
2970
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2971
0
    }
2972
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
2973
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2974
0
    }
2975
2976
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
2977
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
2978
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2979
0
    }
2980
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
2981
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
2982
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
2983
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
2984
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
2985
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), 0);
2986
2987
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
2988
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
2989
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
2990
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
2991
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
2992
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
2993
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
2994
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
2995
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
2996
2997
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
2998
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
2999
3000
0
    ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal);
3001
0
    cmd.info.scaled_dot_product_attention.deterministic = deterministic;
3002
0
    ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
3003
3004
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
3005
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
3006
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
3007
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), 0);
3008
3009
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3010
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3011
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3012
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
3013
3014
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * Hq * D, 1e-3, "scaled dot product attention result should be the same");
3015
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * Hk * D, 3e-3, "scaled dot product attention result should be the same");
3016
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * Hk * D, 6e-3, "GPU computed output should be the same as CPU computed ones");
3017
3018
0
    ccv_nnc_tensor_free(do_tensor);
3019
0
    ccv_nnc_tensor_free(gpu_do_tensor);
3020
0
    ccv_nnc_tensor_free(gpu_o_tensor);
3021
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor_f16);
3022
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor_f16);
3023
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor_f16);
3024
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
3025
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
3026
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
3027
0
    ccv_nnc_tensor_free(q_tensor);
3028
0
    ccv_nnc_tensor_free(k_tensor);
3029
0
    ccv_nnc_tensor_free(v_tensor);
3030
0
    ccv_nnc_tensor_free(q_tensor_f16);
3031
0
    ccv_nnc_tensor_free(k_tensor_f16);
3032
0
    ccv_nnc_tensor_free(v_tensor_f16);
3033
0
    ccv_nnc_tensor_free(do_tensor_f16);
3034
0
    ccv_nnc_tensor_free(gpu_q_tensor);
3035
0
    ccv_nnc_tensor_free(gpu_k_tensor);
3036
0
    ccv_nnc_tensor_free(gpu_v_tensor);
3037
0
    ccv_nnc_tensor_free(dq_tensor);
3038
0
    ccv_nnc_tensor_free(dk_tensor);
3039
0
    ccv_nnc_tensor_free(dv_tensor);
3040
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
3041
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
3042
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
3043
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
3044
0
  }
3045
0
#undef num_long_trials
3046
0
#undef num_short_trials
3047
0
#undef num_trials
3048
0
}
3049
3050
TEST_CASE("cmul in float")
3051
1
{
3052
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
3053
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3054
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
3055
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
3056
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "c");
3057
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
3058
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3059
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3060
1
  ccv_nnc_graph_t* graph = 0;
3061
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3062
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3063
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3064
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3065
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3066
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3067
1
  dsfmt_t dsfmt;
3068
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3069
1
  int i;
3070
201
  for (i = 0; i < 20 * 10; 
i++200
)
3071
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3072
201
  for (i = 0; i < 20 * 10; 
i++200
)
3073
200
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3074
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3075
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3076
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3077
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
3078
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3079
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3080
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3081
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z_tensor), 0);
3082
1
  ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3083
1
  ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
3084
1
  REQUIRE_TENSOR_EQ(tz, z_tensor, "gelu from cudnn should match from CPU");
3085
1
  ccv_nnc_tensor_free(x_tensor);
3086
1
  ccv_nnc_tensor_free(y_tensor);
3087
1
  ccv_nnc_tensor_free(z_tensor);
3088
1
  ccv_nnc_tensor_free(tz);
3089
1
  ccv_nnc_graph_free(graph);
3090
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3091
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3092
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3093
1
}
3094
3095
TEST_CASE("cmul in half precision")
3096
1
{
3097
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
3098
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3099
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
3100
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
3101
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "c");
3102
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
3103
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3104
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3105
1
  ccv_nnc_graph_t* graph = 0;
3106
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3107
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3108
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3109
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3110
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3111
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3112
1
  dsfmt_t dsfmt;
3113
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3114
1
  int i;
3115
201
  for (i = 0; i < 20 * 10; 
i++200
)
3116
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3117
201
  for (i = 0; i < 20 * 10; 
i++200
)
3118
200
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3119
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3120
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3121
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3122
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
3123
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3124
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3125
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(y16_tensor), 0);
3126
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(b_tensor), 0);
3127
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3128
1
  ccv_nnc_tensor_t* const z16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3129
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3130
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3131
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z16_tensor), 0);
3132
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(z16_tensor), TENSOR_LIST(z_tensor), 0);
3133
1
  ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3134
1
  ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
3135
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tz->data.f32, z_tensor->data.f32, 20 * 10, 2e-3, "gelu from cudnn should match from CPU");
3136
1
  ccv_nnc_tensor_free(x_tensor);
3137
1
  ccv_nnc_tensor_free(x16_tensor);
3138
1
  ccv_nnc_tensor_free(y16_tensor);
3139
1
  ccv_nnc_tensor_free(y_tensor);
3140
1
  ccv_nnc_tensor_free(z16_tensor);
3141
1
  ccv_nnc_tensor_free(z_tensor);
3142
1
  ccv_nnc_tensor_free(tz);
3143
1
  ccv_nnc_graph_free(graph);
3144
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3145
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3146
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3147
1
}
3148
3149
TEST_CASE("cmul in float, broadcast semantics")
3150
1
{
3151
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
3152
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3153
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 5, 8, 128), "a");
3154
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 5, 1, 128), "b");
3155
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 5, 8, 128), "c");
3156
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
3157
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3158
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3159
1
  ccv_nnc_graph_t* graph = 0;
3160
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3161
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3162
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3163
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3164
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 5, 8, 128), 0);
3165
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 5, 1, 128), 0);
3166
1
  dsfmt_t dsfmt;
3167
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3168
1
  int i;
3169
5.12k
  for (i = 0; i < 1 * 5 * 8 * 128; 
i++5.12k
)
3170
5.12k
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3171
641
  for (i = 0; i < 1 * 5 * 1 * 128; 
i++640
)
3172
640
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3173
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3174
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3175
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3176
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
3177
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3178
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 5, 8, 128), 0);
3179
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3180
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z_tensor), 0);
3181
1
  ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 5, 8, 128), 0);
3182
1
  ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
3183
1
  REQUIRE_TENSOR_EQ(tz, z_tensor, "gelu from cudnn should match from CPU");
3184
1
  ccv_nnc_tensor_free(x_tensor);
3185
1
  ccv_nnc_tensor_free(y_tensor);
3186
1
  ccv_nnc_tensor_free(z_tensor);
3187
1
  ccv_nnc_tensor_free(tz);
3188
1
  ccv_nnc_graph_free(graph);
3189
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3190
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3191
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3192
1
}
3193
3194
TEST_CASE("cmul gradient in float")
3195
1
{
3196
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3197
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3198
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a");
3199
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b");
3200
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "c");
3201
1
  ccv_nnc_tensor_symbol_t d = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "d");
3202
1
  ccv_nnc_tensor_symbol_t e = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "e");
3203
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_BACKWARD(), TENSOR_SYMBOL_LIST(a, b, c), TENSOR_SYMBOL_LIST(d, e), "cmul");
3204
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3205
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3206
1
  ccv_nnc_graph_t* graph = 0;
3207
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3208
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3209
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3210
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3211
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3212
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3213
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3214
1
  dsfmt_t dsfmt;
3215
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3216
1
  int i;
3217
201
  for (i = 0; i < 20 * 10; 
i++200
)
3218
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3219
201
  for (i = 0; i < 20 * 10; 
i++200
)
3220
200
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3221
201
  for (i = 0; i < 20 * 10; 
i++200
)
3222
200
    z_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3223
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3224
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
3225
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3226
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
3227
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3228
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(z_tensor), TENSOR_LIST(c_tensor), 0);
3229
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3230
1
  ccv_nnc_tensor_t* const od_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3231
1
  ccv_nnc_tensor_t* const d_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, d);
3232
1
  ccv_nnc_tensor_t* const oe_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3233
1
  ccv_nnc_tensor_t* const e_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, e);
3234
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d_tensor, e_tensor), TENSOR_LIST(od_tensor, oe_tensor), 0);
3235
1
  ccv_nnc_tensor_t* const td = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3236
1
  ccv_nnc_tensor_t* const te = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3237
1
  ccv_nnc_cmd_exec(CMD_CMUL_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor, z_tensor), TENSOR_LIST(td, te), 0);
3238
1
  REQUIRE_TENSOR_EQ(td, od_tensor, "cmul gradient from cudnn should match from CPU");
3239
1
  REQUIRE_TENSOR_EQ(te, oe_tensor, "cmul gradient from cudnn should match from CPU");
3240
1
  ccv_nnc_tensor_free(x_tensor);
3241
1
  ccv_nnc_tensor_free(y_tensor);
3242
1
  ccv_nnc_tensor_free(z_tensor);
3243
1
  ccv_nnc_tensor_free(od_tensor);
3244
1
  ccv_nnc_tensor_free(oe_tensor);
3245
1
  ccv_nnc_tensor_free(td);
3246
1
  ccv_nnc_tensor_free(te);
3247
1
  ccv_nnc_graph_free(graph);
3248
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3249
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3250
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3251
1
}
3252
3253
TEST_CASE("cmul gradient in half precision")
3254
1
{
3255
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_MPS));
3256
1
  ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
3257
1
  ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a");
3258
1
  ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");
3259
1
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "c");
3260
1
  ccv_nnc_tensor_symbol_t d = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "c");
3261
1
  ccv_nnc_tensor_symbol_t e = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "c");
3262
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_BACKWARD(), TENSOR_SYMBOL_LIST(a, b, c), TENSOR_SYMBOL_LIST(d, e), "cmul");
3263
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
3264
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
3265
1
  ccv_nnc_graph_t* graph = 0;
3266
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
3267
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
3268
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
3269
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
3270
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3271
1
  ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3272
1
  ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3273
1
  dsfmt_t dsfmt;
3274
1
  dsfmt_init_gen_rand(&dsfmt, 0);
3275
1
  int i;
3276
201
  for (i = 0; i < 20 * 10; 
i++200
)
3277
200
    x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3278
201
  for (i = 0; i < 20 * 10; 
i++200
)
3279
200
    y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3280
201
  for (i = 0; i < 20 * 10; 
i++200
)
3281
200
    z_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3282
1
  ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
3283
1
  ccv_nnc_tensor_t* const x16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3284
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16_tensor), 0);
3285
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16_tensor), TENSOR_LIST(a_tensor), 0);
3286
1
  ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
3287
1
  ccv_nnc_tensor_t* const y16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3288
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(y16_tensor), 0);
3289
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y16_tensor), TENSOR_LIST(b_tensor), 0);
3290
1
  ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
3291
1
  ccv_nnc_tensor_t* const z16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3292
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(z_tensor), TENSOR_LIST(z16_tensor), 0);
3293
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(z16_tensor), TENSOR_LIST(c_tensor), 0);
3294
1
  ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
3295
1
  ccv_nnc_tensor_t* const od16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3296
1
  ccv_nnc_tensor_t* const od_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3297
1
  ccv_nnc_tensor_t* const d_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, d);
3298
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d_tensor), TENSOR_LIST(od16_tensor), 0);
3299
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(od16_tensor), TENSOR_LIST(od_tensor), 0);
3300
1
  ccv_nnc_tensor_t* const oe16_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 20, 10), 0);
3301
1
  ccv_nnc_tensor_t* const oe_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3302
1
  ccv_nnc_tensor_t* const e_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, e);
3303
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(e_tensor), TENSOR_LIST(oe16_tensor), 0);
3304
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(oe16_tensor), TENSOR_LIST(oe_tensor), 0);
3305
1
  ccv_nnc_tensor_t* const td = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3306
1
  ccv_nnc_tensor_t* const te = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 20, 10), 0);
3307
1
  ccv_nnc_cmd_exec(CMD_CMUL_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor, z_tensor), TENSOR_LIST(td, te), 0);
3308
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, td->data.f32, od_tensor->data.f32, 20 * 10, 2e-3, "gelu from cudnn should match from CPU");
3309
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, te->data.f32, oe_tensor->data.f32, 20 * 10, 2e-3, "gelu from cudnn should match from CPU");
3310
1
  ccv_nnc_tensor_free(x_tensor);
3311
1
  ccv_nnc_tensor_free(x16_tensor);
3312
1
  ccv_nnc_tensor_free(y_tensor);
3313
1
  ccv_nnc_tensor_free(y16_tensor);
3314
1
  ccv_nnc_tensor_free(z_tensor);
3315
1
  ccv_nnc_tensor_free(z16_tensor);
3316
1
  ccv_nnc_tensor_free(od_tensor);
3317
1
  ccv_nnc_tensor_free(od16_tensor);
3318
1
  ccv_nnc_tensor_free(td);
3319
1
  ccv_nnc_tensor_free(oe_tensor);
3320
1
  ccv_nnc_tensor_free(oe16_tensor);
3321
1
  ccv_nnc_tensor_free(te);
3322
1
  ccv_nnc_graph_free(graph);
3323
1
  ccv_nnc_tensor_arena_free(tensor_arena);
3324
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
3325
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
3326
1
}
3327
3328
#include "case_main.h"