Coverage Report

Created: 2021-04-12 03:25

/home/liu/buildslave/linux-x64-runtests/build/test/unit/nnc/cblas.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
#if (defined(HAVE_CBLAS) || defined(HAVE_ACCELERATE_FRAMEWORK))
15
16
TEST_CASE("gemm no transpose")
17
1
{
18
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT));
19
1
  float ap[] = {
20
1
    1, 2,
21
1
    3, 4,
22
1
    5, 6,
23
1
    7, 8,
24
1
  };
25
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
26
1
  float bp[] = {
27
1
    7, 8, 9,
28
1
    10, 11, 12,
29
1
  };
30
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
31
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
32
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD();
33
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
34
1
  cmd.algorithm = 1; // This is cblas.
35
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
36
1
  float ctp[] = {
37
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
38
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
39
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
40
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
41
1
  };
42
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
43
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
44
1
  ccv_nnc_tensor_free(a);
45
1
  ccv_nnc_tensor_free(b);
46
1
  ccv_nnc_tensor_free(c);
47
1
}
48
49
TEST_CASE("gemm transpose a")
50
1
{
51
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT));
52
1
  float ap[] = {
53
1
    1, 3, 5, 7,
54
1
    2, 4, 6, 8,
55
1
  };
56
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
57
1
  float bp[] = {
58
1
    7, 8, 9,
59
1
    10, 11, 12,
60
1
  };
61
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
62
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
63
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
64
1
  float ctp[] = {
65
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
66
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
67
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
68
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
69
1
  };
70
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
71
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
72
1
  ccv_nnc_tensor_free(a);
73
1
  ccv_nnc_tensor_free(b);
74
1
  ccv_nnc_tensor_free(c);
75
1
}
76
77
TEST_CASE("gemm transpose b")
78
1
{
79
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT));
80
1
  float ap[] = {
81
1
    1, 2,
82
1
    3, 4,
83
1
    5, 6,
84
1
    7, 8,
85
1
  };
86
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
87
1
  float bp[] = {
88
1
    7, 10,
89
1
    8, 11,
90
1
    9, 12,
91
1
  };
92
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
93
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
94
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1));
95
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
96
1
  cmd.algorithm = 1; // This is cblas.
97
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
98
1
  float ctp[] = {
99
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
100
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
101
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
102
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
103
1
  };
104
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
105
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
106
1
  ccv_nnc_tensor_free(a);
107
1
  ccv_nnc_tensor_free(b);
108
1
  ccv_nnc_tensor_free(c);
109
1
}
110
111
TEST_CASE("gemm transpose a and b")
112
1
{
113
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT));
114
1
  float ap[] = {
115
1
    1, 3, 5, 7,
116
1
    2, 4, 6, 8,
117
1
  };
118
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
119
1
  float bp[] = {
120
1
    7, 10,
121
1
    8, 11,
122
1
    9, 12,
123
1
  };
124
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
125
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
126
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1));
127
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
128
1
  cmd.algorithm = 1; // This is cblas.
129
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
130
1
  float ctp[] = {
131
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
132
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
133
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
134
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
135
1
  };
136
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
137
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
138
1
  ccv_nnc_tensor_free(a);
139
1
  ccv_nnc_tensor_free(b);
140
1
  ccv_nnc_tensor_free(c);
141
1
}
142
143
TEST_CASE("gemm no transpose with bias")
144
1
{
145
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT));
146
1
  float ap[] = {
147
1
    1, 2,
148
1
    3, 4,
149
1
    5, 6,
150
1
    7, 8,
151
1
  };
152
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
153
1
  float bp[] = {
154
1
    7, 8, 9,
155
1
    10, 11, 12,
156
1
  };
157
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
158
1
  float dp[] = {
159
1
    1, -1, 1,
160
1
    1, -1, 1,
161
1
    1, -1, 1,
162
1
    1, -1, 1,
163
1
  };
164
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
165
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
166
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD();
167
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
168
1
  cmd.algorithm = 1; // This is cblas.
169
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(c), 0);
170
1
  float ctp[] = {
171
1
    1 * 7 + 2 * 10 + 1, 1 * 8 + 2 * 11 - 1, 1 * 9 + 2 * 12 + 1,
172
1
    3 * 7 + 4 * 10 + 1, 3 * 8 + 4 * 11 - 1, 3 * 9 + 4 * 12 + 1,
173
1
    5 * 7 + 6 * 10 + 1, 5 * 8 + 6 * 11 - 1, 5 * 9 + 6 * 12 + 1,
174
1
    7 * 7 + 8 * 10 + 1, 7 * 8 + 8 * 11 - 1, 7 * 9 + 8 * 12 + 1,
175
1
  };
176
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
177
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
178
1
  ccv_nnc_tensor_free(a);
179
1
  ccv_nnc_tensor_free(b);
180
1
  ccv_nnc_tensor_free(c);
181
1
  ccv_nnc_tensor_free(d);
182
1
}
183
184
TEST_CASE("backward gemm with no transpose")
185
1
{
186
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT) &&
187
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_CPU_OPT));
188
1
  float gp[] = {
189
1
    1, 2, 3,
190
1
    4, 5, 6,
191
1
    7, 8, 9,
192
1
    10, 11, 12,
193
1
  };
194
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
195
1
  float ap[] = {
196
1
    13, 14,
197
1
    15, 16,
198
1
    17, 18,
199
1
    19, 20,
200
1
  };
201
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
202
1
  float bp[] = {
203
1
    21, 22, 23,
204
1
    24, 25, 26,
205
1
  };
206
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
207
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
208
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
209
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
210
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD();
211
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
212
1
  cmd.algorithm = 1; // This is cblas.
213
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(h, db, dbias), 0);
214
1
  float dbiastp[] = {
215
1
    22, 26, 30,
216
1
  };
217
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
218
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
219
1
  float htp[] = {
220
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
221
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
222
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
223
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
224
1
  };
225
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
226
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
227
1
  float dbtp[] = {
228
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
229
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
230
1
  };
231
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
232
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
233
1
  ccv_nnc_tensor_free(g);
234
1
  ccv_nnc_tensor_free(a);
235
1
  ccv_nnc_tensor_free(b);
236
1
  ccv_nnc_tensor_free(h);
237
1
  ccv_nnc_tensor_free(db);
238
1
  ccv_nnc_tensor_free(dbias);
239
1
}
240
241
TEST_CASE("backward gemm with transpose a")
242
1
{
243
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT) &&
244
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_CPU_OPT));
245
1
  float gp[] = {
246
1
    1, 2, 3,
247
1
    4, 5, 6,
248
1
    7, 8, 9,
249
1
    10, 11, 12,
250
1
  };
251
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
252
1
  float ap[] = {
253
1
    13, 15, 17, 19,
254
1
    14, 16, 18, 20,
255
1
  };
256
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
257
1
  float bp[] = {
258
1
    21, 22, 23,
259
1
    24, 25, 26,
260
1
  };
261
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
262
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
263
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
264
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
265
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD(TRANSPOSE(0, 1));
266
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
267
1
  cmd.algorithm = 1; // This is cblas.
268
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(h, db, dbias), 0);
269
1
  float dbiastp[] = {
270
1
    22, 26, 30,
271
1
  };
272
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
273
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
274
1
  float htp[] = {
275
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
276
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
277
1
  };
278
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
279
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
280
1
  float dbtp[] = {
281
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
282
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
283
1
  };
284
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
285
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
286
1
  ccv_nnc_tensor_free(g);
287
1
  ccv_nnc_tensor_free(a);
288
1
  ccv_nnc_tensor_free(b);
289
1
  ccv_nnc_tensor_free(h);
290
1
  ccv_nnc_tensor_free(db);
291
1
  ccv_nnc_tensor_free(dbias);
292
1
}
293
294
TEST_CASE("backward gemm with transpose b")
295
1
{
296
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT) &&
297
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_CPU_OPT));
298
1
  float gp[] = {
299
1
    1, 2, 3,
300
1
    4, 5, 6,
301
1
    7, 8, 9,
302
1
    10, 11, 12,
303
1
  };
304
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
305
1
  float ap[] = {
306
1
    13, 14,
307
1
    15, 16,
308
1
    17, 18,
309
1
    19, 20,
310
1
  };
311
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
312
1
  float bp[] = {
313
1
    21, 24,
314
1
    22, 25,
315
1
    23, 26,
316
1
  };
317
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
318
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
319
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
320
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
321
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1));
322
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
323
1
  cmd.algorithm = 1; // This is cblas.
324
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(h, db, dbias), 0);
325
1
  float dbiastp[] = {
326
1
    22, 26, 30,
327
1
  };
328
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
329
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
330
1
  float htp[] = {
331
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
332
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
333
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
334
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
335
1
  };
336
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
337
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
338
1
  float dbtp[] = {
339
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
340
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
341
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
342
1
  };
343
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
344
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
345
1
  ccv_nnc_tensor_free(g);
346
1
  ccv_nnc_tensor_free(a);
347
1
  ccv_nnc_tensor_free(b);
348
1
  ccv_nnc_tensor_free(h);
349
1
  ccv_nnc_tensor_free(db);
350
1
  ccv_nnc_tensor_free(dbias);
351
1
}
352
353
TEST_CASE("backward gemm with transpose a and b")
354
1
{
355
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT) &&
356
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_CPU_OPT));
357
1
  float gp[] = {
358
1
    1, 2, 3,
359
1
    4, 5, 6,
360
1
    7, 8, 9,
361
1
    10, 11, 12,
362
1
  };
363
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
364
1
  float ap[] = {
365
1
    13, 15, 17, 19,
366
1
    14, 16, 18, 20,
367
1
  };
368
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
369
1
  float bp[] = {
370
1
    21, 24,
371
1
    22, 25,
372
1
    23, 26,
373
1
  };
374
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
375
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
376
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
377
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
378
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1));
379
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
380
1
  cmd.algorithm = 1; // This is cblas.
381
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(h, db, dbias), 0);
382
1
  float dbiastp[] = {
383
1
    22, 26, 30,
384
1
  };
385
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
386
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
387
1
  float htp[] = {
388
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
389
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
390
1
  };
391
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
392
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
393
1
  float dbtp[] = {
394
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
395
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
396
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
397
1
  };
398
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
399
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
400
1
  ccv_nnc_tensor_free(g);
401
1
  ccv_nnc_tensor_free(a);
402
1
  ccv_nnc_tensor_free(b);
403
1
  ccv_nnc_tensor_free(h);
404
1
  ccv_nnc_tensor_free(db);
405
1
  ccv_nnc_tensor_free(dbias);
406
1
}
407
408
TEST_CASE("gemm no transpose batch 2")
409
1
{
410
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT));
411
1
  float ap[] = {
412
1
    1, 2,
413
1
    3, 4,
414
1
    5, 6,
415
1
    7, 8,
416
1
    2, 3,
417
1
    4, 5,
418
1
    6, 7,
419
1
    8, 9
420
1
  };
421
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
422
1
  float bp[] = {
423
1
    7, 8, 9,
424
1
    10, 11, 12,
425
1
  };
426
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
427
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
428
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD();
429
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
430
1
  cmd.algorithm = 1; // This is cblas.
431
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
432
1
  float ctp[] = {
433
1
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
434
1
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
435
1
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
436
1
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
437
1
    2 * 7 + 3 * 10, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12,
438
1
    4 * 7 + 5 * 10, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12,
439
1
    6 * 7 + 7 * 10, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12,
440
1
    8 * 7 + 9 * 10, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12,
441
1
  };
442
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
443
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
444
1
  ccv_nnc_tensor_free(a);
445
1
  ccv_nnc_tensor_free(b);
446
1
  ccv_nnc_tensor_free(c);
447
1
}
448
449
TEST_CASE("gemm transpose a batch 2")
450
1
{
451
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT));
452
1
  float ap[] = {
453
1
    1, 3, 5, 7,
454
1
    2, 4, 6, 8,
455
1
    2, 4, 6, 8,
456
1
    3, 5, 7, 9,
457
1
  };
458
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
459
1
  float bp[] = {
460
1
    7, 8, 9,
461
1
    10, 11, 12,
462
1
  };
463
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
464
1
  float dp[] = {
465
1
    -1, 0, 1,
466
1
  };
467
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 3), 0);
468
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
469
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD(TRANSPOSE(1, 2));
470
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
471
1
  cmd.algorithm = 1; // This is cblas.
472
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(c), 0);
473
1
  float ctp[] = {
474
1
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
475
1
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
476
1
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
477
1
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
478
1
    2 * 7 + 3 * 10 - 1, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12 + 1,
479
1
    4 * 7 + 5 * 10 - 1, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12 + 1,
480
1
    6 * 7 + 7 * 10 - 1, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12 + 1,
481
1
    8 * 7 + 9 * 10 - 1, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12 + 1,
482
1
  };
483
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
484
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
485
1
  ccv_nnc_tensor_free(a);
486
1
  ccv_nnc_tensor_free(b);
487
1
  ccv_nnc_tensor_free(c);
488
1
  ccv_nnc_tensor_free(d);
489
1
}
490
491
TEST_CASE("gemm transpose b batch 2")
492
1
{
493
1
  float ap[] = {
494
1
    1, 2,
495
1
    3, 4,
496
1
    5, 6,
497
1
    7, 8,
498
1
    2, 3,
499
1
    4, 5,
500
1
    6, 7,
501
1
    8, 9
502
1
  };
503
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
504
1
  float bp[] = {
505
1
    7, 10,
506
1
    8, 11,
507
1
    9, 12,
508
1
    80, 110,
509
1
    90, 120,
510
1
    10, 13,
511
1
  };
512
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
513
1
  float dp[] = {
514
1
    -1, 0, 1,
515
1
    2, 3, -4,
516
1
  };
517
1
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
518
1
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
519
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2));
520
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
521
1
  cmd.algorithm = 1; // This is cblas.
522
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(c), 0);
523
1
  float ctp[] = {
524
1
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
525
1
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
526
1
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
527
1
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
528
1
    2 * 80 + 3 * 110 + 2, 2 * 90 + 3 * 120 + 3, 2 * 10 + 3 * 13 - 4,
529
1
    4 * 80 + 5 * 110 + 2, 4 * 90 + 5 * 120 + 3, 4 * 10 + 5 * 13 - 4,
530
1
    6 * 80 + 7 * 110 + 2, 6 * 90 + 7 * 120 + 3, 6 * 10 + 7 * 13 - 4,
531
1
    8 * 80 + 9 * 110 + 2, 8 * 90 + 9 * 120 + 3, 8 * 10 + 9 * 13 - 4,
532
1
  };
533
1
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
534
1
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
535
1
  ccv_nnc_tensor_free(a);
536
1
  ccv_nnc_tensor_free(b);
537
1
  ccv_nnc_tensor_free(c);
538
1
  ccv_nnc_tensor_free(d);
539
1
}
540
541
TEST_CASE("backward gemm with no transpose batch 2, same b")
542
1
{
543
1
  float gp[] = {
544
1
    1, 2, 3,
545
1
    4, 5, 6,
546
1
    7, 8, 9,
547
1
    10, 11, 12,
548
1
    10, 20, 30,
549
1
    40, 50, 60,
550
1
    70, 80, 90,
551
1
    100, 110, 120,
552
1
  };
553
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
554
1
  float ap[] = {
555
1
    13, 14,
556
1
    15, 16,
557
1
    17, 18,
558
1
    19, 20,
559
1
    131, 141,
560
1
    151, 161,
561
1
    171, 181,
562
1
    191, 201,
563
1
  };
564
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
565
1
  float bp[] = {
566
1
    21, 22, 23,
567
1
    24, 25, 26,
568
1
  };
569
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
570
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
571
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
572
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
573
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD();
574
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
575
1
  cmd.algorithm = 1; // This is cblas.
576
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(h, db, dbias), 0);
577
1
  float dbiastp[] = {
578
1
    22 + 220, 26 + 260, 30 + 300,
579
1
  };
580
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
581
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
582
1
  float htp[] = {
583
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
584
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
585
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
586
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
587
1
    10 * 21 + 20 * 22 + 30 * 23, 10 * 24 + 20 * 25 + 30 * 26,
588
1
    40 * 21 + 50 * 22 + 60 * 23, 40 * 24 + 50 * 25 + 60 * 26,
589
1
    70 * 21 + 80 * 22 + 90 * 23, 70 * 24 + 80 * 25 + 90 * 26,
590
1
    100 * 21 + 110 * 22 + 120 * 23, 100 * 24 + 110 * 25 + 120 * 26,
591
1
  };
592
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
593
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
594
1
  float dbtp[] = {
595
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
596
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
597
1
  };
598
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
599
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
600
1
  ccv_nnc_tensor_free(g);
601
1
  ccv_nnc_tensor_free(a);
602
1
  ccv_nnc_tensor_free(b);
603
1
  ccv_nnc_tensor_free(h);
604
1
  ccv_nnc_tensor_free(db);
605
1
  ccv_nnc_tensor_free(dbias);
606
1
}
607
608
TEST_CASE("backward gemm with no transpose batch 2, batched b")
609
1
{
610
1
  float gp[] = {
611
1
    1, 2, 3,
612
1
    4, 5, 6,
613
1
    7, 8, 9,
614
1
    10, 11, 12,
615
1
    10, 20, 30,
616
1
    40, 50, 60,
617
1
    70, 80, 90,
618
1
    100, 110, 120,
619
1
  };
620
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
621
1
  float ap[] = {
622
1
    13, 14,
623
1
    15, 16,
624
1
    17, 18,
625
1
    19, 20,
626
1
    131, 141,
627
1
    151, 161,
628
1
    171, 181,
629
1
    191, 201,
630
1
  };
631
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
632
1
  float bp[] = {
633
1
    21, 22, 23,
634
1
    24, 25, 26,
635
1
    212, 222, 232,
636
1
    242, 252, 262,
637
1
  };
638
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
639
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
640
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
641
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
642
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD();
643
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
644
1
  cmd.algorithm = 1; // This is cblas.
645
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(h, db, dbias), 0);
646
1
  float dbiastp[] = {
647
1
    22, 26, 30,
648
1
    220, 260, 300,
649
1
  };
650
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
651
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
652
1
  float htp[] = {
653
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
654
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
655
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
656
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
657
1
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
658
1
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
659
1
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
660
1
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
661
1
  };
662
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
663
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
664
1
  float dbtp[] = {
665
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
666
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
667
1
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
668
1
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
669
1
  };
670
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
671
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
672
1
  ccv_nnc_tensor_free(g);
673
1
  ccv_nnc_tensor_free(a);
674
1
  ccv_nnc_tensor_free(b);
675
1
  ccv_nnc_tensor_free(h);
676
1
  ccv_nnc_tensor_free(db);
677
1
  ccv_nnc_tensor_free(dbias);
678
1
}
679
680
TEST_CASE("backward gemm with transpose a batch 2, same b")
681
1
{
682
1
  float gp[] = {
683
1
    1, 2, 3,
684
1
    4, 5, 6,
685
1
    7, 8, 9,
686
1
    10, 11, 12,
687
1
    10, 20, 30,
688
1
    40, 50, 60,
689
1
    70, 80, 90,
690
1
    100, 110, 120,
691
1
  };
692
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
693
1
  float ap[] = {
694
1
    13, 15, 17, 19,
695
1
    14, 16, 18, 20,
696
1
    131, 151, 171, 191,
697
1
    141, 161, 181, 201,
698
1
  };
699
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
700
1
  float bp[] = {
701
1
    21, 22, 23,
702
1
    24, 25, 26,
703
1
  };
704
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
705
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
706
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
707
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
708
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD(TRANSPOSE(1, 2));
709
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
710
1
  cmd.algorithm = 1; // This is cblas.
711
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(h, db, dbias), 0);
712
1
  float dbiastp[] = {
713
1
    22 + 220, 26 + 260, 30 + 300,
714
1
  };
715
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
716
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
717
1
  float htp[] = {
718
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
719
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
720
1
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
721
1
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
722
1
  };
723
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
724
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
725
1
  float dbtp[] = {
726
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
727
1
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
728
1
  };
729
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
730
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
731
1
  ccv_nnc_tensor_free(g);
732
1
  ccv_nnc_tensor_free(a);
733
1
  ccv_nnc_tensor_free(b);
734
1
  ccv_nnc_tensor_free(h);
735
1
  ccv_nnc_tensor_free(db);
736
1
  ccv_nnc_tensor_free(dbias);
737
1
}
738
739
TEST_CASE("backward gemm with transpose b batch 2, batched b")
740
1
{
741
1
  float gp[] = {
742
1
    1, 2, 3,
743
1
    4, 5, 6,
744
1
    7, 8, 9,
745
1
    10, 11, 12,
746
1
    10, 20, 30,
747
1
    40, 50, 60,
748
1
    70, 80, 90,
749
1
    100, 110, 120,
750
1
  };
751
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
752
1
  float ap[] = {
753
1
    13, 14,
754
1
    15, 16,
755
1
    17, 18,
756
1
    19, 20,
757
1
    131, 141,
758
1
    151, 161,
759
1
    171, 181,
760
1
    191, 201,
761
1
  };
762
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
763
1
  float bp[] = {
764
1
    21, 24,
765
1
    22, 25,
766
1
    23, 26,
767
1
    212, 242,
768
1
    222, 252,
769
1
    232, 262,
770
1
  };
771
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
772
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
773
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
774
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
775
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2));
776
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
777
1
  cmd.algorithm = 1; // This is cblas.
778
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(h, db, dbias), 0);
779
1
  float dbiastp[] = {
780
1
    22, 26, 30,
781
1
    220, 260, 300,
782
1
  };
783
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
784
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
785
1
  float htp[] = {
786
1
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
787
1
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
788
1
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
789
1
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
790
1
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
791
1
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
792
1
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
793
1
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
794
1
  };
795
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
796
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
797
1
  float dbtp[] = {
798
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
799
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
800
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
801
1
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
802
1
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
803
1
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
804
1
  };
805
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
806
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
807
1
  ccv_nnc_tensor_free(g);
808
1
  ccv_nnc_tensor_free(a);
809
1
  ccv_nnc_tensor_free(b);
810
1
  ccv_nnc_tensor_free(h);
811
1
  ccv_nnc_tensor_free(db);
812
1
  ccv_nnc_tensor_free(dbias);
813
1
}
814
815
TEST_CASE("backward gemm with transpose a and b batch 2, same b")
816
1
{
817
1
  float gp[] = {
818
1
    1, 2, 3,
819
1
    4, 5, 6,
820
1
    7, 8, 9,
821
1
    10, 11, 12,
822
1
    10, 20, 30,
823
1
    40, 50, 60,
824
1
    70, 80, 90,
825
1
    100, 110, 120,
826
1
  };
827
1
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
828
1
  float ap[] = {
829
1
    13, 15, 17, 19,
830
1
    14, 16, 18, 20,
831
1
    131, 151, 171, 191,
832
1
    141, 161, 181, 201,
833
1
  };
834
1
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
835
1
  float bp[] = {
836
1
    21, 24,
837
1
    22, 25,
838
1
    23, 26,
839
1
  };
840
1
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
841
1
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
842
1
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
843
1
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
844
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(0, 1));
845
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
846
1
  cmd.algorithm = 1; // This is cblas.
847
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(h, db, dbias), 0);
848
1
  float dbiastp[] = {
849
1
    22 + 220, 26 + 260, 30 + 300,
850
1
  };
851
1
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
852
1
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
853
1
  float htp[] = {
854
1
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
855
1
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
856
1
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
857
1
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
858
1
  };
859
1
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
860
1
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
861
1
  float dbtp[] = {
862
1
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
863
1
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
864
1
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
865
1
  };
866
1
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
867
1
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
868
1
  ccv_nnc_tensor_free(g);
869
1
  ccv_nnc_tensor_free(a);
870
1
  ccv_nnc_tensor_free(b);
871
1
  ccv_nnc_tensor_free(h);
872
1
  ccv_nnc_tensor_free(db);
873
1
  ccv_nnc_tensor_free(dbias);
874
1
}
875
876
TEST_CASE("cublas forward gemm")
877
1
{
878
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT));
879
1
  dsfmt_t dsfmt;
880
1
  dsfmt_init_gen_rand(&dsfmt, 0);
881
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
882
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
883
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
884
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
885
1
886
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
887
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
888
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
889
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
890
1
  int i;
891
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
892
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
893
65
  for (i = 0; i < 64; 
i++64
)
894
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
895
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
896
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
897
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
898
129
  for (i = 0; i < 128; 
i++128
)
899
128
    ha->data.f32[i] = ha1->data.f32[i];
900
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(a, w, bias), 0);
901
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
902
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1));
903
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
904
1
  cmd.algorithm = 1; // This is cblas.
905
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
906
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
907
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
908
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
909
65
  for (i = 0; i < 64; 
i++64
)
910
64
    tb1->data.f32[i] = tb->data.f32[i];
911
1
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
912
1
  ccv_nnc_tensor_free(a);
913
1
  ccv_nnc_tensor_free(w);
914
1
  ccv_nnc_tensor_free(bias);
915
1
  ccv_nnc_tensor_free(b);
916
1
  ccv_nnc_tensor_free(tb);
917
1
  ccv_nnc_tensor_free(ha);
918
1
  ccv_nnc_tensor_free(ha1);
919
1
  ccv_nnc_tensor_free(tb1);
920
1
  ccv_nnc_tensor_free(hw);
921
1
  ccv_nnc_tensor_free(hbias);
922
1
  ccv_nnc_tensor_free(hb);
923
1
}
924
925
TEST_CASE("cublas forward gemm no bias")
926
1
{
927
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT));
928
1
  dsfmt_t dsfmt;
929
1
  dsfmt_init_gen_rand(&dsfmt, 0);
930
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
931
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
932
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
933
1
934
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
935
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
936
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
937
1
  int i;
938
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
939
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
940
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
941
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
942
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
943
129
  for (i = 0; i < 128; 
i++128
)
944
128
    ha->data.f32[i] = ha1->data.f32[i];
945
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(a, w), 0);
946
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
947
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1));
948
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
949
1
  cmd.algorithm = 1; // This is cblas.
950
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
951
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
952
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
953
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
954
65
  for (i = 0; i < 64; 
i++64
)
955
64
    tb1->data.f32[i] = tb->data.f32[i];
956
1
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
957
1
  ccv_nnc_tensor_free(a);
958
1
  ccv_nnc_tensor_free(w);
959
1
  ccv_nnc_tensor_free(b);
960
1
  ccv_nnc_tensor_free(tb);
961
1
  ccv_nnc_tensor_free(ha);
962
1
  ccv_nnc_tensor_free(ha1);
963
1
  ccv_nnc_tensor_free(tb1);
964
1
  ccv_nnc_tensor_free(hw);
965
1
  ccv_nnc_tensor_free(hb);
966
1
}
967
968
TEST_CASE("cublas backward gemm")
969
1
{
970
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT) &&
971
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_CPU_OPT));
972
1
  dsfmt_t dsfmt;
973
1
  dsfmt_init_gen_rand(&dsfmt, 0);
974
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
975
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
976
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
977
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
978
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
979
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
980
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
981
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
982
1
983
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
984
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
985
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
986
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
987
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
988
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
989
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
990
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
991
1
  int i;
992
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
993
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
994
65
  for (i = 0; i < 64; 
i++64
)
995
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
996
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
997
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
998
641
  for (i = 0; i < 10 * 64; 
i++640
)
999
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1000
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
1001
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1002
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
1003
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1));
1004
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
1005
1
  cmd.algorithm = 1; // This is cblas.
1006
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1007
1
  cmd = CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1));
1008
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
1009
1
  cmd.algorithm = 1; // This is cblas.
1010
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
1011
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1012
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1013
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1014
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1015
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
1016
1
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
1017
1
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
1018
1
  REQUIRE_TENSOR_EQ(tdbias, hdbias, "GPU computed output should be the same as CPU computed ones");
1019
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
1020
1
  ccv_nnc_tensor_free(a);
1021
1
  ccv_nnc_tensor_free(w);
1022
1
  ccv_nnc_tensor_free(bias);
1023
1
  ccv_nnc_tensor_free(b);
1024
1
  ccv_nnc_tensor_free(g);
1025
1
  ccv_nnc_tensor_free(dw);
1026
1
  ccv_nnc_tensor_free(dbias);
1027
1
  ccv_nnc_tensor_free(h);
1028
1
  ccv_nnc_tensor_free(ha);
1029
1
  ccv_nnc_tensor_free(hw);
1030
1
  ccv_nnc_tensor_free(hbias);
1031
1
  ccv_nnc_tensor_free(hb);
1032
1
  ccv_nnc_tensor_free(hg);
1033
1
  ccv_nnc_tensor_free(hdw);
1034
1
  ccv_nnc_tensor_free(hdbias);
1035
1
  ccv_nnc_tensor_free(hh);
1036
1
  ccv_nnc_tensor_free(tb);
1037
1
  ccv_nnc_tensor_free(th);
1038
1
  ccv_nnc_tensor_free(tdw);
1039
1
  ccv_nnc_tensor_free(tdbias);
1040
1
}
1041
1042
TEST_CASE("cublas backward gemm no bias")
1043
1
{
1044
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT) &&
1045
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_CPU_OPT));
1046
1
  dsfmt_t dsfmt;
1047
1
  dsfmt_init_gen_rand(&dsfmt, 0);
1048
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1049
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1050
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1051
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1052
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1053
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1054
1
1055
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1056
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1057
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1058
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1059
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1060
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1061
1
  int i;
1062
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
1063
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1064
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
1065
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1066
641
  for (i = 0; i < 10 * 64; 
i++640
)
1067
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1068
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(a, w, g), 0);
1069
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
1070
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
1071
1
  ccv_nnc_cmd_t cmd = CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1));
1072
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
1073
1
  cmd.algorithm = 1; // This is cblas.
1074
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
1075
1
  cmd = CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1));
1076
1
  cmd.backend = CCV_NNC_BACKEND_CPU_OPT;
1077
1
  cmd.algorithm = 1; // This is cblas.
1078
1
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
1079
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1080
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1081
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1082
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
1083
1
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
1084
1
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
1085
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
1086
1
  ccv_nnc_tensor_free(a);
1087
1
  ccv_nnc_tensor_free(w);
1088
1
  ccv_nnc_tensor_free(b);
1089
1
  ccv_nnc_tensor_free(g);
1090
1
  ccv_nnc_tensor_free(dw);
1091
1
  ccv_nnc_tensor_free(h);
1092
1
  ccv_nnc_tensor_free(ha);
1093
1
  ccv_nnc_tensor_free(hw);
1094
1
  ccv_nnc_tensor_free(hb);
1095
1
  ccv_nnc_tensor_free(hg);
1096
1
  ccv_nnc_tensor_free(hdw);
1097
1
  ccv_nnc_tensor_free(hh);
1098
1
  ccv_nnc_tensor_free(tb);
1099
1
  ccv_nnc_tensor_free(th);
1100
1
  ccv_nnc_tensor_free(tdw);
1101
1
}
1102
1103
#endif
1104
1105
#include "case_main.h"