Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/test/int/nnc/cublas.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <3rdparty/dsfmt/dSFMT.h>
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
TEST_CASE("cublas forward gemm")
15
1
{
16
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
17
1
  dsfmt_t dsfmt;
18
1
  dsfmt_init_gen_rand(&dsfmt, 0);
19
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
20
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
21
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
22
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
23
1
24
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
25
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
26
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
27
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
28
1
  int i;
29
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
30
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
31
65
  for (i = 0; i < 64; 
i++64
)
32
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
33
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
34
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
35
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
36
129
  for (i = 0; i < 128; 
i++128
)
37
128
    ha->data.f32[i] = ha1->data.f32[i];
38
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(a, w, bias), 0);
39
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
40
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
41
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
42
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
43
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
44
65
  for (i = 0; i < 64; 
i++64
)
45
64
    tb1->data.f32[i] = tb->data.f32[i];
46
1
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
47
1
  ccv_nnc_tensor_free(a);
48
1
  ccv_nnc_tensor_free(w);
49
1
  ccv_nnc_tensor_free(bias);
50
1
  ccv_nnc_tensor_free(b);
51
1
  ccv_nnc_tensor_free(ha);
52
1
  ccv_nnc_tensor_free(ha1);
53
1
  ccv_nnc_tensor_free(tb1);
54
1
  ccv_nnc_tensor_free(hw);
55
1
  ccv_nnc_tensor_free(hbias);
56
1
  ccv_nnc_tensor_free(hb);
57
1
}
58
59
TEST_CASE("cublas forward gemm in half precision")
60
1
{
61
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
62
1
  dsfmt_t dsfmt;
63
1
  dsfmt_init_gen_rand(&dsfmt, 0);
64
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
65
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
66
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
67
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
68
1
69
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
70
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
71
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
72
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
73
1
  int i;
74
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
75
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
76
65
  for (i = 0; i < 64; 
i++64
)
77
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
78
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
79
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
80
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
81
129
  for (i = 0; i < 128; 
i++128
)
82
128
    ha->data.f32[i] = ha1->data.f32[i];
83
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
84
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
85
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
86
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
87
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
88
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
89
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
90
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
91
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
92
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
93
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
94
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
95
1
  ccv_nnc_tensor_free(a);
96
1
  ccv_nnc_tensor_free(w);
97
1
  ccv_nnc_tensor_free(bias);
98
1
  ccv_nnc_tensor_free(b);
99
1
  ccv_nnc_tensor_free(ha);
100
1
  ccv_nnc_tensor_free(ha1);
101
1
  ccv_nnc_tensor_free(tb1);
102
1
  ccv_nnc_tensor_free(hw);
103
1
  ccv_nnc_tensor_free(hbias);
104
1
  ccv_nnc_tensor_free(hb);
105
1
  ccv_nnc_tensor_free(ha2);
106
1
  ccv_nnc_tensor_free(hw2);
107
1
  ccv_nnc_tensor_free(hbias2);
108
1
}
109
110
TEST_CASE("cublas forward gemm no bias")
111
1
{
112
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
113
1
  dsfmt_t dsfmt;
114
1
  dsfmt_init_gen_rand(&dsfmt, 0);
115
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
116
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
117
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
118
1
119
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
120
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
121
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
122
1
  int i;
123
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
124
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
125
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
126
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
127
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
128
129
  for (i = 0; i < 128; 
i++128
)
129
128
    ha->data.f32[i] = ha1->data.f32[i];
130
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(a, w), 0);
131
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
132
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
133
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
134
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
135
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
136
65
  for (i = 0; i < 64; 
i++64
)
137
64
    tb1->data.f32[i] = tb->data.f32[i];
138
1
  REQUIRE_TENSOR_EQ(tb1, hb, "GPU computed output should be the same as CPU computed ones");
139
1
  ccv_nnc_tensor_free(a);
140
1
  ccv_nnc_tensor_free(w);
141
1
  ccv_nnc_tensor_free(b);
142
1
  ccv_nnc_tensor_free(ha);
143
1
  ccv_nnc_tensor_free(ha1);
144
1
  ccv_nnc_tensor_free(tb1);
145
1
  ccv_nnc_tensor_free(hw);
146
1
  ccv_nnc_tensor_free(hb);
147
1
}
148
149
TEST_CASE("cublas forward gemm no bias in half precision")
150
1
{
151
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
152
1
  dsfmt_t dsfmt;
153
1
  dsfmt_init_gen_rand(&dsfmt, 0);
154
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
155
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
156
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
157
1
158
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
159
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
160
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
161
1
  int i;
162
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
163
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
164
1
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
165
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
166
1.28k
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
167
129
  for (i = 0; i < 128; 
i++128
)
168
128
    ha->data.f32[i] = ha1->data.f32[i];
169
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
170
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
171
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
172
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
173
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
174
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
175
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
176
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
177
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
178
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
179
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
180
1
  ccv_nnc_tensor_free(a);
181
1
  ccv_nnc_tensor_free(w);
182
1
  ccv_nnc_tensor_free(b);
183
1
  ccv_nnc_tensor_free(ha);
184
1
  ccv_nnc_tensor_free(ha1);
185
1
  ccv_nnc_tensor_free(tb1);
186
1
  ccv_nnc_tensor_free(hw);
187
1
  ccv_nnc_tensor_free(hb);
188
1
  ccv_nnc_tensor_free(ha2);
189
1
  ccv_nnc_tensor_free(hw2);
190
1
}
191
192
TEST_CASE("cublas backward gemm")
193
1
{
194
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
195
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
196
1
  dsfmt_t dsfmt;
197
1
  dsfmt_init_gen_rand(&dsfmt, 0);
198
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
199
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
200
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
201
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
202
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
203
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
204
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
205
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
206
1
207
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
208
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
209
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
210
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
211
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
212
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
213
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
214
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
215
1
  int i;
216
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
217
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
218
65
  for (i = 0; i < 64; 
i++64
)
219
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
220
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
221
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
222
641
  for (i = 0; i < 10 * 64; 
i++640
)
223
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
224
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
225
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
226
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
227
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
228
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
229
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
230
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
231
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
232
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
233
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
234
1
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
235
1
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
236
1
  REQUIRE_TENSOR_EQ(tdbias, hdbias, "GPU computed output should be the same as CPU computed ones");
237
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
238
1
  ccv_nnc_tensor_free(a);
239
1
  ccv_nnc_tensor_free(w);
240
1
  ccv_nnc_tensor_free(bias);
241
1
  ccv_nnc_tensor_free(b);
242
1
  ccv_nnc_tensor_free(g);
243
1
  ccv_nnc_tensor_free(dw);
244
1
  ccv_nnc_tensor_free(dbias);
245
1
  ccv_nnc_tensor_free(h);
246
1
  ccv_nnc_tensor_free(ha);
247
1
  ccv_nnc_tensor_free(hw);
248
1
  ccv_nnc_tensor_free(hbias);
249
1
  ccv_nnc_tensor_free(hb);
250
1
  ccv_nnc_tensor_free(hg);
251
1
  ccv_nnc_tensor_free(hdw);
252
1
  ccv_nnc_tensor_free(hdbias);
253
1
  ccv_nnc_tensor_free(hh);
254
1
  ccv_nnc_tensor_free(tb);
255
1
  ccv_nnc_tensor_free(th);
256
1
  ccv_nnc_tensor_free(tdw);
257
1
  ccv_nnc_tensor_free(tdbias);
258
1
}
259
260
TEST_CASE("cublas backward gemm in half precision")
261
1
{
262
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
263
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
264
1
  dsfmt_t dsfmt;
265
1
  dsfmt_init_gen_rand(&dsfmt, 0);
266
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
267
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
268
1
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
269
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
270
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
271
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
272
1
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
273
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
274
1
275
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
276
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
277
1
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
278
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
279
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
280
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
281
1
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
282
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
283
1
  int i;
284
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
285
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
286
65
  for (i = 0; i < 64; 
i++64
)
287
64
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
288
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
289
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
290
641
  for (i = 0; i < 10 * 64; 
i++640
)
291
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
292
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
293
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
294
1
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
295
1
  ccv_nnc_tensor_t* hg2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
296
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(ha2, hw2, hbias2, hg2), 0);
297
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2, hg2), TENSOR_LIST(a, w, bias, g), 0);
298
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
299
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
300
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
301
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
302
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
303
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
304
1
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
305
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
306
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
307
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
308
1
  ccv_nnc_tensor_t* tdw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
309
1
  ccv_nnc_tensor_t* tdbias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
310
1
  ccv_nnc_tensor_t* th1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
311
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb, tdw, tdbias, th), TENSOR_LIST(tb1, tdw1, tdbias1, th1), 0);
312
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 10 * 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
313
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw1->data.f32, hdw->data.f32, 64 * 128, 1e-2, "GPU computed output should be the same as CPU computed ones");
314
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias1->data.f32, hdbias->data.f32, 64, 5e-3, "GPU computed output should be the same as CPU computed ones");
315
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th1->data.f32, hh->data.f32, 10 * 128, 1e-3, "GPU computed output should be the same as CPU computed ones");
316
1
  ccv_nnc_tensor_free(a);
317
1
  ccv_nnc_tensor_free(w);
318
1
  ccv_nnc_tensor_free(bias);
319
1
  ccv_nnc_tensor_free(b);
320
1
  ccv_nnc_tensor_free(g);
321
1
  ccv_nnc_tensor_free(dw);
322
1
  ccv_nnc_tensor_free(dbias);
323
1
  ccv_nnc_tensor_free(h);
324
1
  ccv_nnc_tensor_free(ha);
325
1
  ccv_nnc_tensor_free(hw);
326
1
  ccv_nnc_tensor_free(hbias);
327
1
  ccv_nnc_tensor_free(hb);
328
1
  ccv_nnc_tensor_free(hg);
329
1
  ccv_nnc_tensor_free(hdw);
330
1
  ccv_nnc_tensor_free(hdbias);
331
1
  ccv_nnc_tensor_free(hh);
332
1
  ccv_nnc_tensor_free(tb);
333
1
  ccv_nnc_tensor_free(th);
334
1
  ccv_nnc_tensor_free(tdw);
335
1
  ccv_nnc_tensor_free(tdbias);
336
1
  ccv_nnc_tensor_free(ha2);
337
1
  ccv_nnc_tensor_free(hw2);
338
1
  ccv_nnc_tensor_free(hbias2);
339
1
  ccv_nnc_tensor_free(hg2);
340
1
  ccv_nnc_tensor_free(tb1);
341
1
  ccv_nnc_tensor_free(tdw1);
342
1
  ccv_nnc_tensor_free(tdbias1);
343
1
  ccv_nnc_tensor_free(th1);
344
1
}
345
346
TEST_CASE("cublas backward gemm no bias")
347
1
{
348
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
349
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
350
1
  dsfmt_t dsfmt;
351
1
  dsfmt_init_gen_rand(&dsfmt, 0);
352
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
353
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
354
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
355
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
356
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
357
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
358
1
359
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
360
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
361
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
362
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
363
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
364
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
365
1
  int i;
366
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
367
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
368
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
369
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
370
641
  for (i = 0; i < 10 * 64; 
i++640
)
371
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
372
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(a, w, g), 0);
373
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
374
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
375
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
376
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
377
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
378
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
379
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
380
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
381
1
  REQUIRE_TENSOR_EQ(tb, hb, "GPU computed output should be the same as CPU computed ones");
382
1
  REQUIRE_TENSOR_EQ(tdw, hdw, "GPU computed output should be the same as CPU computed ones");
383
1
  REQUIRE_TENSOR_EQ(th, hh, "GPU computed output should be the same as CPU computed ones");
384
1
  ccv_nnc_tensor_free(a);
385
1
  ccv_nnc_tensor_free(w);
386
1
  ccv_nnc_tensor_free(b);
387
1
  ccv_nnc_tensor_free(g);
388
1
  ccv_nnc_tensor_free(dw);
389
1
  ccv_nnc_tensor_free(h);
390
1
  ccv_nnc_tensor_free(ha);
391
1
  ccv_nnc_tensor_free(hw);
392
1
  ccv_nnc_tensor_free(hb);
393
1
  ccv_nnc_tensor_free(hg);
394
1
  ccv_nnc_tensor_free(hdw);
395
1
  ccv_nnc_tensor_free(hh);
396
1
  ccv_nnc_tensor_free(tb);
397
1
  ccv_nnc_tensor_free(th);
398
1
  ccv_nnc_tensor_free(tdw);
399
1
}
400
401
TEST_CASE("cublas backward gemm no bias in half precision")
402
1
{
403
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_GPU_CUBLAS) &&
404
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_GPU_CUBLAS));
405
1
  dsfmt_t dsfmt;
406
1
  dsfmt_init_gen_rand(&dsfmt, 0);
407
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
408
1
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
409
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
410
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
411
1
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
412
1
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
413
1
414
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
415
1
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
416
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
417
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
418
1
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
419
1
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
420
1
  int i;
421
8.19k
  for (i = 0; i < 64 * 128; 
i++8.19k
)
422
8.19k
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
423
1.28k
  for (i = 0; i < 10 * 128; 
i++1.28k
)
424
1.28k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
425
641
  for (i = 0; i < 10 * 64; 
i++640
)
426
640
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
427
1
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
428
1
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
429
1
  ccv_nnc_tensor_t* hg2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
430
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(ha2, hw2, hg2), 0);
431
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hg2), TENSOR_LIST(a, w, g), 0);
432
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
433
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
434
1
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
435
1
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(64), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
436
1
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
437
1
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
438
1
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
439
1
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
440
1
  ccv_nnc_tensor_t* tdw1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
441
1
  ccv_nnc_tensor_t* th1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
442
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
443
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb, tdw, th), TENSOR_LIST(tb1, tdw1, th1), 0);
444
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 10 * 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
445
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw1->data.f32, hdw->data.f32, 64 * 128, 1e-2, "GPU computed output should be the same as CPU computed ones");
446
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th1->data.f32, hh->data.f32, 10 * 128, 1e-3, "GPU computed output should be the same as CPU computed ones");
447
1
  ccv_nnc_tensor_free(a);
448
1
  ccv_nnc_tensor_free(w);
449
1
  ccv_nnc_tensor_free(b);
450
1
  ccv_nnc_tensor_free(g);
451
1
  ccv_nnc_tensor_free(dw);
452
1
  ccv_nnc_tensor_free(h);
453
1
  ccv_nnc_tensor_free(ha);
454
1
  ccv_nnc_tensor_free(hw);
455
1
  ccv_nnc_tensor_free(hb);
456
1
  ccv_nnc_tensor_free(hg);
457
1
  ccv_nnc_tensor_free(hdw);
458
1
  ccv_nnc_tensor_free(hh);
459
1
  ccv_nnc_tensor_free(tb);
460
1
  ccv_nnc_tensor_free(th);
461
1
  ccv_nnc_tensor_free(tdw);
462
1
  ccv_nnc_tensor_free(ha2);
463
1
  ccv_nnc_tensor_free(hw2);
464
1
  ccv_nnc_tensor_free(hg2);
465
1
  ccv_nnc_tensor_free(tb1);
466
1
  ccv_nnc_tensor_free(tdw1);
467
1
  ccv_nnc_tensor_free(th1);
468
1
}
469
470
TEST_CASE("cross entropy loss forward")
471
1
{
472
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_REF));
473
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
474
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
475
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
476
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
477
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
478
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
479
1
  dsfmt_t dsfmt;
480
1
  dsfmt_init_gen_rand(&dsfmt, 0);
481
1
  int i = 0;
482
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
483
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
484
11
  for (i = 0; i < 10; 
i++10
)
485
10
    hb->data.f32[i] = (i + 1) * 9;
486
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
487
1
  ccv_nnc_cmd_exec(CMD_CATEGORICAL_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
488
1
  ccv_nnc_cmd_exec(CMD_CATEGORICAL_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
489
1
  ccv_nnc_tensor_t* tc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
490
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(tc), 0);
491
1
  REQUIRE_TENSOR_EQ(tc, hc, "GPU computed output should be the same as CPU computed ones");
492
1
  ccv_nnc_tensor_free(a);
493
1
  ccv_nnc_tensor_free(b);
494
1
  ccv_nnc_tensor_free(c);
495
1
  ccv_nnc_tensor_free(ha);
496
1
  ccv_nnc_tensor_free(hb);
497
1
  ccv_nnc_tensor_free(hc);
498
1
  ccv_nnc_tensor_free(tc);
499
1
}
500
501
TEST_CASE("cross entropy loss backward")
502
1
{
503
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_GPU_REF) &&
504
1
    ccv_nnc_cmd_ok(CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_GPU_REF));
505
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
506
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
507
1
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
508
1
  ccv_nnc_tensor_t* d = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 100), 0);
509
1
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10), 0);
510
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
511
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
512
1
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
513
1
  ccv_nnc_tensor_t* hd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
514
1
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10), 0);
515
1
  dsfmt_t dsfmt;
516
1
  dsfmt_init_gen_rand(&dsfmt, 0);
517
1
  int i = 0;
518
1.00k
  for (i = 0; i < 1000; 
i++1.00k
)
519
1.00k
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
520
11
  for (i = 0; i < 10; 
i++10
)
521
10
    hb->data.f32[i] = (i + 1) * 9;
522
11
  for (i = 0; i < 10; 
i++10
)
523
10
    hg->data.f32[i] = 1;
524
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb, hg), TENSOR_LIST(a, b, g), 0);
525
1
  ccv_nnc_cmd_exec(CMD_CATEGORICAL_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(hc), 0);
526
1
  ccv_nnc_cmd_exec(CMD_CATEGORICAL_CROSSENTROPY_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hb), TENSOR_LIST(hd), 0);
527
1
  ccv_nnc_cmd_exec(CMD_CATEGORICAL_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
528
1
  ccv_nnc_cmd_exec(CMD_CATEGORICAL_CROSSENTROPY_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(d), 0);
529
1
  ccv_nnc_tensor_t* td = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 100), 0);
530
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(d), TENSOR_LIST(td), 0);
531
1
  REQUIRE_TENSOR_EQ(td, hd, "GPU computed output should be the same as CPU computed ones");
532
1
  ccv_nnc_tensor_free(a);
533
1
  ccv_nnc_tensor_free(b);
534
1
  ccv_nnc_tensor_free(c);
535
1
  ccv_nnc_tensor_free(d);
536
1
  ccv_nnc_tensor_free(g);
537
1
  ccv_nnc_tensor_free(ha);
538
1
  ccv_nnc_tensor_free(hb);
539
1
  ccv_nnc_tensor_free(hc);
540
1
  ccv_nnc_tensor_free(hd);
541
1
  ccv_nnc_tensor_free(hg);
542
1
  ccv_nnc_tensor_free(td);
543
1
}
544
545
TEST_CASE("random uniform distribution")
546
1
{
547
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
548
1
  const ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 100000), "x");
549
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RANDOM_UNIFORM_FORWARD(-8, 4), TENSOR_SYMBOL_LIST(), TENSOR_SYMBOL_LIST(x), "random uniform");
550
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
551
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
552
1
  ccv_nnc_graph_t* graph = 0;
553
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
554
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
555
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
556
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
557
1
  ccv_nnc_graph_run(graph, 0, 0, 0, TRAVERSE_FULL);
558
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
559
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100000), 0);
560
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(xt), 0);
561
1
  int i;
562
1
  int h[4 + 8] = {};
563
100k
  for (i = 0; i < 100000; 
i++100k
)
564
100k
  {
565
100k
    REQUIRE(xt->data.f32[i] > -8 - 1e-5, "it must be bigger than lower bound");
566
100k
    REQUIRE(xt->data.f32[i] < 4 + 1e-5, "and smaller than upper bound");
567
100k
    int b = (int)roundf(xt->data.f32[i] - 0.5) + 8;
568
100k
    b = ccv_max(ccv_min(b, 11), 0);
569
100k
    ++h[b];
570
100k
  }
571
1
  const int count = (int)roundf(100000. / (4 + 8));
572
13
  for (i = 0; i < 12; 
i++12
)
573
12
    { REQUIRE(h[i] >= count - 1000 && h[i] <= count + 1000, "uniform distribution"); }
574
1
  ccv_nnc_tensor_free(xt);
575
1
  ccv_nnc_graph_free(graph);
576
1
  ccv_nnc_tensor_arena_free(tensor_arena);
577
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
578
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
579
1
}
580
581
TEST_CASE("random uniform distribution in half precision")
582
1
{
583
1
  ccv_nnc_symbolic_graph_t* symbolic_graph = ccv_nnc_symbolic_graph_new();
584
1
  const ccv_nnc_tensor_symbol_t x = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NHWC(000, 16F, 100000), "x");
585
1
  ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_RANDOM_UNIFORM_FORWARD(-8, 4), TENSOR_SYMBOL_LIST(), TENSOR_SYMBOL_LIST(x), "random uniform");
586
1
  ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
587
1
  SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
588
1
  ccv_nnc_graph_t* graph = 0;
589
1
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
590
1
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
591
1
  ccv_nnc_symbolic_graph_compile(symbolic_graph, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
592
1
  GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
593
1
  ccv_nnc_graph_run(graph, 0, 0, 0, TRAVERSE_FULL);
594
1
  ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, x);
595
1
  ccv_nnc_tensor_t* const x16t = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 100000), 0);
596
1
  ccv_nnc_tensor_t* const xt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 100000), 0);
597
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(x16t), 0);
598
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x16t), TENSOR_LIST(xt), 0);
599
1
  int i;
600
1
  int h[4 + 8] = {};
601
100k
  for (i = 0; i < 100000; 
i++100k
)
602
100k
  {
603
100k
    REQUIRE(xt->data.f32[i] > -8 - 1e-5, "it must be bigger than lower bound");
604
100k
    REQUIRE(xt->data.f32[i] < 4 + 1e-5, "and smaller than upper bound");
605
100k
    int b = (int)roundf(xt->data.f32[i] - 0.5) + 8;
606
100k
    b = ccv_max(ccv_min(b, 11), 0);
607
100k
    ++h[b];
608
100k
  }
609
1
  const int count = (int)roundf(100000. / (4 + 8));
610
13
  for (i = 0; i < 12; 
i++12
)
611
12
    { REQUIRE(h[i] >= count - 1000 && h[i] <= count + 1000, "uniform distribution"); }
612
1
  ccv_nnc_tensor_free(xt);
613
1
  ccv_nnc_tensor_free(x16t);
614
1
  ccv_nnc_graph_free(graph);
615
1
  ccv_nnc_tensor_arena_free(tensor_arena);
616
1
  ccv_nnc_graph_exec_arena_free(graph_exec_arena);
617
1
  ccv_nnc_symbolic_graph_free(symbolic_graph);
618
1
}
619
620
TEST_CASE("data conversion from float to half precision")
621
1
{
622
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATATYPE_CONVERSION_FORWARD, CCV_NNC_BACKEND_GPU_REF));
623
1
  dsfmt_t dsfmt;
624
1
  dsfmt_init_gen_rand(&dsfmt, 0);
625
1
  int i;
626
1
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
627
129
  for (i = 0; i < 128; 
i++128
)
628
128
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
629
1
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 128), 0);
630
1
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
631
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
632
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
633
1
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
634
1
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(hb), 0);
635
1
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
636
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(bt), 0);
637
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(short, (short*)hb->data.f16, (short*)bt->data.f16, 128, 1, "Result should be exactly equal");
638
1
  ccv_nnc_tensor_free(a);
639
1
  ccv_nnc_tensor_free(b);
640
1
  ccv_nnc_tensor_free(ha);
641
1
  ccv_nnc_tensor_free(hb);
642
1
  ccv_nnc_tensor_free(bt);
643
1
}
644
645
#include "case_main.h"