Coverage Report

Created: 2024-06-09 19:03

/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/palettize.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include "3rdparty/dsfmt/dSFMT.h"
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
TEST_CASE("quantize double to 4-bit and dequantize on GPU losslessly")
15
{
16
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
17
  double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
18
  double* const values = ccmalloc(sizeof(double) * 2839);
19
  int i;
20
  for (i = 0; i < 2839; i++)
21
    values[i] = lut[i % 16];
22
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 + 3) / 4), 0);
23
  uint8_t* compressed = tensor->data.u8;
24
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944);
25
  REQUIRE_EQ(output_size, 1420 + 2944, "output size should match");
26
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 + 3) / 4), 0);
27
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
28
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
29
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839);
30
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
31
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
32
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
33
  ccfree(values);
34
  ccv_nnc_tensor_free(tensor);
35
  ccv_nnc_tensor_free(g_tensor);
36
  ccv_nnc_tensor_free(gv_tensor);
37
  ccv_nnc_tensor_free(v_tensor);
38
}
39
40
TEST_CASE("quantize float to 4-bit and dequantize on GPU losslessly")
41
1
{
42
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
43
1
  float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
44
1
  float* const values = ccmalloc(sizeof(float) * 2839);
45
1
  int i;
46
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
47
2.83k
    values[i] = lut[i % 16];
48
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 2 + 3) / 4), 0);
49
1
  uint8_t* compressed = tensor->data.u8;
50
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 2);
51
1
  REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match");
52
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 2 + 3) / 4), 0);
53
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
54
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
55
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839);
56
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
57
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
58
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
59
1
  ccfree(values);
60
1
  ccv_nnc_tensor_free(tensor);
61
1
  ccv_nnc_tensor_free(g_tensor);
62
1
  ccv_nnc_tensor_free(gv_tensor);
63
1
  ccv_nnc_tensor_free(v_tensor);
64
1
}
65
66
TEST_CASE("quantize half-precision to 4-bit and dequantize on GPU losslessly")
67
1
{
68
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
69
1
  float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
70
1
  uint16_t lut[16];
71
1
  ccv_float_to_half_precision(lut_f32, lut, 16);
72
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
73
1
  int i;
74
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
75
2.83k
    values[i] = lut[i % 16];
76
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 4 + 3) / 4), 0);
77
1
  uint8_t* compressed = tensor->data.u8;
78
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 4);
79
1
  REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match");
80
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 4 + 3) / 4), 0);
81
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
82
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
83
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839);
84
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
85
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
86
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
87
1
  ccfree(values);
88
1
  ccv_nnc_tensor_free(tensor);
89
1
  ccv_nnc_tensor_free(g_tensor);
90
1
  ccv_nnc_tensor_free(gv_tensor);
91
1
  ccv_nnc_tensor_free(v_tensor);
92
1
}
93
94
TEST_CASE("quantize double to 5-bit and dequantize on GPU losslessly")
95
1
{
96
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
97
1
  double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
98
1
  double* const values = ccmalloc(sizeof(double) * 2839);
99
1
  int i;
100
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
101
2.83k
    values[i] = lut[i % 32];
102
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 8 + 3) / 4), 0);
103
1
  uint8_t* compressed = tensor->data.u8;
104
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 8);
105
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match");
106
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 8 + 3) / 4), 0);
107
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
108
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
109
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839);
110
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
111
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
112
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
113
1
  ccfree(values);
114
1
  ccv_nnc_tensor_free(tensor);
115
1
  ccv_nnc_tensor_free(g_tensor);
116
1
  ccv_nnc_tensor_free(gv_tensor);
117
1
  ccv_nnc_tensor_free(v_tensor);
118
1
}
119
120
TEST_CASE("quantize float to 5-bit and dequantize on GPU losslessly")
121
1
{
122
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
123
1
  float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
124
1
  float* const values = ccmalloc(sizeof(float) * 2839);
125
1
  int i;
126
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
127
2.83k
    values[i] = lut[i % 32];
128
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 4 + 3) / 4), 0);
129
1
  uint8_t* compressed = tensor->data.u8;
130
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 4);
131
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match");
132
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 4 + 3) / 4), 0);
133
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
134
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
135
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839);
136
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
137
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
138
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
139
1
  ccfree(values);
140
1
  ccv_nnc_tensor_free(tensor);
141
1
  ccv_nnc_tensor_free(g_tensor);
142
1
  ccv_nnc_tensor_free(gv_tensor);
143
1
  ccv_nnc_tensor_free(v_tensor);
144
1
}
145
146
TEST_CASE("quantize half-precision to 5-bit and dequantize on GPU losslessly")
147
{
148
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
149
  float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
150
  uint16_t lut[32];
151
  ccv_float_to_half_precision(lut_f32, lut, 32);
152
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
153
  int i;
154
  for (i = 0; i < 2839; i++)
155
    values[i] = lut[i % 32];
156
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 2 + 3) / 4), 0);
157
  uint8_t* compressed = tensor->data.u8;
158
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 2);
159
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match");
160
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 2 + 3) / 4), 0);
161
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
162
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
163
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839);
164
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
165
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
166
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
167
  ccfree(values);
168
  ccv_nnc_tensor_free(tensor);
169
  ccv_nnc_tensor_free(g_tensor);
170
  ccv_nnc_tensor_free(gv_tensor);
171
  ccv_nnc_tensor_free(v_tensor);
172
}
173
174
TEST_CASE("quantize double to 6-bit and dequantize on GPU losslessly")
175
1
{
176
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
177
1
  double lut[64];
178
1
  int i;
179
65
  for (i = 0; i < 64; 
i++64
)
180
64
    lut[i] = (double)i;
181
1
  double* const values = ccmalloc(sizeof(double) * 2839);
182
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
183
2.83k
    values[i] = lut[i % 64];
184
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 8 + 3) / 4), 0);
185
1
  uint8_t* compressed = tensor->data.u8;
186
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 8);
187
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match");
188
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 8 + 3) / 4), 0);
189
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
190
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
191
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839);
192
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
193
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
194
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
195
1
  ccfree(values);
196
1
  ccv_nnc_tensor_free(tensor);
197
1
  ccv_nnc_tensor_free(g_tensor);
198
1
  ccv_nnc_tensor_free(gv_tensor);
199
1
  ccv_nnc_tensor_free(v_tensor);
200
1
}
201
202
TEST_CASE("quantize float to 6-bit and dequantize on GPU losslessly")
203
1
{
204
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
205
1
  float lut[64];
206
1
  int i;
207
65
  for (i = 0; i < 64; 
i++64
)
208
64
    lut[i] = (float)i;
209
1
  float* const values = ccmalloc(sizeof(float) * 2839);
210
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
211
2.83k
    values[i] = lut[i % 64];
212
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 4 + 3) / 4), 0);
213
1
  uint8_t* compressed = tensor->data.u8;
214
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 4);
215
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match");
216
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 4 + 3) / 4), 0);
217
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
218
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
219
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839);
220
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
221
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
222
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
223
1
  ccfree(values);
224
1
  ccv_nnc_tensor_free(tensor);
225
1
  ccv_nnc_tensor_free(g_tensor);
226
1
  ccv_nnc_tensor_free(gv_tensor);
227
1
  ccv_nnc_tensor_free(v_tensor);
228
1
}
229
230
TEST_CASE("quantize half-precision to 6-bit and dequantize on GPU losslessly")
231
1
{
232
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
233
1
  float lut_f32[64];
234
1
  int i;
235
65
  for (i = 0; i < 64; 
i++64
)
236
64
    lut_f32[i] = (float)i;
237
1
  uint16_t lut[64];
238
1
  ccv_float_to_half_precision(lut_f32, lut, 64);
239
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
240
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
241
2.83k
    values[i] = lut[i % 64];
242
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 2 + 3) / 4), 0);
243
1
  uint8_t* compressed = tensor->data.u8;
244
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 2);
245
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 2, "output size should match");
246
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 2 + 3) / 4), 0);
247
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
248
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
249
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839);
250
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
251
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
252
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
253
1
  ccfree(values);
254
1
  ccv_nnc_tensor_free(tensor);
255
1
  ccv_nnc_tensor_free(g_tensor);
256
1
  ccv_nnc_tensor_free(gv_tensor);
257
1
  ccv_nnc_tensor_free(v_tensor);
258
1
}
259
260
TEST_CASE("quantize double to 7-bit and dequantize on GPU losslessly")
261
1
{
262
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
263
1
  double lut[128];
264
1
  int i;
265
129
  for (i = 0; i < 128; 
i++128
)
266
128
    lut[i] = (double)i;
267
1
  double* const values = ccmalloc(sizeof(double) * 2839);
268
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
269
2.83k
    values[i] = lut[i % 128];
270
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 8 + 3) / 4), 0);
271
1
  uint8_t* compressed = tensor->data.u8;
272
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 8);
273
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match");
274
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 8 + 3) / 4), 0);
275
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
276
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
277
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839);
278
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
279
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
280
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
281
1
  ccfree(values);
282
1
  ccv_nnc_tensor_free(tensor);
283
1
  ccv_nnc_tensor_free(g_tensor);
284
1
  ccv_nnc_tensor_free(gv_tensor);
285
1
  ccv_nnc_tensor_free(v_tensor);
286
1
}
287
288
TEST_CASE("quantize float to 7-bit and dequantize on GPU losslessly")
289
1
{
290
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
291
1
  float lut[128];
292
1
  int i;
293
129
  for (i = 0; i < 128; 
i++128
)
294
128
    lut[i] = (float)i;
295
1
  float* const values = ccmalloc(sizeof(float) * 2839);
296
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
297
2.83k
    values[i] = lut[i % 128];
298
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 4 + 3) / 4), 0);
299
1
  uint8_t* compressed = tensor->data.u8;
300
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 4);
301
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match");
302
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 4 + 3) / 4), 0);
303
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
304
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
305
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839);
306
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
307
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
308
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
309
1
  ccfree(values);
310
1
  ccv_nnc_tensor_free(tensor);
311
1
  ccv_nnc_tensor_free(g_tensor);
312
1
  ccv_nnc_tensor_free(gv_tensor);
313
1
  ccv_nnc_tensor_free(v_tensor);
314
1
}
315
316
TEST_CASE("quantize half-precision to 7-bit and dequantize on GPU losslessly")
317
1
{
318
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
319
1
  float lut_f32[128];
320
1
  int i;
321
129
  for (i = 0; i < 128; 
i++128
)
322
128
    lut_f32[i] = (float)i;
323
1
  uint16_t lut[128];
324
1
  ccv_float_to_half_precision(lut_f32, lut, 128);
325
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
326
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
327
2.83k
    values[i] = lut[i % 128];
328
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 2 + 3) / 4), 0);
329
1
  uint8_t* compressed = tensor->data.u8;
330
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 2);
331
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match");
332
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 2 + 3) / 4), 0);
333
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
334
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
335
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839);
336
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
337
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
338
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
339
1
  ccfree(values);
340
1
  ccv_nnc_tensor_free(tensor);
341
1
  ccv_nnc_tensor_free(g_tensor);
342
1
  ccv_nnc_tensor_free(gv_tensor);
343
1
  ccv_nnc_tensor_free(v_tensor);
344
1
}
345
346
TEST_CASE("quantize double to 8-bit and dequantize on GPU losslessly")
347
1
{
348
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
349
1
  double lut[256];
350
1
  int i;
351
257
  for (i = 0; i < 256; 
i++256
)
352
256
    lut[i] = (double)i;
353
1
  double* const values = ccmalloc(sizeof(double) * 2839);
354
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
355
2.83k
    values[i] = lut[i % 256];
356
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 8 + 3) / 4), 0);
357
1
  uint8_t* compressed = tensor->data.u8;
358
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 8);
359
1
  REQUIRE_EQ(output_size, 2839 + 3 * 256 * 8, "output size should match");
360
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 8 + 3) / 4), 0);
361
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
362
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
363
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839);
364
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
365
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
366
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
367
1
  ccfree(values);
368
1
  ccv_nnc_tensor_free(tensor);
369
1
  ccv_nnc_tensor_free(g_tensor);
370
1
  ccv_nnc_tensor_free(gv_tensor);
371
1
  ccv_nnc_tensor_free(v_tensor);
372
1
}
373
374
TEST_CASE("quantize float to 8-bit and dequantize on GPU losslessly")
375
1
{
376
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
377
1
  float lut[256];
378
1
  int i;
379
257
  for (i = 0; i < 256; 
i++256
)
380
256
    lut[i] = (float)i;
381
1
  float* const values = ccmalloc(sizeof(float) * 2839);
382
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
383
2.83k
    values[i] = lut[i % 256];
384
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 4 + 3) / 4), 0);
385
1
  uint8_t* compressed = tensor->data.u8;
386
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 4);
387
1
  REQUIRE_EQ(output_size, 2839 + 3 * 256 * 4, "output size should match");
388
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 4 + 3) / 4), 0);
389
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
390
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
391
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839);
392
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
393
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
394
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
395
1
  ccfree(values);
396
1
  ccv_nnc_tensor_free(tensor);
397
1
  ccv_nnc_tensor_free(g_tensor);
398
1
  ccv_nnc_tensor_free(gv_tensor);
399
1
  ccv_nnc_tensor_free(v_tensor);
400
1
}
401
402
TEST_CASE("quantize half-precision to 8-bit and dequantize on GPU losslessly")
403
1
{
404
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
405
1
  float lut_f32[256];
406
1
  int i;
407
257
  for (i = 0; i < 256; 
i++256
)
408
256
    lut_f32[i] = (float)i;
409
1
  uint16_t lut[256];
410
1
  ccv_float_to_half_precision(lut_f32, lut, 256);
411
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
412
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
413
2.83k
    values[i] = lut[i % 256];
414
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 2 + 3) / 4), 0);
415
1
  uint8_t* compressed = tensor->data.u8;
416
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 2);
417
1
  REQUIRE_EQ(output_size, 2839 + 3 * 256 * 2, "output size should match");
418
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 2 + 3) / 4), 0);
419
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
420
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
421
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839);
422
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
423
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
424
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
425
1
  ccfree(values);
426
1
  ccv_nnc_tensor_free(tensor);
427
1
  ccv_nnc_tensor_free(g_tensor);
428
1
  ccv_nnc_tensor_free(gv_tensor);
429
1
  ccv_nnc_tensor_free(v_tensor);
430
1
}
431
432
TEST_CASE("quantize double to 4-bit and dequantize on GPU losslessly, fast path")
433
1
{
434
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
435
1
  double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
436
1
  double* const values = ccmalloc(sizeof(double) * 2840);
437
1
  int i;
438
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
439
2.84k
    values[i] = lut[i % 16];
440
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 + 3) / 4), 0);
441
1
  uint8_t* compressed = tensor->data.u8;
442
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944);
443
1
  REQUIRE_EQ(output_size, 1420 + 2944, "output size should match");
444
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 + 3) / 4), 0);
445
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
446
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
447
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840);
448
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
449
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
450
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
451
1
  ccfree(values);
452
1
  ccv_nnc_tensor_free(tensor);
453
1
  ccv_nnc_tensor_free(g_tensor);
454
1
  ccv_nnc_tensor_free(gv_tensor);
455
1
  ccv_nnc_tensor_free(v_tensor);
456
1
}
457
458
TEST_CASE("quantize float to 4-bit and dequantize on GPU losslessly, fast path")
459
1
{
460
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
461
1
  float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
462
1
  float* const values = ccmalloc(sizeof(float) * 2840);
463
1
  int i;
464
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
465
2.84k
    values[i] = lut[i % 16];
466
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 2 + 3) / 4), 0);
467
1
  uint8_t* compressed = tensor->data.u8;
468
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 2);
469
1
  REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match");
470
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 2 + 3) / 4), 0);
471
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
472
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0);
473
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840);
474
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0);
475
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
476
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless");
477
1
  ccfree(values);
478
1
  ccv_nnc_tensor_free(tensor);
479
1
  ccv_nnc_tensor_free(g_tensor);
480
1
  ccv_nnc_tensor_free(gv_tensor);
481
1
  ccv_nnc_tensor_free(v_tensor);
482
1
}
483
484
TEST_CASE("quantize half-precision to 4-bit and dequantize on GPU losslessly, fast path")
485
1
{
486
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
487
1
  float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
488
1
  uint16_t lut[16];
489
1
  ccv_float_to_half_precision(lut_f32, lut, 16);
490
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
491
1
  int i;
492
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
493
2.84k
    values[i] = lut[i % 16];
494
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 4 + 3) / 4), 0);
495
1
  uint8_t* compressed = tensor->data.u8;
496
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 4);
497
1
  REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match");
498
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 4 + 3) / 4), 0);
499
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
500
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0);
501
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840);
502
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0);
503
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
504
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless");
505
1
  ccfree(values);
506
1
  ccv_nnc_tensor_free(tensor);
507
1
  ccv_nnc_tensor_free(g_tensor);
508
1
  ccv_nnc_tensor_free(gv_tensor);
509
1
  ccv_nnc_tensor_free(v_tensor);
510
1
}
511
512
TEST_CASE("quantize double to 5-bit and dequantize on GPU losslessly, fast path")
513
1
{
514
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
515
1
  double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
516
1
  double* const values = ccmalloc(sizeof(double) * 2840);
517
1
  int i;
518
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
519
2.84k
    values[i] = lut[i % 32];
520
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 8 + 3) / 4), 0);
521
1
  uint8_t* compressed = tensor->data.u8;
522
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 8);
523
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match");
524
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 8 + 3) / 4), 0);
525
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
526
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
527
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840);
528
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
529
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
530
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
531
1
  ccfree(values);
532
1
  ccv_nnc_tensor_free(tensor);
533
1
  ccv_nnc_tensor_free(g_tensor);
534
1
  ccv_nnc_tensor_free(gv_tensor);
535
1
  ccv_nnc_tensor_free(v_tensor);
536
1
}
537
538
TEST_CASE("quantize float to 5-bit and dequantize on GPU losslessly, fast path")
539
1
{
540
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
541
1
  float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
542
1
  float* const values = ccmalloc(sizeof(float) * 2840);
543
1
  int i;
544
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
545
2.84k
    values[i] = lut[i % 32];
546
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 4 + 3) / 4), 0);
547
1
  uint8_t* compressed = tensor->data.u8;
548
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 4);
549
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match");
550
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 4 + 3) / 4), 0);
551
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
552
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0);
553
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840);
554
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0);
555
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
556
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless");
557
1
  ccfree(values);
558
1
  ccv_nnc_tensor_free(tensor);
559
1
  ccv_nnc_tensor_free(g_tensor);
560
1
  ccv_nnc_tensor_free(gv_tensor);
561
1
  ccv_nnc_tensor_free(v_tensor);
562
1
}
563
564
TEST_CASE("quantize half-precision to 5-bit and dequantize on GPU losslessly, fast path")
565
1
{
566
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
567
1
  float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
568
1
  uint16_t lut[32];
569
1
  ccv_float_to_half_precision(lut_f32, lut, 32);
570
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
571
1
  int i;
572
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
573
2.84k
    values[i] = lut[i % 32];
574
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 2 + 3) / 4), 0);
575
1
  uint8_t* compressed = tensor->data.u8;
576
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 2);
577
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match");
578
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 2 + 3) / 4), 0);
579
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
580
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0);
581
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840);
582
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0);
583
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
584
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless");
585
1
  ccfree(values);
586
1
  ccv_nnc_tensor_free(tensor);
587
1
  ccv_nnc_tensor_free(g_tensor);
588
1
  ccv_nnc_tensor_free(gv_tensor);
589
1
  ccv_nnc_tensor_free(v_tensor);
590
1
}
591
592
TEST_CASE("quantize double to 6-bit and dequantize on GPU losslessly, fast path")
593
1
{
594
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
595
1
  double lut[64];
596
1
  int i;
597
65
  for (i = 0; i < 64; 
i++64
)
598
64
    lut[i] = (double)i;
599
1
  double* const values = ccmalloc(sizeof(double) * 2840);
600
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
601
2.84k
    values[i] = lut[i % 64];
602
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 8 + 3) / 4), 0);
603
1
  uint8_t* compressed = tensor->data.u8;
604
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 8);
605
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match");
606
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 8 + 3) / 4), 0);
607
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
608
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
609
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2840);
610
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
611
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
612
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
613
1
  ccfree(values);
614
1
  ccv_nnc_tensor_free(tensor);
615
1
  ccv_nnc_tensor_free(g_tensor);
616
1
  ccv_nnc_tensor_free(gv_tensor);
617
1
  ccv_nnc_tensor_free(v_tensor);
618
1
}
619
620
TEST_CASE("quantize float to 6-bit and dequantize on GPU losslessly, fast path")
621
1
{
622
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
623
1
  float lut[64];
624
1
  int i;
625
65
  for (i = 0; i < 64; 
i++64
)
626
64
    lut[i] = (float)i;
627
1
  float* const values = ccmalloc(sizeof(float) * 2840);
628
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
629
2.84k
    values[i] = lut[i % 64];
630
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 4 + 3) / 4), 0);
631
1
  uint8_t* compressed = tensor->data.u8;
632
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 4);
633
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match");
634
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 4 + 3) / 4), 0);
635
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
636
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0);
637
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2840);
638
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0);
639
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
640
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless");
641
1
  ccfree(values);
642
1
  ccv_nnc_tensor_free(tensor);
643
1
  ccv_nnc_tensor_free(g_tensor);
644
1
  ccv_nnc_tensor_free(gv_tensor);
645
1
  ccv_nnc_tensor_free(v_tensor);
646
1
}
647
648
TEST_CASE("quantize half-precision to 6-bit and dequantize on GPU losslessly, fast path")
649
1
{
650
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
651
1
  float lut_f32[64];
652
1
  int i;
653
65
  for (i = 0; i < 64; 
i++64
)
654
64
    lut_f32[i] = (float)i;
655
1
  uint16_t lut[64];
656
1
  ccv_float_to_half_precision(lut_f32, lut, 64);
657
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 8192);
658
8.19k
  for (i = 0; i < 8192; 
i++8.19k
)
659
8.19k
    values[i] = lut[i % 64];
660
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 2 * 64 * 2 + 3) / 4), 0);
661
1
  uint8_t* compressed = tensor->data.u8;
662
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 8192, 6, 4096, compressed, 6144 + 2 * 64 * 2);
663
1
  REQUIRE_EQ(output_size, 6144 + 2 * 64 * 2, "output size should match");
664
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 2 * 64 * 2 + 3) / 4), 0);
665
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
666
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 8192), 0);
667
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 4096, gv_tensor->data.u8, 8192);
668
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 8192), 0);
669
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
670
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 8192, "should be lossless");
671
1
  ccfree(values);
672
1
  ccv_nnc_tensor_free(tensor);
673
1
  ccv_nnc_tensor_free(g_tensor);
674
1
  ccv_nnc_tensor_free(gv_tensor);
675
1
  ccv_nnc_tensor_free(v_tensor);
676
1
}
677
678
TEST_CASE("quantize double to 7-bit and dequantize on GPU losslessly, fast path")
679
1
{
680
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
681
1
  double lut[128];
682
1
  int i;
683
129
  for (i = 0; i < 128; 
i++128
)
684
128
    lut[i] = (double)i;
685
1
  double* const values = ccmalloc(sizeof(double) * 2840);
686
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
687
2.84k
    values[i] = lut[i % 128];
688
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 8 + 3) / 4), 0);
689
1
  uint8_t* compressed = tensor->data.u8;
690
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 8);
691
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match");
692
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 8 + 3) / 4), 0);
693
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
694
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
695
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840);
696
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
697
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
698
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
699
1
  ccfree(values);
700
1
  ccv_nnc_tensor_free(tensor);
701
1
  ccv_nnc_tensor_free(g_tensor);
702
1
  ccv_nnc_tensor_free(gv_tensor);
703
1
  ccv_nnc_tensor_free(v_tensor);
704
1
}
705
706
TEST_CASE("quantize float to 7-bit and dequantize on GPU losslessly, fast path")
707
1
{
708
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
709
1
  float lut[128];
710
1
  int i;
711
129
  for (i = 0; i < 128; 
i++128
)
712
128
    lut[i] = (float)i;
713
1
  float* const values = ccmalloc(sizeof(float) * 2840);
714
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
715
2.84k
    values[i] = lut[i % 128];
716
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 4 + 3) / 4), 0);
717
1
  uint8_t* compressed = tensor->data.u8;
718
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 4);
719
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match");
720
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 4 + 3) / 4), 0);
721
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
722
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0);
723
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840);
724
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0);
725
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
726
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless");
727
1
  ccfree(values);
728
1
  ccv_nnc_tensor_free(tensor);
729
1
  ccv_nnc_tensor_free(g_tensor);
730
1
  ccv_nnc_tensor_free(gv_tensor);
731
1
  ccv_nnc_tensor_free(v_tensor);
732
1
}
733
734
TEST_CASE("quantize half-precision to 7-bit and dequantize on GPU losslessly, fast path")
735
1
{
736
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
737
1
  float lut_f32[128];
738
1
  int i;
739
129
  for (i = 0; i < 128; 
i++128
)
740
128
    lut_f32[i] = (float)i;
741
1
  uint16_t lut[128];
742
1
  ccv_float_to_half_precision(lut_f32, lut, 128);
743
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
744
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
745
2.84k
    values[i] = lut[i % 128];
746
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 2 + 3) / 4), 0);
747
1
  uint8_t* compressed = tensor->data.u8;
748
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 2);
749
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match");
750
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 2 + 3) / 4), 0);
751
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
752
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0);
753
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840);
754
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0);
755
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
756
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless");
757
1
  ccfree(values);
758
1
  ccv_nnc_tensor_free(tensor);
759
1
  ccv_nnc_tensor_free(g_tensor);
760
1
  ccv_nnc_tensor_free(gv_tensor);
761
1
  ccv_nnc_tensor_free(v_tensor);
762
1
}
763
764
TEST_CASE("quantize double to 8-bit and dequantize on GPU losslessly, fast path")
765
1
{
766
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
767
1
  double lut[256];
768
1
  int i;
769
257
  for (i = 0; i < 256; 
i++256
)
770
256
    lut[i] = (double)i;
771
1
  double* const values = ccmalloc(sizeof(double) * 2840);
772
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
773
2.84k
    values[i] = lut[i % 256];
774
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2840 + 3 * 256 * 8 + 3) / 4), 0);
775
1
  uint8_t* compressed = tensor->data.u8;
776
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 8);
777
1
  REQUIRE_EQ(output_size, 2840 + 3 * 256 * 8, "output size should match");
778
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2840 + 3 * 256 * 8 + 3) / 4), 0);
779
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
780
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
781
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2840);
782
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
783
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
784
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
785
1
  ccfree(values);
786
1
  ccv_nnc_tensor_free(tensor);
787
1
  ccv_nnc_tensor_free(g_tensor);
788
1
  ccv_nnc_tensor_free(gv_tensor);
789
1
  ccv_nnc_tensor_free(v_tensor);
790
1
}
791
792
TEST_CASE("quantize float to 8-bit and dequantize on GPU losslessly, fast path")
793
1
{
794
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
795
1
  float lut[256];
796
1
  int i;
797
257
  for (i = 0; i < 256; 
i++256
)
798
256
    lut[i] = (float)i;
799
1
  float* const values = ccmalloc(sizeof(float) * 8192);
800
8.19k
  for (i = 0; i < 8192; 
i++8.19k
)
801
8.19k
    values[i] = lut[i % 256];
802
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (8192 + 2 * 256 * 4 + 3) / 4), 0);
803
1
  uint8_t* compressed = tensor->data.u8;
804
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 8192, 8, 4096, compressed, 8192 + 2 * 256 * 4);
805
1
  REQUIRE_EQ(output_size, 8192 + 2 * 256 * 4, "output size should match");
806
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (8192 + 2 * 256 * 4 + 3) / 4), 0);
807
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
808
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 8192), 0);
809
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 4096, gv_tensor->data.u8, 8192);
810
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 8192), 0);
811
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
812
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 8192, "should be lossless");
813
1
  ccfree(values);
814
1
  ccv_nnc_tensor_free(tensor);
815
1
  ccv_nnc_tensor_free(g_tensor);
816
1
  ccv_nnc_tensor_free(gv_tensor);
817
1
  ccv_nnc_tensor_free(v_tensor);
818
1
}
819
820
TEST_CASE("quantize half-precision to 8-bit and dequantize on GPU losslessly, fast path")
821
1
{
822
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
823
1
  float lut_f32[256];
824
1
  int i;
825
257
  for (i = 0; i < 256; 
i++256
)
826
256
    lut_f32[i] = (float)i;
827
1
  uint16_t lut[256];
828
1
  ccv_float_to_half_precision(lut_f32, lut, 256);
829
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
830
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
831
2.84k
    values[i] = lut[i % 256];
832
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2840 + 3 * 256 * 2 + 3) / 4), 0);
833
1
  uint8_t* compressed = tensor->data.u8;
834
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 2);
835
1
  REQUIRE_EQ(output_size, 2840 + 3 * 256 * 2, "output size should match");
836
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2840 + 3 * 256 * 2 + 3) / 4), 0);
837
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
838
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0);
839
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2840);
840
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0);
841
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
842
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless");
843
1
  ccfree(values);
844
1
  ccv_nnc_tensor_free(tensor);
845
1
  ccv_nnc_tensor_free(g_tensor);
846
1
  ccv_nnc_tensor_free(gv_tensor);
847
1
  ccv_nnc_tensor_free(v_tensor);
848
1
}
849
850
#include "case_main.h"