Coverage Report

Created: 2026-04-20 13:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/palettize.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include "3rdparty/dsfmt/dSFMT.h"
8
9
TEST_SETUP()
10
{
11
  ccv_nnc_init();
12
}
13
14
TEST_CASE("allocate row-wise int8 tensor with source-precision scales")
15
1
{
16
1
  ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, 10, 20, 30)), 0);
17
1
  REQUIRE_EQ(6848, ccv_nnc_tensor_data_size(tensor->info), "should be this size");
18
1
  ccv_nnc_tensor_free(tensor);
19
1
}
20
21
TEST_CASE("quantize float to row-wise int8 and dequantize on CPU losslessly")
22
1
{
23
1
  float values[32];
24
1
  static const int8_t q[8] = {-127, -96, -64, -32, 0, 32, 64, 127};
25
1
  static const float scales[4] = {0.5, 1.0, 2.0, 4.0};
26
1
  int i, j;
27
5
  for (i = 0; i < 4; 
i++4
)
28
36
    
for (j = 0; 4
j < 8;
j++32
)
29
32
      values[i * 8 + j] = q[j] * scales[i];
30
1
  ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, 4, 8)), 0);
31
1
  const size_t output_size = ccv_nnc_quantize_8i_rowwise(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 32, 8, tensor->data.u8, ccv_nnc_tensor_data_size_without_padding(tensor->info));
32
1
  REQUIRE_EQ(144, output_size, "output size should match");
33
1
  float dequantized[32];
34
1
  ccv_nnc_dequantize_8i_rowwise(tensor->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 8, dequantized, 32);
35
1
  REQUIRE_ARRAY_EQ(float, values, dequantized, 32, "should be lossless");
36
1
  ccv_nnc_tensor_free(tensor);
37
1
}
38
39
TEST_CASE("quantize bfloat16 to row-wise int8 and dequantize on CPU losslessly")
40
1
{
41
1
  float values_f32[32];
42
1
  uint16_t values_bf16[32];
43
1
  uint16_t expected_bf16[32];
44
1
  static const int8_t q[8] = {-127, -96, -64, -32, 0, 32, 64, 127};
45
1
  static const float scales[4] = {0.5, 1.0, 2.0, 4.0};
46
1
  int i, j;
47
5
  for (i = 0; i < 4; 
i++4
)
48
36
    
for (j = 0; 4
j < 8;
j++32
)
49
32
      values_f32[i * 8 + j] = q[j] * scales[i];
50
1
  ccv_float_to_bfloat(values_f32, values_bf16, 32);
51
1
  memcpy(expected_bf16, values_bf16, sizeof(expected_bf16));
52
1
  ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16BF, 4, 8)), 0);
53
1
  const size_t output_size = ccv_nnc_quantize_8i_rowwise(values_bf16, CCV_16BF, CCV_TENSOR_CPU_MEMORY, 32, 8, tensor->data.u8, ccv_nnc_tensor_data_size_without_padding(tensor->info));
54
1
  REQUIRE_EQ(136, output_size, "output size should match");
55
1
  uint16_t dequantized[32];
56
1
  ccv_nnc_dequantize_8i_rowwise(tensor->data.u8, CCV_16BF, CCV_TENSOR_CPU_MEMORY, output_size, 8, dequantized, 32);
57
1
  REQUIRE_ARRAY_EQ(uint16_t, expected_bf16, dequantized, 32, "should be lossless");
58
1
  ccv_nnc_tensor_free(tensor);
59
1
}
60
61
TEST_CASE("quantize float to row-wise int8 and dequantize on GPU losslessly")
62
1
{
63
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
64
1
  float values[32];
65
1
  static const int8_t q[8] = {-127, -96, -64, -32, 0, 32, 64, 127};
66
1
  static const float scales[4] = {0.5, 1.0, 2.0, 4.0};
67
1
  int i, j;
68
5
  for (i = 0; i < 4; 
i++4
)
69
36
    
for (j = 0; 4
j < 8;
j++32
)
70
32
      values[i * 8 + j] = q[j] * scales[i];
71
1
  ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, 4, 8)), 0);
72
1
  const size_t output_size = ccv_nnc_quantize_8i_rowwise(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 32, 8, tensor->data.u8, ccv_nnc_tensor_data_size_without_padding(tensor->info));
73
1
  ccv_nnc_tensor_t* const g_tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 32F, 4, 8)), 0);
74
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
75
1
  ccv_nnc_tensor_t* const gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 8), 0);
76
1
  ccv_nnc_dequantize_8i_rowwise(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, gv_tensor->data.u8, 32);
77
1
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 8), 0);
78
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
79
1
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, values, v_tensor->data.f32, 32, 1e-6, "should be lossless");
80
1
  ccv_nnc_tensor_free(v_tensor);
81
1
  ccv_nnc_tensor_free(gv_tensor);
82
1
  ccv_nnc_tensor_free(g_tensor);
83
1
  ccv_nnc_tensor_free(tensor);
84
1
}
85
86
TEST_CASE("quantize double to 4-bit and dequantize on GPU losslessly")
87
1
{
88
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
89
1
  double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
90
1
  double* const values = ccmalloc(sizeof(double) * 2839);
91
1
  int i;
92
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
93
2.83k
    values[i] = lut[i % 16];
94
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 + 3) / 4), 0);
95
1
  uint8_t* compressed = tensor->data.u8;
96
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944);
97
1
  REQUIRE_EQ(output_size, 1420 + 2944, "output size should match");
98
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 + 3) / 4), 0);
99
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
100
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
101
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839);
102
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
103
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
104
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
105
1
  ccfree(values);
106
1
  ccv_nnc_tensor_free(tensor);
107
1
  ccv_nnc_tensor_free(g_tensor);
108
1
  ccv_nnc_tensor_free(gv_tensor);
109
1
  ccv_nnc_tensor_free(v_tensor);
110
1
}
111
112
TEST_CASE("quantize float to 4-bit and dequantize on GPU losslessly")
113
1
{
114
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
115
1
  float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
116
1
  float* const values = ccmalloc(sizeof(float) * 2839);
117
1
  int i;
118
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
119
2.83k
    values[i] = lut[i % 16];
120
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 2 + 3) / 4), 0);
121
1
  uint8_t* compressed = tensor->data.u8;
122
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 2);
123
1
  REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match");
124
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 2 + 3) / 4), 0);
125
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
126
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
127
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839);
128
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
129
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
130
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
131
1
  ccfree(values);
132
1
  ccv_nnc_tensor_free(tensor);
133
1
  ccv_nnc_tensor_free(g_tensor);
134
1
  ccv_nnc_tensor_free(gv_tensor);
135
1
  ccv_nnc_tensor_free(v_tensor);
136
1
}
137
138
TEST_CASE("quantize half-precision to 4-bit and dequantize on GPU losslessly")
139
1
{
140
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
141
1
  float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
142
1
  uint16_t lut[16];
143
1
  ccv_float_to_half_precision(lut_f32, lut, 16);
144
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
145
1
  int i;
146
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
147
2.83k
    values[i] = lut[i % 16];
148
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 4 + 3) / 4), 0);
149
1
  uint8_t* compressed = tensor->data.u8;
150
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 4);
151
1
  REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match");
152
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 4 + 3) / 4), 0);
153
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
154
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
155
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839);
156
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
157
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
158
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
159
1
  ccfree(values);
160
1
  ccv_nnc_tensor_free(tensor);
161
1
  ccv_nnc_tensor_free(g_tensor);
162
1
  ccv_nnc_tensor_free(gv_tensor);
163
1
  ccv_nnc_tensor_free(v_tensor);
164
1
}
165
166
TEST_CASE("quantize double to 5-bit and dequantize on GPU losslessly")
167
1
{
168
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
169
1
  double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
170
1
  double* const values = ccmalloc(sizeof(double) * 2839);
171
1
  int i;
172
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
173
2.83k
    values[i] = lut[i % 32];
174
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 8 + 3) / 4), 0);
175
1
  uint8_t* compressed = tensor->data.u8;
176
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 8);
177
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match");
178
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 8 + 3) / 4), 0);
179
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
180
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
181
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839);
182
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
183
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
184
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
185
1
  ccfree(values);
186
1
  ccv_nnc_tensor_free(tensor);
187
1
  ccv_nnc_tensor_free(g_tensor);
188
1
  ccv_nnc_tensor_free(gv_tensor);
189
1
  ccv_nnc_tensor_free(v_tensor);
190
1
}
191
192
TEST_CASE("quantize float to 5-bit and dequantize on GPU losslessly")
193
1
{
194
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
195
1
  float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
196
1
  float* const values = ccmalloc(sizeof(float) * 2839);
197
1
  int i;
198
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
199
2.83k
    values[i] = lut[i % 32];
200
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 4 + 3) / 4), 0);
201
1
  uint8_t* compressed = tensor->data.u8;
202
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 4);
203
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match");
204
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 4 + 3) / 4), 0);
205
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
206
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
207
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839);
208
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
209
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
210
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
211
1
  ccfree(values);
212
1
  ccv_nnc_tensor_free(tensor);
213
1
  ccv_nnc_tensor_free(g_tensor);
214
1
  ccv_nnc_tensor_free(gv_tensor);
215
1
  ccv_nnc_tensor_free(v_tensor);
216
1
}
217
218
TEST_CASE("quantize half-precision to 5-bit and dequantize on GPU losslessly")
219
1
{
220
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
221
1
  float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
222
1
  uint16_t lut[32];
223
1
  ccv_float_to_half_precision(lut_f32, lut, 32);
224
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
225
1
  int i;
226
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
227
2.83k
    values[i] = lut[i % 32];
228
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 2 + 3) / 4), 0);
229
1
  uint8_t* compressed = tensor->data.u8;
230
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 2);
231
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match");
232
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 2 + 3) / 4), 0);
233
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
234
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
235
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839);
236
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
237
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
238
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
239
1
  ccfree(values);
240
1
  ccv_nnc_tensor_free(tensor);
241
1
  ccv_nnc_tensor_free(g_tensor);
242
1
  ccv_nnc_tensor_free(gv_tensor);
243
1
  ccv_nnc_tensor_free(v_tensor);
244
1
}
245
246
TEST_CASE("quantize double to 6-bit and dequantize on GPU losslessly")
247
1
{
248
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
249
1
  double lut[64];
250
1
  int i;
251
65
  for (i = 0; i < 64; 
i++64
)
252
64
    lut[i] = (double)i;
253
1
  double* const values = ccmalloc(sizeof(double) * 2839);
254
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
255
2.83k
    values[i] = lut[i % 64];
256
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 8 + 3) / 4), 0);
257
1
  uint8_t* compressed = tensor->data.u8;
258
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 8);
259
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match");
260
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 8 + 3) / 4), 0);
261
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
262
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
263
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839);
264
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
265
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
266
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
267
1
  ccfree(values);
268
1
  ccv_nnc_tensor_free(tensor);
269
1
  ccv_nnc_tensor_free(g_tensor);
270
1
  ccv_nnc_tensor_free(gv_tensor);
271
1
  ccv_nnc_tensor_free(v_tensor);
272
1
}
273
274
TEST_CASE("quantize float to 6-bit and dequantize on GPU losslessly")
275
1
{
276
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
277
1
  float lut[64];
278
1
  int i;
279
65
  for (i = 0; i < 64; 
i++64
)
280
64
    lut[i] = (float)i;
281
1
  float* const values = ccmalloc(sizeof(float) * 2839);
282
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
283
2.83k
    values[i] = lut[i % 64];
284
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 4 + 3) / 4), 0);
285
1
  uint8_t* compressed = tensor->data.u8;
286
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 4);
287
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match");
288
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 4 + 3) / 4), 0);
289
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
290
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
291
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839);
292
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
293
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
294
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
295
1
  ccfree(values);
296
1
  ccv_nnc_tensor_free(tensor);
297
1
  ccv_nnc_tensor_free(g_tensor);
298
1
  ccv_nnc_tensor_free(gv_tensor);
299
1
  ccv_nnc_tensor_free(v_tensor);
300
1
}
301
302
TEST_CASE("quantize half-precision to 6-bit and dequantize on GPU losslessly")
303
1
{
304
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
305
1
  float lut_f32[64];
306
1
  int i;
307
65
  for (i = 0; i < 64; 
i++64
)
308
64
    lut_f32[i] = (float)i;
309
1
  uint16_t lut[64];
310
1
  ccv_float_to_half_precision(lut_f32, lut, 64);
311
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
312
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
313
2.83k
    values[i] = lut[i % 64];
314
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 2 + 3) / 4), 0);
315
1
  uint8_t* compressed = tensor->data.u8;
316
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 2);
317
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 2, "output size should match");
318
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 2 + 3) / 4), 0);
319
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
320
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
321
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839);
322
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
323
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
324
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
325
1
  ccfree(values);
326
1
  ccv_nnc_tensor_free(tensor);
327
1
  ccv_nnc_tensor_free(g_tensor);
328
1
  ccv_nnc_tensor_free(gv_tensor);
329
1
  ccv_nnc_tensor_free(v_tensor);
330
1
}
331
332
TEST_CASE("quantize double to 7-bit and dequantize on GPU losslessly")
333
1
{
334
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
335
1
  double lut[128];
336
1
  int i;
337
129
  for (i = 0; i < 128; 
i++128
)
338
128
    lut[i] = (double)i;
339
1
  double* const values = ccmalloc(sizeof(double) * 2839);
340
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
341
2.83k
    values[i] = lut[i % 128];
342
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 8 + 3) / 4), 0);
343
1
  uint8_t* compressed = tensor->data.u8;
344
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 8);
345
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match");
346
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 8 + 3) / 4), 0);
347
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
348
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
349
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839);
350
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
351
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
352
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
353
1
  ccfree(values);
354
1
  ccv_nnc_tensor_free(tensor);
355
1
  ccv_nnc_tensor_free(g_tensor);
356
1
  ccv_nnc_tensor_free(gv_tensor);
357
1
  ccv_nnc_tensor_free(v_tensor);
358
1
}
359
360
TEST_CASE("quantize float to 7-bit and dequantize on GPU losslessly")
361
1
{
362
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
363
1
  float lut[128];
364
1
  int i;
365
129
  for (i = 0; i < 128; 
i++128
)
366
128
    lut[i] = (float)i;
367
1
  float* const values = ccmalloc(sizeof(float) * 2839);
368
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
369
2.83k
    values[i] = lut[i % 128];
370
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 4 + 3) / 4), 0);
371
1
  uint8_t* compressed = tensor->data.u8;
372
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 4);
373
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match");
374
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 4 + 3) / 4), 0);
375
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
376
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
377
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839);
378
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
379
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
380
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
381
1
  ccfree(values);
382
1
  ccv_nnc_tensor_free(tensor);
383
1
  ccv_nnc_tensor_free(g_tensor);
384
1
  ccv_nnc_tensor_free(gv_tensor);
385
1
  ccv_nnc_tensor_free(v_tensor);
386
1
}
387
388
TEST_CASE("quantize half-precision to 7-bit and dequantize on GPU losslessly")
389
1
{
390
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
391
1
  float lut_f32[128];
392
1
  int i;
393
129
  for (i = 0; i < 128; 
i++128
)
394
128
    lut_f32[i] = (float)i;
395
1
  uint16_t lut[128];
396
1
  ccv_float_to_half_precision(lut_f32, lut, 128);
397
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
398
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
399
2.83k
    values[i] = lut[i % 128];
400
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 2 + 3) / 4), 0);
401
1
  uint8_t* compressed = tensor->data.u8;
402
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 2);
403
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match");
404
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 2 + 3) / 4), 0);
405
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
406
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
407
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839);
408
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
409
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
410
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
411
1
  ccfree(values);
412
1
  ccv_nnc_tensor_free(tensor);
413
1
  ccv_nnc_tensor_free(g_tensor);
414
1
  ccv_nnc_tensor_free(gv_tensor);
415
1
  ccv_nnc_tensor_free(v_tensor);
416
1
}
417
418
TEST_CASE("quantize double to 8-bit and dequantize on GPU losslessly")
419
1
{
420
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
421
1
  double lut[256];
422
1
  int i;
423
257
  for (i = 0; i < 256; 
i++256
)
424
256
    lut[i] = (double)i;
425
1
  double* const values = ccmalloc(sizeof(double) * 2839);
426
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
427
2.83k
    values[i] = lut[i % 256];
428
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 8 + 3) / 4), 0);
429
1
  uint8_t* compressed = tensor->data.u8;
430
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 8);
431
1
  REQUIRE_EQ(output_size, 2839 + 3 * 256 * 8, "output size should match");
432
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 8 + 3) / 4), 0);
433
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
434
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0);
435
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839);
436
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0);
437
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
438
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless");
439
1
  ccfree(values);
440
1
  ccv_nnc_tensor_free(tensor);
441
1
  ccv_nnc_tensor_free(g_tensor);
442
1
  ccv_nnc_tensor_free(gv_tensor);
443
1
  ccv_nnc_tensor_free(v_tensor);
444
1
}
445
446
TEST_CASE("quantize float to 8-bit and dequantize on GPU losslessly")
447
1
{
448
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
449
1
  float lut[256];
450
1
  int i;
451
257
  for (i = 0; i < 256; 
i++256
)
452
256
    lut[i] = (float)i;
453
1
  float* const values = ccmalloc(sizeof(float) * 2839);
454
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
455
2.83k
    values[i] = lut[i % 256];
456
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 4 + 3) / 4), 0);
457
1
  uint8_t* compressed = tensor->data.u8;
458
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 4);
459
1
  REQUIRE_EQ(output_size, 2839 + 3 * 256 * 4, "output size should match");
460
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 4 + 3) / 4), 0);
461
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
462
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0);
463
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839);
464
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0);
465
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
466
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless");
467
1
  ccfree(values);
468
1
  ccv_nnc_tensor_free(tensor);
469
1
  ccv_nnc_tensor_free(g_tensor);
470
1
  ccv_nnc_tensor_free(gv_tensor);
471
1
  ccv_nnc_tensor_free(v_tensor);
472
1
}
473
474
TEST_CASE("quantize half-precision to 8-bit and dequantize on GPU losslessly")
475
1
{
476
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
477
1
  float lut_f32[256];
478
1
  int i;
479
257
  for (i = 0; i < 256; 
i++256
)
480
256
    lut_f32[i] = (float)i;
481
1
  uint16_t lut[256];
482
1
  ccv_float_to_half_precision(lut_f32, lut, 256);
483
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
484
2.84k
  for (i = 0; i < 2839; 
i++2.83k
)
485
2.83k
    values[i] = lut[i % 256];
486
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 2 + 3) / 4), 0);
487
1
  uint8_t* compressed = tensor->data.u8;
488
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 2);
489
1
  REQUIRE_EQ(output_size, 2839 + 3 * 256 * 2, "output size should match");
490
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 2 + 3) / 4), 0);
491
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
492
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0);
493
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839);
494
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0);
495
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
496
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless");
497
1
  ccfree(values);
498
1
  ccv_nnc_tensor_free(tensor);
499
1
  ccv_nnc_tensor_free(g_tensor);
500
1
  ccv_nnc_tensor_free(gv_tensor);
501
1
  ccv_nnc_tensor_free(v_tensor);
502
1
}
503
504
TEST_CASE("quantize double to 4-bit and dequantize on GPU losslessly, fast path")
505
1
{
506
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
507
1
  double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
508
1
  double* const values = ccmalloc(sizeof(double) * 2840);
509
1
  int i;
510
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
511
2.84k
    values[i] = lut[i % 16];
512
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 + 3) / 4), 0);
513
1
  uint8_t* compressed = tensor->data.u8;
514
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944);
515
1
  REQUIRE_EQ(output_size, 1420 + 2944, "output size should match");
516
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 + 3) / 4), 0);
517
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
518
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
519
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840);
520
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
521
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
522
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
523
1
  ccfree(values);
524
1
  ccv_nnc_tensor_free(tensor);
525
1
  ccv_nnc_tensor_free(g_tensor);
526
1
  ccv_nnc_tensor_free(gv_tensor);
527
1
  ccv_nnc_tensor_free(v_tensor);
528
1
}
529
530
TEST_CASE("quantize float to 4-bit and dequantize on GPU losslessly, fast path")
531
1
{
532
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
533
1
  float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
534
1
  float* const values = ccmalloc(sizeof(float) * 2840);
535
1
  int i;
536
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
537
2.84k
    values[i] = lut[i % 16];
538
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 2 + 3) / 4), 0);
539
1
  uint8_t* compressed = tensor->data.u8;
540
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 2);
541
1
  REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match");
542
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 2 + 3) / 4), 0);
543
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
544
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0);
545
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840);
546
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0);
547
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
548
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless");
549
1
  ccfree(values);
550
1
  ccv_nnc_tensor_free(tensor);
551
1
  ccv_nnc_tensor_free(g_tensor);
552
1
  ccv_nnc_tensor_free(gv_tensor);
553
1
  ccv_nnc_tensor_free(v_tensor);
554
1
}
555
556
TEST_CASE("quantize half-precision to 4-bit and dequantize on GPU losslessly, fast path")
557
1
{
558
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
559
1
  float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
560
1
  uint16_t lut[16];
561
1
  ccv_float_to_half_precision(lut_f32, lut, 16);
562
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
563
1
  int i;
564
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
565
2.84k
    values[i] = lut[i % 16];
566
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 4 + 3) / 4), 0);
567
1
  uint8_t* compressed = tensor->data.u8;
568
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 4);
569
1
  REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match");
570
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 4 + 3) / 4), 0);
571
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
572
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0);
573
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840);
574
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0);
575
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
576
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless");
577
1
  ccfree(values);
578
1
  ccv_nnc_tensor_free(tensor);
579
1
  ccv_nnc_tensor_free(g_tensor);
580
1
  ccv_nnc_tensor_free(gv_tensor);
581
1
  ccv_nnc_tensor_free(v_tensor);
582
1
}
583
584
TEST_CASE("quantize double to 5-bit and dequantize on GPU losslessly, fast path")
585
1
{
586
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
587
1
  double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
588
1
  double* const values = ccmalloc(sizeof(double) * 2840);
589
1
  int i;
590
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
591
2.84k
    values[i] = lut[i % 32];
592
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 8 + 3) / 4), 0);
593
1
  uint8_t* compressed = tensor->data.u8;
594
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 8);
595
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match");
596
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 8 + 3) / 4), 0);
597
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
598
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
599
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840);
600
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
601
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
602
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
603
1
  ccfree(values);
604
1
  ccv_nnc_tensor_free(tensor);
605
1
  ccv_nnc_tensor_free(g_tensor);
606
1
  ccv_nnc_tensor_free(gv_tensor);
607
1
  ccv_nnc_tensor_free(v_tensor);
608
1
}
609
610
TEST_CASE("quantize float to 5-bit and dequantize on GPU losslessly, fast path")
611
1
{
612
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
613
1
  float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
614
1
  float* const values = ccmalloc(sizeof(float) * 2840);
615
1
  int i;
616
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
617
2.84k
    values[i] = lut[i % 32];
618
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 4 + 3) / 4), 0);
619
1
  uint8_t* compressed = tensor->data.u8;
620
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 4);
621
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match");
622
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 4 + 3) / 4), 0);
623
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
624
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0);
625
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840);
626
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0);
627
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
628
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless");
629
1
  ccfree(values);
630
1
  ccv_nnc_tensor_free(tensor);
631
1
  ccv_nnc_tensor_free(g_tensor);
632
1
  ccv_nnc_tensor_free(gv_tensor);
633
1
  ccv_nnc_tensor_free(v_tensor);
634
1
}
635
636
TEST_CASE("quantize half-precision to 5-bit and dequantize on GPU losslessly, fast path")
637
1
{
638
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
639
1
  float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
640
1
  uint16_t lut[32];
641
1
  ccv_float_to_half_precision(lut_f32, lut, 32);
642
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
643
1
  int i;
644
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
645
2.84k
    values[i] = lut[i % 32];
646
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 2 + 3) / 4), 0);
647
1
  uint8_t* compressed = tensor->data.u8;
648
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 2);
649
1
  REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match");
650
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 2 + 3) / 4), 0);
651
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
652
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0);
653
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840);
654
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0);
655
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
656
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless");
657
1
  ccfree(values);
658
1
  ccv_nnc_tensor_free(tensor);
659
1
  ccv_nnc_tensor_free(g_tensor);
660
1
  ccv_nnc_tensor_free(gv_tensor);
661
1
  ccv_nnc_tensor_free(v_tensor);
662
1
}
663
664
TEST_CASE("quantize double to 6-bit and dequantize on GPU losslessly, fast path")
665
1
{
666
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
667
1
  double lut[64];
668
1
  int i;
669
65
  for (i = 0; i < 64; 
i++64
)
670
64
    lut[i] = (double)i;
671
1
  double* const values = ccmalloc(sizeof(double) * 2840);
672
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
673
2.84k
    values[i] = lut[i % 64];
674
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 8 + 3) / 4), 0);
675
1
  uint8_t* compressed = tensor->data.u8;
676
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 8);
677
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match");
678
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 8 + 3) / 4), 0);
679
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
680
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
681
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2840);
682
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
683
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
684
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
685
1
  ccfree(values);
686
1
  ccv_nnc_tensor_free(tensor);
687
1
  ccv_nnc_tensor_free(g_tensor);
688
1
  ccv_nnc_tensor_free(gv_tensor);
689
1
  ccv_nnc_tensor_free(v_tensor);
690
1
}
691
692
TEST_CASE("quantize float to 6-bit and dequantize on GPU losslessly, fast path")
693
1
{
694
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
695
1
  float lut[64];
696
1
  int i;
697
65
  for (i = 0; i < 64; 
i++64
)
698
64
    lut[i] = (float)i;
699
1
  float* const values = ccmalloc(sizeof(float) * 2840);
700
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
701
2.84k
    values[i] = lut[i % 64];
702
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 4 + 3) / 4), 0);
703
1
  uint8_t* compressed = tensor->data.u8;
704
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 4);
705
1
  REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match");
706
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 4 + 3) / 4), 0);
707
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
708
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0);
709
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2840);
710
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0);
711
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
712
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless");
713
1
  ccfree(values);
714
1
  ccv_nnc_tensor_free(tensor);
715
1
  ccv_nnc_tensor_free(g_tensor);
716
1
  ccv_nnc_tensor_free(gv_tensor);
717
1
  ccv_nnc_tensor_free(v_tensor);
718
1
}
719
720
TEST_CASE("quantize half-precision to 6-bit and dequantize on GPU losslessly, fast path")
721
1
{
722
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
723
1
  float lut_f32[64];
724
1
  int i;
725
65
  for (i = 0; i < 64; 
i++64
)
726
64
    lut_f32[i] = (float)i;
727
1
  uint16_t lut[64];
728
1
  ccv_float_to_half_precision(lut_f32, lut, 64);
729
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 8192);
730
8.19k
  for (i = 0; i < 8192; 
i++8.19k
)
731
8.19k
    values[i] = lut[i % 64];
732
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 2 * 64 * 2 + 3) / 4), 0);
733
1
  uint8_t* compressed = tensor->data.u8;
734
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 8192, 6, 4096, compressed, 6144 + 2 * 64 * 2);
735
1
  REQUIRE_EQ(output_size, 6144 + 2 * 64 * 2, "output size should match");
736
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 2 * 64 * 2 + 3) / 4), 0);
737
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
738
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 8192), 0);
739
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 4096, gv_tensor->data.u8, 8192);
740
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 8192), 0);
741
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
742
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 8192, "should be lossless");
743
1
  ccfree(values);
744
1
  ccv_nnc_tensor_free(tensor);
745
1
  ccv_nnc_tensor_free(g_tensor);
746
1
  ccv_nnc_tensor_free(gv_tensor);
747
1
  ccv_nnc_tensor_free(v_tensor);
748
1
}
749
750
TEST_CASE("quantize double to 7-bit and dequantize on GPU losslessly, fast path")
751
1
{
752
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
753
1
  double lut[128];
754
1
  int i;
755
129
  for (i = 0; i < 128; 
i++128
)
756
128
    lut[i] = (double)i;
757
1
  double* const values = ccmalloc(sizeof(double) * 2840);
758
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
759
2.84k
    values[i] = lut[i % 128];
760
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 8 + 3) / 4), 0);
761
1
  uint8_t* compressed = tensor->data.u8;
762
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 8);
763
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match");
764
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 8 + 3) / 4), 0);
765
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
766
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
767
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840);
768
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
769
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
770
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
771
1
  ccfree(values);
772
1
  ccv_nnc_tensor_free(tensor);
773
1
  ccv_nnc_tensor_free(g_tensor);
774
1
  ccv_nnc_tensor_free(gv_tensor);
775
1
  ccv_nnc_tensor_free(v_tensor);
776
1
}
777
778
TEST_CASE("quantize float to 7-bit and dequantize on GPU losslessly, fast path")
779
1
{
780
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
781
1
  float lut[128];
782
1
  int i;
783
129
  for (i = 0; i < 128; 
i++128
)
784
128
    lut[i] = (float)i;
785
1
  float* const values = ccmalloc(sizeof(float) * 2840);
786
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
787
2.84k
    values[i] = lut[i % 128];
788
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 4 + 3) / 4), 0);
789
1
  uint8_t* compressed = tensor->data.u8;
790
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 4);
791
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match");
792
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 4 + 3) / 4), 0);
793
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
794
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0);
795
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840);
796
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0);
797
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
798
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless");
799
1
  ccfree(values);
800
1
  ccv_nnc_tensor_free(tensor);
801
1
  ccv_nnc_tensor_free(g_tensor);
802
1
  ccv_nnc_tensor_free(gv_tensor);
803
1
  ccv_nnc_tensor_free(v_tensor);
804
1
}
805
806
TEST_CASE("quantize half-precision to 7-bit and dequantize on GPU losslessly, fast path")
807
1
{
808
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
809
1
  float lut_f32[128];
810
1
  int i;
811
129
  for (i = 0; i < 128; 
i++128
)
812
128
    lut_f32[i] = (float)i;
813
1
  uint16_t lut[128];
814
1
  ccv_float_to_half_precision(lut_f32, lut, 128);
815
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
816
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
817
2.84k
    values[i] = lut[i % 128];
818
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 2 + 3) / 4), 0);
819
1
  uint8_t* compressed = tensor->data.u8;
820
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 2);
821
1
  REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match");
822
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 2 + 3) / 4), 0);
823
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
824
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0);
825
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840);
826
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0);
827
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
828
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless");
829
1
  ccfree(values);
830
1
  ccv_nnc_tensor_free(tensor);
831
1
  ccv_nnc_tensor_free(g_tensor);
832
1
  ccv_nnc_tensor_free(gv_tensor);
833
1
  ccv_nnc_tensor_free(v_tensor);
834
1
}
835
836
TEST_CASE("quantize double to 8-bit and dequantize on GPU losslessly, fast path")
837
1
{
838
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
839
1
  double lut[256];
840
1
  int i;
841
257
  for (i = 0; i < 256; 
i++256
)
842
256
    lut[i] = (double)i;
843
1
  double* const values = ccmalloc(sizeof(double) * 2840);
844
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
845
2.84k
    values[i] = lut[i % 256];
846
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2840 + 3 * 256 * 8 + 3) / 4), 0);
847
1
  uint8_t* compressed = tensor->data.u8;
848
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 8);
849
1
  REQUIRE_EQ(output_size, 2840 + 3 * 256 * 8, "output size should match");
850
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2840 + 3 * 256 * 8 + 3) / 4), 0);
851
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
852
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0);
853
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2840);
854
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0);
855
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
856
1
  REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless");
857
1
  ccfree(values);
858
1
  ccv_nnc_tensor_free(tensor);
859
1
  ccv_nnc_tensor_free(g_tensor);
860
1
  ccv_nnc_tensor_free(gv_tensor);
861
1
  ccv_nnc_tensor_free(v_tensor);
862
1
}
863
864
TEST_CASE("quantize float to 8-bit and dequantize on GPU losslessly, fast path")
865
1
{
866
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
867
1
  float lut[256];
868
1
  int i;
869
257
  for (i = 0; i < 256; 
i++256
)
870
256
    lut[i] = (float)i;
871
1
  float* const values = ccmalloc(sizeof(float) * 8192);
872
8.19k
  for (i = 0; i < 8192; 
i++8.19k
)
873
8.19k
    values[i] = lut[i % 256];
874
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (8192 + 2 * 256 * 4 + 3) / 4), 0);
875
1
  uint8_t* compressed = tensor->data.u8;
876
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 8192, 8, 4096, compressed, 8192 + 2 * 256 * 4);
877
1
  REQUIRE_EQ(output_size, 8192 + 2 * 256 * 4, "output size should match");
878
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (8192 + 2 * 256 * 4 + 3) / 4), 0);
879
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
880
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 8192), 0);
881
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 4096, gv_tensor->data.u8, 8192);
882
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 8192), 0);
883
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
884
1
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 8192, "should be lossless");
885
1
  ccfree(values);
886
1
  ccv_nnc_tensor_free(tensor);
887
1
  ccv_nnc_tensor_free(g_tensor);
888
1
  ccv_nnc_tensor_free(gv_tensor);
889
1
  ccv_nnc_tensor_free(v_tensor);
890
1
}
891
892
TEST_CASE("quantize half-precision to 8-bit and dequantize on GPU losslessly, fast path")
893
1
{
894
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF));
895
1
  float lut_f32[256];
896
1
  int i;
897
257
  for (i = 0; i < 256; 
i++256
)
898
256
    lut_f32[i] = (float)i;
899
1
  uint16_t lut[256];
900
1
  ccv_float_to_half_precision(lut_f32, lut, 256);
901
1
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
902
2.84k
  for (i = 0; i < 2840; 
i++2.84k
)
903
2.84k
    values[i] = lut[i % 256];
904
1
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2840 + 3 * 256 * 2 + 3) / 4), 0);
905
1
  uint8_t* compressed = tensor->data.u8;
906
1
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 2);
907
1
  REQUIRE_EQ(output_size, 2840 + 3 * 256 * 2, "output size should match");
908
1
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2840 + 3 * 256 * 2 + 3) / 4), 0);
909
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
910
1
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0);
911
1
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2840);
912
1
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0);
913
1
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
914
1
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless");
915
1
  ccfree(values);
916
1
  ccv_nnc_tensor_free(tensor);
917
1
  ccv_nnc_tensor_free(g_tensor);
918
1
  ccv_nnc_tensor_free(gv_tensor);
919
1
  ccv_nnc_tensor_free(v_tensor);
920
1
}
921
922
#include "case_main.h"