/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/palettize.tests.c
Line | Count | Source |
1 | | #include "case.h" |
2 | | #include "ccv_case.h" |
3 | | #include "ccv_nnc_case.h" |
4 | | #include <ccv.h> |
5 | | #include <nnc/ccv_nnc.h> |
6 | | #include <nnc/ccv_nnc_easy.h> |
7 | | #include "3rdparty/dsfmt/dSFMT.h" |
8 | | |
9 | | TEST_SETUP() |
10 | | { |
11 | | ccv_nnc_init(); |
12 | | } |
13 | | |
14 | | TEST_CASE("allocate row-wise int8 tensor with source-precision scales") |
15 | 1 | { |
16 | 1 | ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, 10, 20, 30)), 0); |
17 | 1 | REQUIRE_EQ(6848, ccv_nnc_tensor_data_size(tensor->info), "should be this size"); |
18 | 1 | ccv_nnc_tensor_free(tensor); |
19 | 1 | } |
20 | | |
21 | | TEST_CASE("quantize float to row-wise int8 and dequantize on CPU losslessly") |
22 | 1 | { |
23 | 1 | float values[32]; |
24 | 1 | static const int8_t q[8] = {-127, -96, -64, -32, 0, 32, 64, 127}; |
25 | 1 | static const float scales[4] = {0.5, 1.0, 2.0, 4.0}; |
26 | 1 | int i, j; |
27 | 5 | for (i = 0; i < 4; i++4 ) |
28 | 36 | for (j = 0; 4 j < 8; j++32 ) |
29 | 32 | values[i * 8 + j] = q[j] * scales[i]; |
30 | 1 | ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, 4, 8)), 0); |
31 | 1 | const size_t output_size = ccv_nnc_quantize_8i_rowwise(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 32, 8, tensor->data.u8, ccv_nnc_tensor_data_size_without_padding(tensor->info)); |
32 | 1 | REQUIRE_EQ(144, output_size, "output size should match"); |
33 | 1 | float dequantized[32]; |
34 | 1 | ccv_nnc_dequantize_8i_rowwise(tensor->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 8, dequantized, 32); |
35 | 1 | REQUIRE_ARRAY_EQ(float, values, dequantized, 32, "should be lossless"); |
36 | 1 | ccv_nnc_tensor_free(tensor); |
37 | 1 | } |
38 | | |
39 | | TEST_CASE("quantize bfloat16 to row-wise int8 and dequantize on CPU losslessly") |
40 | 1 | { |
41 | 1 | float values_f32[32]; |
42 | 1 | uint16_t values_bf16[32]; |
43 | 1 | uint16_t expected_bf16[32]; |
44 | 1 | static const int8_t q[8] = {-127, -96, -64, -32, 0, 32, 64, 127}; |
45 | 1 | static const float scales[4] = {0.5, 1.0, 2.0, 4.0}; |
46 | 1 | int i, j; |
47 | 5 | for (i = 0; i < 4; i++4 ) |
48 | 36 | for (j = 0; 4 j < 8; j++32 ) |
49 | 32 | values_f32[i * 8 + j] = q[j] * scales[i]; |
50 | 1 | ccv_float_to_bfloat(values_f32, values_bf16, 32); |
51 | 1 | memcpy(expected_bf16, values_bf16, sizeof(expected_bf16)); |
52 | 1 | ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16BF, 4, 8)), 0); |
53 | 1 | const size_t output_size = ccv_nnc_quantize_8i_rowwise(values_bf16, CCV_16BF, CCV_TENSOR_CPU_MEMORY, 32, 8, tensor->data.u8, ccv_nnc_tensor_data_size_without_padding(tensor->info)); |
54 | 1 | REQUIRE_EQ(136, output_size, "output size should match"); |
55 | 1 | uint16_t dequantized[32]; |
56 | 1 | ccv_nnc_dequantize_8i_rowwise(tensor->data.u8, CCV_16BF, CCV_TENSOR_CPU_MEMORY, output_size, 8, dequantized, 32); |
57 | 1 | REQUIRE_ARRAY_EQ(uint16_t, expected_bf16, dequantized, 32, "should be lossless"); |
58 | 1 | ccv_nnc_tensor_free(tensor); |
59 | 1 | } |
60 | | |
61 | | TEST_CASE("quantize float to row-wise int8 and dequantize on GPU losslessly") |
62 | 1 | { |
63 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
64 | 1 | float values[32]; |
65 | 1 | static const int8_t q[8] = {-127, -96, -64, -32, 0, 32, 64, 127}; |
66 | 1 | static const float scales[4] = {0.5, 1.0, 2.0, 4.0}; |
67 | 1 | int i, j; |
68 | 5 | for (i = 0; i < 4; i++4 ) |
69 | 36 | for (j = 0; 4 j < 8; j++32 ) |
70 | 32 | values[i * 8 + j] = q[j] * scales[i]; |
71 | 1 | ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, 4, 8)), 0); |
72 | 1 | const size_t output_size = ccv_nnc_quantize_8i_rowwise(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 32, 8, tensor->data.u8, ccv_nnc_tensor_data_size_without_padding(tensor->info)); |
73 | 1 | ccv_nnc_tensor_t* const g_tensor = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 32F, 4, 8)), 0); |
74 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
75 | 1 | ccv_nnc_tensor_t* const gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 8), 0); |
76 | 1 | ccv_nnc_dequantize_8i_rowwise(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, gv_tensor->data.u8, 32); |
77 | 1 | ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 8), 0); |
78 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
79 | 1 | REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, values, v_tensor->data.f32, 32, 1e-6, "should be lossless"); |
80 | 1 | ccv_nnc_tensor_free(v_tensor); |
81 | 1 | ccv_nnc_tensor_free(gv_tensor); |
82 | 1 | ccv_nnc_tensor_free(g_tensor); |
83 | 1 | ccv_nnc_tensor_free(tensor); |
84 | 1 | } |
85 | | |
86 | | TEST_CASE("quantize double to 4-bit and dequantize on GPU losslessly") |
87 | 1 | { |
88 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
89 | 1 | double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
90 | 1 | double* const values = ccmalloc(sizeof(double) * 2839); |
91 | 1 | int i; |
92 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
93 | 2.83k | values[i] = lut[i % 16]; |
94 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 + 3) / 4), 0); |
95 | 1 | uint8_t* compressed = tensor->data.u8; |
96 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944); |
97 | 1 | REQUIRE_EQ(output_size, 1420 + 2944, "output size should match"); |
98 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 + 3) / 4), 0); |
99 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
100 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
101 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839); |
102 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
103 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
104 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
105 | 1 | ccfree(values); |
106 | 1 | ccv_nnc_tensor_free(tensor); |
107 | 1 | ccv_nnc_tensor_free(g_tensor); |
108 | 1 | ccv_nnc_tensor_free(gv_tensor); |
109 | 1 | ccv_nnc_tensor_free(v_tensor); |
110 | 1 | } |
111 | | |
112 | | TEST_CASE("quantize float to 4-bit and dequantize on GPU losslessly") |
113 | 1 | { |
114 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
115 | 1 | float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
116 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
117 | 1 | int i; |
118 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
119 | 2.83k | values[i] = lut[i % 16]; |
120 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 2 + 3) / 4), 0); |
121 | 1 | uint8_t* compressed = tensor->data.u8; |
122 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 2); |
123 | 1 | REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match"); |
124 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 2 + 3) / 4), 0); |
125 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
126 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
127 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839); |
128 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
129 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
130 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
131 | 1 | ccfree(values); |
132 | 1 | ccv_nnc_tensor_free(tensor); |
133 | 1 | ccv_nnc_tensor_free(g_tensor); |
134 | 1 | ccv_nnc_tensor_free(gv_tensor); |
135 | 1 | ccv_nnc_tensor_free(v_tensor); |
136 | 1 | } |
137 | | |
138 | | TEST_CASE("quantize half-precision to 4-bit and dequantize on GPU losslessly") |
139 | 1 | { |
140 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
141 | 1 | float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
142 | 1 | uint16_t lut[16]; |
143 | 1 | ccv_float_to_half_precision(lut_f32, lut, 16); |
144 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
145 | 1 | int i; |
146 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
147 | 2.83k | values[i] = lut[i % 16]; |
148 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 4 + 3) / 4), 0); |
149 | 1 | uint8_t* compressed = tensor->data.u8; |
150 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 4); |
151 | 1 | REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match"); |
152 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 4 + 3) / 4), 0); |
153 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
154 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
155 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839); |
156 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
157 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
158 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
159 | 1 | ccfree(values); |
160 | 1 | ccv_nnc_tensor_free(tensor); |
161 | 1 | ccv_nnc_tensor_free(g_tensor); |
162 | 1 | ccv_nnc_tensor_free(gv_tensor); |
163 | 1 | ccv_nnc_tensor_free(v_tensor); |
164 | 1 | } |
165 | | |
166 | | TEST_CASE("quantize double to 5-bit and dequantize on GPU losslessly") |
167 | 1 | { |
168 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
169 | 1 | double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
170 | 1 | double* const values = ccmalloc(sizeof(double) * 2839); |
171 | 1 | int i; |
172 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
173 | 2.83k | values[i] = lut[i % 32]; |
174 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 8 + 3) / 4), 0); |
175 | 1 | uint8_t* compressed = tensor->data.u8; |
176 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 8); |
177 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match"); |
178 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 8 + 3) / 4), 0); |
179 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
180 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
181 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839); |
182 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
183 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
184 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
185 | 1 | ccfree(values); |
186 | 1 | ccv_nnc_tensor_free(tensor); |
187 | 1 | ccv_nnc_tensor_free(g_tensor); |
188 | 1 | ccv_nnc_tensor_free(gv_tensor); |
189 | 1 | ccv_nnc_tensor_free(v_tensor); |
190 | 1 | } |
191 | | |
192 | | TEST_CASE("quantize float to 5-bit and dequantize on GPU losslessly") |
193 | 1 | { |
194 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
195 | 1 | float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
196 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
197 | 1 | int i; |
198 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
199 | 2.83k | values[i] = lut[i % 32]; |
200 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 4 + 3) / 4), 0); |
201 | 1 | uint8_t* compressed = tensor->data.u8; |
202 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 4); |
203 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match"); |
204 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 4 + 3) / 4), 0); |
205 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
206 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
207 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839); |
208 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
209 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
210 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
211 | 1 | ccfree(values); |
212 | 1 | ccv_nnc_tensor_free(tensor); |
213 | 1 | ccv_nnc_tensor_free(g_tensor); |
214 | 1 | ccv_nnc_tensor_free(gv_tensor); |
215 | 1 | ccv_nnc_tensor_free(v_tensor); |
216 | 1 | } |
217 | | |
218 | | TEST_CASE("quantize half-precision to 5-bit and dequantize on GPU losslessly") |
219 | 1 | { |
220 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
221 | 1 | float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
222 | 1 | uint16_t lut[32]; |
223 | 1 | ccv_float_to_half_precision(lut_f32, lut, 32); |
224 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
225 | 1 | int i; |
226 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
227 | 2.83k | values[i] = lut[i % 32]; |
228 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 2 + 3) / 4), 0); |
229 | 1 | uint8_t* compressed = tensor->data.u8; |
230 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 2); |
231 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match"); |
232 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 2 + 3) / 4), 0); |
233 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
234 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
235 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839); |
236 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
237 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
238 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
239 | 1 | ccfree(values); |
240 | 1 | ccv_nnc_tensor_free(tensor); |
241 | 1 | ccv_nnc_tensor_free(g_tensor); |
242 | 1 | ccv_nnc_tensor_free(gv_tensor); |
243 | 1 | ccv_nnc_tensor_free(v_tensor); |
244 | 1 | } |
245 | | |
246 | | TEST_CASE("quantize double to 6-bit and dequantize on GPU losslessly") |
247 | 1 | { |
248 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
249 | 1 | double lut[64]; |
250 | 1 | int i; |
251 | 65 | for (i = 0; i < 64; i++64 ) |
252 | 64 | lut[i] = (double)i; |
253 | 1 | double* const values = ccmalloc(sizeof(double) * 2839); |
254 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
255 | 2.83k | values[i] = lut[i % 64]; |
256 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 8 + 3) / 4), 0); |
257 | 1 | uint8_t* compressed = tensor->data.u8; |
258 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 8); |
259 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match"); |
260 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 8 + 3) / 4), 0); |
261 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
262 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
263 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839); |
264 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
265 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
266 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
267 | 1 | ccfree(values); |
268 | 1 | ccv_nnc_tensor_free(tensor); |
269 | 1 | ccv_nnc_tensor_free(g_tensor); |
270 | 1 | ccv_nnc_tensor_free(gv_tensor); |
271 | 1 | ccv_nnc_tensor_free(v_tensor); |
272 | 1 | } |
273 | | |
274 | | TEST_CASE("quantize float to 6-bit and dequantize on GPU losslessly") |
275 | 1 | { |
276 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
277 | 1 | float lut[64]; |
278 | 1 | int i; |
279 | 65 | for (i = 0; i < 64; i++64 ) |
280 | 64 | lut[i] = (float)i; |
281 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
282 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
283 | 2.83k | values[i] = lut[i % 64]; |
284 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 4 + 3) / 4), 0); |
285 | 1 | uint8_t* compressed = tensor->data.u8; |
286 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 4); |
287 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match"); |
288 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 4 + 3) / 4), 0); |
289 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
290 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
291 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839); |
292 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
293 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
294 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
295 | 1 | ccfree(values); |
296 | 1 | ccv_nnc_tensor_free(tensor); |
297 | 1 | ccv_nnc_tensor_free(g_tensor); |
298 | 1 | ccv_nnc_tensor_free(gv_tensor); |
299 | 1 | ccv_nnc_tensor_free(v_tensor); |
300 | 1 | } |
301 | | |
302 | | TEST_CASE("quantize half-precision to 6-bit and dequantize on GPU losslessly") |
303 | 1 | { |
304 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
305 | 1 | float lut_f32[64]; |
306 | 1 | int i; |
307 | 65 | for (i = 0; i < 64; i++64 ) |
308 | 64 | lut_f32[i] = (float)i; |
309 | 1 | uint16_t lut[64]; |
310 | 1 | ccv_float_to_half_precision(lut_f32, lut, 64); |
311 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
312 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
313 | 2.83k | values[i] = lut[i % 64]; |
314 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 2 + 3) / 4), 0); |
315 | 1 | uint8_t* compressed = tensor->data.u8; |
316 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 2); |
317 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 2, "output size should match"); |
318 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 2 + 3) / 4), 0); |
319 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
320 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
321 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839); |
322 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
323 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
324 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
325 | 1 | ccfree(values); |
326 | 1 | ccv_nnc_tensor_free(tensor); |
327 | 1 | ccv_nnc_tensor_free(g_tensor); |
328 | 1 | ccv_nnc_tensor_free(gv_tensor); |
329 | 1 | ccv_nnc_tensor_free(v_tensor); |
330 | 1 | } |
331 | | |
332 | | TEST_CASE("quantize double to 7-bit and dequantize on GPU losslessly") |
333 | 1 | { |
334 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
335 | 1 | double lut[128]; |
336 | 1 | int i; |
337 | 129 | for (i = 0; i < 128; i++128 ) |
338 | 128 | lut[i] = (double)i; |
339 | 1 | double* const values = ccmalloc(sizeof(double) * 2839); |
340 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
341 | 2.83k | values[i] = lut[i % 128]; |
342 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 8 + 3) / 4), 0); |
343 | 1 | uint8_t* compressed = tensor->data.u8; |
344 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 8); |
345 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match"); |
346 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 8 + 3) / 4), 0); |
347 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
348 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
349 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839); |
350 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
351 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
352 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
353 | 1 | ccfree(values); |
354 | 1 | ccv_nnc_tensor_free(tensor); |
355 | 1 | ccv_nnc_tensor_free(g_tensor); |
356 | 1 | ccv_nnc_tensor_free(gv_tensor); |
357 | 1 | ccv_nnc_tensor_free(v_tensor); |
358 | 1 | } |
359 | | |
360 | | TEST_CASE("quantize float to 7-bit and dequantize on GPU losslessly") |
361 | 1 | { |
362 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
363 | 1 | float lut[128]; |
364 | 1 | int i; |
365 | 129 | for (i = 0; i < 128; i++128 ) |
366 | 128 | lut[i] = (float)i; |
367 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
368 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
369 | 2.83k | values[i] = lut[i % 128]; |
370 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 4 + 3) / 4), 0); |
371 | 1 | uint8_t* compressed = tensor->data.u8; |
372 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 4); |
373 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match"); |
374 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 4 + 3) / 4), 0); |
375 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
376 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
377 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839); |
378 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
379 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
380 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
381 | 1 | ccfree(values); |
382 | 1 | ccv_nnc_tensor_free(tensor); |
383 | 1 | ccv_nnc_tensor_free(g_tensor); |
384 | 1 | ccv_nnc_tensor_free(gv_tensor); |
385 | 1 | ccv_nnc_tensor_free(v_tensor); |
386 | 1 | } |
387 | | |
388 | | TEST_CASE("quantize half-precision to 7-bit and dequantize on GPU losslessly") |
389 | 1 | { |
390 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
391 | 1 | float lut_f32[128]; |
392 | 1 | int i; |
393 | 129 | for (i = 0; i < 128; i++128 ) |
394 | 128 | lut_f32[i] = (float)i; |
395 | 1 | uint16_t lut[128]; |
396 | 1 | ccv_float_to_half_precision(lut_f32, lut, 128); |
397 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
398 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
399 | 2.83k | values[i] = lut[i % 128]; |
400 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 2 + 3) / 4), 0); |
401 | 1 | uint8_t* compressed = tensor->data.u8; |
402 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 2); |
403 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match"); |
404 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 2 + 3) / 4), 0); |
405 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
406 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
407 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839); |
408 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
409 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
410 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
411 | 1 | ccfree(values); |
412 | 1 | ccv_nnc_tensor_free(tensor); |
413 | 1 | ccv_nnc_tensor_free(g_tensor); |
414 | 1 | ccv_nnc_tensor_free(gv_tensor); |
415 | 1 | ccv_nnc_tensor_free(v_tensor); |
416 | 1 | } |
417 | | |
418 | | TEST_CASE("quantize double to 8-bit and dequantize on GPU losslessly") |
419 | 1 | { |
420 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
421 | 1 | double lut[256]; |
422 | 1 | int i; |
423 | 257 | for (i = 0; i < 256; i++256 ) |
424 | 256 | lut[i] = (double)i; |
425 | 1 | double* const values = ccmalloc(sizeof(double) * 2839); |
426 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
427 | 2.83k | values[i] = lut[i % 256]; |
428 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 8 + 3) / 4), 0); |
429 | 1 | uint8_t* compressed = tensor->data.u8; |
430 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 8); |
431 | 1 | REQUIRE_EQ(output_size, 2839 + 3 * 256 * 8, "output size should match"); |
432 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 8 + 3) / 4), 0); |
433 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
434 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
435 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839); |
436 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
437 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
438 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
439 | 1 | ccfree(values); |
440 | 1 | ccv_nnc_tensor_free(tensor); |
441 | 1 | ccv_nnc_tensor_free(g_tensor); |
442 | 1 | ccv_nnc_tensor_free(gv_tensor); |
443 | 1 | ccv_nnc_tensor_free(v_tensor); |
444 | 1 | } |
445 | | |
446 | | TEST_CASE("quantize float to 8-bit and dequantize on GPU losslessly") |
447 | 1 | { |
448 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
449 | 1 | float lut[256]; |
450 | 1 | int i; |
451 | 257 | for (i = 0; i < 256; i++256 ) |
452 | 256 | lut[i] = (float)i; |
453 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
454 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
455 | 2.83k | values[i] = lut[i % 256]; |
456 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 4 + 3) / 4), 0); |
457 | 1 | uint8_t* compressed = tensor->data.u8; |
458 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 4); |
459 | 1 | REQUIRE_EQ(output_size, 2839 + 3 * 256 * 4, "output size should match"); |
460 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 4 + 3) / 4), 0); |
461 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
462 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
463 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839); |
464 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
465 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
466 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
467 | 1 | ccfree(values); |
468 | 1 | ccv_nnc_tensor_free(tensor); |
469 | 1 | ccv_nnc_tensor_free(g_tensor); |
470 | 1 | ccv_nnc_tensor_free(gv_tensor); |
471 | 1 | ccv_nnc_tensor_free(v_tensor); |
472 | 1 | } |
473 | | |
474 | | TEST_CASE("quantize half-precision to 8-bit and dequantize on GPU losslessly") |
475 | 1 | { |
476 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
477 | 1 | float lut_f32[256]; |
478 | 1 | int i; |
479 | 257 | for (i = 0; i < 256; i++256 ) |
480 | 256 | lut_f32[i] = (float)i; |
481 | 1 | uint16_t lut[256]; |
482 | 1 | ccv_float_to_half_precision(lut_f32, lut, 256); |
483 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
484 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
485 | 2.83k | values[i] = lut[i % 256]; |
486 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 2 + 3) / 4), 0); |
487 | 1 | uint8_t* compressed = tensor->data.u8; |
488 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 2); |
489 | 1 | REQUIRE_EQ(output_size, 2839 + 3 * 256 * 2, "output size should match"); |
490 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 2 + 3) / 4), 0); |
491 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
492 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
493 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839); |
494 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
495 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
496 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
497 | 1 | ccfree(values); |
498 | 1 | ccv_nnc_tensor_free(tensor); |
499 | 1 | ccv_nnc_tensor_free(g_tensor); |
500 | 1 | ccv_nnc_tensor_free(gv_tensor); |
501 | 1 | ccv_nnc_tensor_free(v_tensor); |
502 | 1 | } |
503 | | |
504 | | TEST_CASE("quantize double to 4-bit and dequantize on GPU losslessly, fast path") |
505 | 1 | { |
506 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
507 | 1 | double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
508 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
509 | 1 | int i; |
510 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
511 | 2.84k | values[i] = lut[i % 16]; |
512 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 + 3) / 4), 0); |
513 | 1 | uint8_t* compressed = tensor->data.u8; |
514 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944); |
515 | 1 | REQUIRE_EQ(output_size, 1420 + 2944, "output size should match"); |
516 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 + 3) / 4), 0); |
517 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
518 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
519 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840); |
520 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
521 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
522 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
523 | 1 | ccfree(values); |
524 | 1 | ccv_nnc_tensor_free(tensor); |
525 | 1 | ccv_nnc_tensor_free(g_tensor); |
526 | 1 | ccv_nnc_tensor_free(gv_tensor); |
527 | 1 | ccv_nnc_tensor_free(v_tensor); |
528 | 1 | } |
529 | | |
530 | | TEST_CASE("quantize float to 4-bit and dequantize on GPU losslessly, fast path") |
531 | 1 | { |
532 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
533 | 1 | float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
534 | 1 | float* const values = ccmalloc(sizeof(float) * 2840); |
535 | 1 | int i; |
536 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
537 | 2.84k | values[i] = lut[i % 16]; |
538 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 2 + 3) / 4), 0); |
539 | 1 | uint8_t* compressed = tensor->data.u8; |
540 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 2); |
541 | 1 | REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match"); |
542 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 2 + 3) / 4), 0); |
543 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
544 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0); |
545 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840); |
546 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0); |
547 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
548 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless"); |
549 | 1 | ccfree(values); |
550 | 1 | ccv_nnc_tensor_free(tensor); |
551 | 1 | ccv_nnc_tensor_free(g_tensor); |
552 | 1 | ccv_nnc_tensor_free(gv_tensor); |
553 | 1 | ccv_nnc_tensor_free(v_tensor); |
554 | 1 | } |
555 | | |
556 | | TEST_CASE("quantize half-precision to 4-bit and dequantize on GPU losslessly, fast path") |
557 | 1 | { |
558 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
559 | 1 | float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
560 | 1 | uint16_t lut[16]; |
561 | 1 | ccv_float_to_half_precision(lut_f32, lut, 16); |
562 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); |
563 | 1 | int i; |
564 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
565 | 2.84k | values[i] = lut[i % 16]; |
566 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 4 + 3) / 4), 0); |
567 | 1 | uint8_t* compressed = tensor->data.u8; |
568 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 4); |
569 | 1 | REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match"); |
570 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 4 + 3) / 4), 0); |
571 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
572 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0); |
573 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840); |
574 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0); |
575 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
576 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless"); |
577 | 1 | ccfree(values); |
578 | 1 | ccv_nnc_tensor_free(tensor); |
579 | 1 | ccv_nnc_tensor_free(g_tensor); |
580 | 1 | ccv_nnc_tensor_free(gv_tensor); |
581 | 1 | ccv_nnc_tensor_free(v_tensor); |
582 | 1 | } |
583 | | |
584 | | TEST_CASE("quantize double to 5-bit and dequantize on GPU losslessly, fast path") |
585 | 1 | { |
586 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
587 | 1 | double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
588 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
589 | 1 | int i; |
590 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
591 | 2.84k | values[i] = lut[i % 32]; |
592 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 8 + 3) / 4), 0); |
593 | 1 | uint8_t* compressed = tensor->data.u8; |
594 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 8); |
595 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match"); |
596 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 8 + 3) / 4), 0); |
597 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
598 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
599 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840); |
600 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
601 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
602 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
603 | 1 | ccfree(values); |
604 | 1 | ccv_nnc_tensor_free(tensor); |
605 | 1 | ccv_nnc_tensor_free(g_tensor); |
606 | 1 | ccv_nnc_tensor_free(gv_tensor); |
607 | 1 | ccv_nnc_tensor_free(v_tensor); |
608 | 1 | } |
609 | | |
610 | | TEST_CASE("quantize float to 5-bit and dequantize on GPU losslessly, fast path") |
611 | 1 | { |
612 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
613 | 1 | float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
614 | 1 | float* const values = ccmalloc(sizeof(float) * 2840); |
615 | 1 | int i; |
616 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
617 | 2.84k | values[i] = lut[i % 32]; |
618 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 4 + 3) / 4), 0); |
619 | 1 | uint8_t* compressed = tensor->data.u8; |
620 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 4); |
621 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match"); |
622 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 4 + 3) / 4), 0); |
623 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
624 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0); |
625 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840); |
626 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0); |
627 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
628 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless"); |
629 | 1 | ccfree(values); |
630 | 1 | ccv_nnc_tensor_free(tensor); |
631 | 1 | ccv_nnc_tensor_free(g_tensor); |
632 | 1 | ccv_nnc_tensor_free(gv_tensor); |
633 | 1 | ccv_nnc_tensor_free(v_tensor); |
634 | 1 | } |
635 | | |
636 | | TEST_CASE("quantize half-precision to 5-bit and dequantize on GPU losslessly, fast path") |
637 | 1 | { |
638 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
639 | 1 | float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
640 | 1 | uint16_t lut[32]; |
641 | 1 | ccv_float_to_half_precision(lut_f32, lut, 32); |
642 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); |
643 | 1 | int i; |
644 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
645 | 2.84k | values[i] = lut[i % 32]; |
646 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 2 + 3) / 4), 0); |
647 | 1 | uint8_t* compressed = tensor->data.u8; |
648 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 2); |
649 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match"); |
650 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 2 + 3) / 4), 0); |
651 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
652 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0); |
653 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840); |
654 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0); |
655 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
656 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless"); |
657 | 1 | ccfree(values); |
658 | 1 | ccv_nnc_tensor_free(tensor); |
659 | 1 | ccv_nnc_tensor_free(g_tensor); |
660 | 1 | ccv_nnc_tensor_free(gv_tensor); |
661 | 1 | ccv_nnc_tensor_free(v_tensor); |
662 | 1 | } |
663 | | |
664 | | TEST_CASE("quantize double to 6-bit and dequantize on GPU losslessly, fast path") |
665 | 1 | { |
666 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
667 | 1 | double lut[64]; |
668 | 1 | int i; |
669 | 65 | for (i = 0; i < 64; i++64 ) |
670 | 64 | lut[i] = (double)i; |
671 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
672 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
673 | 2.84k | values[i] = lut[i % 64]; |
674 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 8 + 3) / 4), 0); |
675 | 1 | uint8_t* compressed = tensor->data.u8; |
676 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 8); |
677 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match"); |
678 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 8 + 3) / 4), 0); |
679 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
680 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
681 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2840); |
682 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
683 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
684 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
685 | 1 | ccfree(values); |
686 | 1 | ccv_nnc_tensor_free(tensor); |
687 | 1 | ccv_nnc_tensor_free(g_tensor); |
688 | 1 | ccv_nnc_tensor_free(gv_tensor); |
689 | 1 | ccv_nnc_tensor_free(v_tensor); |
690 | 1 | } |
691 | | |
692 | | TEST_CASE("quantize float to 6-bit and dequantize on GPU losslessly, fast path") |
693 | 1 | { |
694 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
695 | 1 | float lut[64]; |
696 | 1 | int i; |
697 | 65 | for (i = 0; i < 64; i++64 ) |
698 | 64 | lut[i] = (float)i; |
699 | 1 | float* const values = ccmalloc(sizeof(float) * 2840); |
700 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
701 | 2.84k | values[i] = lut[i % 64]; |
702 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 4 + 3) / 4), 0); |
703 | 1 | uint8_t* compressed = tensor->data.u8; |
704 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 4); |
705 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match"); |
706 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 4 + 3) / 4), 0); |
707 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
708 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0); |
709 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2840); |
710 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0); |
711 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
712 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless"); |
713 | 1 | ccfree(values); |
714 | 1 | ccv_nnc_tensor_free(tensor); |
715 | 1 | ccv_nnc_tensor_free(g_tensor); |
716 | 1 | ccv_nnc_tensor_free(gv_tensor); |
717 | 1 | ccv_nnc_tensor_free(v_tensor); |
718 | 1 | } |
719 | | |
720 | | TEST_CASE("quantize half-precision to 6-bit and dequantize on GPU losslessly, fast path") |
721 | 1 | { |
722 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS)); |
723 | 1 | float lut_f32[64]; |
724 | 1 | int i; |
725 | 65 | for (i = 0; i < 64; i++64 ) |
726 | 64 | lut_f32[i] = (float)i; |
727 | 1 | uint16_t lut[64]; |
728 | 1 | ccv_float_to_half_precision(lut_f32, lut, 64); |
729 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 8192); |
730 | 8.19k | for (i = 0; i < 8192; i++8.19k ) |
731 | 8.19k | values[i] = lut[i % 64]; |
732 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 2 * 64 * 2 + 3) / 4), 0); |
733 | 1 | uint8_t* compressed = tensor->data.u8; |
734 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 8192, 6, 4096, compressed, 6144 + 2 * 64 * 2); |
735 | 1 | REQUIRE_EQ(output_size, 6144 + 2 * 64 * 2, "output size should match"); |
736 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 2 * 64 * 2 + 3) / 4), 0); |
737 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
738 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 8192), 0); |
739 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 4096, gv_tensor->data.u8, 8192); |
740 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 8192), 0); |
741 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
742 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 8192, "should be lossless"); |
743 | 1 | ccfree(values); |
744 | 1 | ccv_nnc_tensor_free(tensor); |
745 | 1 | ccv_nnc_tensor_free(g_tensor); |
746 | 1 | ccv_nnc_tensor_free(gv_tensor); |
747 | 1 | ccv_nnc_tensor_free(v_tensor); |
748 | 1 | } |
749 | | |
750 | | TEST_CASE("quantize double to 7-bit and dequantize on GPU losslessly, fast path") |
751 | 1 | { |
752 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
753 | 1 | double lut[128]; |
754 | 1 | int i; |
755 | 129 | for (i = 0; i < 128; i++128 ) |
756 | 128 | lut[i] = (double)i; |
757 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
758 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
759 | 2.84k | values[i] = lut[i % 128]; |
760 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 8 + 3) / 4), 0); |
761 | 1 | uint8_t* compressed = tensor->data.u8; |
762 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 8); |
763 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match"); |
764 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 8 + 3) / 4), 0); |
765 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
766 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
767 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840); |
768 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
769 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
770 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
771 | 1 | ccfree(values); |
772 | 1 | ccv_nnc_tensor_free(tensor); |
773 | 1 | ccv_nnc_tensor_free(g_tensor); |
774 | 1 | ccv_nnc_tensor_free(gv_tensor); |
775 | 1 | ccv_nnc_tensor_free(v_tensor); |
776 | 1 | } |
777 | | |
778 | | TEST_CASE("quantize float to 7-bit and dequantize on GPU losslessly, fast path") |
779 | 1 | { |
780 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
781 | 1 | float lut[128]; |
782 | 1 | int i; |
783 | 129 | for (i = 0; i < 128; i++128 ) |
784 | 128 | lut[i] = (float)i; |
785 | 1 | float* const values = ccmalloc(sizeof(float) * 2840); |
786 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
787 | 2.84k | values[i] = lut[i % 128]; |
788 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 4 + 3) / 4), 0); |
789 | 1 | uint8_t* compressed = tensor->data.u8; |
790 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 4); |
791 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match"); |
792 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 4 + 3) / 4), 0); |
793 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
794 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0); |
795 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840); |
796 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0); |
797 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
798 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless"); |
799 | 1 | ccfree(values); |
800 | 1 | ccv_nnc_tensor_free(tensor); |
801 | 1 | ccv_nnc_tensor_free(g_tensor); |
802 | 1 | ccv_nnc_tensor_free(gv_tensor); |
803 | 1 | ccv_nnc_tensor_free(v_tensor); |
804 | 1 | } |
805 | | |
806 | | TEST_CASE("quantize half-precision to 7-bit and dequantize on GPU losslessly, fast path") |
807 | 1 | { |
808 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
809 | 1 | float lut_f32[128]; |
810 | 1 | int i; |
811 | 129 | for (i = 0; i < 128; i++128 ) |
812 | 128 | lut_f32[i] = (float)i; |
813 | 1 | uint16_t lut[128]; |
814 | 1 | ccv_float_to_half_precision(lut_f32, lut, 128); |
815 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); |
816 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
817 | 2.84k | values[i] = lut[i % 128]; |
818 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 2 + 3) / 4), 0); |
819 | 1 | uint8_t* compressed = tensor->data.u8; |
820 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 2); |
821 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match"); |
822 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 2 + 3) / 4), 0); |
823 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
824 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0); |
825 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840); |
826 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0); |
827 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
828 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless"); |
829 | 1 | ccfree(values); |
830 | 1 | ccv_nnc_tensor_free(tensor); |
831 | 1 | ccv_nnc_tensor_free(g_tensor); |
832 | 1 | ccv_nnc_tensor_free(gv_tensor); |
833 | 1 | ccv_nnc_tensor_free(v_tensor); |
834 | 1 | } |
835 | | |
836 | | TEST_CASE("quantize double to 8-bit and dequantize on GPU losslessly, fast path") |
837 | 1 | { |
838 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
839 | 1 | double lut[256]; |
840 | 1 | int i; |
841 | 257 | for (i = 0; i < 256; i++256 ) |
842 | 256 | lut[i] = (double)i; |
843 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
844 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
845 | 2.84k | values[i] = lut[i % 256]; |
846 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2840 + 3 * 256 * 8 + 3) / 4), 0); |
847 | 1 | uint8_t* compressed = tensor->data.u8; |
848 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 8); |
849 | 1 | REQUIRE_EQ(output_size, 2840 + 3 * 256 * 8, "output size should match"); |
850 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2840 + 3 * 256 * 8 + 3) / 4), 0); |
851 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
852 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
853 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2840); |
854 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
855 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
856 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
857 | 1 | ccfree(values); |
858 | 1 | ccv_nnc_tensor_free(tensor); |
859 | 1 | ccv_nnc_tensor_free(g_tensor); |
860 | 1 | ccv_nnc_tensor_free(gv_tensor); |
861 | 1 | ccv_nnc_tensor_free(v_tensor); |
862 | 1 | } |
863 | | |
864 | | TEST_CASE("quantize float to 8-bit and dequantize on GPU losslessly, fast path") |
865 | 1 | { |
866 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS)); |
867 | 1 | float lut[256]; |
868 | 1 | int i; |
869 | 257 | for (i = 0; i < 256; i++256 ) |
870 | 256 | lut[i] = (float)i; |
871 | 1 | float* const values = ccmalloc(sizeof(float) * 8192); |
872 | 8.19k | for (i = 0; i < 8192; i++8.19k ) |
873 | 8.19k | values[i] = lut[i % 256]; |
874 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (8192 + 2 * 256 * 4 + 3) / 4), 0); |
875 | 1 | uint8_t* compressed = tensor->data.u8; |
876 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 8192, 8, 4096, compressed, 8192 + 2 * 256 * 4); |
877 | 1 | REQUIRE_EQ(output_size, 8192 + 2 * 256 * 4, "output size should match"); |
878 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (8192 + 2 * 256 * 4 + 3) / 4), 0); |
879 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
880 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 8192), 0); |
881 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 4096, gv_tensor->data.u8, 8192); |
882 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 8192), 0); |
883 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
884 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 8192, "should be lossless"); |
885 | 1 | ccfree(values); |
886 | 1 | ccv_nnc_tensor_free(tensor); |
887 | 1 | ccv_nnc_tensor_free(g_tensor); |
888 | 1 | ccv_nnc_tensor_free(gv_tensor); |
889 | 1 | ccv_nnc_tensor_free(v_tensor); |
890 | 1 | } |
891 | | |
892 | | TEST_CASE("quantize half-precision to 8-bit and dequantize on GPU losslessly, fast path") |
893 | 1 | { |
894 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
895 | 1 | float lut_f32[256]; |
896 | 1 | int i; |
897 | 257 | for (i = 0; i < 256; i++256 ) |
898 | 256 | lut_f32[i] = (float)i; |
899 | 1 | uint16_t lut[256]; |
900 | 1 | ccv_float_to_half_precision(lut_f32, lut, 256); |
901 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); |
902 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
903 | 2.84k | values[i] = lut[i % 256]; |
904 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2840 + 3 * 256 * 2 + 3) / 4), 0); |
905 | 1 | uint8_t* compressed = tensor->data.u8; |
906 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 2); |
907 | 1 | REQUIRE_EQ(output_size, 2840 + 3 * 256 * 2, "output size should match"); |
908 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2840 + 3 * 256 * 2 + 3) / 4), 0); |
909 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
910 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0); |
911 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2840); |
912 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0); |
913 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
914 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless"); |
915 | 1 | ccfree(values); |
916 | 1 | ccv_nnc_tensor_free(tensor); |
917 | 1 | ccv_nnc_tensor_free(g_tensor); |
918 | 1 | ccv_nnc_tensor_free(gv_tensor); |
919 | 1 | ccv_nnc_tensor_free(v_tensor); |
920 | 1 | } |
921 | | |
922 | | #include "case_main.h" |