/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/palettize.tests.c
Line | Count | Source |
1 | | #include "case.h" |
2 | | #include "ccv_case.h" |
3 | | #include "ccv_nnc_case.h" |
4 | | #include <ccv.h> |
5 | | #include <nnc/ccv_nnc.h> |
6 | | #include <nnc/ccv_nnc_easy.h> |
7 | | #include "3rdparty/dsfmt/dSFMT.h" |
8 | | |
9 | | TEST_SETUP() |
10 | | { |
11 | | ccv_nnc_init(); |
12 | | } |
13 | | |
14 | | TEST_CASE("quantize double to 4-bit and dequantize on GPU losslessly") |
15 | | { |
16 | | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
17 | | double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
18 | | double* const values = ccmalloc(sizeof(double) * 2839); |
19 | | int i; |
20 | | for (i = 0; i < 2839; i++) |
21 | | values[i] = lut[i % 16]; |
22 | | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 + 3) / 4), 0); |
23 | | uint8_t* compressed = tensor->data.u8; |
24 | | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944); |
25 | | REQUIRE_EQ(output_size, 1420 + 2944, "output size should match"); |
26 | | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 + 3) / 4), 0); |
27 | | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
28 | | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
29 | | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839); |
30 | | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
31 | | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
32 | | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
33 | | ccfree(values); |
34 | | ccv_nnc_tensor_free(tensor); |
35 | | ccv_nnc_tensor_free(g_tensor); |
36 | | ccv_nnc_tensor_free(gv_tensor); |
37 | | ccv_nnc_tensor_free(v_tensor); |
38 | | } |
39 | | |
40 | | TEST_CASE("quantize float to 4-bit and dequantize on GPU losslessly") |
41 | 1 | { |
42 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
43 | 1 | float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
44 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
45 | 1 | int i; |
46 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
47 | 2.83k | values[i] = lut[i % 16]; |
48 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 2 + 3) / 4), 0); |
49 | 1 | uint8_t* compressed = tensor->data.u8; |
50 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 2); |
51 | 1 | REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match"); |
52 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 2 + 3) / 4), 0); |
53 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
54 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
55 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839); |
56 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
57 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
58 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
59 | 1 | ccfree(values); |
60 | 1 | ccv_nnc_tensor_free(tensor); |
61 | 1 | ccv_nnc_tensor_free(g_tensor); |
62 | 1 | ccv_nnc_tensor_free(gv_tensor); |
63 | 1 | ccv_nnc_tensor_free(v_tensor); |
64 | 1 | } |
65 | | |
66 | | TEST_CASE("quantize half-precision to 4-bit and dequantize on GPU losslessly") |
67 | 1 | { |
68 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
69 | 1 | float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
70 | 1 | uint16_t lut[16]; |
71 | 1 | ccv_float_to_half_precision(lut_f32, lut, 16); |
72 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
73 | 1 | int i; |
74 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
75 | 2.83k | values[i] = lut[i % 16]; |
76 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 4 + 3) / 4), 0); |
77 | 1 | uint8_t* compressed = tensor->data.u8; |
78 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 4); |
79 | 1 | REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match"); |
80 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 4 + 3) / 4), 0); |
81 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
82 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
83 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2839); |
84 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
85 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
86 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
87 | 1 | ccfree(values); |
88 | 1 | ccv_nnc_tensor_free(tensor); |
89 | 1 | ccv_nnc_tensor_free(g_tensor); |
90 | 1 | ccv_nnc_tensor_free(gv_tensor); |
91 | 1 | ccv_nnc_tensor_free(v_tensor); |
92 | 1 | } |
93 | | |
94 | | TEST_CASE("quantize double to 5-bit and dequantize on GPU losslessly") |
95 | 1 | { |
96 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
97 | 1 | double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
98 | 1 | double* const values = ccmalloc(sizeof(double) * 2839); |
99 | 1 | int i; |
100 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
101 | 2.83k | values[i] = lut[i % 32]; |
102 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 8 + 3) / 4), 0); |
103 | 1 | uint8_t* compressed = tensor->data.u8; |
104 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 8); |
105 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match"); |
106 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 8 + 3) / 4), 0); |
107 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
108 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
109 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839); |
110 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
111 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
112 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
113 | 1 | ccfree(values); |
114 | 1 | ccv_nnc_tensor_free(tensor); |
115 | 1 | ccv_nnc_tensor_free(g_tensor); |
116 | 1 | ccv_nnc_tensor_free(gv_tensor); |
117 | 1 | ccv_nnc_tensor_free(v_tensor); |
118 | 1 | } |
119 | | |
120 | | TEST_CASE("quantize float to 5-bit and dequantize on GPU losslessly") |
121 | 1 | { |
122 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
123 | 1 | float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
124 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
125 | 1 | int i; |
126 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
127 | 2.83k | values[i] = lut[i % 32]; |
128 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 4 + 3) / 4), 0); |
129 | 1 | uint8_t* compressed = tensor->data.u8; |
130 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 4); |
131 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match"); |
132 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 4 + 3) / 4), 0); |
133 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
134 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
135 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839); |
136 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
137 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
138 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
139 | 1 | ccfree(values); |
140 | 1 | ccv_nnc_tensor_free(tensor); |
141 | 1 | ccv_nnc_tensor_free(g_tensor); |
142 | 1 | ccv_nnc_tensor_free(gv_tensor); |
143 | 1 | ccv_nnc_tensor_free(v_tensor); |
144 | 1 | } |
145 | | |
146 | | TEST_CASE("quantize half-precision to 5-bit and dequantize on GPU losslessly") |
147 | | { |
148 | | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
149 | | float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
150 | | uint16_t lut[32]; |
151 | | ccv_float_to_half_precision(lut_f32, lut, 32); |
152 | | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
153 | | int i; |
154 | | for (i = 0; i < 2839; i++) |
155 | | values[i] = lut[i % 32]; |
156 | | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 2 + 3) / 4), 0); |
157 | | uint8_t* compressed = tensor->data.u8; |
158 | | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 2); |
159 | | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match"); |
160 | | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 2 + 3) / 4), 0); |
161 | | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
162 | | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
163 | | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2839); |
164 | | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
165 | | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
166 | | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
167 | | ccfree(values); |
168 | | ccv_nnc_tensor_free(tensor); |
169 | | ccv_nnc_tensor_free(g_tensor); |
170 | | ccv_nnc_tensor_free(gv_tensor); |
171 | | ccv_nnc_tensor_free(v_tensor); |
172 | | } |
173 | | |
174 | | TEST_CASE("quantize double to 6-bit and dequantize on GPU losslessly") |
175 | 1 | { |
176 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
177 | 1 | double lut[64]; |
178 | 1 | int i; |
179 | 65 | for (i = 0; i < 64; i++64 ) |
180 | 64 | lut[i] = (double)i; |
181 | 1 | double* const values = ccmalloc(sizeof(double) * 2839); |
182 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
183 | 2.83k | values[i] = lut[i % 64]; |
184 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 8 + 3) / 4), 0); |
185 | 1 | uint8_t* compressed = tensor->data.u8; |
186 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 8); |
187 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match"); |
188 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 8 + 3) / 4), 0); |
189 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
190 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
191 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839); |
192 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
193 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
194 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
195 | 1 | ccfree(values); |
196 | 1 | ccv_nnc_tensor_free(tensor); |
197 | 1 | ccv_nnc_tensor_free(g_tensor); |
198 | 1 | ccv_nnc_tensor_free(gv_tensor); |
199 | 1 | ccv_nnc_tensor_free(v_tensor); |
200 | 1 | } |
201 | | |
202 | | TEST_CASE("quantize float to 6-bit and dequantize on GPU losslessly") |
203 | 1 | { |
204 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
205 | 1 | float lut[64]; |
206 | 1 | int i; |
207 | 65 | for (i = 0; i < 64; i++64 ) |
208 | 64 | lut[i] = (float)i; |
209 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
210 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
211 | 2.83k | values[i] = lut[i % 64]; |
212 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 4 + 3) / 4), 0); |
213 | 1 | uint8_t* compressed = tensor->data.u8; |
214 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 4); |
215 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match"); |
216 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 4 + 3) / 4), 0); |
217 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
218 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
219 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839); |
220 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
221 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
222 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
223 | 1 | ccfree(values); |
224 | 1 | ccv_nnc_tensor_free(tensor); |
225 | 1 | ccv_nnc_tensor_free(g_tensor); |
226 | 1 | ccv_nnc_tensor_free(gv_tensor); |
227 | 1 | ccv_nnc_tensor_free(v_tensor); |
228 | 1 | } |
229 | | |
230 | | TEST_CASE("quantize half-precision to 6-bit and dequantize on GPU losslessly") |
231 | 1 | { |
232 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
233 | 1 | float lut_f32[64]; |
234 | 1 | int i; |
235 | 65 | for (i = 0; i < 64; i++64 ) |
236 | 64 | lut_f32[i] = (float)i; |
237 | 1 | uint16_t lut[64]; |
238 | 1 | ccv_float_to_half_precision(lut_f32, lut, 64); |
239 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
240 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
241 | 2.83k | values[i] = lut[i % 64]; |
242 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 2 + 3) / 4), 0); |
243 | 1 | uint8_t* compressed = tensor->data.u8; |
244 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 2); |
245 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 2, "output size should match"); |
246 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 2 + 3) / 4), 0); |
247 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
248 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
249 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2839); |
250 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
251 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
252 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
253 | 1 | ccfree(values); |
254 | 1 | ccv_nnc_tensor_free(tensor); |
255 | 1 | ccv_nnc_tensor_free(g_tensor); |
256 | 1 | ccv_nnc_tensor_free(gv_tensor); |
257 | 1 | ccv_nnc_tensor_free(v_tensor); |
258 | 1 | } |
259 | | |
260 | | TEST_CASE("quantize double to 7-bit and dequantize on GPU losslessly") |
261 | 1 | { |
262 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
263 | 1 | double lut[128]; |
264 | 1 | int i; |
265 | 129 | for (i = 0; i < 128; i++128 ) |
266 | 128 | lut[i] = (double)i; |
267 | 1 | double* const values = ccmalloc(sizeof(double) * 2839); |
268 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
269 | 2.83k | values[i] = lut[i % 128]; |
270 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 8 + 3) / 4), 0); |
271 | 1 | uint8_t* compressed = tensor->data.u8; |
272 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 8); |
273 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match"); |
274 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 8 + 3) / 4), 0); |
275 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
276 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
277 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839); |
278 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
279 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
280 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
281 | 1 | ccfree(values); |
282 | 1 | ccv_nnc_tensor_free(tensor); |
283 | 1 | ccv_nnc_tensor_free(g_tensor); |
284 | 1 | ccv_nnc_tensor_free(gv_tensor); |
285 | 1 | ccv_nnc_tensor_free(v_tensor); |
286 | 1 | } |
287 | | |
288 | | TEST_CASE("quantize float to 7-bit and dequantize on GPU losslessly") |
289 | 1 | { |
290 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
291 | 1 | float lut[128]; |
292 | 1 | int i; |
293 | 129 | for (i = 0; i < 128; i++128 ) |
294 | 128 | lut[i] = (float)i; |
295 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
296 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
297 | 2.83k | values[i] = lut[i % 128]; |
298 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 4 + 3) / 4), 0); |
299 | 1 | uint8_t* compressed = tensor->data.u8; |
300 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 4); |
301 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match"); |
302 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 4 + 3) / 4), 0); |
303 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
304 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
305 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839); |
306 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
307 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
308 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
309 | 1 | ccfree(values); |
310 | 1 | ccv_nnc_tensor_free(tensor); |
311 | 1 | ccv_nnc_tensor_free(g_tensor); |
312 | 1 | ccv_nnc_tensor_free(gv_tensor); |
313 | 1 | ccv_nnc_tensor_free(v_tensor); |
314 | 1 | } |
315 | | |
316 | | TEST_CASE("quantize half-precision to 7-bit and dequantize on GPU losslessly") |
317 | 1 | { |
318 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
319 | 1 | float lut_f32[128]; |
320 | 1 | int i; |
321 | 129 | for (i = 0; i < 128; i++128 ) |
322 | 128 | lut_f32[i] = (float)i; |
323 | 1 | uint16_t lut[128]; |
324 | 1 | ccv_float_to_half_precision(lut_f32, lut, 128); |
325 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
326 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
327 | 2.83k | values[i] = lut[i % 128]; |
328 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 2 + 3) / 4), 0); |
329 | 1 | uint8_t* compressed = tensor->data.u8; |
330 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 2); |
331 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match"); |
332 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 2 + 3) / 4), 0); |
333 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
334 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
335 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2839); |
336 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
337 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
338 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
339 | 1 | ccfree(values); |
340 | 1 | ccv_nnc_tensor_free(tensor); |
341 | 1 | ccv_nnc_tensor_free(g_tensor); |
342 | 1 | ccv_nnc_tensor_free(gv_tensor); |
343 | 1 | ccv_nnc_tensor_free(v_tensor); |
344 | 1 | } |
345 | | |
346 | | TEST_CASE("quantize double to 8-bit and dequantize on GPU losslessly") |
347 | 1 | { |
348 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
349 | 1 | double lut[256]; |
350 | 1 | int i; |
351 | 257 | for (i = 0; i < 256; i++256 ) |
352 | 256 | lut[i] = (double)i; |
353 | 1 | double* const values = ccmalloc(sizeof(double) * 2839); |
354 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
355 | 2.83k | values[i] = lut[i % 256]; |
356 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 8 + 3) / 4), 0); |
357 | 1 | uint8_t* compressed = tensor->data.u8; |
358 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 8); |
359 | 1 | REQUIRE_EQ(output_size, 2839 + 3 * 256 * 8, "output size should match"); |
360 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 8 + 3) / 4), 0); |
361 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
362 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2839), 0); |
363 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839); |
364 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2839), 0); |
365 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
366 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2839, "should be lossless"); |
367 | 1 | ccfree(values); |
368 | 1 | ccv_nnc_tensor_free(tensor); |
369 | 1 | ccv_nnc_tensor_free(g_tensor); |
370 | 1 | ccv_nnc_tensor_free(gv_tensor); |
371 | 1 | ccv_nnc_tensor_free(v_tensor); |
372 | 1 | } |
373 | | |
374 | | TEST_CASE("quantize float to 8-bit and dequantize on GPU losslessly") |
375 | 1 | { |
376 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
377 | 1 | float lut[256]; |
378 | 1 | int i; |
379 | 257 | for (i = 0; i < 256; i++256 ) |
380 | 256 | lut[i] = (float)i; |
381 | 1 | float* const values = ccmalloc(sizeof(float) * 2839); |
382 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
383 | 2.83k | values[i] = lut[i % 256]; |
384 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 4 + 3) / 4), 0); |
385 | 1 | uint8_t* compressed = tensor->data.u8; |
386 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 4); |
387 | 1 | REQUIRE_EQ(output_size, 2839 + 3 * 256 * 4, "output size should match"); |
388 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 4 + 3) / 4), 0); |
389 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
390 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2839), 0); |
391 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839); |
392 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2839), 0); |
393 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
394 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2839, "should be lossless"); |
395 | 1 | ccfree(values); |
396 | 1 | ccv_nnc_tensor_free(tensor); |
397 | 1 | ccv_nnc_tensor_free(g_tensor); |
398 | 1 | ccv_nnc_tensor_free(gv_tensor); |
399 | 1 | ccv_nnc_tensor_free(v_tensor); |
400 | 1 | } |
401 | | |
402 | | TEST_CASE("quantize half-precision to 8-bit and dequantize on GPU losslessly") |
403 | 1 | { |
404 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
405 | 1 | float lut_f32[256]; |
406 | 1 | int i; |
407 | 257 | for (i = 0; i < 256; i++256 ) |
408 | 256 | lut_f32[i] = (float)i; |
409 | 1 | uint16_t lut[256]; |
410 | 1 | ccv_float_to_half_precision(lut_f32, lut, 256); |
411 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); |
412 | 2.84k | for (i = 0; i < 2839; i++2.83k ) |
413 | 2.83k | values[i] = lut[i % 256]; |
414 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2839 + 3 * 256 * 2 + 3) / 4), 0); |
415 | 1 | uint8_t* compressed = tensor->data.u8; |
416 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 2); |
417 | 1 | REQUIRE_EQ(output_size, 2839 + 3 * 256 * 2, "output size should match"); |
418 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2839 + 3 * 256 * 2 + 3) / 4), 0); |
419 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
420 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2839), 0); |
421 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2839); |
422 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2839), 0); |
423 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
424 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2839, "should be lossless"); |
425 | 1 | ccfree(values); |
426 | 1 | ccv_nnc_tensor_free(tensor); |
427 | 1 | ccv_nnc_tensor_free(g_tensor); |
428 | 1 | ccv_nnc_tensor_free(gv_tensor); |
429 | 1 | ccv_nnc_tensor_free(v_tensor); |
430 | 1 | } |
431 | | |
432 | | TEST_CASE("quantize double to 4-bit and dequantize on GPU losslessly, fast path") |
433 | 1 | { |
434 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
435 | 1 | double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
436 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
437 | 1 | int i; |
438 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
439 | 2.84k | values[i] = lut[i % 16]; |
440 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 + 3) / 4), 0); |
441 | 1 | uint8_t* compressed = tensor->data.u8; |
442 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944); |
443 | 1 | REQUIRE_EQ(output_size, 1420 + 2944, "output size should match"); |
444 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 + 3) / 4), 0); |
445 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
446 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
447 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840); |
448 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
449 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
450 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
451 | 1 | ccfree(values); |
452 | 1 | ccv_nnc_tensor_free(tensor); |
453 | 1 | ccv_nnc_tensor_free(g_tensor); |
454 | 1 | ccv_nnc_tensor_free(gv_tensor); |
455 | 1 | ccv_nnc_tensor_free(v_tensor); |
456 | 1 | } |
457 | | |
458 | | TEST_CASE("quantize float to 4-bit and dequantize on GPU losslessly, fast path") |
459 | 1 | { |
460 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
461 | 1 | float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
462 | 1 | float* const values = ccmalloc(sizeof(float) * 2840); |
463 | 1 | int i; |
464 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
465 | 2.84k | values[i] = lut[i % 16]; |
466 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 2 + 3) / 4), 0); |
467 | 1 | uint8_t* compressed = tensor->data.u8; |
468 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 2); |
469 | 1 | REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match"); |
470 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 2 + 3) / 4), 0); |
471 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
472 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0); |
473 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840); |
474 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0); |
475 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
476 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless"); |
477 | 1 | ccfree(values); |
478 | 1 | ccv_nnc_tensor_free(tensor); |
479 | 1 | ccv_nnc_tensor_free(g_tensor); |
480 | 1 | ccv_nnc_tensor_free(gv_tensor); |
481 | 1 | ccv_nnc_tensor_free(v_tensor); |
482 | 1 | } |
483 | | |
484 | | TEST_CASE("quantize half-precision to 4-bit and dequantize on GPU losslessly, fast path") |
485 | 1 | { |
486 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
487 | 1 | float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; |
488 | 1 | uint16_t lut[16]; |
489 | 1 | ccv_float_to_half_precision(lut_f32, lut, 16); |
490 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); |
491 | 1 | int i; |
492 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
493 | 2.84k | values[i] = lut[i % 16]; |
494 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1420 + 2944 / 4 + 3) / 4), 0); |
495 | 1 | uint8_t* compressed = tensor->data.u8; |
496 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 4); |
497 | 1 | REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match"); |
498 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1420 + 2944 / 4 + 3) / 4), 0); |
499 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
500 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0); |
501 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 4, 128, gv_tensor->data.u8, 2840); |
502 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0); |
503 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
504 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless"); |
505 | 1 | ccfree(values); |
506 | 1 | ccv_nnc_tensor_free(tensor); |
507 | 1 | ccv_nnc_tensor_free(g_tensor); |
508 | 1 | ccv_nnc_tensor_free(gv_tensor); |
509 | 1 | ccv_nnc_tensor_free(v_tensor); |
510 | 1 | } |
511 | | |
512 | | TEST_CASE("quantize double to 5-bit and dequantize on GPU losslessly, fast path") |
513 | 1 | { |
514 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
515 | 1 | double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
516 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
517 | 1 | int i; |
518 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
519 | 2.84k | values[i] = lut[i % 32]; |
520 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 8 + 3) / 4), 0); |
521 | 1 | uint8_t* compressed = tensor->data.u8; |
522 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 8); |
523 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match"); |
524 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 8 + 3) / 4), 0); |
525 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
526 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
527 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840); |
528 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
529 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
530 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
531 | 1 | ccfree(values); |
532 | 1 | ccv_nnc_tensor_free(tensor); |
533 | 1 | ccv_nnc_tensor_free(g_tensor); |
534 | 1 | ccv_nnc_tensor_free(gv_tensor); |
535 | 1 | ccv_nnc_tensor_free(v_tensor); |
536 | 1 | } |
537 | | |
538 | | TEST_CASE("quantize float to 5-bit and dequantize on GPU losslessly, fast path") |
539 | 1 | { |
540 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
541 | 1 | float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
542 | 1 | float* const values = ccmalloc(sizeof(float) * 2840); |
543 | 1 | int i; |
544 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
545 | 2.84k | values[i] = lut[i % 32]; |
546 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 4 + 3) / 4), 0); |
547 | 1 | uint8_t* compressed = tensor->data.u8; |
548 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 4); |
549 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match"); |
550 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 4 + 3) / 4), 0); |
551 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
552 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0); |
553 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840); |
554 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0); |
555 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
556 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless"); |
557 | 1 | ccfree(values); |
558 | 1 | ccv_nnc_tensor_free(tensor); |
559 | 1 | ccv_nnc_tensor_free(g_tensor); |
560 | 1 | ccv_nnc_tensor_free(gv_tensor); |
561 | 1 | ccv_nnc_tensor_free(v_tensor); |
562 | 1 | } |
563 | | |
564 | | TEST_CASE("quantize half-precision to 5-bit and dequantize on GPU losslessly, fast path") |
565 | 1 | { |
566 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
567 | 1 | float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; |
568 | 1 | uint16_t lut[32]; |
569 | 1 | ccv_float_to_half_precision(lut_f32, lut, 32); |
570 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); |
571 | 1 | int i; |
572 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
573 | 2.84k | values[i] = lut[i % 32]; |
574 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (1775 + 23 * 32 * 2 + 3) / 4), 0); |
575 | 1 | uint8_t* compressed = tensor->data.u8; |
576 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 2); |
577 | 1 | REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match"); |
578 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (1775 + 23 * 32 * 2 + 3) / 4), 0); |
579 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
580 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0); |
581 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 128, gv_tensor->data.u8, 2840); |
582 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0); |
583 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
584 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless"); |
585 | 1 | ccfree(values); |
586 | 1 | ccv_nnc_tensor_free(tensor); |
587 | 1 | ccv_nnc_tensor_free(g_tensor); |
588 | 1 | ccv_nnc_tensor_free(gv_tensor); |
589 | 1 | ccv_nnc_tensor_free(v_tensor); |
590 | 1 | } |
591 | | |
592 | | TEST_CASE("quantize double to 6-bit and dequantize on GPU losslessly, fast path") |
593 | 1 | { |
594 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
595 | 1 | double lut[64]; |
596 | 1 | int i; |
597 | 65 | for (i = 0; i < 64; i++64 ) |
598 | 64 | lut[i] = (double)i; |
599 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
600 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
601 | 2.84k | values[i] = lut[i % 64]; |
602 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 8 + 3) / 4), 0); |
603 | 1 | uint8_t* compressed = tensor->data.u8; |
604 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 8); |
605 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match"); |
606 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 8 + 3) / 4), 0); |
607 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
608 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
609 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2840); |
610 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
611 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
612 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
613 | 1 | ccfree(values); |
614 | 1 | ccv_nnc_tensor_free(tensor); |
615 | 1 | ccv_nnc_tensor_free(g_tensor); |
616 | 1 | ccv_nnc_tensor_free(gv_tensor); |
617 | 1 | ccv_nnc_tensor_free(v_tensor); |
618 | 1 | } |
619 | | |
620 | | TEST_CASE("quantize float to 6-bit and dequantize on GPU losslessly, fast path") |
621 | 1 | { |
622 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
623 | 1 | float lut[64]; |
624 | 1 | int i; |
625 | 65 | for (i = 0; i < 64; i++64 ) |
626 | 64 | lut[i] = (float)i; |
627 | 1 | float* const values = ccmalloc(sizeof(float) * 2840); |
628 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
629 | 2.84k | values[i] = lut[i % 64]; |
630 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2130 + 6 * 64 * 4 + 3) / 4), 0); |
631 | 1 | uint8_t* compressed = tensor->data.u8; |
632 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 4); |
633 | 1 | REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match"); |
634 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2130 + 6 * 64 * 4 + 3) / 4), 0); |
635 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
636 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0); |
637 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 512, gv_tensor->data.u8, 2840); |
638 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0); |
639 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
640 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless"); |
641 | 1 | ccfree(values); |
642 | 1 | ccv_nnc_tensor_free(tensor); |
643 | 1 | ccv_nnc_tensor_free(g_tensor); |
644 | 1 | ccv_nnc_tensor_free(gv_tensor); |
645 | 1 | ccv_nnc_tensor_free(v_tensor); |
646 | 1 | } |
647 | | |
648 | | TEST_CASE("quantize half-precision to 6-bit and dequantize on GPU losslessly, fast path") |
649 | 1 | { |
650 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS)); |
651 | 1 | float lut_f32[64]; |
652 | 1 | int i; |
653 | 65 | for (i = 0; i < 64; i++64 ) |
654 | 64 | lut_f32[i] = (float)i; |
655 | 1 | uint16_t lut[64]; |
656 | 1 | ccv_float_to_half_precision(lut_f32, lut, 64); |
657 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 8192); |
658 | 8.19k | for (i = 0; i < 8192; i++8.19k ) |
659 | 8.19k | values[i] = lut[i % 64]; |
660 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 2 * 64 * 2 + 3) / 4), 0); |
661 | 1 | uint8_t* compressed = tensor->data.u8; |
662 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 8192, 6, 4096, compressed, 6144 + 2 * 64 * 2); |
663 | 1 | REQUIRE_EQ(output_size, 6144 + 2 * 64 * 2, "output size should match"); |
664 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 2 * 64 * 2 + 3) / 4), 0); |
665 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
666 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 8192), 0); |
667 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 4096, gv_tensor->data.u8, 8192); |
668 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 8192), 0); |
669 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
670 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 8192, "should be lossless"); |
671 | 1 | ccfree(values); |
672 | 1 | ccv_nnc_tensor_free(tensor); |
673 | 1 | ccv_nnc_tensor_free(g_tensor); |
674 | 1 | ccv_nnc_tensor_free(gv_tensor); |
675 | 1 | ccv_nnc_tensor_free(v_tensor); |
676 | 1 | } |
677 | | |
678 | | TEST_CASE("quantize double to 7-bit and dequantize on GPU losslessly, fast path") |
679 | 1 | { |
680 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
681 | 1 | double lut[128]; |
682 | 1 | int i; |
683 | 129 | for (i = 0; i < 128; i++128 ) |
684 | 128 | lut[i] = (double)i; |
685 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
686 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
687 | 2.84k | values[i] = lut[i % 128]; |
688 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 8 + 3) / 4), 0); |
689 | 1 | uint8_t* compressed = tensor->data.u8; |
690 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 8); |
691 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match"); |
692 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 8 + 3) / 4), 0); |
693 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
694 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
695 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840); |
696 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
697 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
698 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
699 | 1 | ccfree(values); |
700 | 1 | ccv_nnc_tensor_free(tensor); |
701 | 1 | ccv_nnc_tensor_free(g_tensor); |
702 | 1 | ccv_nnc_tensor_free(gv_tensor); |
703 | 1 | ccv_nnc_tensor_free(v_tensor); |
704 | 1 | } |
705 | | |
706 | | TEST_CASE("quantize float to 7-bit and dequantize on GPU losslessly, fast path") |
707 | 1 | { |
708 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
709 | 1 | float lut[128]; |
710 | 1 | int i; |
711 | 129 | for (i = 0; i < 128; i++128 ) |
712 | 128 | lut[i] = (float)i; |
713 | 1 | float* const values = ccmalloc(sizeof(float) * 2840); |
714 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
715 | 2.84k | values[i] = lut[i % 128]; |
716 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 4 + 3) / 4), 0); |
717 | 1 | uint8_t* compressed = tensor->data.u8; |
718 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 4); |
719 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match"); |
720 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 4 + 3) / 4), 0); |
721 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
722 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 2840), 0); |
723 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840); |
724 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2840), 0); |
725 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
726 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 2840, "should be lossless"); |
727 | 1 | ccfree(values); |
728 | 1 | ccv_nnc_tensor_free(tensor); |
729 | 1 | ccv_nnc_tensor_free(g_tensor); |
730 | 1 | ccv_nnc_tensor_free(gv_tensor); |
731 | 1 | ccv_nnc_tensor_free(v_tensor); |
732 | 1 | } |
733 | | |
734 | | TEST_CASE("quantize half-precision to 7-bit and dequantize on GPU losslessly, fast path") |
735 | 1 | { |
736 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
737 | 1 | float lut_f32[128]; |
738 | 1 | int i; |
739 | 129 | for (i = 0; i < 128; i++128 ) |
740 | 128 | lut_f32[i] = (float)i; |
741 | 1 | uint16_t lut[128]; |
742 | 1 | ccv_float_to_half_precision(lut_f32, lut, 128); |
743 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); |
744 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
745 | 2.84k | values[i] = lut[i % 128]; |
746 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2485 + 6 * 128 * 2 + 3) / 4), 0); |
747 | 1 | uint8_t* compressed = tensor->data.u8; |
748 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 2); |
749 | 1 | REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match"); |
750 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2485 + 6 * 128 * 2 + 3) / 4), 0); |
751 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
752 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0); |
753 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 7, 512, gv_tensor->data.u8, 2840); |
754 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0); |
755 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
756 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless"); |
757 | 1 | ccfree(values); |
758 | 1 | ccv_nnc_tensor_free(tensor); |
759 | 1 | ccv_nnc_tensor_free(g_tensor); |
760 | 1 | ccv_nnc_tensor_free(gv_tensor); |
761 | 1 | ccv_nnc_tensor_free(v_tensor); |
762 | 1 | } |
763 | | |
764 | | TEST_CASE("quantize double to 8-bit and dequantize on GPU losslessly, fast path") |
765 | 1 | { |
766 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
767 | 1 | double lut[256]; |
768 | 1 | int i; |
769 | 257 | for (i = 0; i < 256; i++256 ) |
770 | 256 | lut[i] = (double)i; |
771 | 1 | double* const values = ccmalloc(sizeof(double) * 2840); |
772 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
773 | 2.84k | values[i] = lut[i % 256]; |
774 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2840 + 3 * 256 * 8 + 3) / 4), 0); |
775 | 1 | uint8_t* compressed = tensor->data.u8; |
776 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 8); |
777 | 1 | REQUIRE_EQ(output_size, 2840 + 3 * 256 * 8, "output size should match"); |
778 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2840 + 3 * 256 * 8 + 3) / 4), 0); |
779 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
780 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 64F, 2840), 0); |
781 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_64F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2840); |
782 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(64F, 2840), 0); |
783 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
784 | 1 | REQUIRE_ARRAY_EQ(double, values, v_tensor->data.f64, 2840, "should be lossless"); |
785 | 1 | ccfree(values); |
786 | 1 | ccv_nnc_tensor_free(tensor); |
787 | 1 | ccv_nnc_tensor_free(g_tensor); |
788 | 1 | ccv_nnc_tensor_free(gv_tensor); |
789 | 1 | ccv_nnc_tensor_free(v_tensor); |
790 | 1 | } |
791 | | |
792 | | TEST_CASE("quantize float to 8-bit and dequantize on GPU losslessly, fast path") |
793 | 1 | { |
794 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS)); |
795 | 1 | float lut[256]; |
796 | 1 | int i; |
797 | 257 | for (i = 0; i < 256; i++256 ) |
798 | 256 | lut[i] = (float)i; |
799 | 1 | float* const values = ccmalloc(sizeof(float) * 8192); |
800 | 8.19k | for (i = 0; i < 8192; i++8.19k ) |
801 | 8.19k | values[i] = lut[i % 256]; |
802 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (8192 + 2 * 256 * 4 + 3) / 4), 0); |
803 | 1 | uint8_t* compressed = tensor->data.u8; |
804 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 8192, 8, 4096, compressed, 8192 + 2 * 256 * 4); |
805 | 1 | REQUIRE_EQ(output_size, 8192 + 2 * 256 * 4, "output size should match"); |
806 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (8192 + 2 * 256 * 4 + 3) / 4), 0); |
807 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
808 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 8192), 0); |
809 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 4096, gv_tensor->data.u8, 8192); |
810 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 8192), 0); |
811 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
812 | 1 | REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 8192, "should be lossless"); |
813 | 1 | ccfree(values); |
814 | 1 | ccv_nnc_tensor_free(tensor); |
815 | 1 | ccv_nnc_tensor_free(g_tensor); |
816 | 1 | ccv_nnc_tensor_free(gv_tensor); |
817 | 1 | ccv_nnc_tensor_free(v_tensor); |
818 | 1 | } |
819 | | |
820 | | TEST_CASE("quantize half-precision to 8-bit and dequantize on GPU losslessly, fast path") |
821 | 1 | { |
822 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)); |
823 | 1 | float lut_f32[256]; |
824 | 1 | int i; |
825 | 257 | for (i = 0; i < 256; i++256 ) |
826 | 256 | lut_f32[i] = (float)i; |
827 | 1 | uint16_t lut[256]; |
828 | 1 | ccv_float_to_half_precision(lut_f32, lut, 256); |
829 | 1 | uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); |
830 | 2.84k | for (i = 0; i < 2840; i++2.84k ) |
831 | 2.84k | values[i] = lut[i % 256]; |
832 | 1 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2840 + 3 * 256 * 2 + 3) / 4), 0); |
833 | 1 | uint8_t* compressed = tensor->data.u8; |
834 | 1 | const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 2); |
835 | 1 | REQUIRE_EQ(output_size, 2840 + 3 * 256 * 2, "output size should match"); |
836 | 1 | ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2840 + 3 * 256 * 2 + 3) / 4), 0); |
837 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0); |
838 | 1 | ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 2840), 0); |
839 | 1 | ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 1280, gv_tensor->data.u8, 2840); |
840 | 1 | ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 2840), 0); |
841 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0); |
842 | 1 | REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 2840, "should be lossless"); |
843 | 1 | ccfree(values); |
844 | 1 | ccv_nnc_tensor_free(tensor); |
845 | 1 | ccv_nnc_tensor_free(g_tensor); |
846 | 1 | ccv_nnc_tensor_free(gv_tensor); |
847 | 1 | ccv_nnc_tensor_free(v_tensor); |
848 | 1 | } |
849 | | |
850 | | #include "case_main.h" |