/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_palettize.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_internal.h" |
3 | | #ifdef HAVE_CUDA |
4 | | #include "gpu/ccv_nnc_compat.h" |
5 | | #elif defined(HAVE_MPS) |
6 | | #include "mps/ccv_nnc_mps.h" |
7 | | #endif |
8 | | |
9 | | size_t ccv_nnc_palettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length) |
10 | 63 | { |
11 | 63 | assert(datatype == CCV_16F || datatype == CCV_32F || datatype == CCV_64F); |
12 | 63 | assert(memory_type == CCV_TENSOR_CPU_MEMORY); |
13 | 63 | const int num_blocks = (input_length + number_in_blocks - 1) / number_in_blocks; |
14 | 63 | const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype); |
15 | 63 | uint8_t* const u8 = (uint8_t*)output; |
16 | 63 | uint8_t* const ui = (uint8_t*)input; |
17 | 63 | assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8); |
18 | 63 | if (qbits == 4) |
19 | 14 | { |
20 | 278 | parallel_for14 (i, num_blocks) { |
21 | 278 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
22 | 278 | int* const indices = ccmalloc(sizeof(int) * nI); |
23 | 278 | double centroids[16]; |
24 | 278 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
25 | 278 | ccv_kmeans1d(&a, 16, indices, centroids); |
26 | 278 | uint8_t* u80 = u8 + (16 * element_size + number_in_blocks / 2) * i; |
27 | 278 | int j; |
28 | 278 | if (datatype == CCV_16F) |
29 | 92 | { |
30 | 92 | float* f32 = (float*)centroids; |
31 | 1.56k | for (j = 0; j < 16; j++1.47k ) |
32 | 1.47k | f32[j] = (float)centroids[j]; |
33 | 92 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 16); |
34 | 186 | } else if (datatype == CCV_32F) { |
35 | 94 | float* f32 = (float*)u80; |
36 | 1.59k | for (j = 0; j < 16; j++1.50k ) |
37 | 1.50k | f32[j] = (float)centroids[j]; |
38 | 94 | } else { |
39 | 92 | memcpy(u80, centroids, sizeof(double) * 16); |
40 | 92 | } |
41 | 278 | u80 += 16 * element_size; |
42 | 17.3k | for (j = 0; j < nI; j += 217.0k ) |
43 | 17.0k | { |
44 | 17.0k | const uint8_t i0 = (uint8_t)indices[j]; |
45 | 17.0k | const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1]17.0k : 06 ; |
46 | 17.0k | *u80 = (i0 << 4) | i1; |
47 | 17.0k | ++u80; |
48 | 17.0k | } |
49 | 278 | ccfree(indices); |
50 | 278 | } parallel_endfor |
51 | 14 | return element_size * num_blocks * 16 + (input_length + 1) / 2; |
52 | 49 | } else if (qbits == 5) { |
53 | 276 | parallel_for12 (i, num_blocks) { |
54 | 276 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
55 | 276 | int* const indices = ccmalloc(sizeof(int) * nI); |
56 | 276 | double centroids[32]; |
57 | 276 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
58 | 276 | ccv_kmeans1d(&a, 32, indices, centroids); |
59 | 276 | uint8_t* u80 = u8 + (32 * element_size + number_in_blocks / 8 * 5) * i; |
60 | 276 | int j; |
61 | 276 | if (datatype == CCV_16F) |
62 | 92 | { |
63 | 92 | float* f32 = (float*)centroids; |
64 | 3.03k | for (j = 0; j < 32; j++2.94k ) |
65 | 2.94k | f32[j] = (float)centroids[j]; |
66 | 92 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 32); |
67 | 184 | } else if (datatype == CCV_32F) { |
68 | 92 | float* f32 = (float*)u80; |
69 | 3.03k | for (j = 0; j < 32; j++2.94k ) |
70 | 2.94k | f32[j] = (float)centroids[j]; |
71 | 92 | } else { |
72 | 92 | memcpy(u80, centroids, sizeof(double) * 32); |
73 | 92 | } |
74 | 276 | u80 += 32 * element_size; |
75 | 4.53k | for (j = 0; j < nI; j += 84.26k ) |
76 | 4.26k | { |
77 | 4.26k | const uint8_t i0 = (uint8_t)indices[j]; |
78 | 4.26k | const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 00 ; |
79 | 4.26k | const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 00 ; |
80 | 4.26k | const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 00 ; |
81 | 4.26k | const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 00 ; |
82 | 4.26k | const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 00 ; |
83 | 4.26k | const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 00 ; |
84 | 4.26k | const uint8_t i7 = j + 7 < nI ? (uint8_t)indices[j + 7]4.25k : 06 ; |
85 | 4.26k | u80[0] = (i0 << 3) | (i1 >> 2); |
86 | 4.26k | u80[1] = (i1 << 6) | (i2 << 1) | (i3 >> 4); |
87 | 4.26k | u80[2] = (i3 << 4) | (i4 >> 1); |
88 | 4.26k | u80[3] = (i4 << 7) | (i5 << 2) | (i6 >> 3); |
89 | 4.26k | u80[4] = (i6 << 5) | i7; |
90 | 4.26k | u80 += 5; |
91 | 4.26k | } |
92 | 276 | ccfree(indices); |
93 | 276 | } parallel_endfor |
94 | 12 | return element_size * num_blocks * 32 + (input_length + 7) / 8 * 5; |
95 | 37 | } else if (qbits == 6) { |
96 | 80 | parallel_for13 (i, num_blocks) { |
97 | 80 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
98 | 80 | int* const indices = ccmalloc(sizeof(int) * nI); |
99 | 80 | double centroids[64]; |
100 | 80 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
101 | 80 | ccv_kmeans1d(&a, 64, indices, centroids); |
102 | 80 | uint8_t* u80 = u8 + (64 * element_size + number_in_blocks / 4 * 3) * i; |
103 | 80 | int j; |
104 | 80 | if (datatype == CCV_16F) |
105 | 32 | { |
106 | 32 | float* f32 = (float*)centroids; |
107 | 2.08k | for (j = 0; j < 64; j++2.04k ) |
108 | 2.04k | f32[j] = (float)centroids[j]; |
109 | 32 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 64); |
110 | 48 | } else if (datatype == CCV_32F) { |
111 | 24 | float* f32 = (float*)u80; |
112 | 1.56k | for (j = 0; j < 64; j++1.53k ) |
113 | 1.53k | f32[j] = (float)centroids[j]; |
114 | 24 | } else { |
115 | 24 | memcpy(u80, centroids, sizeof(double) * 64); |
116 | 24 | } |
117 | 80 | u80 += 64 * element_size; |
118 | 13.4k | for (j = 0; j < nI; j += 413.3k ) |
119 | 13.3k | { |
120 | 13.3k | const uint8_t i0 = (uint8_t)indices[j]; |
121 | 13.3k | const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 00 ; |
122 | 13.3k | const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 00 ; |
123 | 13.3k | const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3]13.3k : 06 ; |
124 | 13.3k | u80[0] = (i0 << 2) | (i1 >> 4); |
125 | 13.3k | u80[1] = (i1 << 4) | (i2 >> 2); |
126 | 13.3k | u80[2] = (i2 << 6) | i3; |
127 | 13.3k | u80 += 3; |
128 | 13.3k | } |
129 | 80 | ccfree(indices); |
130 | 80 | } parallel_endfor |
131 | 13 | return element_size * num_blocks * 64 + (input_length + 3) / 4 * 3; |
132 | 24 | } else if (qbits == 7) { |
133 | 72 | parallel_for12 (i, num_blocks) { |
134 | 72 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
135 | 72 | int* const indices = ccmalloc(sizeof(int) * nI); |
136 | 72 | double centroids[128]; |
137 | 72 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
138 | 72 | ccv_kmeans1d(&a, 128, indices, centroids); |
139 | 72 | uint8_t* u80 = u8 + (128 * element_size + number_in_blocks / 8 * 7) * i; |
140 | 72 | int j; |
141 | 72 | if (datatype == CCV_16F) |
142 | 24 | { |
143 | 24 | float* f32 = (float*)centroids; |
144 | 3.09k | for (j = 0; j < 128; j++3.07k ) |
145 | 3.07k | f32[j] = (float)centroids[j]; |
146 | 24 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 128); |
147 | 48 | } else if (datatype == CCV_32F) { |
148 | 24 | float* f32 = (float*)u80; |
149 | 3.09k | for (j = 0; j < 128; j++3.07k ) |
150 | 3.07k | f32[j] = (float)centroids[j]; |
151 | 24 | } else { |
152 | 24 | memcpy(u80, centroids, sizeof(double) * 128); |
153 | 24 | } |
154 | 72 | u80 += 128 * element_size; |
155 | 4.33k | for (j = 0; j < nI; j += 84.26k ) |
156 | 4.26k | { |
157 | 4.26k | const uint8_t i0 = (uint8_t)indices[j]; |
158 | 4.26k | const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 00 ; |
159 | 4.26k | const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 00 ; |
160 | 4.26k | const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 00 ; |
161 | 4.26k | const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 00 ; |
162 | 4.26k | const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 00 ; |
163 | 4.26k | const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 00 ; |
164 | 4.26k | const uint8_t i7 = j + 7 < nI ? (uint8_t)indices[j + 7]4.25k : 06 ; |
165 | 4.26k | u80[0] = (i0 << 1) | (i1 >> 6); |
166 | 4.26k | u80[1] = (i1 << 2) | (i2 >> 5); |
167 | 4.26k | u80[2] = (i2 << 3) | (i3 >> 4); |
168 | 4.26k | u80[3] = (i3 << 4) | (i4 >> 3); |
169 | 4.26k | u80[4] = (i4 << 5) | (i5 >> 2); |
170 | 4.26k | u80[5] = (i5 << 6) | (i6 >> 1); |
171 | 4.26k | u80[6] = (i6 << 7) | i7; |
172 | 4.26k | u80 += 7; |
173 | 4.26k | } |
174 | 72 | ccfree(indices); |
175 | 72 | } parallel_endfor |
176 | 12 | return element_size * num_blocks * 128 + (input_length + 7) / 8 * 7; |
177 | 12 | } else { |
178 | 35 | parallel_for12 (i, num_blocks) { |
179 | 35 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
180 | 35 | int* const indices = ccmalloc(sizeof(int) * nI); |
181 | 35 | double centroids[256]; |
182 | 35 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
183 | 35 | ccv_kmeans1d(&a, 256, indices, centroids); |
184 | 35 | uint8_t* u80 = u8 + (256 * element_size + number_in_blocks) * i; |
185 | 35 | int j; |
186 | 35 | if (datatype == CCV_16F) |
187 | 12 | { |
188 | 12 | float* f32 = (float*)centroids; |
189 | 3.08k | for (j = 0; j < 256; j++3.07k ) |
190 | 3.07k | f32[j] = (float)centroids[j]; |
191 | 12 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 256); |
192 | 23 | } else if (datatype == CCV_32F) { |
193 | 11 | float* f32 = (float*)u80; |
194 | 2.82k | for (j = 0; j < 256; j++2.81k ) |
195 | 2.81k | f32[j] = (float)centroids[j]; |
196 | 12 | } else { |
197 | 12 | memcpy(u80, centroids, sizeof(double) * 256); |
198 | 12 | } |
199 | 35 | u80 += 256 * element_size; |
200 | 39.4k | for (j = 0; j < nI; j++39.4k ) |
201 | 39.4k | { |
202 | 39.4k | *u80 = (uint8_t)indices[j]; |
203 | 39.4k | ++u80; |
204 | 39.4k | } |
205 | 35 | ccfree(indices); |
206 | 35 | } parallel_endfor |
207 | 12 | return element_size * num_blocks * 256 + input_length; |
208 | 12 | } |
209 | 63 | } |
210 | | |
211 | | static void _ccv_nnc_depalettize(const void* input, const int datatype, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length) |
212 | 30 | { |
213 | 30 | assert(datatype == CCV_16F || datatype == CCV_32F || datatype == CCV_64F); |
214 | 30 | const int num_blocks = (output_length + number_in_blocks - 1) / number_in_blocks; |
215 | 30 | const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype); |
216 | 30 | uint8_t* const u8 = (uint8_t*)output; |
217 | 30 | const uint8_t* const ui = (const uint8_t*)input; |
218 | 30 | assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8); |
219 | 30 | if (datatype == CCV_16F) |
220 | 10 | { |
221 | 10 | if (qbits == 4) |
222 | 2 | { |
223 | 46 | parallel_for2 (i, num_blocks) { |
224 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
225 | 46 | const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i; |
226 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
227 | 46 | const uint16_t* const palette = (uint16_t*)ui0; |
228 | 46 | const uint8_t* ui1 = ui0 + element_size * 16; |
229 | 46 | uint16_t* const f16 = (uint16_t*)u80; |
230 | 46 | int j; |
231 | 46 | if (nI % 2 == 0) |
232 | 45 | { |
233 | 2.87k | for (j = 0; j < nI; j += 22.82k ) |
234 | 2.82k | { |
235 | 2.82k | const uint8_t u0 = *ui1; |
236 | 2.82k | const int i0 = (int)(u0 >> 4); |
237 | 2.82k | const int i1 = (int)(u0 & 15); |
238 | 2.82k | f16[j] = palette[i0]; |
239 | 2.82k | f16[j + 1] = palette[i1]; |
240 | 2.82k | ++ui1; |
241 | 2.82k | } |
242 | 45 | } else { |
243 | 13 | for (j = 0; j < nI; j += 212 ) |
244 | 12 | { |
245 | 12 | const uint8_t u0 = *ui1; |
246 | 12 | const int i0 = (int)(u0 >> 4); |
247 | 12 | const int i1 = (int)(u0 & 15); |
248 | 12 | f16[j] = palette[i0]; |
249 | 12 | if (j + 1 < nI) |
250 | 11 | f16[j + 1] = palette[i1]; |
251 | 12 | ++ui1; |
252 | 12 | } |
253 | 1 | } |
254 | 46 | } parallel_endfor |
255 | 8 | } else if (qbits == 5) { |
256 | 46 | parallel_for2 (i, num_blocks) { |
257 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
258 | 46 | const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i; |
259 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
260 | 46 | const uint16_t* const palette = (uint16_t*)ui0; |
261 | 46 | const uint8_t* ui1 = ui0 + element_size * 32; |
262 | 46 | uint16_t* const f16 = (uint16_t*)u80; |
263 | 46 | int j; |
264 | 46 | if (nI % 8 == 0) |
265 | 45 | { |
266 | 752 | for (j = 0; j < nI; j += 8707 ) |
267 | 707 | { |
268 | 707 | const uint8_t u0 = ui1[0]; |
269 | 707 | const uint8_t u1 = ui1[1]; |
270 | 707 | const uint8_t u2 = ui1[2]; |
271 | 707 | const uint8_t u3 = ui1[3]; |
272 | 707 | const uint8_t u4 = ui1[4]; |
273 | 707 | const int i0 = (int)(u0 >> 3); |
274 | 707 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
275 | 707 | const int i2 = (int)((u1 >> 1) & 31); |
276 | 707 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
277 | 707 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
278 | 707 | const int i5 = (int)((u3 >> 2) & 31); |
279 | 707 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
280 | 707 | const int i7 = (int)(u4 & 31); |
281 | 707 | f16[j] = palette[i0]; |
282 | 707 | f16[j + 1] = palette[i1]; |
283 | 707 | f16[j + 2] = palette[i2]; |
284 | 707 | f16[j + 3] = palette[i3]; |
285 | 707 | f16[j + 4] = palette[i4]; |
286 | 707 | f16[j + 5] = palette[i5]; |
287 | 707 | f16[j + 6] = palette[i6]; |
288 | 707 | f16[j + 7] = palette[i7]; |
289 | 707 | ui1 += 5; |
290 | 707 | } |
291 | 45 | } else { |
292 | 4 | for (j = 0; j < nI; j += 83 ) |
293 | 3 | { |
294 | 3 | const uint8_t u0 = ui1[0]; |
295 | 3 | const uint8_t u1 = ui1[1]; |
296 | 3 | const uint8_t u2 = ui1[2]; |
297 | 3 | const uint8_t u3 = ui1[3]; |
298 | 3 | const uint8_t u4 = ui1[4]; |
299 | 3 | const int i0 = (int)(u0 >> 3); |
300 | 3 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
301 | 3 | const int i2 = (int)((u1 >> 1) & 31); |
302 | 3 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
303 | 3 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
304 | 3 | const int i5 = (int)((u3 >> 2) & 31); |
305 | 3 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
306 | 3 | const int i7 = (int)(u4 & 31); |
307 | 3 | f16[j] = palette[i0]; |
308 | 3 | if (j + 1 < nI) |
309 | 3 | f16[j + 1] = palette[i1]; |
310 | 3 | if (j + 2 < nI) |
311 | 3 | f16[j + 2] = palette[i2]; |
312 | 3 | if (j + 3 < nI) |
313 | 3 | f16[j + 3] = palette[i3]; |
314 | 3 | if (j + 4 < nI) |
315 | 3 | f16[j + 4] = palette[i4]; |
316 | 3 | if (j + 5 < nI) |
317 | 3 | f16[j + 5] = palette[i5]; |
318 | 3 | if (j + 6 < nI) |
319 | 3 | f16[j + 6] = palette[i6]; |
320 | 3 | if (j + 7 < nI) |
321 | 2 | f16[j + 7] = palette[i7]; |
322 | 3 | ui1 += 5; |
323 | 3 | } |
324 | 1 | } |
325 | 46 | } parallel_endfor |
326 | 6 | } else if (qbits == 6) { |
327 | 12 | parallel_for2 (i, num_blocks) { |
328 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
329 | 12 | const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i; |
330 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
331 | 12 | const uint16_t* const palette = (uint16_t*)ui0; |
332 | 12 | const uint8_t* ui1 = ui0 + element_size * 64; |
333 | 12 | uint16_t* const f16 = (uint16_t*)u80; |
334 | 12 | int j; |
335 | 12 | if (nI % 4 == 0) |
336 | 11 | { |
337 | 1.36k | for (j = 0; j < nI; j += 41.35k ) |
338 | 1.35k | { |
339 | 1.35k | const uint8_t u0 = ui1[0]; |
340 | 1.35k | const uint8_t u1 = ui1[1]; |
341 | 1.35k | const uint8_t u2 = ui1[2]; |
342 | 1.35k | const int i0 = (int)(u0 >> 2); |
343 | 1.35k | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
344 | 1.35k | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
345 | 1.35k | const int i3 = (int)(u2 & 63); |
346 | 1.35k | f16[j] = palette[i0]; |
347 | 1.35k | f16[j + 1] = palette[i1]; |
348 | 1.35k | f16[j + 2] = palette[i2]; |
349 | 1.35k | f16[j + 3] = palette[i3]; |
350 | 1.35k | ui1 += 3; |
351 | 1.35k | } |
352 | 11 | } else { |
353 | 71 | for (j = 0; j < nI; j += 470 ) |
354 | 70 | { |
355 | 70 | const uint8_t u0 = ui1[0]; |
356 | 70 | const uint8_t u1 = ui1[1]; |
357 | 70 | const uint8_t u2 = ui1[2]; |
358 | 70 | const int i0 = (int)(u0 >> 2); |
359 | 70 | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
360 | 70 | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
361 | 70 | const int i3 = (int)(u2 & 63); |
362 | 70 | f16[j] = palette[i0]; |
363 | 70 | if (j + 1 < nI) |
364 | 70 | f16[j + 1] = palette[i1]; |
365 | 70 | if (j + 2 < nI) |
366 | 70 | f16[j + 2] = palette[i2]; |
367 | 70 | if (j + 3 < nI) |
368 | 69 | f16[j + 3] = palette[i3]; |
369 | 70 | ui1 += 3; |
370 | 70 | } |
371 | 1 | } |
372 | 12 | } parallel_endfor |
373 | 4 | } else if (qbits == 7) { |
374 | 12 | parallel_for2 (i, num_blocks) { |
375 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
376 | 12 | const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i; |
377 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
378 | 12 | const uint16_t* const palette = (uint16_t*)ui0; |
379 | 12 | const uint8_t* ui1 = ui0 + element_size * 128; |
380 | 12 | uint16_t* const f16 = (uint16_t*)u80; |
381 | 12 | int j; |
382 | 12 | if (nI % 8 == 0) |
383 | 11 | { |
384 | 686 | for (j = 0; j < nI; j += 8675 ) |
385 | 675 | { |
386 | 675 | const uint8_t u0 = ui1[0]; |
387 | 675 | const uint8_t u1 = ui1[1]; |
388 | 675 | const uint8_t u2 = ui1[2]; |
389 | 675 | const uint8_t u3 = ui1[3]; |
390 | 675 | const uint8_t u4 = ui1[4]; |
391 | 675 | const uint8_t u5 = ui1[5]; |
392 | 675 | const uint8_t u6 = ui1[6]; |
393 | 675 | const int i0 = (int)(u0 >> 1); |
394 | 675 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
395 | 675 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
396 | 675 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
397 | 675 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
398 | 675 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
399 | 675 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
400 | 675 | const int i7 = (int)(u6 & 127); |
401 | 675 | f16[j] = palette[i0]; |
402 | 675 | f16[j + 1] = palette[i1]; |
403 | 675 | f16[j + 2] = palette[i2]; |
404 | 675 | f16[j + 3] = palette[i3]; |
405 | 675 | f16[j + 4] = palette[i4]; |
406 | 675 | f16[j + 5] = palette[i5]; |
407 | 675 | f16[j + 6] = palette[i6]; |
408 | 675 | f16[j + 7] = palette[i7]; |
409 | 675 | ui1 += 7; |
410 | 675 | } |
411 | 11 | } else { |
412 | 36 | for (j = 0; j < nI; j += 835 ) |
413 | 35 | { |
414 | 35 | const uint8_t u0 = ui1[0]; |
415 | 35 | const uint8_t u1 = ui1[1]; |
416 | 35 | const uint8_t u2 = ui1[2]; |
417 | 35 | const uint8_t u3 = ui1[3]; |
418 | 35 | const uint8_t u4 = ui1[4]; |
419 | 35 | const uint8_t u5 = ui1[5]; |
420 | 35 | const uint8_t u6 = ui1[6]; |
421 | 35 | const int i0 = (int)(u0 >> 1); |
422 | 35 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
423 | 35 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
424 | 35 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
425 | 35 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
426 | 35 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
427 | 35 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
428 | 35 | const int i7 = (int)(u6 & 127); |
429 | 35 | f16[j] = palette[i0]; |
430 | 35 | if (j + 1 < nI) |
431 | 35 | f16[j + 1] = palette[i1]; |
432 | 35 | if (j + 2 < nI) |
433 | 35 | f16[j + 2] = palette[i2]; |
434 | 35 | if (j + 3 < nI) |
435 | 35 | f16[j + 3] = palette[i3]; |
436 | 35 | if (j + 4 < nI) |
437 | 35 | f16[j + 4] = palette[i4]; |
438 | 35 | if (j + 5 < nI) |
439 | 35 | f16[j + 5] = palette[i5]; |
440 | 35 | if (j + 6 < nI) |
441 | 35 | f16[j + 6] = palette[i6]; |
442 | 35 | if (j + 7 < nI) |
443 | 34 | f16[j + 7] = palette[i7]; |
444 | 35 | ui1 += 7; |
445 | 35 | } |
446 | 1 | } |
447 | 12 | } parallel_endfor |
448 | 2 | } else { |
449 | 6 | parallel_for2 (i, num_blocks) { |
450 | 6 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
451 | 6 | const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i; |
452 | 6 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
453 | 6 | const uint16_t* const palette = (uint16_t*)ui0; |
454 | 6 | const uint8_t* ui1 = ui0 + element_size * 256; |
455 | 6 | uint16_t* const f16 = (uint16_t*)u80; |
456 | 6 | int j; |
457 | 5.68k | for (j = 0; j < nI; j++5.67k ) |
458 | 5.67k | { |
459 | 5.67k | const uint8_t u0 = *ui1; |
460 | 5.67k | f16[j] = palette[u0]; |
461 | 5.67k | ++ui1; |
462 | 5.67k | } |
463 | 6 | } parallel_endfor |
464 | 2 | } |
465 | 20 | } else if (datatype == CCV_32F) { |
466 | 10 | if (qbits == 4) |
467 | 2 | { |
468 | 46 | parallel_for2 (i, num_blocks) { |
469 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
470 | 46 | const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i; |
471 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
472 | 46 | const float* const palette = (float*)ui0; |
473 | 46 | const uint8_t* ui1 = ui0 + element_size * 16; |
474 | 46 | float* const f32 = (float*)u80; |
475 | 46 | int j; |
476 | 46 | if (nI % 2 == 0) |
477 | 45 | { |
478 | 2.87k | for (j = 0; j < nI; j += 22.82k ) |
479 | 2.82k | { |
480 | 2.82k | const uint8_t u0 = *ui1; |
481 | 2.82k | const int i0 = (int)(u0 >> 4); |
482 | 2.82k | const int i1 = (int)(u0 & 15); |
483 | 2.82k | f32[j] = palette[i0]; |
484 | 2.82k | f32[j + 1] = palette[i1]; |
485 | 2.82k | ++ui1; |
486 | 2.82k | } |
487 | 45 | } else { |
488 | 13 | for (j = 0; j < nI; j += 212 ) |
489 | 12 | { |
490 | 12 | const uint8_t u0 = *ui1; |
491 | 12 | const int i0 = (int)(u0 >> 4); |
492 | 12 | const int i1 = (int)(u0 & 15); |
493 | 12 | f32[j] = palette[i0]; |
494 | 12 | if (j + 1 < nI) |
495 | 11 | f32[j + 1] = palette[i1]; |
496 | 12 | ++ui1; |
497 | 12 | } |
498 | 1 | } |
499 | 46 | } parallel_endfor |
500 | 8 | } else if (qbits == 5) { |
501 | 46 | parallel_for2 (i, num_blocks) { |
502 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
503 | 46 | const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i; |
504 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
505 | 46 | const float* const palette = (float*)ui0; |
506 | 46 | const uint8_t* ui1 = ui0 + element_size * 32; |
507 | 46 | float* const f32 = (float*)u80; |
508 | 46 | int j; |
509 | 46 | if (nI % 8 == 0) |
510 | 45 | { |
511 | 752 | for (j = 0; j < nI; j += 8707 ) |
512 | 707 | { |
513 | 707 | const uint8_t u0 = ui1[0]; |
514 | 707 | const uint8_t u1 = ui1[1]; |
515 | 707 | const uint8_t u2 = ui1[2]; |
516 | 707 | const uint8_t u3 = ui1[3]; |
517 | 707 | const uint8_t u4 = ui1[4]; |
518 | 707 | const int i0 = (int)(u0 >> 3); |
519 | 707 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
520 | 707 | const int i2 = (int)((u1 >> 1) & 31); |
521 | 707 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
522 | 707 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
523 | 707 | const int i5 = (int)((u3 >> 2) & 31); |
524 | 707 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
525 | 707 | const int i7 = (int)(u4 & 31); |
526 | 707 | f32[j] = palette[i0]; |
527 | 707 | f32[j + 1] = palette[i1]; |
528 | 707 | f32[j + 2] = palette[i2]; |
529 | 707 | f32[j + 3] = palette[i3]; |
530 | 707 | f32[j + 4] = palette[i4]; |
531 | 707 | f32[j + 5] = palette[i5]; |
532 | 707 | f32[j + 6] = palette[i6]; |
533 | 707 | f32[j + 7] = palette[i7]; |
534 | 707 | ui1 += 5; |
535 | 707 | } |
536 | 45 | } else { |
537 | 4 | for (j = 0; j < nI; j += 83 ) |
538 | 3 | { |
539 | 3 | const uint8_t u0 = ui1[0]; |
540 | 3 | const uint8_t u1 = ui1[1]; |
541 | 3 | const uint8_t u2 = ui1[2]; |
542 | 3 | const uint8_t u3 = ui1[3]; |
543 | 3 | const uint8_t u4 = ui1[4]; |
544 | 3 | const int i0 = (int)(u0 >> 3); |
545 | 3 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
546 | 3 | const int i2 = (int)((u1 >> 1) & 31); |
547 | 3 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
548 | 3 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
549 | 3 | const int i5 = (int)((u3 >> 2) & 31); |
550 | 3 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
551 | 3 | const int i7 = (int)(u4 & 31); |
552 | 3 | f32[j] = palette[i0]; |
553 | 3 | if (j + 1 < nI) |
554 | 3 | f32[j + 1] = palette[i1]; |
555 | 3 | if (j + 2 < nI) |
556 | 3 | f32[j + 2] = palette[i2]; |
557 | 3 | if (j + 3 < nI) |
558 | 3 | f32[j + 3] = palette[i3]; |
559 | 3 | if (j + 4 < nI) |
560 | 3 | f32[j + 4] = palette[i4]; |
561 | 3 | if (j + 5 < nI) |
562 | 3 | f32[j + 5] = palette[i5]; |
563 | 3 | if (j + 6 < nI) |
564 | 3 | f32[j + 6] = palette[i6]; |
565 | 3 | if (j + 7 < nI) |
566 | 2 | f32[j + 7] = palette[i7]; |
567 | 3 | ui1 += 5; |
568 | 3 | } |
569 | 1 | } |
570 | 46 | } parallel_endfor |
571 | 6 | } else if (qbits == 6) { |
572 | 12 | parallel_for2 (i, num_blocks) { |
573 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
574 | 12 | const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i; |
575 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
576 | 12 | const float* const palette = (float*)ui0; |
577 | 12 | const uint8_t* ui1 = ui0 + element_size * 64; |
578 | 12 | float* const f32 = (float*)u80; |
579 | 12 | int j; |
580 | 12 | if (nI % 4 == 0) |
581 | 11 | { |
582 | 1.36k | for (j = 0; j < nI; j += 41.35k ) |
583 | 1.35k | { |
584 | 1.35k | const uint8_t u0 = ui1[0]; |
585 | 1.35k | const uint8_t u1 = ui1[1]; |
586 | 1.35k | const uint8_t u2 = ui1[2]; |
587 | 1.35k | const int i0 = (int)(u0 >> 2); |
588 | 1.35k | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
589 | 1.35k | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
590 | 1.35k | const int i3 = (int)(u2 & 63); |
591 | 1.35k | f32[j] = palette[i0]; |
592 | 1.35k | f32[j + 1] = palette[i1]; |
593 | 1.35k | f32[j + 2] = palette[i2]; |
594 | 1.35k | f32[j + 3] = palette[i3]; |
595 | 1.35k | ui1 += 3; |
596 | 1.35k | } |
597 | 11 | } else { |
598 | 71 | for (j = 0; j < nI; j += 470 ) |
599 | 70 | { |
600 | 70 | const uint8_t u0 = ui1[0]; |
601 | 70 | const uint8_t u1 = ui1[1]; |
602 | 70 | const uint8_t u2 = ui1[2]; |
603 | 70 | const int i0 = (int)(u0 >> 2); |
604 | 70 | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
605 | 70 | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
606 | 70 | const int i3 = (int)(u2 & 63); |
607 | 70 | f32[j] = palette[i0]; |
608 | 70 | if (j + 1 < nI) |
609 | 70 | f32[j + 1] = palette[i1]; |
610 | 70 | if (j + 2 < nI) |
611 | 70 | f32[j + 2] = palette[i2]; |
612 | 70 | if (j + 3 < nI) |
613 | 69 | f32[j + 3] = palette[i3]; |
614 | 70 | ui1 += 3; |
615 | 70 | } |
616 | 1 | } |
617 | 12 | } parallel_endfor |
618 | 4 | } else if (qbits == 7) { |
619 | 12 | parallel_for2 (i, num_blocks) { |
620 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
621 | 12 | const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i; |
622 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
623 | 12 | const float* const palette = (float*)ui0; |
624 | 12 | const uint8_t* ui1 = ui0 + element_size * 128; |
625 | 12 | float* const f32 = (float*)u80; |
626 | 12 | int j; |
627 | 12 | if (nI % 8 == 0) |
628 | 11 | { |
629 | 686 | for (j = 0; j < nI; j += 8675 ) |
630 | 675 | { |
631 | 675 | const uint8_t u0 = ui1[0]; |
632 | 675 | const uint8_t u1 = ui1[1]; |
633 | 675 | const uint8_t u2 = ui1[2]; |
634 | 675 | const uint8_t u3 = ui1[3]; |
635 | 675 | const uint8_t u4 = ui1[4]; |
636 | 675 | const uint8_t u5 = ui1[5]; |
637 | 675 | const uint8_t u6 = ui1[6]; |
638 | 675 | const int i0 = (int)(u0 >> 1); |
639 | 675 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
640 | 675 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
641 | 675 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
642 | 675 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
643 | 675 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
644 | 675 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
645 | 675 | const int i7 = (int)(u6 & 127); |
646 | 675 | f32[j] = palette[i0]; |
647 | 675 | f32[j + 1] = palette[i1]; |
648 | 675 | f32[j + 2] = palette[i2]; |
649 | 675 | f32[j + 3] = palette[i3]; |
650 | 675 | f32[j + 4] = palette[i4]; |
651 | 675 | f32[j + 5] = palette[i5]; |
652 | 675 | f32[j + 6] = palette[i6]; |
653 | 675 | f32[j + 7] = palette[i7]; |
654 | 675 | ui1 += 7; |
655 | 675 | } |
656 | 11 | } else { |
657 | 36 | for (j = 0; j < nI; j += 835 ) |
658 | 35 | { |
659 | 35 | const uint8_t u0 = ui1[0]; |
660 | 35 | const uint8_t u1 = ui1[1]; |
661 | 35 | const uint8_t u2 = ui1[2]; |
662 | 35 | const uint8_t u3 = ui1[3]; |
663 | 35 | const uint8_t u4 = ui1[4]; |
664 | 35 | const uint8_t u5 = ui1[5]; |
665 | 35 | const uint8_t u6 = ui1[6]; |
666 | 35 | const int i0 = (int)(u0 >> 1); |
667 | 35 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
668 | 35 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
669 | 35 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
670 | 35 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
671 | 35 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
672 | 35 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
673 | 35 | const int i7 = (int)(u6 & 127); |
674 | 35 | f32[j] = palette[i0]; |
675 | 35 | if (j + 1 < nI) |
676 | 35 | f32[j + 1] = palette[i1]; |
677 | 35 | if (j + 2 < nI) |
678 | 35 | f32[j + 2] = palette[i2]; |
679 | 35 | if (j + 3 < nI) |
680 | 35 | f32[j + 3] = palette[i3]; |
681 | 35 | if (j + 4 < nI) |
682 | 35 | f32[j + 4] = palette[i4]; |
683 | 35 | if (j + 5 < nI) |
684 | 35 | f32[j + 5] = palette[i5]; |
685 | 35 | if (j + 6 < nI) |
686 | 35 | f32[j + 6] = palette[i6]; |
687 | 35 | if (j + 7 < nI) |
688 | 34 | f32[j + 7] = palette[i7]; |
689 | 35 | ui1 += 7; |
690 | 35 | } |
691 | 1 | } |
692 | 12 | } parallel_endfor |
693 | 2 | } else { |
694 | 6 | parallel_for2 (i, num_blocks) { |
695 | 6 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
696 | 6 | const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i; |
697 | 6 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
698 | 6 | const float* const palette = (float*)ui0; |
699 | 6 | const uint8_t* ui1 = ui0 + element_size * 256; |
700 | 6 | float* const f32 = (float*)u80; |
701 | 6 | int j; |
702 | 5.68k | for (j = 0; j < nI; j++5.67k ) |
703 | 5.67k | { |
704 | 5.67k | const uint8_t u0 = *ui1; |
705 | 5.67k | f32[j] = palette[u0]; |
706 | 5.67k | ++ui1; |
707 | 5.67k | } |
708 | 6 | } parallel_endfor |
709 | 2 | } |
710 | 10 | } else { |
711 | 10 | if (qbits == 4) |
712 | 2 | { |
713 | 46 | parallel_for2 (i, num_blocks) { |
714 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
715 | 46 | const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i; |
716 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
717 | 46 | const double* const palette = (double*)ui0; |
718 | 46 | const uint8_t* ui1 = ui0 + element_size * 16; |
719 | 46 | double* const f64 = (double*)u80; |
720 | 46 | int j; |
721 | 46 | if (nI % 2 == 0) |
722 | 45 | { |
723 | 2.87k | for (j = 0; j < nI; j += 22.82k ) |
724 | 2.82k | { |
725 | 2.82k | const uint8_t u0 = *ui1; |
726 | 2.82k | const int i0 = (int)(u0 >> 4); |
727 | 2.82k | const int i1 = (int)(u0 & 15); |
728 | 2.82k | f64[j] = palette[i0]; |
729 | 2.82k | f64[j + 1] = palette[i1]; |
730 | 2.82k | ++ui1; |
731 | 2.82k | } |
732 | 45 | } else { |
733 | 13 | for (j = 0; j < nI; j += 212 ) |
734 | 12 | { |
735 | 12 | const uint8_t u0 = *ui1; |
736 | 12 | const int i0 = (int)(u0 >> 4); |
737 | 12 | const int i1 = (int)(u0 & 15); |
738 | 12 | f64[j] = palette[i0]; |
739 | 12 | if (j + 1 < nI) |
740 | 11 | f64[j + 1] = palette[i1]; |
741 | 12 | ++ui1; |
742 | 12 | } |
743 | 1 | } |
744 | 46 | } parallel_endfor |
745 | 8 | } else if (qbits == 5) { |
746 | 46 | parallel_for2 (i, num_blocks) { |
747 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
748 | 46 | const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i; |
749 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
750 | 46 | const double* const palette = (double*)ui0; |
751 | 46 | const uint8_t* ui1 = ui0 + element_size * 32; |
752 | 46 | double* const f64 = (double*)u80; |
753 | 46 | int j; |
754 | 46 | if (nI % 8 == 0) |
755 | 45 | { |
756 | 752 | for (j = 0; j < nI; j += 8707 ) |
757 | 707 | { |
758 | 707 | const uint8_t u0 = ui1[0]; |
759 | 707 | const uint8_t u1 = ui1[1]; |
760 | 707 | const uint8_t u2 = ui1[2]; |
761 | 707 | const uint8_t u3 = ui1[3]; |
762 | 707 | const uint8_t u4 = ui1[4]; |
763 | 707 | const int i0 = (int)(u0 >> 3); |
764 | 707 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
765 | 707 | const int i2 = (int)((u1 >> 1) & 31); |
766 | 707 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
767 | 707 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
768 | 707 | const int i5 = (int)((u3 >> 2) & 31); |
769 | 707 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
770 | 707 | const int i7 = (int)(u4 & 31); |
771 | 707 | f64[j] = palette[i0]; |
772 | 707 | f64[j + 1] = palette[i1]; |
773 | 707 | f64[j + 2] = palette[i2]; |
774 | 707 | f64[j + 3] = palette[i3]; |
775 | 707 | f64[j + 4] = palette[i4]; |
776 | 707 | f64[j + 5] = palette[i5]; |
777 | 707 | f64[j + 6] = palette[i6]; |
778 | 707 | f64[j + 7] = palette[i7]; |
779 | 707 | ui1 += 5; |
780 | 707 | } |
781 | 45 | } else { |
782 | 4 | for (j = 0; j < nI; j += 83 ) |
783 | 3 | { |
784 | 3 | const uint8_t u0 = ui1[0]; |
785 | 3 | const uint8_t u1 = ui1[1]; |
786 | 3 | const uint8_t u2 = ui1[2]; |
787 | 3 | const uint8_t u3 = ui1[3]; |
788 | 3 | const uint8_t u4 = ui1[4]; |
789 | 3 | const int i0 = (int)(u0 >> 3); |
790 | 3 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
791 | 3 | const int i2 = (int)((u1 >> 1) & 31); |
792 | 3 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
793 | 3 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
794 | 3 | const int i5 = (int)((u3 >> 2) & 31); |
795 | 3 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
796 | 3 | const int i7 = (int)(u4 & 31); |
797 | 3 | f64[j] = palette[i0]; |
798 | 3 | if (j + 1 < nI) |
799 | 3 | f64[j + 1] = palette[i1]; |
800 | 3 | if (j + 2 < nI) |
801 | 3 | f64[j + 2] = palette[i2]; |
802 | 3 | if (j + 3 < nI) |
803 | 3 | f64[j + 3] = palette[i3]; |
804 | 3 | if (j + 4 < nI) |
805 | 3 | f64[j + 4] = palette[i4]; |
806 | 3 | if (j + 5 < nI) |
807 | 3 | f64[j + 5] = palette[i5]; |
808 | 3 | if (j + 6 < nI) |
809 | 3 | f64[j + 6] = palette[i6]; |
810 | 3 | if (j + 7 < nI) |
811 | 2 | f64[j + 7] = palette[i7]; |
812 | 3 | ui1 += 5; |
813 | 3 | } |
814 | 1 | } |
815 | 46 | } parallel_endfor |
816 | 6 | } else if (qbits == 6) { |
817 | 12 | parallel_for2 (i, num_blocks) { |
818 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
819 | 12 | const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i; |
820 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
821 | 12 | const double* const palette = (double*)ui0; |
822 | 12 | const uint8_t* ui1 = ui0 + element_size * 64; |
823 | 12 | double* const f64 = (double*)u80; |
824 | 12 | int j; |
825 | 12 | if (nI % 4 == 0) |
826 | 11 | { |
827 | 1.36k | for (j = 0; j < nI; j += 41.35k ) |
828 | 1.35k | { |
829 | 1.35k | const uint8_t u0 = ui1[0]; |
830 | 1.35k | const uint8_t u1 = ui1[1]; |
831 | 1.35k | const uint8_t u2 = ui1[2]; |
832 | 1.35k | const int i0 = (int)(u0 >> 2); |
833 | 1.35k | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
834 | 1.35k | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
835 | 1.35k | const int i3 = (int)(u2 & 63); |
836 | 1.35k | f64[j] = palette[i0]; |
837 | 1.35k | f64[j + 1] = palette[i1]; |
838 | 1.35k | f64[j + 2] = palette[i2]; |
839 | 1.35k | f64[j + 3] = palette[i3]; |
840 | 1.35k | ui1 += 3; |
841 | 1.35k | } |
842 | 11 | } else { |
843 | 71 | for (j = 0; j < nI; j += 470 ) |
844 | 70 | { |
845 | 70 | const uint8_t u0 = ui1[0]; |
846 | 70 | const uint8_t u1 = ui1[1]; |
847 | 70 | const uint8_t u2 = ui1[2]; |
848 | 70 | const int i0 = (int)(u0 >> 2); |
849 | 70 | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
850 | 70 | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
851 | 70 | const int i3 = (int)(u2 & 63); |
852 | 70 | f64[j] = palette[i0]; |
853 | 70 | if (j + 1 < nI) |
854 | 70 | f64[j + 1] = palette[i1]; |
855 | 70 | if (j + 2 < nI) |
856 | 70 | f64[j + 2] = palette[i2]; |
857 | 70 | if (j + 3 < nI) |
858 | 69 | f64[j + 3] = palette[i3]; |
859 | 70 | ui1 += 3; |
860 | 70 | } |
861 | 1 | } |
862 | 12 | } parallel_endfor |
863 | 4 | } else if (qbits == 7) { |
864 | 12 | parallel_for2 (i, num_blocks) { |
865 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
866 | 12 | const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i; |
867 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
868 | 12 | const double* const palette = (double*)ui0; |
869 | 12 | const uint8_t* ui1 = ui0 + element_size * 128; |
870 | 12 | double* const f64 = (double*)u80; |
871 | 12 | int j; |
872 | 12 | if (nI % 8 == 0) |
873 | 11 | { |
874 | 686 | for (j = 0; j < nI; j += 8675 ) |
875 | 675 | { |
876 | 675 | const uint8_t u0 = ui1[0]; |
877 | 675 | const uint8_t u1 = ui1[1]; |
878 | 675 | const uint8_t u2 = ui1[2]; |
879 | 675 | const uint8_t u3 = ui1[3]; |
880 | 675 | const uint8_t u4 = ui1[4]; |
881 | 675 | const uint8_t u5 = ui1[5]; |
882 | 675 | const uint8_t u6 = ui1[6]; |
883 | 675 | const int i0 = (int)(u0 >> 1); |
884 | 675 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
885 | 675 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
886 | 675 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
887 | 675 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
888 | 675 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
889 | 675 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
890 | 675 | const int i7 = (int)(u6 & 127); |
891 | 675 | f64[j] = palette[i0]; |
892 | 675 | f64[j + 1] = palette[i1]; |
893 | 675 | f64[j + 2] = palette[i2]; |
894 | 675 | f64[j + 3] = palette[i3]; |
895 | 675 | f64[j + 4] = palette[i4]; |
896 | 675 | f64[j + 5] = palette[i5]; |
897 | 675 | f64[j + 6] = palette[i6]; |
898 | 675 | f64[j + 7] = palette[i7]; |
899 | 675 | ui1 += 7; |
900 | 675 | } |
901 | 11 | } else { |
902 | 36 | for (j = 0; j < nI; j += 835 ) |
903 | 35 | { |
904 | 35 | const uint8_t u0 = ui1[0]; |
905 | 35 | const uint8_t u1 = ui1[1]; |
906 | 35 | const uint8_t u2 = ui1[2]; |
907 | 35 | const uint8_t u3 = ui1[3]; |
908 | 35 | const uint8_t u4 = ui1[4]; |
909 | 35 | const uint8_t u5 = ui1[5]; |
910 | 35 | const uint8_t u6 = ui1[6]; |
911 | 35 | const int i0 = (int)(u0 >> 1); |
912 | 35 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
913 | 35 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
914 | 35 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
915 | 35 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
916 | 35 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
917 | 35 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
918 | 35 | const int i7 = (int)(u6 & 127); |
919 | 35 | f64[j] = palette[i0]; |
920 | 35 | if (j + 1 < nI) |
921 | 35 | f64[j + 1] = palette[i1]; |
922 | 35 | if (j + 2 < nI) |
923 | 35 | f64[j + 2] = palette[i2]; |
924 | 35 | if (j + 3 < nI) |
925 | 35 | f64[j + 3] = palette[i3]; |
926 | 35 | if (j + 4 < nI) |
927 | 35 | f64[j + 4] = palette[i4]; |
928 | 35 | if (j + 5 < nI) |
929 | 35 | f64[j + 5] = palette[i5]; |
930 | 35 | if (j + 6 < nI) |
931 | 35 | f64[j + 6] = palette[i6]; |
932 | 35 | if (j + 7 < nI) |
933 | 34 | f64[j + 7] = palette[i7]; |
934 | 35 | ui1 += 7; |
935 | 35 | } |
936 | 1 | } |
937 | 12 | } parallel_endfor |
938 | 2 | } else { |
939 | 6 | parallel_for2 (i, num_blocks) { |
940 | 6 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
941 | 6 | const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i; |
942 | 6 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
943 | 6 | const double* const palette = (double*)ui0; |
944 | 6 | const uint8_t* ui1 = ui0 + element_size * 256; |
945 | 6 | double* const f64 = (double*)u80; |
946 | 6 | int j; |
947 | 5.68k | for (j = 0; j < nI; j++5.67k ) |
948 | 5.67k | { |
949 | 5.67k | const uint8_t u0 = *ui1; |
950 | 5.67k | f64[j] = palette[u0]; |
951 | 5.67k | ++ui1; |
952 | 5.67k | } |
953 | 6 | } parallel_endfor |
954 | 2 | } |
955 | 10 | } |
956 | 30 | } |
957 | | |
958 | | void ccv_nnc_depalettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length) |
959 | 60 | { |
960 | 60 | assert(memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY); |
961 | 60 | if (memory_type == CCV_TENSOR_CPU_MEMORY) |
962 | 30 | _ccv_nnc_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length); |
963 | 30 | else { |
964 | 30 | #ifdef HAVE_CUDA |
965 | 30 | ccv_nnc_compat_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length, 0); |
966 | | #elif defined(HAVE_MPS) |
967 | | ccv_nnc_mps_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length, 0); |
968 | | #else |
969 | | assert(memory_type == CCV_TENSOR_CPU_MEMORY); |
970 | | #endif |
971 | 30 | } |
972 | 60 | } |