/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_palettize.c
Line | Count | Source |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_internal.h" |
3 | | #ifdef HAVE_CUDA |
4 | | #include "gpu/ccv_nnc_compat.h" |
5 | | #elif defined(HAVE_MPS) |
6 | | #include "mps/ccv_nnc_mps.h" |
7 | | #endif |
8 | | |
9 | | size_t ccv_nnc_palettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length) |
10 | 63 | { |
11 | 63 | assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F); |
12 | 63 | assert(memory_type == CCV_TENSOR_CPU_MEMORY); |
13 | 63 | const int num_blocks = (input_length + number_in_blocks - 1) / number_in_blocks; |
14 | 63 | const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype); |
15 | 63 | uint8_t* const u8 = (uint8_t*)output; |
16 | 63 | uint8_t* const ui = (uint8_t*)input; |
17 | 63 | assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8); |
18 | 63 | if (qbits == 4) |
19 | 14 | { |
20 | 278 | parallel_for14 (i, num_blocks) { |
21 | 278 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
22 | 278 | int* const indices = ccmalloc(sizeof(int) * nI); |
23 | 278 | double centroids[16]; |
24 | 278 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
25 | 278 | ccv_kmeans1d(&a, 16, indices, centroids); |
26 | 278 | uint8_t* u80 = u8 + (16 * element_size + number_in_blocks / 2) * i; |
27 | 278 | int j; |
28 | 278 | if (datatype == CCV_16F) |
29 | 92 | { |
30 | 92 | float* f32 = (float*)centroids; |
31 | 1.56k | for (j = 0; j < 16; j++1.47k ) |
32 | 1.47k | f32[j] = (float)centroids[j]; |
33 | 92 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 16); |
34 | 186 | } else if (datatype == CCV_16BF) { |
35 | 0 | float* f32 = (float*)centroids; |
36 | 0 | for (j = 0; j < 16; j++) |
37 | 0 | f32[j] = (float)centroids[j]; |
38 | 0 | ccv_float_to_bfloat(f32, (uint16_t*)u80, 16); |
39 | 186 | } else if (datatype == CCV_32F) { |
40 | 94 | float* f32 = (float*)u80; |
41 | 1.59k | for (j = 0; j < 16; j++1.50k ) |
42 | 1.50k | f32[j] = (float)centroids[j]; |
43 | 94 | } else { |
44 | 92 | memcpy(u80, centroids, sizeof(double) * 16); |
45 | 92 | } |
46 | 278 | u80 += 16 * element_size; |
47 | 17.3k | for (j = 0; j < nI; j += 217.0k ) |
48 | 17.0k | { |
49 | 17.0k | const uint8_t i0 = (uint8_t)indices[j]; |
50 | 17.0k | const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1]17.0k : 06 ; |
51 | 17.0k | *u80 = (i0 << 4) | i1; |
52 | 17.0k | ++u80; |
53 | 17.0k | } |
54 | 278 | ccfree(indices); |
55 | 278 | } parallel_endfor |
56 | 14 | return element_size * num_blocks * 16 + (input_length + 1) / 2; |
57 | 49 | } else if (qbits == 5) { |
58 | 276 | parallel_for12 (i, num_blocks) { |
59 | 276 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
60 | 276 | int* const indices = ccmalloc(sizeof(int) * nI); |
61 | 276 | double centroids[32]; |
62 | 276 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
63 | 276 | ccv_kmeans1d(&a, 32, indices, centroids); |
64 | 276 | uint8_t* u80 = u8 + (32 * element_size + number_in_blocks / 8 * 5) * i; |
65 | 276 | int j; |
66 | 276 | if (datatype == CCV_16F) |
67 | 92 | { |
68 | 92 | float* f32 = (float*)centroids; |
69 | 3.03k | for (j = 0; j < 32; j++2.94k ) |
70 | 2.94k | f32[j] = (float)centroids[j]; |
71 | 92 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 32); |
72 | 184 | } else if (datatype == CCV_16BF) { |
73 | 0 | float* f32 = (float*)centroids; |
74 | 0 | for (j = 0; j < 32; j++) |
75 | 0 | f32[j] = (float)centroids[j]; |
76 | 0 | ccv_float_to_bfloat(f32, (uint16_t*)u80, 32); |
77 | 184 | } else if (datatype == CCV_32F) { |
78 | 92 | float* f32 = (float*)u80; |
79 | 3.03k | for (j = 0; j < 32; j++2.94k ) |
80 | 2.94k | f32[j] = (float)centroids[j]; |
81 | 92 | } else { |
82 | 92 | memcpy(u80, centroids, sizeof(double) * 32); |
83 | 92 | } |
84 | 276 | u80 += 32 * element_size; |
85 | 4.53k | for (j = 0; j < nI; j += 84.26k ) |
86 | 4.26k | { |
87 | 4.26k | const uint8_t i0 = (uint8_t)indices[j]; |
88 | 4.26k | const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 00 ; |
89 | 4.26k | const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 00 ; |
90 | 4.26k | const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 00 ; |
91 | 4.26k | const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 00 ; |
92 | 4.26k | const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 00 ; |
93 | 4.26k | const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 00 ; |
94 | 4.26k | const uint8_t i7 = j + 7 < nI ? (uint8_t)indices[j + 7]4.25k : 06 ; |
95 | 4.26k | u80[0] = (i0 << 3) | (i1 >> 2); |
96 | 4.26k | u80[1] = (i1 << 6) | (i2 << 1) | (i3 >> 4); |
97 | 4.26k | u80[2] = (i3 << 4) | (i4 >> 1); |
98 | 4.26k | u80[3] = (i4 << 7) | (i5 << 2) | (i6 >> 3); |
99 | 4.26k | u80[4] = (i6 << 5) | i7; |
100 | 4.26k | u80 += 5; |
101 | 4.26k | } |
102 | 276 | ccfree(indices); |
103 | 276 | } parallel_endfor |
104 | 12 | return element_size * num_blocks * 32 + (input_length + 7) / 8 * 5; |
105 | 37 | } else if (qbits == 6) { |
106 | 80 | parallel_for13 (i, num_blocks) { |
107 | 80 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
108 | 80 | int* const indices = ccmalloc(sizeof(int) * nI); |
109 | 80 | double centroids[64]; |
110 | 80 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
111 | 80 | ccv_kmeans1d(&a, 64, indices, centroids); |
112 | 80 | uint8_t* u80 = u8 + (64 * element_size + number_in_blocks / 4 * 3) * i; |
113 | 80 | int j; |
114 | 80 | if (datatype == CCV_16F) |
115 | 32 | { |
116 | 32 | float* f32 = (float*)centroids; |
117 | 2.08k | for (j = 0; j < 64; j++2.04k ) |
118 | 2.04k | f32[j] = (float)centroids[j]; |
119 | 32 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 64); |
120 | 48 | } else if (datatype == CCV_16BF) { |
121 | 0 | float* f32 = (float*)centroids; |
122 | 0 | for (j = 0; j < 64; j++) |
123 | 0 | f32[j] = (float)centroids[j]; |
124 | 0 | ccv_float_to_bfloat(f32, (uint16_t*)u80, 64); |
125 | 48 | } else if (datatype == CCV_32F) { |
126 | 24 | float* f32 = (float*)u80; |
127 | 1.56k | for (j = 0; j < 64; j++1.53k ) |
128 | 1.53k | f32[j] = (float)centroids[j]; |
129 | 24 | } else { |
130 | 24 | memcpy(u80, centroids, sizeof(double) * 64); |
131 | 24 | } |
132 | 80 | u80 += 64 * element_size; |
133 | 13.4k | for (j = 0; j < nI; j += 413.3k ) |
134 | 13.3k | { |
135 | 13.3k | const uint8_t i0 = (uint8_t)indices[j]; |
136 | 13.3k | const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 00 ; |
137 | 13.3k | const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 00 ; |
138 | 13.3k | const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3]13.3k : 06 ; |
139 | 13.3k | u80[0] = (i0 << 2) | (i1 >> 4); |
140 | 13.3k | u80[1] = (i1 << 4) | (i2 >> 2); |
141 | 13.3k | u80[2] = (i2 << 6) | i3; |
142 | 13.3k | u80 += 3; |
143 | 13.3k | } |
144 | 80 | ccfree(indices); |
145 | 80 | } parallel_endfor |
146 | 13 | return element_size * num_blocks * 64 + (input_length + 3) / 4 * 3; |
147 | 24 | } else if (qbits == 7) { |
148 | 72 | parallel_for12 (i, num_blocks) { |
149 | 72 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
150 | 72 | int* const indices = ccmalloc(sizeof(int) * nI); |
151 | 72 | double centroids[128]; |
152 | 72 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
153 | 72 | ccv_kmeans1d(&a, 128, indices, centroids); |
154 | 72 | uint8_t* u80 = u8 + (128 * element_size + number_in_blocks / 8 * 7) * i; |
155 | 72 | int j; |
156 | 72 | if (datatype == CCV_16F) |
157 | 24 | { |
158 | 24 | float* f32 = (float*)centroids; |
159 | 3.09k | for (j = 0; j < 128; j++3.07k ) |
160 | 3.07k | f32[j] = (float)centroids[j]; |
161 | 24 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 128); |
162 | 48 | } else if (datatype == CCV_16BF) { |
163 | 0 | float* f32 = (float*)centroids; |
164 | 0 | for (j = 0; j < 128; j++) |
165 | 0 | f32[j] = (float)centroids[j]; |
166 | 0 | ccv_float_to_bfloat(f32, (uint16_t*)u80, 128); |
167 | 48 | } else if (datatype == CCV_32F) { |
168 | 24 | float* f32 = (float*)u80; |
169 | 3.09k | for (j = 0; j < 128; j++3.07k ) |
170 | 3.07k | f32[j] = (float)centroids[j]; |
171 | 24 | } else { |
172 | 24 | memcpy(u80, centroids, sizeof(double) * 128); |
173 | 24 | } |
174 | 72 | u80 += 128 * element_size; |
175 | 4.33k | for (j = 0; j < nI; j += 84.26k ) |
176 | 4.26k | { |
177 | 4.26k | const uint8_t i0 = (uint8_t)indices[j]; |
178 | 4.26k | const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 00 ; |
179 | 4.26k | const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 00 ; |
180 | 4.26k | const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 00 ; |
181 | 4.26k | const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 00 ; |
182 | 4.26k | const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 00 ; |
183 | 4.26k | const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 00 ; |
184 | 4.26k | const uint8_t i7 = j + 7 < nI ? (uint8_t)indices[j + 7]4.25k : 06 ; |
185 | 4.26k | u80[0] = (i0 << 1) | (i1 >> 6); |
186 | 4.26k | u80[1] = (i1 << 2) | (i2 >> 5); |
187 | 4.26k | u80[2] = (i2 << 3) | (i3 >> 4); |
188 | 4.26k | u80[3] = (i3 << 4) | (i4 >> 3); |
189 | 4.26k | u80[4] = (i4 << 5) | (i5 >> 2); |
190 | 4.26k | u80[5] = (i5 << 6) | (i6 >> 1); |
191 | 4.26k | u80[6] = (i6 << 7) | i7; |
192 | 4.26k | u80 += 7; |
193 | 4.26k | } |
194 | 72 | ccfree(indices); |
195 | 72 | } parallel_endfor |
196 | 12 | return element_size * num_blocks * 128 + (input_length + 7) / 8 * 7; |
197 | 12 | } else { |
198 | 35 | parallel_for12 (i, num_blocks) { |
199 | 35 | const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); |
200 | 35 | int* const indices = ccmalloc(sizeof(int) * nI); |
201 | 35 | double centroids[256]; |
202 | 35 | ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); |
203 | 35 | ccv_kmeans1d(&a, 256, indices, centroids); |
204 | 35 | uint8_t* u80 = u8 + (256 * element_size + number_in_blocks) * i; |
205 | 35 | int j; |
206 | 35 | if (datatype == CCV_16F) |
207 | 12 | { |
208 | 12 | float* f32 = (float*)centroids; |
209 | 3.08k | for (j = 0; j < 256; j++3.07k ) |
210 | 3.07k | f32[j] = (float)centroids[j]; |
211 | 12 | ccv_float_to_half_precision(f32, (uint16_t*)u80, 256); |
212 | 23 | } else if (datatype == CCV_16BF) { |
213 | 0 | float* f32 = (float*)centroids; |
214 | 0 | for (j = 0; j < 256; j++) |
215 | 0 | f32[j] = (float)centroids[j]; |
216 | 0 | ccv_float_to_bfloat(f32, (uint16_t*)u80, 256); |
217 | 23 | } else if (datatype == CCV_32F) { |
218 | 11 | float* f32 = (float*)u80; |
219 | 2.82k | for (j = 0; j < 256; j++2.81k ) |
220 | 2.81k | f32[j] = (float)centroids[j]; |
221 | 12 | } else { |
222 | 12 | memcpy(u80, centroids, sizeof(double) * 256); |
223 | 12 | } |
224 | 35 | u80 += 256 * element_size; |
225 | 39.4k | for (j = 0; j < nI; j++39.4k ) |
226 | 39.4k | { |
227 | 39.4k | *u80 = (uint8_t)indices[j]; |
228 | 39.4k | ++u80; |
229 | 39.4k | } |
230 | 35 | ccfree(indices); |
231 | 35 | } parallel_endfor |
232 | 12 | return element_size * num_blocks * 256 + input_length; |
233 | 12 | } |
234 | 63 | } |
235 | | |
236 | | static void _ccv_nnc_depalettize(const void* input, const int datatype, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length) |
237 | 30 | { |
238 | 30 | assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F); |
239 | 30 | const int num_blocks = (output_length + number_in_blocks - 1) / number_in_blocks; |
240 | 30 | const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype); |
241 | 30 | uint8_t* const u8 = (uint8_t*)output; |
242 | 30 | const uint8_t* const ui = (const uint8_t*)input; |
243 | 30 | assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8); |
244 | 30 | if (datatype == CCV_16F || datatype == CCV_16BF20 ) |
245 | 10 | { |
246 | 10 | if (qbits == 4) |
247 | 2 | { |
248 | 46 | parallel_for2 (i, num_blocks) { |
249 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
250 | 46 | const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i; |
251 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
252 | 46 | const uint16_t* const palette = (uint16_t*)ui0; |
253 | 46 | const uint8_t* ui1 = ui0 + element_size * 16; |
254 | 46 | uint16_t* const f16 = (uint16_t*)u80; |
255 | 46 | int j; |
256 | 46 | if (nI % 2 == 0) |
257 | 45 | { |
258 | 2.87k | for (j = 0; j < nI; j += 22.82k ) |
259 | 2.82k | { |
260 | 2.82k | const uint8_t u0 = *ui1; |
261 | 2.82k | const int i0 = (int)(u0 >> 4); |
262 | 2.82k | const int i1 = (int)(u0 & 15); |
263 | 2.82k | f16[j] = palette[i0]; |
264 | 2.82k | f16[j + 1] = palette[i1]; |
265 | 2.82k | ++ui1; |
266 | 2.82k | } |
267 | 45 | } else { |
268 | 13 | for (j = 0; j < nI; j += 212 ) |
269 | 12 | { |
270 | 12 | const uint8_t u0 = *ui1; |
271 | 12 | const int i0 = (int)(u0 >> 4); |
272 | 12 | const int i1 = (int)(u0 & 15); |
273 | 12 | f16[j] = palette[i0]; |
274 | 12 | if (j + 1 < nI) |
275 | 11 | f16[j + 1] = palette[i1]; |
276 | 12 | ++ui1; |
277 | 12 | } |
278 | 1 | } |
279 | 46 | } parallel_endfor |
280 | 8 | } else if (qbits == 5) { |
281 | 46 | parallel_for2 (i, num_blocks) { |
282 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
283 | 46 | const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i; |
284 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
285 | 46 | const uint16_t* const palette = (uint16_t*)ui0; |
286 | 46 | const uint8_t* ui1 = ui0 + element_size * 32; |
287 | 46 | uint16_t* const f16 = (uint16_t*)u80; |
288 | 46 | int j; |
289 | 46 | if (nI % 8 == 0) |
290 | 45 | { |
291 | 752 | for (j = 0; j < nI; j += 8707 ) |
292 | 707 | { |
293 | 707 | const uint8_t u0 = ui1[0]; |
294 | 707 | const uint8_t u1 = ui1[1]; |
295 | 707 | const uint8_t u2 = ui1[2]; |
296 | 707 | const uint8_t u3 = ui1[3]; |
297 | 707 | const uint8_t u4 = ui1[4]; |
298 | 707 | const int i0 = (int)(u0 >> 3); |
299 | 707 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
300 | 707 | const int i2 = (int)((u1 >> 1) & 31); |
301 | 707 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
302 | 707 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
303 | 707 | const int i5 = (int)((u3 >> 2) & 31); |
304 | 707 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
305 | 707 | const int i7 = (int)(u4 & 31); |
306 | 707 | f16[j] = palette[i0]; |
307 | 707 | f16[j + 1] = palette[i1]; |
308 | 707 | f16[j + 2] = palette[i2]; |
309 | 707 | f16[j + 3] = palette[i3]; |
310 | 707 | f16[j + 4] = palette[i4]; |
311 | 707 | f16[j + 5] = palette[i5]; |
312 | 707 | f16[j + 6] = palette[i6]; |
313 | 707 | f16[j + 7] = palette[i7]; |
314 | 707 | ui1 += 5; |
315 | 707 | } |
316 | 45 | } else { |
317 | 4 | for (j = 0; j < nI; j += 83 ) |
318 | 3 | { |
319 | 3 | const uint8_t u0 = ui1[0]; |
320 | 3 | const uint8_t u1 = ui1[1]; |
321 | 3 | const uint8_t u2 = ui1[2]; |
322 | 3 | const uint8_t u3 = ui1[3]; |
323 | 3 | const uint8_t u4 = ui1[4]; |
324 | 3 | const int i0 = (int)(u0 >> 3); |
325 | 3 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
326 | 3 | const int i2 = (int)((u1 >> 1) & 31); |
327 | 3 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
328 | 3 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
329 | 3 | const int i5 = (int)((u3 >> 2) & 31); |
330 | 3 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
331 | 3 | const int i7 = (int)(u4 & 31); |
332 | 3 | f16[j] = palette[i0]; |
333 | 3 | if (j + 1 < nI) |
334 | 3 | f16[j + 1] = palette[i1]; |
335 | 3 | if (j + 2 < nI) |
336 | 3 | f16[j + 2] = palette[i2]; |
337 | 3 | if (j + 3 < nI) |
338 | 3 | f16[j + 3] = palette[i3]; |
339 | 3 | if (j + 4 < nI) |
340 | 3 | f16[j + 4] = palette[i4]; |
341 | 3 | if (j + 5 < nI) |
342 | 3 | f16[j + 5] = palette[i5]; |
343 | 3 | if (j + 6 < nI) |
344 | 3 | f16[j + 6] = palette[i6]; |
345 | 3 | if (j + 7 < nI) |
346 | 2 | f16[j + 7] = palette[i7]; |
347 | 3 | ui1 += 5; |
348 | 3 | } |
349 | 1 | } |
350 | 46 | } parallel_endfor |
351 | 6 | } else if (qbits == 6) { |
352 | 12 | parallel_for2 (i, num_blocks) { |
353 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
354 | 12 | const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i; |
355 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
356 | 12 | const uint16_t* const palette = (uint16_t*)ui0; |
357 | 12 | const uint8_t* ui1 = ui0 + element_size * 64; |
358 | 12 | uint16_t* const f16 = (uint16_t*)u80; |
359 | 12 | int j; |
360 | 12 | if (nI % 4 == 0) |
361 | 11 | { |
362 | 1.36k | for (j = 0; j < nI; j += 41.35k ) |
363 | 1.35k | { |
364 | 1.35k | const uint8_t u0 = ui1[0]; |
365 | 1.35k | const uint8_t u1 = ui1[1]; |
366 | 1.35k | const uint8_t u2 = ui1[2]; |
367 | 1.35k | const int i0 = (int)(u0 >> 2); |
368 | 1.35k | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
369 | 1.35k | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
370 | 1.35k | const int i3 = (int)(u2 & 63); |
371 | 1.35k | f16[j] = palette[i0]; |
372 | 1.35k | f16[j + 1] = palette[i1]; |
373 | 1.35k | f16[j + 2] = palette[i2]; |
374 | 1.35k | f16[j + 3] = palette[i3]; |
375 | 1.35k | ui1 += 3; |
376 | 1.35k | } |
377 | 11 | } else { |
378 | 71 | for (j = 0; j < nI; j += 470 ) |
379 | 70 | { |
380 | 70 | const uint8_t u0 = ui1[0]; |
381 | 70 | const uint8_t u1 = ui1[1]; |
382 | 70 | const uint8_t u2 = ui1[2]; |
383 | 70 | const int i0 = (int)(u0 >> 2); |
384 | 70 | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
385 | 70 | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
386 | 70 | const int i3 = (int)(u2 & 63); |
387 | 70 | f16[j] = palette[i0]; |
388 | 70 | if (j + 1 < nI) |
389 | 70 | f16[j + 1] = palette[i1]; |
390 | 70 | if (j + 2 < nI) |
391 | 70 | f16[j + 2] = palette[i2]; |
392 | 70 | if (j + 3 < nI) |
393 | 69 | f16[j + 3] = palette[i3]; |
394 | 70 | ui1 += 3; |
395 | 70 | } |
396 | 1 | } |
397 | 12 | } parallel_endfor |
398 | 4 | } else if (qbits == 7) { |
399 | 12 | parallel_for2 (i, num_blocks) { |
400 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
401 | 12 | const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i; |
402 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
403 | 12 | const uint16_t* const palette = (uint16_t*)ui0; |
404 | 12 | const uint8_t* ui1 = ui0 + element_size * 128; |
405 | 12 | uint16_t* const f16 = (uint16_t*)u80; |
406 | 12 | int j; |
407 | 12 | if (nI % 8 == 0) |
408 | 11 | { |
409 | 686 | for (j = 0; j < nI; j += 8675 ) |
410 | 675 | { |
411 | 675 | const uint8_t u0 = ui1[0]; |
412 | 675 | const uint8_t u1 = ui1[1]; |
413 | 675 | const uint8_t u2 = ui1[2]; |
414 | 675 | const uint8_t u3 = ui1[3]; |
415 | 675 | const uint8_t u4 = ui1[4]; |
416 | 675 | const uint8_t u5 = ui1[5]; |
417 | 675 | const uint8_t u6 = ui1[6]; |
418 | 675 | const int i0 = (int)(u0 >> 1); |
419 | 675 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
420 | 675 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
421 | 675 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
422 | 675 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
423 | 675 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
424 | 675 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
425 | 675 | const int i7 = (int)(u6 & 127); |
426 | 675 | f16[j] = palette[i0]; |
427 | 675 | f16[j + 1] = palette[i1]; |
428 | 675 | f16[j + 2] = palette[i2]; |
429 | 675 | f16[j + 3] = palette[i3]; |
430 | 675 | f16[j + 4] = palette[i4]; |
431 | 675 | f16[j + 5] = palette[i5]; |
432 | 675 | f16[j + 6] = palette[i6]; |
433 | 675 | f16[j + 7] = palette[i7]; |
434 | 675 | ui1 += 7; |
435 | 675 | } |
436 | 11 | } else { |
437 | 36 | for (j = 0; j < nI; j += 835 ) |
438 | 35 | { |
439 | 35 | const uint8_t u0 = ui1[0]; |
440 | 35 | const uint8_t u1 = ui1[1]; |
441 | 35 | const uint8_t u2 = ui1[2]; |
442 | 35 | const uint8_t u3 = ui1[3]; |
443 | 35 | const uint8_t u4 = ui1[4]; |
444 | 35 | const uint8_t u5 = ui1[5]; |
445 | 35 | const uint8_t u6 = ui1[6]; |
446 | 35 | const int i0 = (int)(u0 >> 1); |
447 | 35 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
448 | 35 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
449 | 35 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
450 | 35 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
451 | 35 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
452 | 35 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
453 | 35 | const int i7 = (int)(u6 & 127); |
454 | 35 | f16[j] = palette[i0]; |
455 | 35 | if (j + 1 < nI) |
456 | 35 | f16[j + 1] = palette[i1]; |
457 | 35 | if (j + 2 < nI) |
458 | 35 | f16[j + 2] = palette[i2]; |
459 | 35 | if (j + 3 < nI) |
460 | 35 | f16[j + 3] = palette[i3]; |
461 | 35 | if (j + 4 < nI) |
462 | 35 | f16[j + 4] = palette[i4]; |
463 | 35 | if (j + 5 < nI) |
464 | 35 | f16[j + 5] = palette[i5]; |
465 | 35 | if (j + 6 < nI) |
466 | 35 | f16[j + 6] = palette[i6]; |
467 | 35 | if (j + 7 < nI) |
468 | 34 | f16[j + 7] = palette[i7]; |
469 | 35 | ui1 += 7; |
470 | 35 | } |
471 | 1 | } |
472 | 12 | } parallel_endfor |
473 | 2 | } else { |
474 | 6 | parallel_for2 (i, num_blocks) { |
475 | 6 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
476 | 6 | const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i; |
477 | 6 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
478 | 6 | const uint16_t* const palette = (uint16_t*)ui0; |
479 | 6 | const uint8_t* ui1 = ui0 + element_size * 256; |
480 | 6 | uint16_t* const f16 = (uint16_t*)u80; |
481 | 6 | int j; |
482 | 5.68k | for (j = 0; j < nI; j++5.67k ) |
483 | 5.67k | { |
484 | 5.67k | const uint8_t u0 = *ui1; |
485 | 5.67k | f16[j] = palette[u0]; |
486 | 5.67k | ++ui1; |
487 | 5.67k | } |
488 | 6 | } parallel_endfor |
489 | 2 | } |
490 | 20 | } else if (datatype == CCV_32F) { |
491 | 10 | if (qbits == 4) |
492 | 2 | { |
493 | 46 | parallel_for2 (i, num_blocks) { |
494 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
495 | 46 | const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i; |
496 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
497 | 46 | const float* const palette = (float*)ui0; |
498 | 46 | const uint8_t* ui1 = ui0 + element_size * 16; |
499 | 46 | float* const f32 = (float*)u80; |
500 | 46 | int j; |
501 | 46 | if (nI % 2 == 0) |
502 | 45 | { |
503 | 2.87k | for (j = 0; j < nI; j += 22.82k ) |
504 | 2.82k | { |
505 | 2.82k | const uint8_t u0 = *ui1; |
506 | 2.82k | const int i0 = (int)(u0 >> 4); |
507 | 2.82k | const int i1 = (int)(u0 & 15); |
508 | 2.82k | f32[j] = palette[i0]; |
509 | 2.82k | f32[j + 1] = palette[i1]; |
510 | 2.82k | ++ui1; |
511 | 2.82k | } |
512 | 45 | } else { |
513 | 13 | for (j = 0; j < nI; j += 212 ) |
514 | 12 | { |
515 | 12 | const uint8_t u0 = *ui1; |
516 | 12 | const int i0 = (int)(u0 >> 4); |
517 | 12 | const int i1 = (int)(u0 & 15); |
518 | 12 | f32[j] = palette[i0]; |
519 | 12 | if (j + 1 < nI) |
520 | 11 | f32[j + 1] = palette[i1]; |
521 | 12 | ++ui1; |
522 | 12 | } |
523 | 1 | } |
524 | 46 | } parallel_endfor |
525 | 8 | } else if (qbits == 5) { |
526 | 46 | parallel_for2 (i, num_blocks) { |
527 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
528 | 46 | const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i; |
529 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
530 | 46 | const float* const palette = (float*)ui0; |
531 | 46 | const uint8_t* ui1 = ui0 + element_size * 32; |
532 | 46 | float* const f32 = (float*)u80; |
533 | 46 | int j; |
534 | 46 | if (nI % 8 == 0) |
535 | 45 | { |
536 | 752 | for (j = 0; j < nI; j += 8707 ) |
537 | 707 | { |
538 | 707 | const uint8_t u0 = ui1[0]; |
539 | 707 | const uint8_t u1 = ui1[1]; |
540 | 707 | const uint8_t u2 = ui1[2]; |
541 | 707 | const uint8_t u3 = ui1[3]; |
542 | 707 | const uint8_t u4 = ui1[4]; |
543 | 707 | const int i0 = (int)(u0 >> 3); |
544 | 707 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
545 | 707 | const int i2 = (int)((u1 >> 1) & 31); |
546 | 707 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
547 | 707 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
548 | 707 | const int i5 = (int)((u3 >> 2) & 31); |
549 | 707 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
550 | 707 | const int i7 = (int)(u4 & 31); |
551 | 707 | f32[j] = palette[i0]; |
552 | 707 | f32[j + 1] = palette[i1]; |
553 | 707 | f32[j + 2] = palette[i2]; |
554 | 707 | f32[j + 3] = palette[i3]; |
555 | 707 | f32[j + 4] = palette[i4]; |
556 | 707 | f32[j + 5] = palette[i5]; |
557 | 707 | f32[j + 6] = palette[i6]; |
558 | 707 | f32[j + 7] = palette[i7]; |
559 | 707 | ui1 += 5; |
560 | 707 | } |
561 | 45 | } else { |
562 | 4 | for (j = 0; j < nI; j += 83 ) |
563 | 3 | { |
564 | 3 | const uint8_t u0 = ui1[0]; |
565 | 3 | const uint8_t u1 = ui1[1]; |
566 | 3 | const uint8_t u2 = ui1[2]; |
567 | 3 | const uint8_t u3 = ui1[3]; |
568 | 3 | const uint8_t u4 = ui1[4]; |
569 | 3 | const int i0 = (int)(u0 >> 3); |
570 | 3 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
571 | 3 | const int i2 = (int)((u1 >> 1) & 31); |
572 | 3 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
573 | 3 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
574 | 3 | const int i5 = (int)((u3 >> 2) & 31); |
575 | 3 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
576 | 3 | const int i7 = (int)(u4 & 31); |
577 | 3 | f32[j] = palette[i0]; |
578 | 3 | if (j + 1 < nI) |
579 | 3 | f32[j + 1] = palette[i1]; |
580 | 3 | if (j + 2 < nI) |
581 | 3 | f32[j + 2] = palette[i2]; |
582 | 3 | if (j + 3 < nI) |
583 | 3 | f32[j + 3] = palette[i3]; |
584 | 3 | if (j + 4 < nI) |
585 | 3 | f32[j + 4] = palette[i4]; |
586 | 3 | if (j + 5 < nI) |
587 | 3 | f32[j + 5] = palette[i5]; |
588 | 3 | if (j + 6 < nI) |
589 | 3 | f32[j + 6] = palette[i6]; |
590 | 3 | if (j + 7 < nI) |
591 | 2 | f32[j + 7] = palette[i7]; |
592 | 3 | ui1 += 5; |
593 | 3 | } |
594 | 1 | } |
595 | 46 | } parallel_endfor |
596 | 6 | } else if (qbits == 6) { |
597 | 12 | parallel_for2 (i, num_blocks) { |
598 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
599 | 12 | const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i; |
600 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
601 | 12 | const float* const palette = (float*)ui0; |
602 | 12 | const uint8_t* ui1 = ui0 + element_size * 64; |
603 | 12 | float* const f32 = (float*)u80; |
604 | 12 | int j; |
605 | 12 | if (nI % 4 == 0) |
606 | 11 | { |
607 | 1.36k | for (j = 0; j < nI; j += 41.35k ) |
608 | 1.35k | { |
609 | 1.35k | const uint8_t u0 = ui1[0]; |
610 | 1.35k | const uint8_t u1 = ui1[1]; |
611 | 1.35k | const uint8_t u2 = ui1[2]; |
612 | 1.35k | const int i0 = (int)(u0 >> 2); |
613 | 1.35k | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
614 | 1.35k | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
615 | 1.35k | const int i3 = (int)(u2 & 63); |
616 | 1.35k | f32[j] = palette[i0]; |
617 | 1.35k | f32[j + 1] = palette[i1]; |
618 | 1.35k | f32[j + 2] = palette[i2]; |
619 | 1.35k | f32[j + 3] = palette[i3]; |
620 | 1.35k | ui1 += 3; |
621 | 1.35k | } |
622 | 11 | } else { |
623 | 71 | for (j = 0; j < nI; j += 470 ) |
624 | 70 | { |
625 | 70 | const uint8_t u0 = ui1[0]; |
626 | 70 | const uint8_t u1 = ui1[1]; |
627 | 70 | const uint8_t u2 = ui1[2]; |
628 | 70 | const int i0 = (int)(u0 >> 2); |
629 | 70 | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
630 | 70 | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
631 | 70 | const int i3 = (int)(u2 & 63); |
632 | 70 | f32[j] = palette[i0]; |
633 | 70 | if (j + 1 < nI) |
634 | 70 | f32[j + 1] = palette[i1]; |
635 | 70 | if (j + 2 < nI) |
636 | 70 | f32[j + 2] = palette[i2]; |
637 | 70 | if (j + 3 < nI) |
638 | 69 | f32[j + 3] = palette[i3]; |
639 | 70 | ui1 += 3; |
640 | 70 | } |
641 | 1 | } |
642 | 12 | } parallel_endfor |
643 | 4 | } else if (qbits == 7) { |
644 | 12 | parallel_for2 (i, num_blocks) { |
645 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
646 | 12 | const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i; |
647 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
648 | 12 | const float* const palette = (float*)ui0; |
649 | 12 | const uint8_t* ui1 = ui0 + element_size * 128; |
650 | 12 | float* const f32 = (float*)u80; |
651 | 12 | int j; |
652 | 12 | if (nI % 8 == 0) |
653 | 11 | { |
654 | 686 | for (j = 0; j < nI; j += 8675 ) |
655 | 675 | { |
656 | 675 | const uint8_t u0 = ui1[0]; |
657 | 675 | const uint8_t u1 = ui1[1]; |
658 | 675 | const uint8_t u2 = ui1[2]; |
659 | 675 | const uint8_t u3 = ui1[3]; |
660 | 675 | const uint8_t u4 = ui1[4]; |
661 | 675 | const uint8_t u5 = ui1[5]; |
662 | 675 | const uint8_t u6 = ui1[6]; |
663 | 675 | const int i0 = (int)(u0 >> 1); |
664 | 675 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
665 | 675 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
666 | 675 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
667 | 675 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
668 | 675 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
669 | 675 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
670 | 675 | const int i7 = (int)(u6 & 127); |
671 | 675 | f32[j] = palette[i0]; |
672 | 675 | f32[j + 1] = palette[i1]; |
673 | 675 | f32[j + 2] = palette[i2]; |
674 | 675 | f32[j + 3] = palette[i3]; |
675 | 675 | f32[j + 4] = palette[i4]; |
676 | 675 | f32[j + 5] = palette[i5]; |
677 | 675 | f32[j + 6] = palette[i6]; |
678 | 675 | f32[j + 7] = palette[i7]; |
679 | 675 | ui1 += 7; |
680 | 675 | } |
681 | 11 | } else { |
682 | 36 | for (j = 0; j < nI; j += 835 ) |
683 | 35 | { |
684 | 35 | const uint8_t u0 = ui1[0]; |
685 | 35 | const uint8_t u1 = ui1[1]; |
686 | 35 | const uint8_t u2 = ui1[2]; |
687 | 35 | const uint8_t u3 = ui1[3]; |
688 | 35 | const uint8_t u4 = ui1[4]; |
689 | 35 | const uint8_t u5 = ui1[5]; |
690 | 35 | const uint8_t u6 = ui1[6]; |
691 | 35 | const int i0 = (int)(u0 >> 1); |
692 | 35 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
693 | 35 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
694 | 35 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
695 | 35 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
696 | 35 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
697 | 35 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
698 | 35 | const int i7 = (int)(u6 & 127); |
699 | 35 | f32[j] = palette[i0]; |
700 | 35 | if (j + 1 < nI) |
701 | 35 | f32[j + 1] = palette[i1]; |
702 | 35 | if (j + 2 < nI) |
703 | 35 | f32[j + 2] = palette[i2]; |
704 | 35 | if (j + 3 < nI) |
705 | 35 | f32[j + 3] = palette[i3]; |
706 | 35 | if (j + 4 < nI) |
707 | 35 | f32[j + 4] = palette[i4]; |
708 | 35 | if (j + 5 < nI) |
709 | 35 | f32[j + 5] = palette[i5]; |
710 | 35 | if (j + 6 < nI) |
711 | 35 | f32[j + 6] = palette[i6]; |
712 | 35 | if (j + 7 < nI) |
713 | 34 | f32[j + 7] = palette[i7]; |
714 | 35 | ui1 += 7; |
715 | 35 | } |
716 | 1 | } |
717 | 12 | } parallel_endfor |
718 | 2 | } else { |
719 | 6 | parallel_for2 (i, num_blocks) { |
720 | 6 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
721 | 6 | const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i; |
722 | 6 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
723 | 6 | const float* const palette = (float*)ui0; |
724 | 6 | const uint8_t* ui1 = ui0 + element_size * 256; |
725 | 6 | float* const f32 = (float*)u80; |
726 | 6 | int j; |
727 | 5.68k | for (j = 0; j < nI; j++5.67k ) |
728 | 5.67k | { |
729 | 5.67k | const uint8_t u0 = *ui1; |
730 | 5.67k | f32[j] = palette[u0]; |
731 | 5.67k | ++ui1; |
732 | 5.67k | } |
733 | 6 | } parallel_endfor |
734 | 2 | } |
735 | 10 | } else { |
736 | 10 | if (qbits == 4) |
737 | 2 | { |
738 | 46 | parallel_for2 (i, num_blocks) { |
739 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
740 | 46 | const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i; |
741 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
742 | 46 | const double* const palette = (double*)ui0; |
743 | 46 | const uint8_t* ui1 = ui0 + element_size * 16; |
744 | 46 | double* const f64 = (double*)u80; |
745 | 46 | int j; |
746 | 46 | if (nI % 2 == 0) |
747 | 45 | { |
748 | 2.87k | for (j = 0; j < nI; j += 22.82k ) |
749 | 2.82k | { |
750 | 2.82k | const uint8_t u0 = *ui1; |
751 | 2.82k | const int i0 = (int)(u0 >> 4); |
752 | 2.82k | const int i1 = (int)(u0 & 15); |
753 | 2.82k | f64[j] = palette[i0]; |
754 | 2.82k | f64[j + 1] = palette[i1]; |
755 | 2.82k | ++ui1; |
756 | 2.82k | } |
757 | 45 | } else { |
758 | 13 | for (j = 0; j < nI; j += 212 ) |
759 | 12 | { |
760 | 12 | const uint8_t u0 = *ui1; |
761 | 12 | const int i0 = (int)(u0 >> 4); |
762 | 12 | const int i1 = (int)(u0 & 15); |
763 | 12 | f64[j] = palette[i0]; |
764 | 12 | if (j + 1 < nI) |
765 | 11 | f64[j + 1] = palette[i1]; |
766 | 12 | ++ui1; |
767 | 12 | } |
768 | 1 | } |
769 | 46 | } parallel_endfor |
770 | 8 | } else if (qbits == 5) { |
771 | 46 | parallel_for2 (i, num_blocks) { |
772 | 46 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
773 | 46 | const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i; |
774 | 46 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
775 | 46 | const double* const palette = (double*)ui0; |
776 | 46 | const uint8_t* ui1 = ui0 + element_size * 32; |
777 | 46 | double* const f64 = (double*)u80; |
778 | 46 | int j; |
779 | 46 | if (nI % 8 == 0) |
780 | 45 | { |
781 | 752 | for (j = 0; j < nI; j += 8707 ) |
782 | 707 | { |
783 | 707 | const uint8_t u0 = ui1[0]; |
784 | 707 | const uint8_t u1 = ui1[1]; |
785 | 707 | const uint8_t u2 = ui1[2]; |
786 | 707 | const uint8_t u3 = ui1[3]; |
787 | 707 | const uint8_t u4 = ui1[4]; |
788 | 707 | const int i0 = (int)(u0 >> 3); |
789 | 707 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
790 | 707 | const int i2 = (int)((u1 >> 1) & 31); |
791 | 707 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
792 | 707 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
793 | 707 | const int i5 = (int)((u3 >> 2) & 31); |
794 | 707 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
795 | 707 | const int i7 = (int)(u4 & 31); |
796 | 707 | f64[j] = palette[i0]; |
797 | 707 | f64[j + 1] = palette[i1]; |
798 | 707 | f64[j + 2] = palette[i2]; |
799 | 707 | f64[j + 3] = palette[i3]; |
800 | 707 | f64[j + 4] = palette[i4]; |
801 | 707 | f64[j + 5] = palette[i5]; |
802 | 707 | f64[j + 6] = palette[i6]; |
803 | 707 | f64[j + 7] = palette[i7]; |
804 | 707 | ui1 += 5; |
805 | 707 | } |
806 | 45 | } else { |
807 | 4 | for (j = 0; j < nI; j += 83 ) |
808 | 3 | { |
809 | 3 | const uint8_t u0 = ui1[0]; |
810 | 3 | const uint8_t u1 = ui1[1]; |
811 | 3 | const uint8_t u2 = ui1[2]; |
812 | 3 | const uint8_t u3 = ui1[3]; |
813 | 3 | const uint8_t u4 = ui1[4]; |
814 | 3 | const int i0 = (int)(u0 >> 3); |
815 | 3 | const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); |
816 | 3 | const int i2 = (int)((u1 >> 1) & 31); |
817 | 3 | const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); |
818 | 3 | const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); |
819 | 3 | const int i5 = (int)((u3 >> 2) & 31); |
820 | 3 | const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); |
821 | 3 | const int i7 = (int)(u4 & 31); |
822 | 3 | f64[j] = palette[i0]; |
823 | 3 | if (j + 1 < nI) |
824 | 3 | f64[j + 1] = palette[i1]; |
825 | 3 | if (j + 2 < nI) |
826 | 3 | f64[j + 2] = palette[i2]; |
827 | 3 | if (j + 3 < nI) |
828 | 3 | f64[j + 3] = palette[i3]; |
829 | 3 | if (j + 4 < nI) |
830 | 3 | f64[j + 4] = palette[i4]; |
831 | 3 | if (j + 5 < nI) |
832 | 3 | f64[j + 5] = palette[i5]; |
833 | 3 | if (j + 6 < nI) |
834 | 3 | f64[j + 6] = palette[i6]; |
835 | 3 | if (j + 7 < nI) |
836 | 2 | f64[j + 7] = palette[i7]; |
837 | 3 | ui1 += 5; |
838 | 3 | } |
839 | 1 | } |
840 | 46 | } parallel_endfor |
841 | 6 | } else if (qbits == 6) { |
842 | 12 | parallel_for2 (i, num_blocks) { |
843 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
844 | 12 | const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i; |
845 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
846 | 12 | const double* const palette = (double*)ui0; |
847 | 12 | const uint8_t* ui1 = ui0 + element_size * 64; |
848 | 12 | double* const f64 = (double*)u80; |
849 | 12 | int j; |
850 | 12 | if (nI % 4 == 0) |
851 | 11 | { |
852 | 1.36k | for (j = 0; j < nI; j += 41.35k ) |
853 | 1.35k | { |
854 | 1.35k | const uint8_t u0 = ui1[0]; |
855 | 1.35k | const uint8_t u1 = ui1[1]; |
856 | 1.35k | const uint8_t u2 = ui1[2]; |
857 | 1.35k | const int i0 = (int)(u0 >> 2); |
858 | 1.35k | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
859 | 1.35k | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
860 | 1.35k | const int i3 = (int)(u2 & 63); |
861 | 1.35k | f64[j] = palette[i0]; |
862 | 1.35k | f64[j + 1] = palette[i1]; |
863 | 1.35k | f64[j + 2] = palette[i2]; |
864 | 1.35k | f64[j + 3] = palette[i3]; |
865 | 1.35k | ui1 += 3; |
866 | 1.35k | } |
867 | 11 | } else { |
868 | 71 | for (j = 0; j < nI; j += 470 ) |
869 | 70 | { |
870 | 70 | const uint8_t u0 = ui1[0]; |
871 | 70 | const uint8_t u1 = ui1[1]; |
872 | 70 | const uint8_t u2 = ui1[2]; |
873 | 70 | const int i0 = (int)(u0 >> 2); |
874 | 70 | const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); |
875 | 70 | const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); |
876 | 70 | const int i3 = (int)(u2 & 63); |
877 | 70 | f64[j] = palette[i0]; |
878 | 70 | if (j + 1 < nI) |
879 | 70 | f64[j + 1] = palette[i1]; |
880 | 70 | if (j + 2 < nI) |
881 | 70 | f64[j + 2] = palette[i2]; |
882 | 70 | if (j + 3 < nI) |
883 | 69 | f64[j + 3] = palette[i3]; |
884 | 70 | ui1 += 3; |
885 | 70 | } |
886 | 1 | } |
887 | 12 | } parallel_endfor |
888 | 4 | } else if (qbits == 7) { |
889 | 12 | parallel_for2 (i, num_blocks) { |
890 | 12 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
891 | 12 | const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i; |
892 | 12 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
893 | 12 | const double* const palette = (double*)ui0; |
894 | 12 | const uint8_t* ui1 = ui0 + element_size * 128; |
895 | 12 | double* const f64 = (double*)u80; |
896 | 12 | int j; |
897 | 12 | if (nI % 8 == 0) |
898 | 11 | { |
899 | 686 | for (j = 0; j < nI; j += 8675 ) |
900 | 675 | { |
901 | 675 | const uint8_t u0 = ui1[0]; |
902 | 675 | const uint8_t u1 = ui1[1]; |
903 | 675 | const uint8_t u2 = ui1[2]; |
904 | 675 | const uint8_t u3 = ui1[3]; |
905 | 675 | const uint8_t u4 = ui1[4]; |
906 | 675 | const uint8_t u5 = ui1[5]; |
907 | 675 | const uint8_t u6 = ui1[6]; |
908 | 675 | const int i0 = (int)(u0 >> 1); |
909 | 675 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
910 | 675 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
911 | 675 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
912 | 675 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
913 | 675 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
914 | 675 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
915 | 675 | const int i7 = (int)(u6 & 127); |
916 | 675 | f64[j] = palette[i0]; |
917 | 675 | f64[j + 1] = palette[i1]; |
918 | 675 | f64[j + 2] = palette[i2]; |
919 | 675 | f64[j + 3] = palette[i3]; |
920 | 675 | f64[j + 4] = palette[i4]; |
921 | 675 | f64[j + 5] = palette[i5]; |
922 | 675 | f64[j + 6] = palette[i6]; |
923 | 675 | f64[j + 7] = palette[i7]; |
924 | 675 | ui1 += 7; |
925 | 675 | } |
926 | 11 | } else { |
927 | 36 | for (j = 0; j < nI; j += 835 ) |
928 | 35 | { |
929 | 35 | const uint8_t u0 = ui1[0]; |
930 | 35 | const uint8_t u1 = ui1[1]; |
931 | 35 | const uint8_t u2 = ui1[2]; |
932 | 35 | const uint8_t u3 = ui1[3]; |
933 | 35 | const uint8_t u4 = ui1[4]; |
934 | 35 | const uint8_t u5 = ui1[5]; |
935 | 35 | const uint8_t u6 = ui1[6]; |
936 | 35 | const int i0 = (int)(u0 >> 1); |
937 | 35 | const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); |
938 | 35 | const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); |
939 | 35 | const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); |
940 | 35 | const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); |
941 | 35 | const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); |
942 | 35 | const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); |
943 | 35 | const int i7 = (int)(u6 & 127); |
944 | 35 | f64[j] = palette[i0]; |
945 | 35 | if (j + 1 < nI) |
946 | 35 | f64[j + 1] = palette[i1]; |
947 | 35 | if (j + 2 < nI) |
948 | 35 | f64[j + 2] = palette[i2]; |
949 | 35 | if (j + 3 < nI) |
950 | 35 | f64[j + 3] = palette[i3]; |
951 | 35 | if (j + 4 < nI) |
952 | 35 | f64[j + 4] = palette[i4]; |
953 | 35 | if (j + 5 < nI) |
954 | 35 | f64[j + 5] = palette[i5]; |
955 | 35 | if (j + 6 < nI) |
956 | 35 | f64[j + 6] = palette[i6]; |
957 | 35 | if (j + 7 < nI) |
958 | 34 | f64[j + 7] = palette[i7]; |
959 | 35 | ui1 += 7; |
960 | 35 | } |
961 | 1 | } |
962 | 12 | } parallel_endfor |
963 | 2 | } else { |
964 | 6 | parallel_for2 (i, num_blocks) { |
965 | 6 | const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); |
966 | 6 | const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i; |
967 | 6 | uint8_t* const u80 = u8 + element_size * number_in_blocks * i; |
968 | 6 | const double* const palette = (double*)ui0; |
969 | 6 | const uint8_t* ui1 = ui0 + element_size * 256; |
970 | 6 | double* const f64 = (double*)u80; |
971 | 6 | int j; |
972 | 5.68k | for (j = 0; j < nI; j++5.67k ) |
973 | 5.67k | { |
974 | 5.67k | const uint8_t u0 = *ui1; |
975 | 5.67k | f64[j] = palette[u0]; |
976 | 5.67k | ++ui1; |
977 | 5.67k | } |
978 | 6 | } parallel_endfor |
979 | 2 | } |
980 | 10 | } |
981 | 30 | } |
982 | | |
983 | | void ccv_nnc_depalettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length) |
984 | 60 | { |
985 | 60 | assert(memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY); |
986 | 60 | if (memory_type == CCV_TENSOR_CPU_MEMORY) |
987 | 30 | _ccv_nnc_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length); |
988 | 30 | else { |
989 | 30 | #ifdef HAVE_CUDA |
990 | 30 | ccv_nnc_compat_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length, 0); |
991 | | #elif defined(HAVE_MPS) |
992 | | ccv_nnc_mps_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length, 0); |
993 | | #else |
994 | | assert(memory_type == CCV_TENSOR_CPU_MEMORY); |
995 | | #endif |
996 | 30 | } |
997 | 60 | } |
998 | | |
999 | | CCV_WARN_UNUSED(size_t) ccv_nnc_quantize_8i_rowwise(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, void* output, const size_t output_length) |
1000 | 5 | { |
1001 | 5 | assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F); |
1002 | 5 | assert(memory_type == CCV_TENSOR_CPU_MEMORY); |
1003 | 5 | assert(row_length > 0); |
1004 | 5 | assert(input_length % row_length == 0); |
1005 | 5 | const size_t row_count = input_length / row_length; |
1006 | 5 | const size_t scale_offset = (input_length + 127) & -128; |
1007 | 5 | const size_t scale_size = row_count * CCV_GET_DATA_TYPE_SIZE(datatype); |
1008 | 5 | assert(output_length >= scale_offset + scale_size); |
1009 | 5 | int8_t* const q = (int8_t*)output; |
1010 | 5 | uint8_t* const u8 = (uint8_t*)output; |
1011 | 5 | if (datatype == CCV_16F) |
1012 | 0 | { |
1013 | 0 | const uint16_t* const f16 = (const uint16_t*)input; |
1014 | 0 | uint16_t* const scales = (uint16_t*)(u8 + scale_offset); |
1015 | 0 | parallel_for(i, (int)row_count) { |
1016 | 0 | const size_t row_start = (size_t)i * row_length; |
1017 | 0 | double max_abs = 0; |
1018 | 0 | size_t j; |
1019 | 0 | for (j = 0; j < row_length; j++) |
1020 | 0 | { |
1021 | 0 | float v; |
1022 | 0 | ccv_half_precision_to_float(f16 + row_start + j, &v, 1); |
1023 | 0 | max_abs = ccv_max(max_abs, fabs(v)); |
1024 | 0 | } |
1025 | 0 | const float scale_f = (float)(max_abs / 127.); |
1026 | 0 | ccv_float_to_half_precision(&scale_f, scales + i, 1); |
1027 | 0 | if (scale_f == 0) |
1028 | 0 | memset(q + row_start, 0, row_length); |
1029 | 0 | else { |
1030 | 0 | const double inv_scale = 1. / scale_f; |
1031 | 0 | for (j = 0; j < row_length; j++) |
1032 | 0 | { |
1033 | 0 | float v; |
1034 | 0 | ccv_half_precision_to_float(f16 + row_start + j, &v, 1); |
1035 | 0 | const int iv = (int)lrint(v * inv_scale); |
1036 | 0 | q[row_start + j] = (int8_t)ccv_clamp(iv, -127, 127); |
1037 | 0 | } |
1038 | 0 | } |
1039 | 0 | } parallel_endfor |
1040 | 5 | } else if (datatype == CCV_16BF) { |
1041 | 1 | const uint16_t* const bf16 = (const uint16_t*)input; |
1042 | 1 | uint16_t* const scales = (uint16_t*)(u8 + scale_offset); |
1043 | 4 | parallel_for1 (i, (int)row_count) { |
1044 | 4 | const size_t row_start = (size_t)i * row_length; |
1045 | 4 | double max_abs = 0; |
1046 | 4 | size_t j; |
1047 | 36 | for (j = 0; j < row_length; j++32 ) |
1048 | 32 | { |
1049 | 32 | float v; |
1050 | 32 | ccv_bfloat_to_float(bf16 + row_start + j, &v, 1); |
1051 | 32 | max_abs = ccv_max(max_abs, fabs(v)); |
1052 | 32 | } |
1053 | 4 | const float scale_f = (float)(max_abs / 127.); |
1054 | 4 | ccv_float_to_bfloat(&scale_f, scales + i, 1); |
1055 | 4 | if (scale_f == 0) |
1056 | 0 | memset(q + row_start, 0, row_length); |
1057 | 4 | else { |
1058 | 4 | const double inv_scale = 1. / scale_f; |
1059 | 36 | for (j = 0; j < row_length; j++32 ) |
1060 | 32 | { |
1061 | 32 | float v; |
1062 | 32 | ccv_bfloat_to_float(bf16 + row_start + j, &v, 1); |
1063 | 32 | const int iv = (int)lrint(v * inv_scale); |
1064 | 32 | q[row_start + j] = (int8_t)ccv_clamp(iv, -127, 127); |
1065 | 32 | } |
1066 | 4 | } |
1067 | 4 | } parallel_endfor |
1068 | 4 | } else if (datatype == CCV_32F) { |
1069 | 4 | const float* const f32 = (const float*)input; |
1070 | 4 | float* const scales = (float*)(u8 + scale_offset); |
1071 | 12 | parallel_for4 (i, (int)row_count) { |
1072 | 12 | const size_t row_start = (size_t)i * row_length; |
1073 | 12 | double max_abs = 0; |
1074 | 12 | size_t j; |
1075 | 88 | for (j = 0; j < row_length; j++76 ) |
1076 | 76 | max_abs = ccv_max(max_abs, fabs(f32[row_start + j])); |
1077 | 12 | scales[i] = (float)(max_abs / 127.); |
1078 | 12 | if (scales[i] == 0) |
1079 | 0 | memset(q + row_start, 0, row_length); |
1080 | 12 | else { |
1081 | 12 | const double inv_scale = 1. / scales[i]; |
1082 | 88 | for (j = 0; j < row_length; j++76 ) |
1083 | 76 | { |
1084 | 76 | const int iv = (int)lrint(f32[row_start + j] * inv_scale); |
1085 | 76 | q[row_start + j] = (int8_t)ccv_clamp(iv, -127, 127); |
1086 | 76 | } |
1087 | 12 | } |
1088 | 12 | } parallel_endfor |
1089 | 4 | } else { |
1090 | 0 | assert(datatype == CCV_64F); |
1091 | 0 | const double* const f64 = (const double*)input; |
1092 | 0 | double* const scales = (double*)(u8 + scale_offset); |
1093 | 0 | parallel_for(i, (int)row_count) { |
1094 | 0 | const size_t row_start = (size_t)i * row_length; |
1095 | 0 | double max_abs = 0; |
1096 | 0 | size_t j; |
1097 | 0 | for (j = 0; j < row_length; j++) |
1098 | 0 | max_abs = ccv_max(max_abs, fabs(f64[row_start + j])); |
1099 | 0 | scales[i] = max_abs / 127.; |
1100 | 0 | if (scales[i] == 0) |
1101 | 0 | memset(q + row_start, 0, row_length); |
1102 | 0 | else { |
1103 | 0 | const double inv_scale = 1. / scales[i]; |
1104 | 0 | for (j = 0; j < row_length; j++) |
1105 | 0 | { |
1106 | 0 | const int iv = (int)lrint(f64[row_start + j] * inv_scale); |
1107 | 0 | q[row_start + j] = (int8_t)ccv_clamp(iv, -127, 127); |
1108 | 0 | } |
1109 | 0 | } |
1110 | 0 | } parallel_endfor |
1111 | 0 | } |
1112 | 5 | return scale_offset + scale_size; |
1113 | 5 | } |
1114 | | |
1115 | | void ccv_nnc_dequantize_8i_rowwise(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, void* output, const size_t output_length) |
1116 | 5 | { |
1117 | 5 | assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F); |
1118 | 5 | assert(memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY); |
1119 | 5 | assert(row_length > 0); |
1120 | 5 | assert(output_length % row_length == 0); |
1121 | 5 | if (memory_type != CCV_TENSOR_CPU_MEMORY) |
1122 | 1 | { |
1123 | 1 | #ifdef HAVE_CUDA |
1124 | 1 | ccv_nnc_compat_dequantize_8i_rowwise(input, datatype, input_length, row_length, output, output_length, 0); |
1125 | | #elif defined(HAVE_MPS) |
1126 | | assert(datatype != CCV_64F); |
1127 | | ccv_nnc_mps_dequantize_8i_rowwise(input, datatype, input_length, row_length, output, output_length, 0); |
1128 | | #else |
1129 | | assert(memory_type == CCV_TENSOR_CPU_MEMORY); |
1130 | | #endif |
1131 | 1 | return; |
1132 | 1 | } |
1133 | 4 | const size_t row_count = output_length / row_length; |
1134 | 4 | const size_t scale_offset = (output_length + 127) & -128; |
1135 | 4 | const size_t scale_size = row_count * CCV_GET_DATA_TYPE_SIZE(datatype); |
1136 | 4 | assert(input_length >= scale_offset + scale_size); |
1137 | 4 | const int8_t* const q = (const int8_t*)input; |
1138 | 4 | const uint8_t* const u8 = (const uint8_t*)input; |
1139 | 4 | if (datatype == CCV_16F) |
1140 | 0 | { |
1141 | 0 | uint16_t* const f16 = (uint16_t*)output; |
1142 | 0 | const uint16_t* const scales = (const uint16_t*)(u8 + scale_offset); |
1143 | 0 | parallel_for(i, (int)row_count) { |
1144 | 0 | const size_t row_start = (size_t)i * row_length; |
1145 | 0 | float scale_f; |
1146 | 0 | ccv_half_precision_to_float(scales + i, &scale_f, 1); |
1147 | 0 | size_t j; |
1148 | 0 | for (j = 0; j < row_length; j++) |
1149 | 0 | { |
1150 | 0 | const float v = q[row_start + j] * scale_f; |
1151 | 0 | ccv_float_to_half_precision(&v, f16 + row_start + j, 1); |
1152 | 0 | } |
1153 | 0 | } parallel_endfor |
1154 | 4 | } else if (datatype == CCV_16BF) { |
1155 | 1 | uint16_t* const bf16 = (uint16_t*)output; |
1156 | 1 | const uint16_t* const scales = (const uint16_t*)(u8 + scale_offset); |
1157 | 4 | parallel_for1 (i, (int)row_count) { |
1158 | 4 | const size_t row_start = (size_t)i * row_length; |
1159 | 4 | float scale_f; |
1160 | 4 | ccv_bfloat_to_float(scales + i, &scale_f, 1); |
1161 | 4 | size_t j; |
1162 | 36 | for (j = 0; j < row_length; j++32 ) |
1163 | 32 | { |
1164 | 32 | const float v = q[row_start + j] * scale_f; |
1165 | 32 | ccv_float_to_bfloat(&v, bf16 + row_start + j, 1); |
1166 | 32 | } |
1167 | 4 | } parallel_endfor |
1168 | 3 | } else if (datatype == CCV_32F) { |
1169 | 3 | float* const f32 = (float*)output; |
1170 | 3 | const float* const scales = (const float*)(u8 + scale_offset); |
1171 | 8 | parallel_for3 (i, (int)row_count) { |
1172 | 8 | const size_t row_start = (size_t)i * row_length; |
1173 | 8 | const float scale = scales[i]; |
1174 | 8 | size_t j; |
1175 | 52 | for (j = 0; j < row_length; j++44 ) |
1176 | 44 | f32[row_start + j] = q[row_start + j] * scale; |
1177 | 8 | } parallel_endfor |
1178 | 3 | } else { |
1179 | 0 | assert(datatype == CCV_64F); |
1180 | 0 | double* const f64 = (double*)output; |
1181 | 0 | const double* const scales = (const double*)(u8 + scale_offset); |
1182 | 0 | parallel_for(i, (int)row_count) { |
1183 | 0 | const size_t row_start = (size_t)i * row_length; |
1184 | 0 | const double scale = scales[i]; |
1185 | 0 | size_t j; |
1186 | 0 | for (j = 0; j < row_length; j++) |
1187 | 0 | f64[row_start + j] = q[row_start + j] * scale; |
1188 | 0 | } parallel_endfor |
1189 | 0 | } |
1190 | 4 | } |