Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_palettize.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#ifdef HAVE_CUDA
4
#include "gpu/ccv_nnc_compat.h"
5
#elif defined(HAVE_MPS)
6
#include "mps/ccv_nnc_mps.h"
7
#endif
8
9
size_t ccv_nnc_palettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length)
10
63
{
11
63
  assert(datatype == CCV_16F || datatype == CCV_32F || datatype == CCV_64F);
12
63
  assert(memory_type == CCV_TENSOR_CPU_MEMORY);
13
63
  const int num_blocks = (input_length + number_in_blocks - 1) / number_in_blocks;
14
63
  const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype);
15
63
  uint8_t* const u8 = (uint8_t*)output;
16
63
  uint8_t* const ui = (uint8_t*)input;
17
63
  assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8);
18
63
  if (qbits == 4)
19
14
  {
20
278
    
parallel_for14
(i, num_blocks) {
21
278
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
22
278
      int* const indices = ccmalloc(sizeof(int) * nI);
23
278
      double centroids[16];
24
278
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
25
278
      ccv_kmeans1d(&a, 16, indices, centroids);
26
278
      uint8_t* u80 = u8 + (16 * element_size + number_in_blocks / 2) * i;
27
278
      int j;
28
278
      if (datatype == CCV_16F)
29
92
      {
30
92
        float* f32 = (float*)centroids;
31
1.56k
        for (j = 0; j < 16; 
j++1.47k
)
32
1.47k
          f32[j] = (float)centroids[j];
33
92
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 16);
34
186
      } else if (datatype == CCV_32F) {
35
94
        float* f32 = (float*)u80;
36
1.59k
        for (j = 0; j < 16; 
j++1.50k
)
37
1.50k
          f32[j] = (float)centroids[j];
38
94
      } else {
39
92
        memcpy(u80, centroids, sizeof(double) * 16);
40
92
      }
41
278
      u80 += 16 * element_size;
42
17.3k
      for (j = 0; j < nI; 
j += 217.0k
)
43
17.0k
      {
44
17.0k
        const uint8_t i0 = (uint8_t)indices[j];
45
17.0k
        const uint8_t i1 = j + 1 < nI ? 
(uint8_t)indices[j + 1]17.0k
:
06
;
46
17.0k
        *u80 = (i0 << 4) | i1;
47
17.0k
        ++u80;
48
17.0k
      }
49
278
      ccfree(indices);
50
278
    } parallel_endfor
51
14
    return element_size * num_blocks * 16 + (input_length + 1) / 2;
52
49
  } else if (qbits == 5) {
53
276
    
parallel_for12
(i, num_blocks) {
54
276
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
55
276
      int* const indices = ccmalloc(sizeof(int) * nI);
56
276
      double centroids[32];
57
276
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
58
276
      ccv_kmeans1d(&a, 32, indices, centroids);
59
276
      uint8_t* u80 = u8 + (32 * element_size + number_in_blocks / 8 * 5) * i;
60
276
      int j;
61
276
      if (datatype == CCV_16F)
62
92
      {
63
92
        float* f32 = (float*)centroids;
64
3.03k
        for (j = 0; j < 32; 
j++2.94k
)
65
2.94k
          f32[j] = (float)centroids[j];
66
92
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 32);
67
184
      } else if (datatype == CCV_32F) {
68
92
        float* f32 = (float*)u80;
69
3.03k
        for (j = 0; j < 32; 
j++2.94k
)
70
2.94k
          f32[j] = (float)centroids[j];
71
92
      } else {
72
92
        memcpy(u80, centroids, sizeof(double) * 32);
73
92
      }
74
276
      u80 += 32 * element_size;
75
4.53k
      for (j = 0; j < nI; 
j += 84.26k
)
76
4.26k
      {
77
4.26k
        const uint8_t i0 = (uint8_t)indices[j];
78
4.26k
        const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 
00
;
79
4.26k
        const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 
00
;
80
4.26k
        const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 
00
;
81
4.26k
        const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 
00
;
82
4.26k
        const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 
00
;
83
4.26k
        const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 
00
;
84
4.26k
        const uint8_t i7 = j + 7 < nI ? 
(uint8_t)indices[j + 7]4.25k
:
06
;
85
4.26k
        u80[0] = (i0 << 3) | (i1 >> 2);
86
4.26k
        u80[1] = (i1 << 6) | (i2 << 1) | (i3 >> 4);
87
4.26k
        u80[2] = (i3 << 4) | (i4 >> 1);
88
4.26k
        u80[3] = (i4 << 7) | (i5 << 2) | (i6 >> 3);
89
4.26k
        u80[4] = (i6 << 5) | i7;
90
4.26k
        u80 += 5;
91
4.26k
      }
92
276
      ccfree(indices);
93
276
    } parallel_endfor
94
12
    return element_size * num_blocks * 32 + (input_length + 7) / 8 * 5;
95
37
  } else if (qbits == 6) {
96
80
    
parallel_for13
(i, num_blocks) {
97
80
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
98
80
      int* const indices = ccmalloc(sizeof(int) * nI);
99
80
      double centroids[64];
100
80
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
101
80
      ccv_kmeans1d(&a, 64, indices, centroids);
102
80
      uint8_t* u80 = u8 + (64 * element_size + number_in_blocks / 4 * 3) * i;
103
80
      int j;
104
80
      if (datatype == CCV_16F)
105
32
      {
106
32
        float* f32 = (float*)centroids;
107
2.08k
        for (j = 0; j < 64; 
j++2.04k
)
108
2.04k
          f32[j] = (float)centroids[j];
109
32
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 64);
110
48
      } else if (datatype == CCV_32F) {
111
24
        float* f32 = (float*)u80;
112
1.56k
        for (j = 0; j < 64; 
j++1.53k
)
113
1.53k
          f32[j] = (float)centroids[j];
114
24
      } else {
115
24
        memcpy(u80, centroids, sizeof(double) * 64);
116
24
      }
117
80
      u80 += 64 * element_size;
118
13.4k
      for (j = 0; j < nI; 
j += 413.3k
)
119
13.3k
      {
120
13.3k
        const uint8_t i0 = (uint8_t)indices[j];
121
13.3k
        const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 
00
;
122
13.3k
        const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 
00
;
123
13.3k
        const uint8_t i3 = j + 3 < nI ? 
(uint8_t)indices[j + 3]13.3k
:
06
;
124
13.3k
        u80[0] = (i0 << 2) | (i1 >> 4);
125
13.3k
        u80[1] = (i1 << 4) | (i2 >> 2);
126
13.3k
        u80[2] = (i2 << 6) | i3;
127
13.3k
        u80 += 3;
128
13.3k
      }
129
80
      ccfree(indices);
130
80
    } parallel_endfor
131
13
    return element_size * num_blocks * 64 + (input_length + 3) / 4 * 3;
132
24
  } else if (qbits == 7) {
133
72
    
parallel_for12
(i, num_blocks) {
134
72
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
135
72
      int* const indices = ccmalloc(sizeof(int) * nI);
136
72
      double centroids[128];
137
72
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
138
72
      ccv_kmeans1d(&a, 128, indices, centroids);
139
72
      uint8_t* u80 = u8 + (128 * element_size + number_in_blocks / 8 * 7) * i;
140
72
      int j;
141
72
      if (datatype == CCV_16F)
142
24
      {
143
24
        float* f32 = (float*)centroids;
144
3.09k
        for (j = 0; j < 128; 
j++3.07k
)
145
3.07k
          f32[j] = (float)centroids[j];
146
24
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 128);
147
48
      } else if (datatype == CCV_32F) {
148
24
        float* f32 = (float*)u80;
149
3.09k
        for (j = 0; j < 128; 
j++3.07k
)
150
3.07k
          f32[j] = (float)centroids[j];
151
24
      } else {
152
24
        memcpy(u80, centroids, sizeof(double) * 128);
153
24
      }
154
72
      u80 += 128 * element_size;
155
4.33k
      for (j = 0; j < nI; 
j += 84.26k
)
156
4.26k
      {
157
4.26k
        const uint8_t i0 = (uint8_t)indices[j];
158
4.26k
        const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 
00
;
159
4.26k
        const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 
00
;
160
4.26k
        const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 
00
;
161
4.26k
        const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 
00
;
162
4.26k
        const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 
00
;
163
4.26k
        const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 
00
;
164
4.26k
        const uint8_t i7 = j + 7 < nI ? 
(uint8_t)indices[j + 7]4.25k
:
06
;
165
4.26k
        u80[0] = (i0 << 1) | (i1 >> 6);
166
4.26k
        u80[1] = (i1 << 2) | (i2 >> 5);
167
4.26k
        u80[2] = (i2 << 3) | (i3 >> 4);
168
4.26k
        u80[3] = (i3 << 4) | (i4 >> 3);
169
4.26k
        u80[4] = (i4 << 5) | (i5 >> 2);
170
4.26k
        u80[5] = (i5 << 6) | (i6 >> 1);
171
4.26k
        u80[6] = (i6 << 7) | i7;
172
4.26k
        u80 += 7;
173
4.26k
      }
174
72
      ccfree(indices);
175
72
    } parallel_endfor
176
12
    return element_size * num_blocks * 128 + (input_length + 7) / 8 * 7;
177
12
  } else {
178
35
    
parallel_for12
(i, num_blocks) {
179
35
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
180
35
      int* const indices = ccmalloc(sizeof(int) * nI);
181
35
      double centroids[256];
182
35
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
183
35
      ccv_kmeans1d(&a, 256, indices, centroids);
184
35
      uint8_t* u80 = u8 + (256 * element_size + number_in_blocks) * i;
185
35
      int j;
186
35
      if (datatype == CCV_16F)
187
12
      {
188
12
        float* f32 = (float*)centroids;
189
3.08k
        for (j = 0; j < 256; 
j++3.07k
)
190
3.07k
          f32[j] = (float)centroids[j];
191
12
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 256);
192
23
      } else if (datatype == CCV_32F) {
193
11
        float* f32 = (float*)u80;
194
2.82k
        for (j = 0; j < 256; 
j++2.81k
)
195
2.81k
          f32[j] = (float)centroids[j];
196
12
      } else {
197
12
        memcpy(u80, centroids, sizeof(double) * 256);
198
12
      }
199
35
      u80 += 256 * element_size;
200
39.4k
      for (j = 0; j < nI; 
j++39.4k
)
201
39.4k
      {
202
39.4k
        *u80 = (uint8_t)indices[j];
203
39.4k
        ++u80;
204
39.4k
      }
205
35
      ccfree(indices);
206
35
    } parallel_endfor
207
12
    return element_size * num_blocks * 256 + input_length;
208
12
  }
209
63
}
210
211
static void _ccv_nnc_depalettize(const void* input, const int datatype, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length)
212
30
{
213
30
  assert(datatype == CCV_16F || datatype == CCV_32F || datatype == CCV_64F);
214
30
  const int num_blocks = (output_length + number_in_blocks - 1) / number_in_blocks;
215
30
  const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype);
216
30
  uint8_t* const u8 = (uint8_t*)output;
217
30
  const uint8_t* const ui = (const uint8_t*)input;
218
30
  assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8);
219
30
  if (datatype == CCV_16F)
220
10
  {
221
10
    if (qbits == 4)
222
2
    {
223
46
      
parallel_for2
(i, num_blocks) {
224
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
225
46
        const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i;
226
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
227
46
        const uint16_t* const palette = (uint16_t*)ui0;
228
46
        const uint8_t* ui1 = ui0 + element_size * 16;
229
46
        uint16_t* const f16 = (uint16_t*)u80;
230
46
        int j;
231
46
        if (nI % 2 == 0)
232
45
        {
233
2.87k
          for (j = 0; j < nI; 
j += 22.82k
)
234
2.82k
          {
235
2.82k
            const uint8_t u0 = *ui1;
236
2.82k
            const int i0 = (int)(u0 >> 4);
237
2.82k
            const int i1 = (int)(u0 & 15);
238
2.82k
            f16[j] = palette[i0];
239
2.82k
            f16[j + 1] = palette[i1];
240
2.82k
            ++ui1;
241
2.82k
          }
242
45
        } else {
243
13
          for (j = 0; j < nI; 
j += 212
)
244
12
          {
245
12
            const uint8_t u0 = *ui1;
246
12
            const int i0 = (int)(u0 >> 4);
247
12
            const int i1 = (int)(u0 & 15);
248
12
            f16[j] = palette[i0];
249
12
            if (j + 1 < nI)
250
11
              f16[j + 1] = palette[i1];
251
12
            ++ui1;
252
12
          }
253
1
        }
254
46
      } parallel_endfor
255
8
    } else if (qbits == 5) {
256
46
      
parallel_for2
(i, num_blocks) {
257
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
258
46
        const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i;
259
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
260
46
        const uint16_t* const palette = (uint16_t*)ui0;
261
46
        const uint8_t* ui1 = ui0 + element_size * 32;
262
46
        uint16_t* const f16 = (uint16_t*)u80;
263
46
        int j;
264
46
        if (nI % 8 == 0)
265
45
        {
266
752
          for (j = 0; j < nI; 
j += 8707
)
267
707
          {
268
707
            const uint8_t u0 = ui1[0];
269
707
            const uint8_t u1 = ui1[1];
270
707
            const uint8_t u2 = ui1[2];
271
707
            const uint8_t u3 = ui1[3];
272
707
            const uint8_t u4 = ui1[4];
273
707
            const int i0 = (int)(u0 >> 3);
274
707
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
275
707
            const int i2 = (int)((u1 >> 1) & 31);
276
707
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
277
707
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
278
707
            const int i5 = (int)((u3 >> 2) & 31);
279
707
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
280
707
            const int i7 = (int)(u4 & 31);
281
707
            f16[j] = palette[i0];
282
707
            f16[j + 1] = palette[i1];
283
707
            f16[j + 2] = palette[i2];
284
707
            f16[j + 3] = palette[i3];
285
707
            f16[j + 4] = palette[i4];
286
707
            f16[j + 5] = palette[i5];
287
707
            f16[j + 6] = palette[i6];
288
707
            f16[j + 7] = palette[i7];
289
707
            ui1 += 5;
290
707
          }
291
45
        } else {
292
4
          for (j = 0; j < nI; 
j += 83
)
293
3
          {
294
3
            const uint8_t u0 = ui1[0];
295
3
            const uint8_t u1 = ui1[1];
296
3
            const uint8_t u2 = ui1[2];
297
3
            const uint8_t u3 = ui1[3];
298
3
            const uint8_t u4 = ui1[4];
299
3
            const int i0 = (int)(u0 >> 3);
300
3
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
301
3
            const int i2 = (int)((u1 >> 1) & 31);
302
3
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
303
3
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
304
3
            const int i5 = (int)((u3 >> 2) & 31);
305
3
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
306
3
            const int i7 = (int)(u4 & 31);
307
3
            f16[j] = palette[i0];
308
3
            if (j + 1 < nI)
309
3
              f16[j + 1] = palette[i1];
310
3
            if (j + 2 < nI)
311
3
              f16[j + 2] = palette[i2];
312
3
            if (j + 3 < nI)
313
3
              f16[j + 3] = palette[i3];
314
3
            if (j + 4 < nI)
315
3
              f16[j + 4] = palette[i4];
316
3
            if (j + 5 < nI)
317
3
              f16[j + 5] = palette[i5];
318
3
            if (j + 6 < nI)
319
3
              f16[j + 6] = palette[i6];
320
3
            if (j + 7 < nI)
321
2
              f16[j + 7] = palette[i7];
322
3
            ui1 += 5;
323
3
          }
324
1
        }
325
46
      } parallel_endfor
326
6
    } else if (qbits == 6) {
327
12
      
parallel_for2
(i, num_blocks) {
328
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
329
12
        const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i;
330
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
331
12
        const uint16_t* const palette = (uint16_t*)ui0;
332
12
        const uint8_t* ui1 = ui0 + element_size * 64;
333
12
        uint16_t* const f16 = (uint16_t*)u80;
334
12
        int j;
335
12
        if (nI % 4 == 0)
336
11
        {
337
1.36k
          for (j = 0; j < nI; 
j += 41.35k
)
338
1.35k
          {
339
1.35k
            const uint8_t u0 = ui1[0];
340
1.35k
            const uint8_t u1 = ui1[1];
341
1.35k
            const uint8_t u2 = ui1[2];
342
1.35k
            const int i0 = (int)(u0 >> 2);
343
1.35k
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
344
1.35k
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
345
1.35k
            const int i3 = (int)(u2 & 63);
346
1.35k
            f16[j] = palette[i0];
347
1.35k
            f16[j + 1] = palette[i1];
348
1.35k
            f16[j + 2] = palette[i2];
349
1.35k
            f16[j + 3] = palette[i3];
350
1.35k
            ui1 += 3;
351
1.35k
          }
352
11
        } else {
353
71
          for (j = 0; j < nI; 
j += 470
)
354
70
          {
355
70
            const uint8_t u0 = ui1[0];
356
70
            const uint8_t u1 = ui1[1];
357
70
            const uint8_t u2 = ui1[2];
358
70
            const int i0 = (int)(u0 >> 2);
359
70
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
360
70
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
361
70
            const int i3 = (int)(u2 & 63);
362
70
            f16[j] = palette[i0];
363
70
            if (j + 1 < nI)
364
70
              f16[j + 1] = palette[i1];
365
70
            if (j + 2 < nI)
366
70
              f16[j + 2] = palette[i2];
367
70
            if (j + 3 < nI)
368
69
              f16[j + 3] = palette[i3];
369
70
            ui1 += 3;
370
70
          }
371
1
        }
372
12
      } parallel_endfor
373
4
    } else if (qbits == 7) {
374
12
      
parallel_for2
(i, num_blocks) {
375
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
376
12
        const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i;
377
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
378
12
        const uint16_t* const palette = (uint16_t*)ui0;
379
12
        const uint8_t* ui1 = ui0 + element_size * 128;
380
12
        uint16_t* const f16 = (uint16_t*)u80;
381
12
        int j;
382
12
        if (nI % 8 == 0)
383
11
        {
384
686
          for (j = 0; j < nI; 
j += 8675
)
385
675
          {
386
675
            const uint8_t u0 = ui1[0];
387
675
            const uint8_t u1 = ui1[1];
388
675
            const uint8_t u2 = ui1[2];
389
675
            const uint8_t u3 = ui1[3];
390
675
            const uint8_t u4 = ui1[4];
391
675
            const uint8_t u5 = ui1[5];
392
675
            const uint8_t u6 = ui1[6];
393
675
            const int i0 = (int)(u0 >> 1);
394
675
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
395
675
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
396
675
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
397
675
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
398
675
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
399
675
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
400
675
            const int i7 = (int)(u6 & 127);
401
675
            f16[j] = palette[i0];
402
675
            f16[j + 1] = palette[i1];
403
675
            f16[j + 2] = palette[i2];
404
675
            f16[j + 3] = palette[i3];
405
675
            f16[j + 4] = palette[i4];
406
675
            f16[j + 5] = palette[i5];
407
675
            f16[j + 6] = palette[i6];
408
675
            f16[j + 7] = palette[i7];
409
675
            ui1 += 7;
410
675
          }
411
11
        } else {
412
36
          for (j = 0; j < nI; 
j += 835
)
413
35
          {
414
35
            const uint8_t u0 = ui1[0];
415
35
            const uint8_t u1 = ui1[1];
416
35
            const uint8_t u2 = ui1[2];
417
35
            const uint8_t u3 = ui1[3];
418
35
            const uint8_t u4 = ui1[4];
419
35
            const uint8_t u5 = ui1[5];
420
35
            const uint8_t u6 = ui1[6];
421
35
            const int i0 = (int)(u0 >> 1);
422
35
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
423
35
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
424
35
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
425
35
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
426
35
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
427
35
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
428
35
            const int i7 = (int)(u6 & 127);
429
35
            f16[j] = palette[i0];
430
35
            if (j + 1 < nI)
431
35
              f16[j + 1] = palette[i1];
432
35
            if (j + 2 < nI)
433
35
              f16[j + 2] = palette[i2];
434
35
            if (j + 3 < nI)
435
35
              f16[j + 3] = palette[i3];
436
35
            if (j + 4 < nI)
437
35
              f16[j + 4] = palette[i4];
438
35
            if (j + 5 < nI)
439
35
              f16[j + 5] = palette[i5];
440
35
            if (j + 6 < nI)
441
35
              f16[j + 6] = palette[i6];
442
35
            if (j + 7 < nI)
443
34
              f16[j + 7] = palette[i7];
444
35
            ui1 += 7;
445
35
          }
446
1
        }
447
12
      } parallel_endfor
448
2
    } else {
449
6
      
parallel_for2
(i, num_blocks) {
450
6
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
451
6
        const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i;
452
6
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
453
6
        const uint16_t* const palette = (uint16_t*)ui0;
454
6
        const uint8_t* ui1 = ui0 + element_size * 256;
455
6
        uint16_t* const f16 = (uint16_t*)u80;
456
6
        int j;
457
5.68k
        for (j = 0; j < nI; 
j++5.67k
)
458
5.67k
        {
459
5.67k
          const uint8_t u0 = *ui1;
460
5.67k
          f16[j] = palette[u0];
461
5.67k
          ++ui1;
462
5.67k
        }
463
6
      } parallel_endfor
464
2
    }
465
20
  } else if (datatype == CCV_32F) {
466
10
    if (qbits == 4)
467
2
    {
468
46
      
parallel_for2
(i, num_blocks) {
469
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
470
46
        const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i;
471
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
472
46
        const float* const palette = (float*)ui0;
473
46
        const uint8_t* ui1 = ui0 + element_size * 16;
474
46
        float* const f32 = (float*)u80;
475
46
        int j;
476
46
        if (nI % 2 == 0)
477
45
        {
478
2.87k
          for (j = 0; j < nI; 
j += 22.82k
)
479
2.82k
          {
480
2.82k
            const uint8_t u0 = *ui1;
481
2.82k
            const int i0 = (int)(u0 >> 4);
482
2.82k
            const int i1 = (int)(u0 & 15);
483
2.82k
            f32[j] = palette[i0];
484
2.82k
            f32[j + 1] = palette[i1];
485
2.82k
            ++ui1;
486
2.82k
          }
487
45
        } else {
488
13
          for (j = 0; j < nI; 
j += 212
)
489
12
          {
490
12
            const uint8_t u0 = *ui1;
491
12
            const int i0 = (int)(u0 >> 4);
492
12
            const int i1 = (int)(u0 & 15);
493
12
            f32[j] = palette[i0];
494
12
            if (j + 1 < nI)
495
11
              f32[j + 1] = palette[i1];
496
12
            ++ui1;
497
12
          }
498
1
        }
499
46
      } parallel_endfor
500
8
    } else if (qbits == 5) {
501
46
      
parallel_for2
(i, num_blocks) {
502
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
503
46
        const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i;
504
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
505
46
        const float* const palette = (float*)ui0;
506
46
        const uint8_t* ui1 = ui0 + element_size * 32;
507
46
        float* const f32 = (float*)u80;
508
46
        int j;
509
46
        if (nI % 8 == 0)
510
45
        {
511
752
          for (j = 0; j < nI; 
j += 8707
)
512
707
          {
513
707
            const uint8_t u0 = ui1[0];
514
707
            const uint8_t u1 = ui1[1];
515
707
            const uint8_t u2 = ui1[2];
516
707
            const uint8_t u3 = ui1[3];
517
707
            const uint8_t u4 = ui1[4];
518
707
            const int i0 = (int)(u0 >> 3);
519
707
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
520
707
            const int i2 = (int)((u1 >> 1) & 31);
521
707
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
522
707
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
523
707
            const int i5 = (int)((u3 >> 2) & 31);
524
707
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
525
707
            const int i7 = (int)(u4 & 31);
526
707
            f32[j] = palette[i0];
527
707
            f32[j + 1] = palette[i1];
528
707
            f32[j + 2] = palette[i2];
529
707
            f32[j + 3] = palette[i3];
530
707
            f32[j + 4] = palette[i4];
531
707
            f32[j + 5] = palette[i5];
532
707
            f32[j + 6] = palette[i6];
533
707
            f32[j + 7] = palette[i7];
534
707
            ui1 += 5;
535
707
          }
536
45
        } else {
537
4
          for (j = 0; j < nI; 
j += 83
)
538
3
          {
539
3
            const uint8_t u0 = ui1[0];
540
3
            const uint8_t u1 = ui1[1];
541
3
            const uint8_t u2 = ui1[2];
542
3
            const uint8_t u3 = ui1[3];
543
3
            const uint8_t u4 = ui1[4];
544
3
            const int i0 = (int)(u0 >> 3);
545
3
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
546
3
            const int i2 = (int)((u1 >> 1) & 31);
547
3
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
548
3
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
549
3
            const int i5 = (int)((u3 >> 2) & 31);
550
3
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
551
3
            const int i7 = (int)(u4 & 31);
552
3
            f32[j] = palette[i0];
553
3
            if (j + 1 < nI)
554
3
              f32[j + 1] = palette[i1];
555
3
            if (j + 2 < nI)
556
3
              f32[j + 2] = palette[i2];
557
3
            if (j + 3 < nI)
558
3
              f32[j + 3] = palette[i3];
559
3
            if (j + 4 < nI)
560
3
              f32[j + 4] = palette[i4];
561
3
            if (j + 5 < nI)
562
3
              f32[j + 5] = palette[i5];
563
3
            if (j + 6 < nI)
564
3
              f32[j + 6] = palette[i6];
565
3
            if (j + 7 < nI)
566
2
              f32[j + 7] = palette[i7];
567
3
            ui1 += 5;
568
3
          }
569
1
        }
570
46
      } parallel_endfor
571
6
    } else if (qbits == 6) {
572
12
      
parallel_for2
(i, num_blocks) {
573
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
574
12
        const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i;
575
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
576
12
        const float* const palette = (float*)ui0;
577
12
        const uint8_t* ui1 = ui0 + element_size * 64;
578
12
        float* const f32 = (float*)u80;
579
12
        int j;
580
12
        if (nI % 4 == 0)
581
11
        {
582
1.36k
          for (j = 0; j < nI; 
j += 41.35k
)
583
1.35k
          {
584
1.35k
            const uint8_t u0 = ui1[0];
585
1.35k
            const uint8_t u1 = ui1[1];
586
1.35k
            const uint8_t u2 = ui1[2];
587
1.35k
            const int i0 = (int)(u0 >> 2);
588
1.35k
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
589
1.35k
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
590
1.35k
            const int i3 = (int)(u2 & 63);
591
1.35k
            f32[j] = palette[i0];
592
1.35k
            f32[j + 1] = palette[i1];
593
1.35k
            f32[j + 2] = palette[i2];
594
1.35k
            f32[j + 3] = palette[i3];
595
1.35k
            ui1 += 3;
596
1.35k
          }
597
11
        } else {
598
71
          for (j = 0; j < nI; 
j += 470
)
599
70
          {
600
70
            const uint8_t u0 = ui1[0];
601
70
            const uint8_t u1 = ui1[1];
602
70
            const uint8_t u2 = ui1[2];
603
70
            const int i0 = (int)(u0 >> 2);
604
70
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
605
70
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
606
70
            const int i3 = (int)(u2 & 63);
607
70
            f32[j] = palette[i0];
608
70
            if (j + 1 < nI)
609
70
              f32[j + 1] = palette[i1];
610
70
            if (j + 2 < nI)
611
70
              f32[j + 2] = palette[i2];
612
70
            if (j + 3 < nI)
613
69
              f32[j + 3] = palette[i3];
614
70
            ui1 += 3;
615
70
          }
616
1
        }
617
12
      } parallel_endfor
618
4
    } else if (qbits == 7) {
619
12
      
parallel_for2
(i, num_blocks) {
620
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
621
12
        const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i;
622
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
623
12
        const float* const palette = (float*)ui0;
624
12
        const uint8_t* ui1 = ui0 + element_size * 128;
625
12
        float* const f32 = (float*)u80;
626
12
        int j;
627
12
        if (nI % 8 == 0)
628
11
        {
629
686
          for (j = 0; j < nI; 
j += 8675
)
630
675
          {
631
675
            const uint8_t u0 = ui1[0];
632
675
            const uint8_t u1 = ui1[1];
633
675
            const uint8_t u2 = ui1[2];
634
675
            const uint8_t u3 = ui1[3];
635
675
            const uint8_t u4 = ui1[4];
636
675
            const uint8_t u5 = ui1[5];
637
675
            const uint8_t u6 = ui1[6];
638
675
            const int i0 = (int)(u0 >> 1);
639
675
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
640
675
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
641
675
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
642
675
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
643
675
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
644
675
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
645
675
            const int i7 = (int)(u6 & 127);
646
675
            f32[j] = palette[i0];
647
675
            f32[j + 1] = palette[i1];
648
675
            f32[j + 2] = palette[i2];
649
675
            f32[j + 3] = palette[i3];
650
675
            f32[j + 4] = palette[i4];
651
675
            f32[j + 5] = palette[i5];
652
675
            f32[j + 6] = palette[i6];
653
675
            f32[j + 7] = palette[i7];
654
675
            ui1 += 7;
655
675
          }
656
11
        } else {
657
36
          for (j = 0; j < nI; 
j += 835
)
658
35
          {
659
35
            const uint8_t u0 = ui1[0];
660
35
            const uint8_t u1 = ui1[1];
661
35
            const uint8_t u2 = ui1[2];
662
35
            const uint8_t u3 = ui1[3];
663
35
            const uint8_t u4 = ui1[4];
664
35
            const uint8_t u5 = ui1[5];
665
35
            const uint8_t u6 = ui1[6];
666
35
            const int i0 = (int)(u0 >> 1);
667
35
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
668
35
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
669
35
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
670
35
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
671
35
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
672
35
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
673
35
            const int i7 = (int)(u6 & 127);
674
35
            f32[j] = palette[i0];
675
35
            if (j + 1 < nI)
676
35
              f32[j + 1] = palette[i1];
677
35
            if (j + 2 < nI)
678
35
              f32[j + 2] = palette[i2];
679
35
            if (j + 3 < nI)
680
35
              f32[j + 3] = palette[i3];
681
35
            if (j + 4 < nI)
682
35
              f32[j + 4] = palette[i4];
683
35
            if (j + 5 < nI)
684
35
              f32[j + 5] = palette[i5];
685
35
            if (j + 6 < nI)
686
35
              f32[j + 6] = palette[i6];
687
35
            if (j + 7 < nI)
688
34
              f32[j + 7] = palette[i7];
689
35
            ui1 += 7;
690
35
          }
691
1
        }
692
12
      } parallel_endfor
693
2
    } else {
694
6
      
parallel_for2
(i, num_blocks) {
695
6
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
696
6
        const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i;
697
6
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
698
6
        const float* const palette = (float*)ui0;
699
6
        const uint8_t* ui1 = ui0 + element_size * 256;
700
6
        float* const f32 = (float*)u80;
701
6
        int j;
702
5.68k
        for (j = 0; j < nI; 
j++5.67k
)
703
5.67k
        {
704
5.67k
          const uint8_t u0 = *ui1;
705
5.67k
          f32[j] = palette[u0];
706
5.67k
          ++ui1;
707
5.67k
        }
708
6
      } parallel_endfor
709
2
    }
710
10
  } else {
711
10
    if (qbits == 4)
712
2
    {
713
46
      
parallel_for2
(i, num_blocks) {
714
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
715
46
        const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i;
716
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
717
46
        const double* const palette = (double*)ui0;
718
46
        const uint8_t* ui1 = ui0 + element_size * 16;
719
46
        double* const f64 = (double*)u80;
720
46
        int j;
721
46
        if (nI % 2 == 0)
722
45
        {
723
2.87k
          for (j = 0; j < nI; 
j += 22.82k
)
724
2.82k
          {
725
2.82k
            const uint8_t u0 = *ui1;
726
2.82k
            const int i0 = (int)(u0 >> 4);
727
2.82k
            const int i1 = (int)(u0 & 15);
728
2.82k
            f64[j] = palette[i0];
729
2.82k
            f64[j + 1] = palette[i1];
730
2.82k
            ++ui1;
731
2.82k
          }
732
45
        } else {
733
13
          for (j = 0; j < nI; 
j += 212
)
734
12
          {
735
12
            const uint8_t u0 = *ui1;
736
12
            const int i0 = (int)(u0 >> 4);
737
12
            const int i1 = (int)(u0 & 15);
738
12
            f64[j] = palette[i0];
739
12
            if (j + 1 < nI)
740
11
              f64[j + 1] = palette[i1];
741
12
            ++ui1;
742
12
          }
743
1
        }
744
46
      } parallel_endfor
745
8
    } else if (qbits == 5) {
746
46
      
parallel_for2
(i, num_blocks) {
747
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
748
46
        const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i;
749
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
750
46
        const double* const palette = (double*)ui0;
751
46
        const uint8_t* ui1 = ui0 + element_size * 32;
752
46
        double* const f64 = (double*)u80;
753
46
        int j;
754
46
        if (nI % 8 == 0)
755
45
        {
756
752
          for (j = 0; j < nI; 
j += 8707
)
757
707
          {
758
707
            const uint8_t u0 = ui1[0];
759
707
            const uint8_t u1 = ui1[1];
760
707
            const uint8_t u2 = ui1[2];
761
707
            const uint8_t u3 = ui1[3];
762
707
            const uint8_t u4 = ui1[4];
763
707
            const int i0 = (int)(u0 >> 3);
764
707
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
765
707
            const int i2 = (int)((u1 >> 1) & 31);
766
707
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
767
707
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
768
707
            const int i5 = (int)((u3 >> 2) & 31);
769
707
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
770
707
            const int i7 = (int)(u4 & 31);
771
707
            f64[j] = palette[i0];
772
707
            f64[j + 1] = palette[i1];
773
707
            f64[j + 2] = palette[i2];
774
707
            f64[j + 3] = palette[i3];
775
707
            f64[j + 4] = palette[i4];
776
707
            f64[j + 5] = palette[i5];
777
707
            f64[j + 6] = palette[i6];
778
707
            f64[j + 7] = palette[i7];
779
707
            ui1 += 5;
780
707
          }
781
45
        } else {
782
4
          for (j = 0; j < nI; 
j += 83
)
783
3
          {
784
3
            const uint8_t u0 = ui1[0];
785
3
            const uint8_t u1 = ui1[1];
786
3
            const uint8_t u2 = ui1[2];
787
3
            const uint8_t u3 = ui1[3];
788
3
            const uint8_t u4 = ui1[4];
789
3
            const int i0 = (int)(u0 >> 3);
790
3
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
791
3
            const int i2 = (int)((u1 >> 1) & 31);
792
3
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
793
3
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
794
3
            const int i5 = (int)((u3 >> 2) & 31);
795
3
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
796
3
            const int i7 = (int)(u4 & 31);
797
3
            f64[j] = palette[i0];
798
3
            if (j + 1 < nI)
799
3
              f64[j + 1] = palette[i1];
800
3
            if (j + 2 < nI)
801
3
              f64[j + 2] = palette[i2];
802
3
            if (j + 3 < nI)
803
3
              f64[j + 3] = palette[i3];
804
3
            if (j + 4 < nI)
805
3
              f64[j + 4] = palette[i4];
806
3
            if (j + 5 < nI)
807
3
              f64[j + 5] = palette[i5];
808
3
            if (j + 6 < nI)
809
3
              f64[j + 6] = palette[i6];
810
3
            if (j + 7 < nI)
811
2
              f64[j + 7] = palette[i7];
812
3
            ui1 += 5;
813
3
          }
814
1
        }
815
46
      } parallel_endfor
816
6
    } else if (qbits == 6) {
817
12
      
parallel_for2
(i, num_blocks) {
818
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
819
12
        const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i;
820
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
821
12
        const double* const palette = (double*)ui0;
822
12
        const uint8_t* ui1 = ui0 + element_size * 64;
823
12
        double* const f64 = (double*)u80;
824
12
        int j;
825
12
        if (nI % 4 == 0)
826
11
        {
827
1.36k
          for (j = 0; j < nI; 
j += 41.35k
)
828
1.35k
          {
829
1.35k
            const uint8_t u0 = ui1[0];
830
1.35k
            const uint8_t u1 = ui1[1];
831
1.35k
            const uint8_t u2 = ui1[2];
832
1.35k
            const int i0 = (int)(u0 >> 2);
833
1.35k
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
834
1.35k
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
835
1.35k
            const int i3 = (int)(u2 & 63);
836
1.35k
            f64[j] = palette[i0];
837
1.35k
            f64[j + 1] = palette[i1];
838
1.35k
            f64[j + 2] = palette[i2];
839
1.35k
            f64[j + 3] = palette[i3];
840
1.35k
            ui1 += 3;
841
1.35k
          }
842
11
        } else {
843
71
          for (j = 0; j < nI; 
j += 470
)
844
70
          {
845
70
            const uint8_t u0 = ui1[0];
846
70
            const uint8_t u1 = ui1[1];
847
70
            const uint8_t u2 = ui1[2];
848
70
            const int i0 = (int)(u0 >> 2);
849
70
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
850
70
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
851
70
            const int i3 = (int)(u2 & 63);
852
70
            f64[j] = palette[i0];
853
70
            if (j + 1 < nI)
854
70
              f64[j + 1] = palette[i1];
855
70
            if (j + 2 < nI)
856
70
              f64[j + 2] = palette[i2];
857
70
            if (j + 3 < nI)
858
69
              f64[j + 3] = palette[i3];
859
70
            ui1 += 3;
860
70
          }
861
1
        }
862
12
      } parallel_endfor
863
4
    } else if (qbits == 7) {
864
12
      
parallel_for2
(i, num_blocks) {
865
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
866
12
        const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i;
867
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
868
12
        const double* const palette = (double*)ui0;
869
12
        const uint8_t* ui1 = ui0 + element_size * 128;
870
12
        double* const f64 = (double*)u80;
871
12
        int j;
872
12
        if (nI % 8 == 0)
873
11
        {
874
686
          for (j = 0; j < nI; 
j += 8675
)
875
675
          {
876
675
            const uint8_t u0 = ui1[0];
877
675
            const uint8_t u1 = ui1[1];
878
675
            const uint8_t u2 = ui1[2];
879
675
            const uint8_t u3 = ui1[3];
880
675
            const uint8_t u4 = ui1[4];
881
675
            const uint8_t u5 = ui1[5];
882
675
            const uint8_t u6 = ui1[6];
883
675
            const int i0 = (int)(u0 >> 1);
884
675
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
885
675
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
886
675
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
887
675
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
888
675
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
889
675
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
890
675
            const int i7 = (int)(u6 & 127);
891
675
            f64[j] = palette[i0];
892
675
            f64[j + 1] = palette[i1];
893
675
            f64[j + 2] = palette[i2];
894
675
            f64[j + 3] = palette[i3];
895
675
            f64[j + 4] = palette[i4];
896
675
            f64[j + 5] = palette[i5];
897
675
            f64[j + 6] = palette[i6];
898
675
            f64[j + 7] = palette[i7];
899
675
            ui1 += 7;
900
675
          }
901
11
        } else {
902
36
          for (j = 0; j < nI; 
j += 835
)
903
35
          {
904
35
            const uint8_t u0 = ui1[0];
905
35
            const uint8_t u1 = ui1[1];
906
35
            const uint8_t u2 = ui1[2];
907
35
            const uint8_t u3 = ui1[3];
908
35
            const uint8_t u4 = ui1[4];
909
35
            const uint8_t u5 = ui1[5];
910
35
            const uint8_t u6 = ui1[6];
911
35
            const int i0 = (int)(u0 >> 1);
912
35
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
913
35
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
914
35
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
915
35
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
916
35
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
917
35
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
918
35
            const int i7 = (int)(u6 & 127);
919
35
            f64[j] = palette[i0];
920
35
            if (j + 1 < nI)
921
35
              f64[j + 1] = palette[i1];
922
35
            if (j + 2 < nI)
923
35
              f64[j + 2] = palette[i2];
924
35
            if (j + 3 < nI)
925
35
              f64[j + 3] = palette[i3];
926
35
            if (j + 4 < nI)
927
35
              f64[j + 4] = palette[i4];
928
35
            if (j + 5 < nI)
929
35
              f64[j + 5] = palette[i5];
930
35
            if (j + 6 < nI)
931
35
              f64[j + 6] = palette[i6];
932
35
            if (j + 7 < nI)
933
34
              f64[j + 7] = palette[i7];
934
35
            ui1 += 7;
935
35
          }
936
1
        }
937
12
      } parallel_endfor
938
2
    } else {
939
6
      
parallel_for2
(i, num_blocks) {
940
6
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
941
6
        const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i;
942
6
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
943
6
        const double* const palette = (double*)ui0;
944
6
        const uint8_t* ui1 = ui0 + element_size * 256;
945
6
        double* const f64 = (double*)u80;
946
6
        int j;
947
5.68k
        for (j = 0; j < nI; 
j++5.67k
)
948
5.67k
        {
949
5.67k
          const uint8_t u0 = *ui1;
950
5.67k
          f64[j] = palette[u0];
951
5.67k
          ++ui1;
952
5.67k
        }
953
6
      } parallel_endfor
954
2
    }
955
10
  }
956
30
}
957
958
void ccv_nnc_depalettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length)
959
60
{
960
60
  assert(memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY);
961
60
  if (memory_type == CCV_TENSOR_CPU_MEMORY)
962
30
    _ccv_nnc_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length);
963
30
  else {
964
30
#ifdef HAVE_CUDA
965
30
    ccv_nnc_compat_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length, 0);
966
#elif defined(HAVE_MPS)
967
    ccv_nnc_mps_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length, 0);
968
#else
969
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
970
#endif
971
30
  }
972
60
}