Coverage Report

Created: 2026-04-14 20:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_palettize.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#ifdef HAVE_CUDA
4
#include "gpu/ccv_nnc_compat.h"
5
#elif defined(HAVE_MPS)
6
#include "mps/ccv_nnc_mps.h"
7
#endif
8
9
size_t ccv_nnc_palettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length)
10
63
{
11
63
  assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F);
12
63
  assert(memory_type == CCV_TENSOR_CPU_MEMORY);
13
63
  const int num_blocks = (input_length + number_in_blocks - 1) / number_in_blocks;
14
63
  const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype);
15
63
  uint8_t* const u8 = (uint8_t*)output;
16
63
  uint8_t* const ui = (uint8_t*)input;
17
63
  assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8);
18
63
  if (qbits == 4)
19
14
  {
20
278
    
parallel_for14
(i, num_blocks) {
21
278
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
22
278
      int* const indices = ccmalloc(sizeof(int) * nI);
23
278
      double centroids[16];
24
278
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
25
278
      ccv_kmeans1d(&a, 16, indices, centroids);
26
278
      uint8_t* u80 = u8 + (16 * element_size + number_in_blocks / 2) * i;
27
278
      int j;
28
278
      if (datatype == CCV_16F)
29
92
      {
30
92
        float* f32 = (float*)centroids;
31
1.56k
        for (j = 0; j < 16; 
j++1.47k
)
32
1.47k
          f32[j] = (float)centroids[j];
33
92
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 16);
34
186
      } else if (datatype == CCV_16BF) {
35
0
        float* f32 = (float*)centroids;
36
0
        for (j = 0; j < 16; j++)
37
0
          f32[j] = (float)centroids[j];
38
0
        ccv_float_to_bfloat(f32, (uint16_t*)u80, 16);
39
186
      } else if (datatype == CCV_32F) {
40
94
        float* f32 = (float*)u80;
41
1.59k
        for (j = 0; j < 16; 
j++1.50k
)
42
1.50k
          f32[j] = (float)centroids[j];
43
94
      } else {
44
92
        memcpy(u80, centroids, sizeof(double) * 16);
45
92
      }
46
278
      u80 += 16 * element_size;
47
17.3k
      for (j = 0; j < nI; 
j += 217.0k
)
48
17.0k
      {
49
17.0k
        const uint8_t i0 = (uint8_t)indices[j];
50
17.0k
        const uint8_t i1 = j + 1 < nI ? 
(uint8_t)indices[j + 1]17.0k
:
06
;
51
17.0k
        *u80 = (i0 << 4) | i1;
52
17.0k
        ++u80;
53
17.0k
      }
54
278
      ccfree(indices);
55
278
    } parallel_endfor
56
14
    return element_size * num_blocks * 16 + (input_length + 1) / 2;
57
49
  } else if (qbits == 5) {
58
276
    
parallel_for12
(i, num_blocks) {
59
276
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
60
276
      int* const indices = ccmalloc(sizeof(int) * nI);
61
276
      double centroids[32];
62
276
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
63
276
      ccv_kmeans1d(&a, 32, indices, centroids);
64
276
      uint8_t* u80 = u8 + (32 * element_size + number_in_blocks / 8 * 5) * i;
65
276
      int j;
66
276
      if (datatype == CCV_16F)
67
92
      {
68
92
        float* f32 = (float*)centroids;
69
3.03k
        for (j = 0; j < 32; 
j++2.94k
)
70
2.94k
          f32[j] = (float)centroids[j];
71
92
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 32);
72
184
      } else if (datatype == CCV_16BF) {
73
0
        float* f32 = (float*)centroids;
74
0
        for (j = 0; j < 32; j++)
75
0
          f32[j] = (float)centroids[j];
76
0
        ccv_float_to_bfloat(f32, (uint16_t*)u80, 32);
77
184
      } else if (datatype == CCV_32F) {
78
92
        float* f32 = (float*)u80;
79
3.03k
        for (j = 0; j < 32; 
j++2.94k
)
80
2.94k
          f32[j] = (float)centroids[j];
81
92
      } else {
82
92
        memcpy(u80, centroids, sizeof(double) * 32);
83
92
      }
84
276
      u80 += 32 * element_size;
85
4.53k
      for (j = 0; j < nI; 
j += 84.26k
)
86
4.26k
      {
87
4.26k
        const uint8_t i0 = (uint8_t)indices[j];
88
4.26k
        const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 
00
;
89
4.26k
        const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 
00
;
90
4.26k
        const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 
00
;
91
4.26k
        const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 
00
;
92
4.26k
        const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 
00
;
93
4.26k
        const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 
00
;
94
4.26k
        const uint8_t i7 = j + 7 < nI ? 
(uint8_t)indices[j + 7]4.25k
:
06
;
95
4.26k
        u80[0] = (i0 << 3) | (i1 >> 2);
96
4.26k
        u80[1] = (i1 << 6) | (i2 << 1) | (i3 >> 4);
97
4.26k
        u80[2] = (i3 << 4) | (i4 >> 1);
98
4.26k
        u80[3] = (i4 << 7) | (i5 << 2) | (i6 >> 3);
99
4.26k
        u80[4] = (i6 << 5) | i7;
100
4.26k
        u80 += 5;
101
4.26k
      }
102
276
      ccfree(indices);
103
276
    } parallel_endfor
104
12
    return element_size * num_blocks * 32 + (input_length + 7) / 8 * 5;
105
37
  } else if (qbits == 6) {
106
80
    
parallel_for13
(i, num_blocks) {
107
80
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
108
80
      int* const indices = ccmalloc(sizeof(int) * nI);
109
80
      double centroids[64];
110
80
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
111
80
      ccv_kmeans1d(&a, 64, indices, centroids);
112
80
      uint8_t* u80 = u8 + (64 * element_size + number_in_blocks / 4 * 3) * i;
113
80
      int j;
114
80
      if (datatype == CCV_16F)
115
32
      {
116
32
        float* f32 = (float*)centroids;
117
2.08k
        for (j = 0; j < 64; 
j++2.04k
)
118
2.04k
          f32[j] = (float)centroids[j];
119
32
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 64);
120
48
      } else if (datatype == CCV_16BF) {
121
0
        float* f32 = (float*)centroids;
122
0
        for (j = 0; j < 64; j++)
123
0
          f32[j] = (float)centroids[j];
124
0
        ccv_float_to_bfloat(f32, (uint16_t*)u80, 64);
125
48
      } else if (datatype == CCV_32F) {
126
24
        float* f32 = (float*)u80;
127
1.56k
        for (j = 0; j < 64; 
j++1.53k
)
128
1.53k
          f32[j] = (float)centroids[j];
129
24
      } else {
130
24
        memcpy(u80, centroids, sizeof(double) * 64);
131
24
      }
132
80
      u80 += 64 * element_size;
133
13.4k
      for (j = 0; j < nI; 
j += 413.3k
)
134
13.3k
      {
135
13.3k
        const uint8_t i0 = (uint8_t)indices[j];
136
13.3k
        const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 
00
;
137
13.3k
        const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 
00
;
138
13.3k
        const uint8_t i3 = j + 3 < nI ? 
(uint8_t)indices[j + 3]13.3k
:
06
;
139
13.3k
        u80[0] = (i0 << 2) | (i1 >> 4);
140
13.3k
        u80[1] = (i1 << 4) | (i2 >> 2);
141
13.3k
        u80[2] = (i2 << 6) | i3;
142
13.3k
        u80 += 3;
143
13.3k
      }
144
80
      ccfree(indices);
145
80
    } parallel_endfor
146
13
    return element_size * num_blocks * 64 + (input_length + 3) / 4 * 3;
147
24
  } else if (qbits == 7) {
148
72
    
parallel_for12
(i, num_blocks) {
149
72
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
150
72
      int* const indices = ccmalloc(sizeof(int) * nI);
151
72
      double centroids[128];
152
72
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
153
72
      ccv_kmeans1d(&a, 128, indices, centroids);
154
72
      uint8_t* u80 = u8 + (128 * element_size + number_in_blocks / 8 * 7) * i;
155
72
      int j;
156
72
      if (datatype == CCV_16F)
157
24
      {
158
24
        float* f32 = (float*)centroids;
159
3.09k
        for (j = 0; j < 128; 
j++3.07k
)
160
3.07k
          f32[j] = (float)centroids[j];
161
24
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 128);
162
48
      } else if (datatype == CCV_16BF) {
163
0
        float* f32 = (float*)centroids;
164
0
        for (j = 0; j < 128; j++)
165
0
          f32[j] = (float)centroids[j];
166
0
        ccv_float_to_bfloat(f32, (uint16_t*)u80, 128);
167
48
      } else if (datatype == CCV_32F) {
168
24
        float* f32 = (float*)u80;
169
3.09k
        for (j = 0; j < 128; 
j++3.07k
)
170
3.07k
          f32[j] = (float)centroids[j];
171
24
      } else {
172
24
        memcpy(u80, centroids, sizeof(double) * 128);
173
24
      }
174
72
      u80 += 128 * element_size;
175
4.33k
      for (j = 0; j < nI; 
j += 84.26k
)
176
4.26k
      {
177
4.26k
        const uint8_t i0 = (uint8_t)indices[j];
178
4.26k
        const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 
00
;
179
4.26k
        const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 
00
;
180
4.26k
        const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 
00
;
181
4.26k
        const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 
00
;
182
4.26k
        const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 
00
;
183
4.26k
        const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 
00
;
184
4.26k
        const uint8_t i7 = j + 7 < nI ? 
(uint8_t)indices[j + 7]4.25k
:
06
;
185
4.26k
        u80[0] = (i0 << 1) | (i1 >> 6);
186
4.26k
        u80[1] = (i1 << 2) | (i2 >> 5);
187
4.26k
        u80[2] = (i2 << 3) | (i3 >> 4);
188
4.26k
        u80[3] = (i3 << 4) | (i4 >> 3);
189
4.26k
        u80[4] = (i4 << 5) | (i5 >> 2);
190
4.26k
        u80[5] = (i5 << 6) | (i6 >> 1);
191
4.26k
        u80[6] = (i6 << 7) | i7;
192
4.26k
        u80 += 7;
193
4.26k
      }
194
72
      ccfree(indices);
195
72
    } parallel_endfor
196
12
    return element_size * num_blocks * 128 + (input_length + 7) / 8 * 7;
197
12
  } else {
198
35
    
parallel_for12
(i, num_blocks) {
199
35
      const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
200
35
      int* const indices = ccmalloc(sizeof(int) * nI);
201
35
      double centroids[256];
202
35
      ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
203
35
      ccv_kmeans1d(&a, 256, indices, centroids);
204
35
      uint8_t* u80 = u8 + (256 * element_size + number_in_blocks) * i;
205
35
      int j;
206
35
      if (datatype == CCV_16F)
207
12
      {
208
12
        float* f32 = (float*)centroids;
209
3.08k
        for (j = 0; j < 256; 
j++3.07k
)
210
3.07k
          f32[j] = (float)centroids[j];
211
12
        ccv_float_to_half_precision(f32, (uint16_t*)u80, 256);
212
23
      } else if (datatype == CCV_16BF) {
213
0
        float* f32 = (float*)centroids;
214
0
        for (j = 0; j < 256; j++)
215
0
          f32[j] = (float)centroids[j];
216
0
        ccv_float_to_bfloat(f32, (uint16_t*)u80, 256);
217
23
      } else if (datatype == CCV_32F) {
218
11
        float* f32 = (float*)u80;
219
2.82k
        for (j = 0; j < 256; 
j++2.81k
)
220
2.81k
          f32[j] = (float)centroids[j];
221
12
      } else {
222
12
        memcpy(u80, centroids, sizeof(double) * 256);
223
12
      }
224
35
      u80 += 256 * element_size;
225
39.4k
      for (j = 0; j < nI; 
j++39.4k
)
226
39.4k
      {
227
39.4k
        *u80 = (uint8_t)indices[j];
228
39.4k
        ++u80;
229
39.4k
      }
230
35
      ccfree(indices);
231
35
    } parallel_endfor
232
12
    return element_size * num_blocks * 256 + input_length;
233
12
  }
234
63
}
235
236
static void _ccv_nnc_depalettize(const void* input, const int datatype, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length)
237
30
{
238
30
  assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F);
239
30
  const int num_blocks = (output_length + number_in_blocks - 1) / number_in_blocks;
240
30
  const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype);
241
30
  uint8_t* const u8 = (uint8_t*)output;
242
30
  const uint8_t* const ui = (const uint8_t*)input;
243
30
  assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8);
244
30
  if (datatype == CCV_16F || 
datatype == CCV_16BF20
)
245
10
  {
246
10
    if (qbits == 4)
247
2
    {
248
46
      
parallel_for2
(i, num_blocks) {
249
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
250
46
        const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i;
251
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
252
46
        const uint16_t* const palette = (uint16_t*)ui0;
253
46
        const uint8_t* ui1 = ui0 + element_size * 16;
254
46
        uint16_t* const f16 = (uint16_t*)u80;
255
46
        int j;
256
46
        if (nI % 2 == 0)
257
45
        {
258
2.87k
          for (j = 0; j < nI; 
j += 22.82k
)
259
2.82k
          {
260
2.82k
            const uint8_t u0 = *ui1;
261
2.82k
            const int i0 = (int)(u0 >> 4);
262
2.82k
            const int i1 = (int)(u0 & 15);
263
2.82k
            f16[j] = palette[i0];
264
2.82k
            f16[j + 1] = palette[i1];
265
2.82k
            ++ui1;
266
2.82k
          }
267
45
        } else {
268
13
          for (j = 0; j < nI; 
j += 212
)
269
12
          {
270
12
            const uint8_t u0 = *ui1;
271
12
            const int i0 = (int)(u0 >> 4);
272
12
            const int i1 = (int)(u0 & 15);
273
12
            f16[j] = palette[i0];
274
12
            if (j + 1 < nI)
275
11
              f16[j + 1] = palette[i1];
276
12
            ++ui1;
277
12
          }
278
1
        }
279
46
      } parallel_endfor
280
8
    } else if (qbits == 5) {
281
46
      
parallel_for2
(i, num_blocks) {
282
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
283
46
        const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i;
284
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
285
46
        const uint16_t* const palette = (uint16_t*)ui0;
286
46
        const uint8_t* ui1 = ui0 + element_size * 32;
287
46
        uint16_t* const f16 = (uint16_t*)u80;
288
46
        int j;
289
46
        if (nI % 8 == 0)
290
45
        {
291
752
          for (j = 0; j < nI; 
j += 8707
)
292
707
          {
293
707
            const uint8_t u0 = ui1[0];
294
707
            const uint8_t u1 = ui1[1];
295
707
            const uint8_t u2 = ui1[2];
296
707
            const uint8_t u3 = ui1[3];
297
707
            const uint8_t u4 = ui1[4];
298
707
            const int i0 = (int)(u0 >> 3);
299
707
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
300
707
            const int i2 = (int)((u1 >> 1) & 31);
301
707
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
302
707
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
303
707
            const int i5 = (int)((u3 >> 2) & 31);
304
707
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
305
707
            const int i7 = (int)(u4 & 31);
306
707
            f16[j] = palette[i0];
307
707
            f16[j + 1] = palette[i1];
308
707
            f16[j + 2] = palette[i2];
309
707
            f16[j + 3] = palette[i3];
310
707
            f16[j + 4] = palette[i4];
311
707
            f16[j + 5] = palette[i5];
312
707
            f16[j + 6] = palette[i6];
313
707
            f16[j + 7] = palette[i7];
314
707
            ui1 += 5;
315
707
          }
316
45
        } else {
317
4
          for (j = 0; j < nI; 
j += 83
)
318
3
          {
319
3
            const uint8_t u0 = ui1[0];
320
3
            const uint8_t u1 = ui1[1];
321
3
            const uint8_t u2 = ui1[2];
322
3
            const uint8_t u3 = ui1[3];
323
3
            const uint8_t u4 = ui1[4];
324
3
            const int i0 = (int)(u0 >> 3);
325
3
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
326
3
            const int i2 = (int)((u1 >> 1) & 31);
327
3
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
328
3
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
329
3
            const int i5 = (int)((u3 >> 2) & 31);
330
3
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
331
3
            const int i7 = (int)(u4 & 31);
332
3
            f16[j] = palette[i0];
333
3
            if (j + 1 < nI)
334
3
              f16[j + 1] = palette[i1];
335
3
            if (j + 2 < nI)
336
3
              f16[j + 2] = palette[i2];
337
3
            if (j + 3 < nI)
338
3
              f16[j + 3] = palette[i3];
339
3
            if (j + 4 < nI)
340
3
              f16[j + 4] = palette[i4];
341
3
            if (j + 5 < nI)
342
3
              f16[j + 5] = palette[i5];
343
3
            if (j + 6 < nI)
344
3
              f16[j + 6] = palette[i6];
345
3
            if (j + 7 < nI)
346
2
              f16[j + 7] = palette[i7];
347
3
            ui1 += 5;
348
3
          }
349
1
        }
350
46
      } parallel_endfor
351
6
    } else if (qbits == 6) {
352
12
      
parallel_for2
(i, num_blocks) {
353
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
354
12
        const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i;
355
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
356
12
        const uint16_t* const palette = (uint16_t*)ui0;
357
12
        const uint8_t* ui1 = ui0 + element_size * 64;
358
12
        uint16_t* const f16 = (uint16_t*)u80;
359
12
        int j;
360
12
        if (nI % 4 == 0)
361
11
        {
362
1.36k
          for (j = 0; j < nI; 
j += 41.35k
)
363
1.35k
          {
364
1.35k
            const uint8_t u0 = ui1[0];
365
1.35k
            const uint8_t u1 = ui1[1];
366
1.35k
            const uint8_t u2 = ui1[2];
367
1.35k
            const int i0 = (int)(u0 >> 2);
368
1.35k
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
369
1.35k
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
370
1.35k
            const int i3 = (int)(u2 & 63);
371
1.35k
            f16[j] = palette[i0];
372
1.35k
            f16[j + 1] = palette[i1];
373
1.35k
            f16[j + 2] = palette[i2];
374
1.35k
            f16[j + 3] = palette[i3];
375
1.35k
            ui1 += 3;
376
1.35k
          }
377
11
        } else {
378
71
          for (j = 0; j < nI; 
j += 470
)
379
70
          {
380
70
            const uint8_t u0 = ui1[0];
381
70
            const uint8_t u1 = ui1[1];
382
70
            const uint8_t u2 = ui1[2];
383
70
            const int i0 = (int)(u0 >> 2);
384
70
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
385
70
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
386
70
            const int i3 = (int)(u2 & 63);
387
70
            f16[j] = palette[i0];
388
70
            if (j + 1 < nI)
389
70
              f16[j + 1] = palette[i1];
390
70
            if (j + 2 < nI)
391
70
              f16[j + 2] = palette[i2];
392
70
            if (j + 3 < nI)
393
69
              f16[j + 3] = palette[i3];
394
70
            ui1 += 3;
395
70
          }
396
1
        }
397
12
      } parallel_endfor
398
4
    } else if (qbits == 7) {
399
12
      
parallel_for2
(i, num_blocks) {
400
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
401
12
        const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i;
402
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
403
12
        const uint16_t* const palette = (uint16_t*)ui0;
404
12
        const uint8_t* ui1 = ui0 + element_size * 128;
405
12
        uint16_t* const f16 = (uint16_t*)u80;
406
12
        int j;
407
12
        if (nI % 8 == 0)
408
11
        {
409
686
          for (j = 0; j < nI; 
j += 8675
)
410
675
          {
411
675
            const uint8_t u0 = ui1[0];
412
675
            const uint8_t u1 = ui1[1];
413
675
            const uint8_t u2 = ui1[2];
414
675
            const uint8_t u3 = ui1[3];
415
675
            const uint8_t u4 = ui1[4];
416
675
            const uint8_t u5 = ui1[5];
417
675
            const uint8_t u6 = ui1[6];
418
675
            const int i0 = (int)(u0 >> 1);
419
675
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
420
675
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
421
675
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
422
675
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
423
675
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
424
675
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
425
675
            const int i7 = (int)(u6 & 127);
426
675
            f16[j] = palette[i0];
427
675
            f16[j + 1] = palette[i1];
428
675
            f16[j + 2] = palette[i2];
429
675
            f16[j + 3] = palette[i3];
430
675
            f16[j + 4] = palette[i4];
431
675
            f16[j + 5] = palette[i5];
432
675
            f16[j + 6] = palette[i6];
433
675
            f16[j + 7] = palette[i7];
434
675
            ui1 += 7;
435
675
          }
436
11
        } else {
437
36
          for (j = 0; j < nI; 
j += 835
)
438
35
          {
439
35
            const uint8_t u0 = ui1[0];
440
35
            const uint8_t u1 = ui1[1];
441
35
            const uint8_t u2 = ui1[2];
442
35
            const uint8_t u3 = ui1[3];
443
35
            const uint8_t u4 = ui1[4];
444
35
            const uint8_t u5 = ui1[5];
445
35
            const uint8_t u6 = ui1[6];
446
35
            const int i0 = (int)(u0 >> 1);
447
35
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
448
35
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
449
35
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
450
35
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
451
35
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
452
35
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
453
35
            const int i7 = (int)(u6 & 127);
454
35
            f16[j] = palette[i0];
455
35
            if (j + 1 < nI)
456
35
              f16[j + 1] = palette[i1];
457
35
            if (j + 2 < nI)
458
35
              f16[j + 2] = palette[i2];
459
35
            if (j + 3 < nI)
460
35
              f16[j + 3] = palette[i3];
461
35
            if (j + 4 < nI)
462
35
              f16[j + 4] = palette[i4];
463
35
            if (j + 5 < nI)
464
35
              f16[j + 5] = palette[i5];
465
35
            if (j + 6 < nI)
466
35
              f16[j + 6] = palette[i6];
467
35
            if (j + 7 < nI)
468
34
              f16[j + 7] = palette[i7];
469
35
            ui1 += 7;
470
35
          }
471
1
        }
472
12
      } parallel_endfor
473
2
    } else {
474
6
      
parallel_for2
(i, num_blocks) {
475
6
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
476
6
        const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i;
477
6
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
478
6
        const uint16_t* const palette = (uint16_t*)ui0;
479
6
        const uint8_t* ui1 = ui0 + element_size * 256;
480
6
        uint16_t* const f16 = (uint16_t*)u80;
481
6
        int j;
482
5.68k
        for (j = 0; j < nI; 
j++5.67k
)
483
5.67k
        {
484
5.67k
          const uint8_t u0 = *ui1;
485
5.67k
          f16[j] = palette[u0];
486
5.67k
          ++ui1;
487
5.67k
        }
488
6
      } parallel_endfor
489
2
    }
490
20
  } else if (datatype == CCV_32F) {
491
10
    if (qbits == 4)
492
2
    {
493
46
      
parallel_for2
(i, num_blocks) {
494
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
495
46
        const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i;
496
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
497
46
        const float* const palette = (float*)ui0;
498
46
        const uint8_t* ui1 = ui0 + element_size * 16;
499
46
        float* const f32 = (float*)u80;
500
46
        int j;
501
46
        if (nI % 2 == 0)
502
45
        {
503
2.87k
          for (j = 0; j < nI; 
j += 22.82k
)
504
2.82k
          {
505
2.82k
            const uint8_t u0 = *ui1;
506
2.82k
            const int i0 = (int)(u0 >> 4);
507
2.82k
            const int i1 = (int)(u0 & 15);
508
2.82k
            f32[j] = palette[i0];
509
2.82k
            f32[j + 1] = palette[i1];
510
2.82k
            ++ui1;
511
2.82k
          }
512
45
        } else {
513
13
          for (j = 0; j < nI; 
j += 212
)
514
12
          {
515
12
            const uint8_t u0 = *ui1;
516
12
            const int i0 = (int)(u0 >> 4);
517
12
            const int i1 = (int)(u0 & 15);
518
12
            f32[j] = palette[i0];
519
12
            if (j + 1 < nI)
520
11
              f32[j + 1] = palette[i1];
521
12
            ++ui1;
522
12
          }
523
1
        }
524
46
      } parallel_endfor
525
8
    } else if (qbits == 5) {
526
46
      
parallel_for2
(i, num_blocks) {
527
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
528
46
        const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i;
529
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
530
46
        const float* const palette = (float*)ui0;
531
46
        const uint8_t* ui1 = ui0 + element_size * 32;
532
46
        float* const f32 = (float*)u80;
533
46
        int j;
534
46
        if (nI % 8 == 0)
535
45
        {
536
752
          for (j = 0; j < nI; 
j += 8707
)
537
707
          {
538
707
            const uint8_t u0 = ui1[0];
539
707
            const uint8_t u1 = ui1[1];
540
707
            const uint8_t u2 = ui1[2];
541
707
            const uint8_t u3 = ui1[3];
542
707
            const uint8_t u4 = ui1[4];
543
707
            const int i0 = (int)(u0 >> 3);
544
707
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
545
707
            const int i2 = (int)((u1 >> 1) & 31);
546
707
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
547
707
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
548
707
            const int i5 = (int)((u3 >> 2) & 31);
549
707
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
550
707
            const int i7 = (int)(u4 & 31);
551
707
            f32[j] = palette[i0];
552
707
            f32[j + 1] = palette[i1];
553
707
            f32[j + 2] = palette[i2];
554
707
            f32[j + 3] = palette[i3];
555
707
            f32[j + 4] = palette[i4];
556
707
            f32[j + 5] = palette[i5];
557
707
            f32[j + 6] = palette[i6];
558
707
            f32[j + 7] = palette[i7];
559
707
            ui1 += 5;
560
707
          }
561
45
        } else {
562
4
          for (j = 0; j < nI; 
j += 83
)
563
3
          {
564
3
            const uint8_t u0 = ui1[0];
565
3
            const uint8_t u1 = ui1[1];
566
3
            const uint8_t u2 = ui1[2];
567
3
            const uint8_t u3 = ui1[3];
568
3
            const uint8_t u4 = ui1[4];
569
3
            const int i0 = (int)(u0 >> 3);
570
3
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
571
3
            const int i2 = (int)((u1 >> 1) & 31);
572
3
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
573
3
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
574
3
            const int i5 = (int)((u3 >> 2) & 31);
575
3
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
576
3
            const int i7 = (int)(u4 & 31);
577
3
            f32[j] = palette[i0];
578
3
            if (j + 1 < nI)
579
3
              f32[j + 1] = palette[i1];
580
3
            if (j + 2 < nI)
581
3
              f32[j + 2] = palette[i2];
582
3
            if (j + 3 < nI)
583
3
              f32[j + 3] = palette[i3];
584
3
            if (j + 4 < nI)
585
3
              f32[j + 4] = palette[i4];
586
3
            if (j + 5 < nI)
587
3
              f32[j + 5] = palette[i5];
588
3
            if (j + 6 < nI)
589
3
              f32[j + 6] = palette[i6];
590
3
            if (j + 7 < nI)
591
2
              f32[j + 7] = palette[i7];
592
3
            ui1 += 5;
593
3
          }
594
1
        }
595
46
      } parallel_endfor
596
6
    } else if (qbits == 6) {
597
12
      
parallel_for2
(i, num_blocks) {
598
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
599
12
        const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i;
600
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
601
12
        const float* const palette = (float*)ui0;
602
12
        const uint8_t* ui1 = ui0 + element_size * 64;
603
12
        float* const f32 = (float*)u80;
604
12
        int j;
605
12
        if (nI % 4 == 0)
606
11
        {
607
1.36k
          for (j = 0; j < nI; 
j += 41.35k
)
608
1.35k
          {
609
1.35k
            const uint8_t u0 = ui1[0];
610
1.35k
            const uint8_t u1 = ui1[1];
611
1.35k
            const uint8_t u2 = ui1[2];
612
1.35k
            const int i0 = (int)(u0 >> 2);
613
1.35k
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
614
1.35k
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
615
1.35k
            const int i3 = (int)(u2 & 63);
616
1.35k
            f32[j] = palette[i0];
617
1.35k
            f32[j + 1] = palette[i1];
618
1.35k
            f32[j + 2] = palette[i2];
619
1.35k
            f32[j + 3] = palette[i3];
620
1.35k
            ui1 += 3;
621
1.35k
          }
622
11
        } else {
623
71
          for (j = 0; j < nI; 
j += 470
)
624
70
          {
625
70
            const uint8_t u0 = ui1[0];
626
70
            const uint8_t u1 = ui1[1];
627
70
            const uint8_t u2 = ui1[2];
628
70
            const int i0 = (int)(u0 >> 2);
629
70
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
630
70
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
631
70
            const int i3 = (int)(u2 & 63);
632
70
            f32[j] = palette[i0];
633
70
            if (j + 1 < nI)
634
70
              f32[j + 1] = palette[i1];
635
70
            if (j + 2 < nI)
636
70
              f32[j + 2] = palette[i2];
637
70
            if (j + 3 < nI)
638
69
              f32[j + 3] = palette[i3];
639
70
            ui1 += 3;
640
70
          }
641
1
        }
642
12
      } parallel_endfor
643
4
    } else if (qbits == 7) {
644
12
      
parallel_for2
(i, num_blocks) {
645
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
646
12
        const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i;
647
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
648
12
        const float* const palette = (float*)ui0;
649
12
        const uint8_t* ui1 = ui0 + element_size * 128;
650
12
        float* const f32 = (float*)u80;
651
12
        int j;
652
12
        if (nI % 8 == 0)
653
11
        {
654
686
          for (j = 0; j < nI; 
j += 8675
)
655
675
          {
656
675
            const uint8_t u0 = ui1[0];
657
675
            const uint8_t u1 = ui1[1];
658
675
            const uint8_t u2 = ui1[2];
659
675
            const uint8_t u3 = ui1[3];
660
675
            const uint8_t u4 = ui1[4];
661
675
            const uint8_t u5 = ui1[5];
662
675
            const uint8_t u6 = ui1[6];
663
675
            const int i0 = (int)(u0 >> 1);
664
675
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
665
675
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
666
675
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
667
675
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
668
675
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
669
675
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
670
675
            const int i7 = (int)(u6 & 127);
671
675
            f32[j] = palette[i0];
672
675
            f32[j + 1] = palette[i1];
673
675
            f32[j + 2] = palette[i2];
674
675
            f32[j + 3] = palette[i3];
675
675
            f32[j + 4] = palette[i4];
676
675
            f32[j + 5] = palette[i5];
677
675
            f32[j + 6] = palette[i6];
678
675
            f32[j + 7] = palette[i7];
679
675
            ui1 += 7;
680
675
          }
681
11
        } else {
682
36
          for (j = 0; j < nI; 
j += 835
)
683
35
          {
684
35
            const uint8_t u0 = ui1[0];
685
35
            const uint8_t u1 = ui1[1];
686
35
            const uint8_t u2 = ui1[2];
687
35
            const uint8_t u3 = ui1[3];
688
35
            const uint8_t u4 = ui1[4];
689
35
            const uint8_t u5 = ui1[5];
690
35
            const uint8_t u6 = ui1[6];
691
35
            const int i0 = (int)(u0 >> 1);
692
35
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
693
35
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
694
35
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
695
35
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
696
35
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
697
35
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
698
35
            const int i7 = (int)(u6 & 127);
699
35
            f32[j] = palette[i0];
700
35
            if (j + 1 < nI)
701
35
              f32[j + 1] = palette[i1];
702
35
            if (j + 2 < nI)
703
35
              f32[j + 2] = palette[i2];
704
35
            if (j + 3 < nI)
705
35
              f32[j + 3] = palette[i3];
706
35
            if (j + 4 < nI)
707
35
              f32[j + 4] = palette[i4];
708
35
            if (j + 5 < nI)
709
35
              f32[j + 5] = palette[i5];
710
35
            if (j + 6 < nI)
711
35
              f32[j + 6] = palette[i6];
712
35
            if (j + 7 < nI)
713
34
              f32[j + 7] = palette[i7];
714
35
            ui1 += 7;
715
35
          }
716
1
        }
717
12
      } parallel_endfor
718
2
    } else {
719
6
      
parallel_for2
(i, num_blocks) {
720
6
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
721
6
        const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i;
722
6
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
723
6
        const float* const palette = (float*)ui0;
724
6
        const uint8_t* ui1 = ui0 + element_size * 256;
725
6
        float* const f32 = (float*)u80;
726
6
        int j;
727
5.68k
        for (j = 0; j < nI; 
j++5.67k
)
728
5.67k
        {
729
5.67k
          const uint8_t u0 = *ui1;
730
5.67k
          f32[j] = palette[u0];
731
5.67k
          ++ui1;
732
5.67k
        }
733
6
      } parallel_endfor
734
2
    }
735
10
  } else {
736
10
    if (qbits == 4)
737
2
    {
738
46
      
parallel_for2
(i, num_blocks) {
739
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
740
46
        const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i;
741
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
742
46
        const double* const palette = (double*)ui0;
743
46
        const uint8_t* ui1 = ui0 + element_size * 16;
744
46
        double* const f64 = (double*)u80;
745
46
        int j;
746
46
        if (nI % 2 == 0)
747
45
        {
748
2.87k
          for (j = 0; j < nI; 
j += 22.82k
)
749
2.82k
          {
750
2.82k
            const uint8_t u0 = *ui1;
751
2.82k
            const int i0 = (int)(u0 >> 4);
752
2.82k
            const int i1 = (int)(u0 & 15);
753
2.82k
            f64[j] = palette[i0];
754
2.82k
            f64[j + 1] = palette[i1];
755
2.82k
            ++ui1;
756
2.82k
          }
757
45
        } else {
758
13
          for (j = 0; j < nI; 
j += 212
)
759
12
          {
760
12
            const uint8_t u0 = *ui1;
761
12
            const int i0 = (int)(u0 >> 4);
762
12
            const int i1 = (int)(u0 & 15);
763
12
            f64[j] = palette[i0];
764
12
            if (j + 1 < nI)
765
11
              f64[j + 1] = palette[i1];
766
12
            ++ui1;
767
12
          }
768
1
        }
769
46
      } parallel_endfor
770
8
    } else if (qbits == 5) {
771
46
      
parallel_for2
(i, num_blocks) {
772
46
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
773
46
        const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i;
774
46
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
775
46
        const double* const palette = (double*)ui0;
776
46
        const uint8_t* ui1 = ui0 + element_size * 32;
777
46
        double* const f64 = (double*)u80;
778
46
        int j;
779
46
        if (nI % 8 == 0)
780
45
        {
781
752
          for (j = 0; j < nI; 
j += 8707
)
782
707
          {
783
707
            const uint8_t u0 = ui1[0];
784
707
            const uint8_t u1 = ui1[1];
785
707
            const uint8_t u2 = ui1[2];
786
707
            const uint8_t u3 = ui1[3];
787
707
            const uint8_t u4 = ui1[4];
788
707
            const int i0 = (int)(u0 >> 3);
789
707
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
790
707
            const int i2 = (int)((u1 >> 1) & 31);
791
707
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
792
707
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
793
707
            const int i5 = (int)((u3 >> 2) & 31);
794
707
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
795
707
            const int i7 = (int)(u4 & 31);
796
707
            f64[j] = palette[i0];
797
707
            f64[j + 1] = palette[i1];
798
707
            f64[j + 2] = palette[i2];
799
707
            f64[j + 3] = palette[i3];
800
707
            f64[j + 4] = palette[i4];
801
707
            f64[j + 5] = palette[i5];
802
707
            f64[j + 6] = palette[i6];
803
707
            f64[j + 7] = palette[i7];
804
707
            ui1 += 5;
805
707
          }
806
45
        } else {
807
4
          for (j = 0; j < nI; 
j += 83
)
808
3
          {
809
3
            const uint8_t u0 = ui1[0];
810
3
            const uint8_t u1 = ui1[1];
811
3
            const uint8_t u2 = ui1[2];
812
3
            const uint8_t u3 = ui1[3];
813
3
            const uint8_t u4 = ui1[4];
814
3
            const int i0 = (int)(u0 >> 3);
815
3
            const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
816
3
            const int i2 = (int)((u1 >> 1) & 31);
817
3
            const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
818
3
            const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
819
3
            const int i5 = (int)((u3 >> 2) & 31);
820
3
            const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
821
3
            const int i7 = (int)(u4 & 31);
822
3
            f64[j] = palette[i0];
823
3
            if (j + 1 < nI)
824
3
              f64[j + 1] = palette[i1];
825
3
            if (j + 2 < nI)
826
3
              f64[j + 2] = palette[i2];
827
3
            if (j + 3 < nI)
828
3
              f64[j + 3] = palette[i3];
829
3
            if (j + 4 < nI)
830
3
              f64[j + 4] = palette[i4];
831
3
            if (j + 5 < nI)
832
3
              f64[j + 5] = palette[i5];
833
3
            if (j + 6 < nI)
834
3
              f64[j + 6] = palette[i6];
835
3
            if (j + 7 < nI)
836
2
              f64[j + 7] = palette[i7];
837
3
            ui1 += 5;
838
3
          }
839
1
        }
840
46
      } parallel_endfor
841
6
    } else if (qbits == 6) {
842
12
      
parallel_for2
(i, num_blocks) {
843
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
844
12
        const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i;
845
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
846
12
        const double* const palette = (double*)ui0;
847
12
        const uint8_t* ui1 = ui0 + element_size * 64;
848
12
        double* const f64 = (double*)u80;
849
12
        int j;
850
12
        if (nI % 4 == 0)
851
11
        {
852
1.36k
          for (j = 0; j < nI; 
j += 41.35k
)
853
1.35k
          {
854
1.35k
            const uint8_t u0 = ui1[0];
855
1.35k
            const uint8_t u1 = ui1[1];
856
1.35k
            const uint8_t u2 = ui1[2];
857
1.35k
            const int i0 = (int)(u0 >> 2);
858
1.35k
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
859
1.35k
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
860
1.35k
            const int i3 = (int)(u2 & 63);
861
1.35k
            f64[j] = palette[i0];
862
1.35k
            f64[j + 1] = palette[i1];
863
1.35k
            f64[j + 2] = palette[i2];
864
1.35k
            f64[j + 3] = palette[i3];
865
1.35k
            ui1 += 3;
866
1.35k
          }
867
11
        } else {
868
71
          for (j = 0; j < nI; 
j += 470
)
869
70
          {
870
70
            const uint8_t u0 = ui1[0];
871
70
            const uint8_t u1 = ui1[1];
872
70
            const uint8_t u2 = ui1[2];
873
70
            const int i0 = (int)(u0 >> 2);
874
70
            const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
875
70
            const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
876
70
            const int i3 = (int)(u2 & 63);
877
70
            f64[j] = palette[i0];
878
70
            if (j + 1 < nI)
879
70
              f64[j + 1] = palette[i1];
880
70
            if (j + 2 < nI)
881
70
              f64[j + 2] = palette[i2];
882
70
            if (j + 3 < nI)
883
69
              f64[j + 3] = palette[i3];
884
70
            ui1 += 3;
885
70
          }
886
1
        }
887
12
      } parallel_endfor
888
4
    } else if (qbits == 7) {
889
12
      
parallel_for2
(i, num_blocks) {
890
12
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
891
12
        const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i;
892
12
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
893
12
        const double* const palette = (double*)ui0;
894
12
        const uint8_t* ui1 = ui0 + element_size * 128;
895
12
        double* const f64 = (double*)u80;
896
12
        int j;
897
12
        if (nI % 8 == 0)
898
11
        {
899
686
          for (j = 0; j < nI; 
j += 8675
)
900
675
          {
901
675
            const uint8_t u0 = ui1[0];
902
675
            const uint8_t u1 = ui1[1];
903
675
            const uint8_t u2 = ui1[2];
904
675
            const uint8_t u3 = ui1[3];
905
675
            const uint8_t u4 = ui1[4];
906
675
            const uint8_t u5 = ui1[5];
907
675
            const uint8_t u6 = ui1[6];
908
675
            const int i0 = (int)(u0 >> 1);
909
675
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
910
675
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
911
675
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
912
675
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
913
675
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
914
675
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
915
675
            const int i7 = (int)(u6 & 127);
916
675
            f64[j] = palette[i0];
917
675
            f64[j + 1] = palette[i1];
918
675
            f64[j + 2] = palette[i2];
919
675
            f64[j + 3] = palette[i3];
920
675
            f64[j + 4] = palette[i4];
921
675
            f64[j + 5] = palette[i5];
922
675
            f64[j + 6] = palette[i6];
923
675
            f64[j + 7] = palette[i7];
924
675
            ui1 += 7;
925
675
          }
926
11
        } else {
927
36
          for (j = 0; j < nI; 
j += 835
)
928
35
          {
929
35
            const uint8_t u0 = ui1[0];
930
35
            const uint8_t u1 = ui1[1];
931
35
            const uint8_t u2 = ui1[2];
932
35
            const uint8_t u3 = ui1[3];
933
35
            const uint8_t u4 = ui1[4];
934
35
            const uint8_t u5 = ui1[5];
935
35
            const uint8_t u6 = ui1[6];
936
35
            const int i0 = (int)(u0 >> 1);
937
35
            const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
938
35
            const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
939
35
            const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
940
35
            const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
941
35
            const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
942
35
            const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
943
35
            const int i7 = (int)(u6 & 127);
944
35
            f64[j] = palette[i0];
945
35
            if (j + 1 < nI)
946
35
              f64[j + 1] = palette[i1];
947
35
            if (j + 2 < nI)
948
35
              f64[j + 2] = palette[i2];
949
35
            if (j + 3 < nI)
950
35
              f64[j + 3] = palette[i3];
951
35
            if (j + 4 < nI)
952
35
              f64[j + 4] = palette[i4];
953
35
            if (j + 5 < nI)
954
35
              f64[j + 5] = palette[i5];
955
35
            if (j + 6 < nI)
956
35
              f64[j + 6] = palette[i6];
957
35
            if (j + 7 < nI)
958
34
              f64[j + 7] = palette[i7];
959
35
            ui1 += 7;
960
35
          }
961
1
        }
962
12
      } parallel_endfor
963
2
    } else {
964
6
      
parallel_for2
(i, num_blocks) {
965
6
        const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
966
6
        const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i;
967
6
        uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
968
6
        const double* const palette = (double*)ui0;
969
6
        const uint8_t* ui1 = ui0 + element_size * 256;
970
6
        double* const f64 = (double*)u80;
971
6
        int j;
972
5.68k
        for (j = 0; j < nI; 
j++5.67k
)
973
5.67k
        {
974
5.67k
          const uint8_t u0 = *ui1;
975
5.67k
          f64[j] = palette[u0];
976
5.67k
          ++ui1;
977
5.67k
        }
978
6
      } parallel_endfor
979
2
    }
980
10
  }
981
30
}
982
983
void ccv_nnc_depalettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length)
984
60
{
985
60
  assert(memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY);
986
60
  if (memory_type == CCV_TENSOR_CPU_MEMORY)
987
30
    _ccv_nnc_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length);
988
30
  else {
989
30
#ifdef HAVE_CUDA
990
30
    ccv_nnc_compat_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length, 0);
991
#elif defined(HAVE_MPS)
992
    ccv_nnc_mps_depalettize(input, datatype, input_length, qbits, number_in_blocks, output, output_length, 0);
993
#else
994
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
995
#endif
996
30
  }
997
60
}
998
999
CCV_WARN_UNUSED(size_t) ccv_nnc_quantize_8i_rowwise(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, void* output, const size_t output_length)
1000
5
{
1001
5
  assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F);
1002
5
  assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1003
5
  assert(row_length > 0);
1004
5
  assert(input_length % row_length == 0);
1005
5
  const size_t row_count = input_length / row_length;
1006
5
  const size_t scale_offset = (input_length + 127) & -128;
1007
5
  const size_t scale_size = row_count * CCV_GET_DATA_TYPE_SIZE(datatype);
1008
5
  assert(output_length >= scale_offset + scale_size);
1009
5
  int8_t* const q = (int8_t*)output;
1010
5
  uint8_t* const u8 = (uint8_t*)output;
1011
5
  if (datatype == CCV_16F)
1012
0
  {
1013
0
    const uint16_t* const f16 = (const uint16_t*)input;
1014
0
    uint16_t* const scales = (uint16_t*)(u8 + scale_offset);
1015
0
    parallel_for(i, (int)row_count) {
1016
0
      const size_t row_start = (size_t)i * row_length;
1017
0
      double max_abs = 0;
1018
0
      size_t j;
1019
0
      for (j = 0; j < row_length; j++)
1020
0
      {
1021
0
        float v;
1022
0
        ccv_half_precision_to_float(f16 + row_start + j, &v, 1);
1023
0
        max_abs = ccv_max(max_abs, fabs(v));
1024
0
      }
1025
0
      const float scale_f = (float)(max_abs / 127.);
1026
0
      ccv_float_to_half_precision(&scale_f, scales + i, 1);
1027
0
      if (scale_f == 0)
1028
0
        memset(q + row_start, 0, row_length);
1029
0
      else {
1030
0
        const double inv_scale = 1. / scale_f;
1031
0
        for (j = 0; j < row_length; j++)
1032
0
        {
1033
0
          float v;
1034
0
          ccv_half_precision_to_float(f16 + row_start + j, &v, 1);
1035
0
          const int iv = (int)lrint(v * inv_scale);
1036
0
          q[row_start + j] = (int8_t)ccv_clamp(iv, -127, 127);
1037
0
        }
1038
0
      }
1039
0
    } parallel_endfor
1040
5
  } else if (datatype == CCV_16BF) {
1041
1
    const uint16_t* const bf16 = (const uint16_t*)input;
1042
1
    uint16_t* const scales = (uint16_t*)(u8 + scale_offset);
1043
4
    
parallel_for1
(i, (int)row_count) {
1044
4
      const size_t row_start = (size_t)i * row_length;
1045
4
      double max_abs = 0;
1046
4
      size_t j;
1047
36
      for (j = 0; j < row_length; 
j++32
)
1048
32
      {
1049
32
        float v;
1050
32
        ccv_bfloat_to_float(bf16 + row_start + j, &v, 1);
1051
32
        max_abs = ccv_max(max_abs, fabs(v));
1052
32
      }
1053
4
      const float scale_f = (float)(max_abs / 127.);
1054
4
      ccv_float_to_bfloat(&scale_f, scales + i, 1);
1055
4
      if (scale_f == 0)
1056
0
        memset(q + row_start, 0, row_length);
1057
4
      else {
1058
4
        const double inv_scale = 1. / scale_f;
1059
36
        for (j = 0; j < row_length; 
j++32
)
1060
32
        {
1061
32
          float v;
1062
32
          ccv_bfloat_to_float(bf16 + row_start + j, &v, 1);
1063
32
          const int iv = (int)lrint(v * inv_scale);
1064
32
          q[row_start + j] = (int8_t)ccv_clamp(iv, -127, 127);
1065
32
        }
1066
4
      }
1067
4
    } parallel_endfor
1068
4
  } else if (datatype == CCV_32F) {
1069
4
    const float* const f32 = (const float*)input;
1070
4
    float* const scales = (float*)(u8 + scale_offset);
1071
12
    
parallel_for4
(i, (int)row_count) {
1072
12
      const size_t row_start = (size_t)i * row_length;
1073
12
      double max_abs = 0;
1074
12
      size_t j;
1075
88
      for (j = 0; j < row_length; 
j++76
)
1076
76
        max_abs = ccv_max(max_abs, fabs(f32[row_start + j]));
1077
12
      scales[i] = (float)(max_abs / 127.);
1078
12
      if (scales[i] == 0)
1079
0
        memset(q + row_start, 0, row_length);
1080
12
      else {
1081
12
        const double inv_scale = 1. / scales[i];
1082
88
        for (j = 0; j < row_length; 
j++76
)
1083
76
        {
1084
76
          const int iv = (int)lrint(f32[row_start + j] * inv_scale);
1085
76
          q[row_start + j] = (int8_t)ccv_clamp(iv, -127, 127);
1086
76
        }
1087
12
      }
1088
12
    } parallel_endfor
1089
4
  } else {
1090
0
    assert(datatype == CCV_64F);
1091
0
    const double* const f64 = (const double*)input;
1092
0
    double* const scales = (double*)(u8 + scale_offset);
1093
0
    parallel_for(i, (int)row_count) {
1094
0
      const size_t row_start = (size_t)i * row_length;
1095
0
      double max_abs = 0;
1096
0
      size_t j;
1097
0
      for (j = 0; j < row_length; j++)
1098
0
        max_abs = ccv_max(max_abs, fabs(f64[row_start + j]));
1099
0
      scales[i] = max_abs / 127.;
1100
0
      if (scales[i] == 0)
1101
0
        memset(q + row_start, 0, row_length);
1102
0
      else {
1103
0
        const double inv_scale = 1. / scales[i];
1104
0
        for (j = 0; j < row_length; j++)
1105
0
        {
1106
0
          const int iv = (int)lrint(f64[row_start + j] * inv_scale);
1107
0
          q[row_start + j] = (int8_t)ccv_clamp(iv, -127, 127);
1108
0
        }
1109
0
      }
1110
0
    } parallel_endfor
1111
0
  }
1112
5
  return scale_offset + scale_size;
1113
5
}
1114
1115
void ccv_nnc_dequantize_8i_rowwise(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, void* output, const size_t output_length)
1116
5
{
1117
5
  assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F);
1118
5
  assert(memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY);
1119
5
  assert(row_length > 0);
1120
5
  assert(output_length % row_length == 0);
1121
5
  if (memory_type != CCV_TENSOR_CPU_MEMORY)
1122
1
  {
1123
1
#ifdef HAVE_CUDA
1124
1
    ccv_nnc_compat_dequantize_8i_rowwise(input, datatype, input_length, row_length, output, output_length, 0);
1125
#elif defined(HAVE_MPS)
1126
    assert(datatype != CCV_64F);
1127
    ccv_nnc_mps_dequantize_8i_rowwise(input, datatype, input_length, row_length, output, output_length, 0);
1128
#else
1129
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1130
#endif
1131
1
    return;
1132
1
  }
1133
4
  const size_t row_count = output_length / row_length;
1134
4
  const size_t scale_offset = (output_length + 127) & -128;
1135
4
  const size_t scale_size = row_count * CCV_GET_DATA_TYPE_SIZE(datatype);
1136
4
  assert(input_length >= scale_offset + scale_size);
1137
4
  const int8_t* const q = (const int8_t*)input;
1138
4
  const uint8_t* const u8 = (const uint8_t*)input;
1139
4
  if (datatype == CCV_16F)
1140
0
  {
1141
0
    uint16_t* const f16 = (uint16_t*)output;
1142
0
    const uint16_t* const scales = (const uint16_t*)(u8 + scale_offset);
1143
0
    parallel_for(i, (int)row_count) {
1144
0
      const size_t row_start = (size_t)i * row_length;
1145
0
      float scale_f;
1146
0
      ccv_half_precision_to_float(scales + i, &scale_f, 1);
1147
0
      size_t j;
1148
0
      for (j = 0; j < row_length; j++)
1149
0
      {
1150
0
        const float v = q[row_start + j] * scale_f;
1151
0
        ccv_float_to_half_precision(&v, f16 + row_start + j, 1);
1152
0
      }
1153
0
    } parallel_endfor
1154
4
  } else if (datatype == CCV_16BF) {
1155
1
    uint16_t* const bf16 = (uint16_t*)output;
1156
1
    const uint16_t* const scales = (const uint16_t*)(u8 + scale_offset);
1157
4
    
parallel_for1
(i, (int)row_count) {
1158
4
      const size_t row_start = (size_t)i * row_length;
1159
4
      float scale_f;
1160
4
      ccv_bfloat_to_float(scales + i, &scale_f, 1);
1161
4
      size_t j;
1162
36
      for (j = 0; j < row_length; 
j++32
)
1163
32
      {
1164
32
        const float v = q[row_start + j] * scale_f;
1165
32
        ccv_float_to_bfloat(&v, bf16 + row_start + j, 1);
1166
32
      }
1167
4
    } parallel_endfor
1168
3
  } else if (datatype == CCV_32F) {
1169
3
    float* const f32 = (float*)output;
1170
3
    const float* const scales = (const float*)(u8 + scale_offset);
1171
8
    
parallel_for3
(i, (int)row_count) {
1172
8
      const size_t row_start = (size_t)i * row_length;
1173
8
      const float scale = scales[i];
1174
8
      size_t j;
1175
52
      for (j = 0; j < row_length; 
j++44
)
1176
44
        f32[row_start + j] = q[row_start + j] * scale;
1177
8
    } parallel_endfor
1178
3
  } else {
1179
0
    assert(datatype == CCV_64F);
1180
0
    double* const f64 = (double*)output;
1181
0
    const double* const scales = (const double*)(u8 + scale_offset);
1182
0
    parallel_for(i, (int)row_count) {
1183
0
      const size_t row_start = (size_t)i * row_length;
1184
0
      const double scale = scales[i];
1185
0
      size_t j;
1186
0
      for (j = 0; j < row_length; j++)
1187
0
        f64[row_start + j] = q[row_start + j] * scale;
1188
0
    } parallel_endfor
1189
0
  }
1190
4
}