Bug Summary

File:nnc/ccv_nnc_8i_rowwise.c
Warning:line 1231, column 38
The left operand of '/' is a garbage value

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_8i_rowwise.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2026-06-03-154855-3528769-1 -x c ccv_nnc_8i_rowwise.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include <float.h>
4#include "ccv_nnc_8i_rowwise_packed_grids.inc"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10
11static int _ccv_nnc_8i_rowwise_x_group_size(const int format)
12{
13 switch (format)
14 {
15 case CCV_NNC_QX_8I_ROWWISE_Q5_K:
16 case CCV_NNC_QX_8I_ROWWISE_Q4_K:
17 case CCV_NNC_QX_8I_ROWWISE_Q3_K:
18 case CCV_NNC_QX_8I_ROWWISE_Q2_K:
19 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
20 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
21 return 16;
22 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
23 return 32;
24 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS:
25 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
26 return 8;
27 default:
28 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 28, __extension__ __PRETTY_FUNCTION__
); }))
;
29 return 0;
30 }
31}
32
33static int _ccv_nnc_8i_rowwise_x_group_bits(const int format)
34{
35 switch (format)
36 {
37 case CCV_NNC_QX_8I_ROWWISE_Q5_K:
38 return 88;
39 case CCV_NNC_QX_8I_ROWWISE_Q4_K:
40 return 72;
41 case CCV_NNC_QX_8I_ROWWISE_Q3_K:
42 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
43 return 56;
44 case CCV_NNC_QX_8I_ROWWISE_Q2_K:
45 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
46 return 42;
47 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS:
48 return 21;
49 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
50 return 28;
51 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
52 return 64;
53 default:
54 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 54, __extension__ __PRETTY_FUNCTION__
); }))
;
55 return 0;
56 }
57}
58
59static size_t _ccv_nnc_8i_rowwise_packed_scale_offset(const int format, const size_t input_length, const size_t row_length)
60{
61 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 61, __extension__ __PRETTY_FUNCTION__
); }))
;
62 assert(input_length % row_length == 0)((void) sizeof ((input_length % row_length == 0) ? 1 : 0), __extension__
({ if (input_length % row_length == 0) ; else __assert_fail (
"input_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 62,
__extension__ __PRETTY_FUNCTION__); }))
;
63 const size_t row_count = input_length / row_length;
64 const size_t group_size = _ccv_nnc_8i_rowwise_x_group_size(format);
65 const size_t groups_per_row = (row_length + group_size - 1) / group_size;
66 const size_t group_bits = _ccv_nnc_8i_rowwise_x_group_bits(format);
67 const size_t payload_size = (row_count * groups_per_row * group_bits + 7) / 8;
68 return (payload_size + 127) & -128;
69}
70
71CCV_WARN_UNUSED(size_t)size_t __attribute__((warn_unused_result)) ccv_nnc_8i_rowwise_x_data_size(const int format, const int datatype, const size_t input_length, const size_t row_length)
72{
73 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 73, __extension__ __PRETTY_FUNCTION__
); }))
;
74 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 74, __extension__ __PRETTY_FUNCTION__
); }))
;
75 assert(input_length % row_length == 0)((void) sizeof ((input_length % row_length == 0) ? 1 : 0), __extension__
({ if (input_length % row_length == 0) ; else __assert_fail (
"input_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 75,
__extension__ __PRETTY_FUNCTION__); }))
;
76 const size_t row_count = input_length / row_length;
77 const size_t scale_offset = _ccv_nnc_8i_rowwise_packed_scale_offset(format, input_length, row_length);
78 return scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype)_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]
;
79}
80
81static void _ccv_nnc_8i_rowwise_packed_write_bits(uint8_t* const data, const size_t bit_offset, const uint32_t value, const int bits)
82{
83 int i;
84 for (i = 0; i < bits; i++)
85 if (value & (1u << i))
86 data[(bit_offset + i) >> 3] |= (uint8_t)(1u << ((bit_offset + i) & 7));
87}
88
89static uint32_t _ccv_nnc_8i_rowwise_packed_read_bits(const uint8_t* const data, const size_t bit_offset, const int bits)
90{
91 uint32_t value = 0;
92 int i;
93 for (i = 0; i < bits; i++)
94 if (data[(bit_offset + i) >> 3] & (uint8_t)(1u << ((bit_offset + i) & 7)))
95 value |= (1u << i);
96 return value;
97}
98
99static double _ccv_nnc_8i_rowwise_packed_stored_scale(const double scale, const int datatype)
100{
101 if (datatype == CCV_16F)
102 {
103 const float scale_f = (float)scale;
104 uint16_t scale_h;
105 float stored_scale;
106 ccv_float_to_half_precision(&scale_f, &scale_h, 1);
107 ccv_half_precision_to_float(&scale_h, &stored_scale, 1);
108 return stored_scale;
109 } else if (datatype == CCV_16BF) {
110 const float scale_f = (float)scale;
111 uint16_t scale_bf;
112 float stored_scale;
113 ccv_float_to_bfloat(&scale_f, &scale_bf, 1);
114 ccv_bfloat_to_float(&scale_bf, &stored_scale, 1);
115 return stored_scale;
116 } else if (datatype == CCV_32F)
117 return (float)scale;
118 return scale;
119}
120
121static void _ccv_nnc_8i_rowwise_packed_store_scale(uint8_t* const scales, const int datatype, const size_t i, const double scale)
122{
123 if (datatype == CCV_16F)
124 {
125 const float scale_f = (float)scale;
126 ccv_float_to_half_precision(&scale_f, (uint16_t*)scales + i, 1);
127 } else if (datatype == CCV_16BF) {
128 const float scale_f = (float)scale;
129 ccv_float_to_bfloat(&scale_f, (uint16_t*)scales + i, 1);
130 } else if (datatype == CCV_32F)
131 ((float*)scales)[i] = (float)scale;
132 else
133 ((double*)scales)[i] = scale;
134}
135
136static double _ccv_nnc_8i_rowwise_packed_load_scale(const uint8_t* const scales, const int datatype, const size_t i)
137{
138 if (datatype == CCV_16F)
139 {
140 float scale_f;
141 ccv_half_precision_to_float((const uint16_t*)scales + i, &scale_f, 1);
142 return scale_f;
143 } else if (datatype == CCV_16BF) {
144 float scale_f;
145 ccv_bfloat_to_float((const uint16_t*)scales + i, &scale_f, 1);
146 return scale_f;
147 } else if (datatype == CCV_32F)
148 return ((const float*)scales)[i];
149 return ((const double*)scales)[i];
150}
151
152static void _ccv_nnc_8i_rowwise_packed_read_row(const void* const input, const int datatype, const size_t row_start, const size_t row_length, const size_t padded_row_length, double* const row)
153{
154 size_t j;
155 if (datatype
24.1
'datatype' is not equal to CCV_16F
== CCV_16F)
25
Taking false branch
156 {
157 const uint16_t* const f16 = (const uint16_t*)input + row_start;
158 for (j = 0; j < row_length; j++)
159 {
160 float v;
161 ccv_half_precision_to_float(f16 + j, &v, 1);
162 row[j] = v;
163 }
164 } else if (datatype
25.1
'datatype' is not equal to CCV_16BF
== CCV_16BF) {
26
Taking false branch
165 const uint16_t* const bf16 = (const uint16_t*)input + row_start;
166 for (j = 0; j < row_length; j++)
167 {
168 float v;
169 ccv_bfloat_to_float(bf16 + j, &v, 1);
170 row[j] = v;
171 }
172 } else if (datatype
26.1
'datatype' is not equal to CCV_32F
== CCV_32F) {
27
Taking false branch
173 const float* const f32 = (const float*)input + row_start;
174 for (j = 0; j < row_length; j++)
175 row[j] = f32[j];
176 } else {
177 const double* const f64 = (const double*)input + row_start;
178 for (j = 0; j
27.1
'j' is < 'row_length'
< row_length
; j++)
28
Loop condition is true. Entering loop body
29
Assuming 'j' is >= 'row_length'
30
Loop condition is false. Execution continues on line 181
179 row[j] = f64[j];
180 }
181 for (; j < padded_row_length; j++)
31
Assuming 'j' is >= 'padded_row_length'
32
Loop condition is false. Execution continues on line 181
182 row[j] = 0;
183}
184
185static void _ccv_nnc_8i_rowwise_packed_write_value(void* const output, const int datatype, const size_t j, const double v)
186{
187 if (datatype == CCV_16F)
188 {
189 const float v_f = (float)v;
190 ccv_float_to_half_precision(&v_f, (uint16_t*)output + j, 1);
191 } else if (datatype == CCV_16BF) {
192 const float v_f = (float)v;
193 ccv_float_to_bfloat(&v_f, (uint16_t*)output + j, 1);
194 } else if (datatype == CCV_32F)
195 ((float*)output)[j] = (float)v;
196 else
197 ((double*)output)[j] = v;
198}
199
200static inline double _ccv_nnc_8i_rowwise_weight(const float* const imatrix, const size_t j)
201{
202 return imatrix ? ccv_max((double)imatrix[j], 0.)({ typeof ((double)imatrix[j]) _a = ((double)imatrix[j]); typeof
(0.) _b = (0.); (_a > _b) ? _a : _b; })
: 1.;
203}
204
205static inline int _ccv_nnc_8i_rowwise_imatrix_is_valid(const float* const imatrix, const size_t imatrix_length, const size_t row_length, const size_t row_count)
206{
207 if (!imatrix)
208 return 1;
209 if (imatrix_length < row_length || imatrix_length % row_length != 0)
210 return 0;
211 const size_t imatrix_slices = imatrix_length / row_length;
212 return imatrix_slices > 0 && row_count % imatrix_slices == 0;
213}
214
215static inline const float* _ccv_nnc_8i_rowwise_imatrix_for_row(const float* const imatrix, const size_t imatrix_length, const size_t row_length, const size_t row_count, const size_t row_idx)
216{
217 if (!imatrix)
218 return 0;
219 const size_t imatrix_slices = imatrix_length / row_length;
220 if (imatrix_slices == 1)
221 return imatrix;
222 const size_t rows_per_slice = row_count / imatrix_slices;
223 return imatrix + (row_idx / rows_per_slice) * row_length;
224}
225
226typedef struct {
227 int q[32];
228 int q8[32];
229 int m;
230 int b;
231 int z;
232 int scale;
233 int grid[4];
234 uint32_t signs;
235} ccv_nnc_8i_rowwise_packed_group_t;
236
237static void _ccv_nnc_8i_rowwise_packed_quant_q5(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
238{
239 double best_sse = DBL_MAX1.7976931348623157e+308;
240 int best_q[16] = {0};
241 int best_q8[16] = {0};
242 int best_m = 1, best_b = 0;
243 int m, b, j;
244 for (m = 1; m <= 8; m++)
245 for (b = -16; b <= 15; b++)
246 {
247 if (-16 * m + b < -127 || 15 * m + b > 127)
248 continue;
249 double sse = 0;
250 int q[16];
251 int q8[16];
252 for (j = 0; j < 16; j++)
253 {
254 q[j] = ccv_clamp((int)lrint((y[j] - b) / m), -16, 15)({ typeof (-16) _a = (-16); typeof (15) _b = (15); typeof ((int
)lrint((y[j] - b) / m)) _x = ((int)lrint((y[j] - b) / m)); (_x
< _a) ? _a : ((_x > _b) ? _b : _x); })
;
255 q8[j] = q[j] * m + b;
256 const double d = q8[j] - y[j];
257 sse += w[j] * d * d;
258 }
259 if (sse < best_sse)
260 {
261 best_sse = sse;
262 best_m = m;
263 best_b = b;
264 memcpy(best_q, q, sizeof(best_q));
265 memcpy(best_q8, q8, sizeof(best_q8));
266 }
267 }
268 group->m = best_m;
269 group->b = best_b;
270 memcpy(group->q, best_q, sizeof(best_q));
271 memcpy(group->q8, best_q8, sizeof(best_q8));
272}
273
274static void _ccv_nnc_8i_rowwise_packed_quant_q4(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
275{
276 double best_sse = DBL_MAX1.7976931348623157e+308;
277 int best_q[16] = {0};
278 int best_q8[16] = {0};
279 int best_m = 1, best_b = 0;
280 int m, b, j;
281 for (m = 1; m <= 16; m++)
282 for (b = -8; b <= 7; b++)
283 {
284 if (-8 * m + b < -127 || 7 * m + b > 127)
285 continue;
286 double sse = 0;
287 int q[16];
288 int q8[16];
289 for (j = 0; j < 16; j++)
290 {
291 q[j] = ccv_clamp((int)lrint((y[j] - b) / m), -8, 7)({ typeof (-8) _a = (-8); typeof (7) _b = (7); typeof ((int)lrint
((y[j] - b) / m)) _x = ((int)lrint((y[j] - b) / m)); (_x <
_a) ? _a : ((_x > _b) ? _b : _x); })
;
292 q8[j] = q[j] * m + b;
293 const double d = q8[j] - y[j];
294 sse += w[j] * d * d;
295 }
296 if (sse < best_sse)
297 {
298 best_sse = sse;
299 best_m = m;
300 best_b = b;
301 memcpy(best_q, q, sizeof(best_q));
302 memcpy(best_q8, q8, sizeof(best_q8));
303 }
304 }
305 group->m = best_m;
306 group->b = best_b;
307 memcpy(group->q, best_q, sizeof(best_q));
308 memcpy(group->q8, best_q8, sizeof(best_q8));
309}
310
311static void _ccv_nnc_8i_rowwise_packed_quant_q3(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
312{
313 double best_sse = DBL_MAX1.7976931348623157e+308;
314 int best_q[16] = {0};
315 int best_q8[16] = {0};
316 int best_m = 1, best_b = 0;
317 int m, b, j;
318 for (m = 1; m <= 32; m++)
319 for (b = -8; b <= 6; b += 2)
320 {
321 if (-4 * m + b < -127 || 3 * m + b > 127)
322 continue;
323 double sse = 0;
324 int q[16];
325 int q8[16];
326 for (j = 0; j < 16; j++)
327 {
328 q[j] = ccv_clamp((int)lrint((y[j] - b) / m), -4, 3)({ typeof (-4) _a = (-4); typeof (3) _b = (3); typeof ((int)lrint
((y[j] - b) / m)) _x = ((int)lrint((y[j] - b) / m)); (_x <
_a) ? _a : ((_x > _b) ? _b : _x); })
;
329 q8[j] = q[j] * m + b;
330 const double d = q8[j] - y[j];
331 sse += w[j] * d * d;
332 }
333 if (sse < best_sse)
334 {
335 best_sse = sse;
336 best_m = m;
337 best_b = b;
338 memcpy(best_q, q, sizeof(best_q));
339 memcpy(best_q8, q8, sizeof(best_q8));
340 }
341 }
342 group->m = best_m;
343 group->b = best_b;
344 memcpy(group->q, best_q, sizeof(best_q));
345 memcpy(group->q8, best_q8, sizeof(best_q8));
346}
347
348static void _ccv_nnc_8i_rowwise_packed_quant_q2(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
349{
350 double best_sse = DBL_MAX1.7976931348623157e+308;
351 int best_q[16] = {0};
352 int best_q8[16] = {0};
353 int best_m = 1, best_z = 0;
354 int m, z, j;
355 for (m = 1; m <= 64; m++)
356 for (z = 0; z <= 120; z += 8)
357 {
358 if (3 * m - z > 127)
359 continue;
360 double sse = 0;
361 int q[16];
362 int q8[16];
363 for (j = 0; j < 16; j++)
364 {
365 q[j] = ccv_clamp((int)lrint((y[j] + z) / m), 0, 3)({ typeof (0) _a = (0); typeof (3) _b = (3); typeof ((int)lrint
((y[j] + z) / m)) _x = ((int)lrint((y[j] + z) / m)); (_x <
_a) ? _a : ((_x > _b) ? _b : _x); })
;
366 q8[j] = q[j] * m - z;
367 const double d = q8[j] - y[j];
368 sse += w[j] * d * d;
369 }
370 if (sse < best_sse)
371 {
372 best_sse = sse;
373 best_m = m;
374 best_z = z;
375 memcpy(best_q, q, sizeof(best_q));
376 memcpy(best_q8, q8, sizeof(best_q8));
377 }
378 }
379 group->m = best_m;
380 group->z = best_z;
381 memcpy(group->q, best_q, sizeof(best_q));
382 memcpy(group->q8, best_q8, sizeof(best_q8));
383}
384
385static int _ccv_nnc_8i_rowwise_packed_iq2_value(const uint64_t* const grid, const int index, const int lane)
386{
387 const int v = (int)((grid[index] >> (lane * 8)) & 0xff);
388 if (v == 8)
389 return 1;
390 if (v == 25)
391 return 3;
392 assert(v == 43)((void) sizeof ((v == 43) ? 1 : 0), __extension__ ({ if (v ==
43) ; else __assert_fail ("v == 43", "ccv_nnc_8i_rowwise.c",
392, __extension__ __PRETTY_FUNCTION__); }))
;
393 return 5;
394}
395
396static int _ccv_nnc_8i_rowwise_packed_iq2xxs_value(const int index, const int lane)
397{
398 const int v = (int)((ccv_nnc_8i_rowwise_packed_iq2xxs_grid[index] >> (lane * 2)) & 3);
399 assert(v < 3)((void) sizeof ((v < 3) ? 1 : 0), __extension__ ({ if (v <
3) ; else __assert_fail ("v < 3", "ccv_nnc_8i_rowwise.c",
399, __extension__ __PRETTY_FUNCTION__); }))
;
400 return 1 + v * 2;
401}
402
403enum {
404 CCV_NNC_8I_ROWWISE_PACKED_IQ2XXS_GRID_SIZE = 256,
405 CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE = 1024,
406};
407
408static const int ccv_nnc_8i_rowwise_packed_iq2xxs_scales[16] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32};
409static int ccv_nnc_8i_rowwise_packed_iq2xxs_initialized = 0;
410static uint8_t ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[33][CCV_NNC_8I_ROWWISE_PACKED_IQ2XXS_GRID_SIZE][8];
411
412static void _ccv_nnc_8i_rowwise_packed_iq2xxs_init(void)
413{
414 if (ccv_nnc_8i_rowwise_packed_iq2xxs_initialized)
415 return;
416 int index, j, scale;
417 for (scale = 1; scale <= 32; scale++)
418 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2XXS_GRID_SIZE; index++)
419 for (j = 0; j < 8; j++)
420 ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[scale][index][j] = (uint8_t)ccv_min(_ccv_nnc_8i_rowwise_packed_iq2xxs_value(index, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2xxs_value(index, j) *
scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2xxs_value(index, j
) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b; }
)
;
421 ccv_nnc_8i_rowwise_packed_iq2xxs_initialized = 1;
422}
423
424static double _ccv_nnc_8i_rowwise_packed_iq2xxs_sse(const double* const y, const double* const w, const int lane, const int scale, const int index, const int sign_index)
425{
426 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[scale][index];
427 const uint8_t signs = ccv_nnc_8i_rowwise_packed_iq2xxs_ksigns[sign_index];
428 int j;
429 double sse = 0;
430 for (j = 0; j < 8; j++)
431 {
432 const int q8 = (signs & (1u << j)) ? -(int)mag[j] : (int)mag[j];
433 const double d = (double)q8 - y[lane + j];
434 sse += w[lane + j] * d * d;
435 }
436 return sse;
437}
438
439static double _ccv_nnc_8i_rowwise_packed_iq2xxs_best_sign_sse(const double* const y, const double* const w, const int lane, const int scale, const int index, int* const sign_index)
440{
441 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[scale][index];
442 uint8_t signs = 0;
443 int negative_count = 0;
444 int j;
445 for (j = 0; j < 8; j++)
446 if (y[lane + j] < 0)
447 {
448 signs |= (uint8_t)(1u << j);
449 negative_count++;
450 }
451 if (negative_count & 1)
452 {
453 int best_flip = 0;
454 double best_cost = DBL_MAX1.7976931348623157e+308;
455 for (j = 0; j < 8; j++)
456 {
457 const double cost = w[lane + j] * (double)mag[j] * fabs(y[lane + j]);
458 if (cost < best_cost)
459 {
460 best_cost = cost;
461 best_flip = j;
462 }
463 }
464 signs ^= (uint8_t)(1u << best_flip);
465 }
466 *sign_index = signs & 0x7f;
467 return _ccv_nnc_8i_rowwise_packed_iq2xxs_sse(y, w, lane, scale, index, *sign_index);
468}
469
470static int ccv_nnc_8i_rowwise_packed_iq2s_initialized = 0;
471static uint8_t ccv_nnc_8i_rowwise_packed_iq2s_level[CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE][8];
472static uint8_t ccv_nnc_8i_rowwise_packed_iq2s_scale_level[65][3];
473static uint16_t ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[65][3];
474static uint8_t ccv_nnc_8i_rowwise_packed_iq2s_scaled_value[65][CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE][8];
475
476static void _ccv_nnc_8i_rowwise_packed_iq2s_init(void)
477{
478 if (ccv_nnc_8i_rowwise_packed_iq2s_initialized)
479 return;
480 int index, j, scale;
481 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE; index++)
482 for (j = 0; j < 8; j++)
483 {
484 const int v = _ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid, index, j);
485 ccv_nnc_8i_rowwise_packed_iq2s_level[index][j] = (uint8_t)((v - 1) / 2);
486 }
487 for (scale = 1; scale <= 64; scale++)
488 {
489 for (j = 0; j < 3; j++)
490 {
491 const int v = ccv_min((1 + j * 2) * scale, 127)({ typeof ((1 + j * 2) * scale) _a = ((1 + j * 2) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
492 ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][j] = (uint8_t)v;
493 ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[scale][j] = (uint16_t)(v * v);
494 }
495 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE; index++)
496 for (j = 0; j < 8; j++)
497 ccv_nnc_8i_rowwise_packed_iq2s_scaled_value[scale][index][j] = ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][ccv_nnc_8i_rowwise_packed_iq2s_level[index][j]];
498 }
499 ccv_nnc_8i_rowwise_packed_iq2s_initialized = 1;
500}
501
502static double _ccv_nnc_8i_rowwise_packed_iq2s_sse(const double* const ay, const double* const w, const int lane, const int scale, const int index)
503{
504 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq2s_scaled_value[scale][index];
505 double d = (double)mag[0] - ay[lane];
506 double sse = w[lane] * d * d;
507 d = (double)mag[1] - ay[lane + 1];
508 sse += w[lane + 1] * d * d;
509 d = (double)mag[2] - ay[lane + 2];
510 sse += w[lane + 2] * d * d;
511 d = (double)mag[3] - ay[lane + 3];
512 sse += w[lane + 3] * d * d;
513 d = (double)mag[4] - ay[lane + 4];
514 sse += w[lane + 4] * d * d;
515 d = (double)mag[5] - ay[lane + 5];
516 sse += w[lane + 5] * d * d;
517 d = (double)mag[6] - ay[lane + 6];
518 sse += w[lane + 6] * d * d;
519 d = (double)mag[7] - ay[lane + 7];
520 sse += w[lane + 7] * d * d;
521 return sse;
522}
523
524static int _ccv_nnc_8i_rowwise_packed_iq3xxs_value(const int index, const int lane)
525{
526 const int v = (int)((ccv_nnc_8i_rowwise_packed_iq3xxs_grid[index] >> (lane * 8)) & 0xff);
527 switch (v)
528 {
529 case 4: return 1;
530 case 12: return 3;
531 case 20: return 5;
532 case 28: return 7;
533 case 36: return 9;
534 case 44: return 11;
535 case 52: return 13;
536 default:
537 assert(v == 62)((void) sizeof ((v == 62) ? 1 : 0), __extension__ ({ if (v ==
62) ; else __assert_fail ("v == 62", "ccv_nnc_8i_rowwise.c",
537, __extension__ __PRETTY_FUNCTION__); }))
;
538 return 15;
539 }
540}
541
542static int _ccv_nnc_8i_rowwise_packed_iq3s_value(const int index, const int lane)
543{
544 return (int)((ccv_nnc_8i_rowwise_packed_iq3s_grid[index] >> (lane * 8)) & 0xff);
545}
546
547#define CCV_NNC_8I_ROWWISE_PACKED_IQ3S_GRID_SIZE(512) (512)
548#define CCV_NNC_8I_ROWWISE_PACKED_IQ3XXS_GRID_SIZE(256) (256)
549
550static int ccv_nnc_8i_rowwise_packed_iq3s_initialized = 0;
551static uint8_t ccv_nnc_8i_rowwise_packed_iq3s_scaled_value[17][CCV_NNC_8I_ROWWISE_PACKED_IQ3S_GRID_SIZE(512)][4];
552
553static int ccv_nnc_8i_rowwise_packed_iq3xxs_initialized = 0;
554static uint8_t ccv_nnc_8i_rowwise_packed_iq3xxs_scaled_value[17][CCV_NNC_8I_ROWWISE_PACKED_IQ3XXS_GRID_SIZE(256)][4];
555
556static void _ccv_nnc_8i_rowwise_packed_iq3s_init(void)
557{
558 if (ccv_nnc_8i_rowwise_packed_iq3s_initialized)
559 return;
560 int index, j, scale;
561 for (scale = 1; scale <= 16; scale++)
562 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ3S_GRID_SIZE(512); index++)
563 for (j = 0; j < 4; j++)
564 ccv_nnc_8i_rowwise_packed_iq3s_scaled_value[scale][index][j] = (uint8_t)ccv_min(_ccv_nnc_8i_rowwise_packed_iq3s_value(index, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3s_value(index, j) * scale
) _a = (_ccv_nnc_8i_rowwise_packed_iq3s_value(index, j) * scale
); typeof (127) _b = (127); (_a < _b) ? _a : _b; })
;
565 ccv_nnc_8i_rowwise_packed_iq3s_initialized = 1;
566}
567
568static void _ccv_nnc_8i_rowwise_packed_iq3xxs_init(void)
569{
570 if (ccv_nnc_8i_rowwise_packed_iq3xxs_initialized)
571 return;
572 int index, j, scale;
573 for (scale = 1; scale <= 16; scale++)
574 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ3XXS_GRID_SIZE(256); index++)
575 for (j = 0; j < 4; j++)
576 ccv_nnc_8i_rowwise_packed_iq3xxs_scaled_value[scale][index][j] = (uint8_t)ccv_min(_ccv_nnc_8i_rowwise_packed_iq3xxs_value(index, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(index, j) *
scale) _a = (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(index, j
) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b; }
)
;
577 ccv_nnc_8i_rowwise_packed_iq3xxs_initialized = 1;
578}
579
580static double _ccv_nnc_8i_rowwise_packed_iq3s_sse(const double* const ay, const double* const w, const int lane, const int scale, const int index)
581{
582 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq3s_scaled_value[scale][index];
583 double d = (double)mag[0] - ay[lane];
584 double sse = w[lane] * d * d;
585 d = (double)mag[1] - ay[lane + 1];
586 sse += w[lane + 1] * d * d;
587 d = (double)mag[2] - ay[lane + 2];
588 sse += w[lane + 2] * d * d;
589 d = (double)mag[3] - ay[lane + 3];
590 sse += w[lane + 3] * d * d;
591 return sse;
592}
593
594static double _ccv_nnc_8i_rowwise_packed_iq3xxs_sse(const double* const ay, const double* const w, const int lane, const int scale, const int index)
595{
596 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq3xxs_scaled_value[scale][index];
597 double d = (double)mag[0] - ay[lane];
598 double sse = w[lane] * d * d;
599 d = (double)mag[1] - ay[lane + 1];
600 sse += w[lane + 1] * d * d;
601 d = (double)mag[2] - ay[lane + 2];
602 sse += w[lane + 2] * d * d;
603 d = (double)mag[3] - ay[lane + 3];
604 sse += w[lane + 3] * d * d;
605 return sse;
606}
607
608static void _ccv_nnc_8i_rowwise_packed_quant_iq2_xxs(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
609{
610 assert(ccv_nnc_8i_rowwise_packed_iq2xxs_initialized)((void) sizeof ((ccv_nnc_8i_rowwise_packed_iq2xxs_initialized
) ? 1 : 0), __extension__ ({ if (ccv_nnc_8i_rowwise_packed_iq2xxs_initialized
) ; else __assert_fail ("ccv_nnc_8i_rowwise_packed_iq2xxs_initialized"
, "ccv_nnc_8i_rowwise.c", 610, __extension__ __PRETTY_FUNCTION__
); }))
;
611 double best_sse = DBL_MAX1.7976931348623157e+308;
612 int best_scale_code = 0;
613 int best_grid[4] = {0};
614 int best_sign[4] = {0};
615 int scale_code;
616 for (scale_code = 0; scale_code < 16; scale_code++)
617 {
618 const int scale = ccv_nnc_8i_rowwise_packed_iq2xxs_scales[scale_code];
619 double group_sse = 0;
620 int group_grid[4] = {0};
621 int group_sign[4] = {0};
622 int sg;
623 for (sg = 0; sg < 4; sg++)
624 {
625 double best_sub_sse = DBL_MAX1.7976931348623157e+308;
626 int best_sub_grid = 0;
627 int best_sub_sign = 0;
628 const int lane = sg * 8;
629 int index;
630 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2XXS_GRID_SIZE; index++)
631 {
632 int sign;
633 const double sse = _ccv_nnc_8i_rowwise_packed_iq2xxs_best_sign_sse(y, w, lane, scale, index, &sign);
634 if (sse < best_sub_sse)
635 {
636 best_sub_sse = sse;
637 best_sub_grid = index;
638 best_sub_sign = sign;
639 }
640 }
641 group_sse += best_sub_sse;
642 group_grid[sg] = best_sub_grid;
643 group_sign[sg] = best_sub_sign;
644 }
645 if (group_sse < best_sse)
646 {
647 best_sse = group_sse;
648 best_scale_code = scale_code;
649 memcpy(best_grid, group_grid, sizeof(best_grid));
650 memcpy(best_sign, group_sign, sizeof(best_sign));
651 }
652 }
653 group->scale = best_scale_code;
654 group->signs = 0;
655 memcpy(group->grid, best_grid, sizeof(best_grid));
656 int j;
657 for (j = 0; j < 4; j++)
658 group->signs |= (uint32_t)best_sign[j] << (j * 7);
659 for (j = 0; j < 32; j++)
660 {
661 const int sg = j >> 3;
662 const int lane = j & 7;
663 const uint8_t signs = ccv_nnc_8i_rowwise_packed_iq2xxs_ksigns[best_sign[sg]];
664 const int mag = ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[ccv_nnc_8i_rowwise_packed_iq2xxs_scales[best_scale_code]][best_grid[sg]][lane];
665 group->q8[j] = (signs & (1u << lane)) ? -mag : mag;
666 }
667}
668
669static void _ccv_nnc_8i_rowwise_packed_quant_iq2_s(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
670{
671 assert(ccv_nnc_8i_rowwise_packed_iq2s_initialized)((void) sizeof ((ccv_nnc_8i_rowwise_packed_iq2s_initialized) ?
1 : 0), __extension__ ({ if (ccv_nnc_8i_rowwise_packed_iq2s_initialized
) ; else __assert_fail ("ccv_nnc_8i_rowwise_packed_iq2s_initialized"
, "ccv_nnc_8i_rowwise.c", 671, __extension__ __PRETTY_FUNCTION__
); }))
;
672 double best_sse = DBL_MAX1.7976931348623157e+308;
673 int best_scale = 1;
674 int best_grid[2] = {0};
675 double ay[16];
676 double wy[16];
677 uint32_t signs = 0;
678 int j;
679 for (j = 0; j < 16; j++)
680 {
681 ay[j] = fabs(y[j]);
682 wy[j] = w[j] * ay[j];
683 if (y[j] < 0)
684 signs |= (1u << j);
685 }
686 double sub_sse[2][65];
687 int sub_grid[2][65];
688 int sg, scale;
689 for (sg = 0; sg < 2; sg++)
690 for (scale = 1; scale <= 64; scale++)
691 {
692 sub_sse[sg][scale] = DBL_MAX1.7976931348623157e+308;
693 sub_grid[sg][scale] = 0;
694 }
695 for (sg = 0; sg < 2; sg++)
696 {
697 const int lane = sg * 8;
698 double sum_y2 = 0;
699 for (j = 0; j < 8; j++)
700 sum_y2 += w[lane + j] * ay[lane + j] * ay[lane + j];
701 int index;
702 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE; index++)
703 {
704 double sw[3] = {0};
705 double swy[3] = {0};
706 for (j = 0; j < 8; j++)
707 {
708 const int level = ccv_nnc_8i_rowwise_packed_iq2s_level[index][j];
709 sw[level] += w[lane + j];
710 swy[level] += wy[lane + j];
711 }
712 for (scale = 1; scale <= 64; scale++)
713 {
714 const double sse = sum_y2 +
715 sw[0] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[scale][0] - 2 * swy[0] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][0] +
716 sw[1] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[scale][1] - 2 * swy[1] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][1] +
717 sw[2] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[scale][2] - 2 * swy[2] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][2];
718 if (sub_sse[sg][scale] == DBL_MAX1.7976931348623157e+308 || sse <= sub_sse[sg][scale] + ccv_max(1., fabs(sub_sse[sg][scale]))({ typeof (1.) _a = (1.); typeof (fabs(sub_sse[sg][scale])) _b
= (fabs(sub_sse[sg][scale])); (_a > _b) ? _a : _b; })
* 1e-9)
719 {
720 const double exact_sse = _ccv_nnc_8i_rowwise_packed_iq2s_sse(ay, w, lane, scale, index);
721 if (exact_sse < sub_sse[sg][scale])
722 {
723 sub_sse[sg][scale] = exact_sse;
724 sub_grid[sg][scale] = index;
725 }
726 }
727 }
728 }
729 }
730 for (scale = 1; scale <= 64; scale++)
731 {
732 const double group_sse = sub_sse[0][scale] + sub_sse[1][scale];
733 if (group_sse < best_sse)
734 {
735 best_sse = group_sse;
736 best_scale = scale;
737 best_grid[0] = sub_grid[0][scale];
738 best_grid[1] = sub_grid[1][scale];
739 }
740 }
741 group->scale = best_scale;
742 group->signs = signs;
743 memcpy(group->grid, best_grid, sizeof(best_grid));
744 for (j = 0; j < 16; j++)
745 {
746 const int sg = j >> 3;
747 const int lane = j & 7;
748 const int mag = ccv_nnc_8i_rowwise_packed_iq2s_scaled_value[best_scale][best_grid[sg]][lane];
749 group->q8[j] = (signs & (1u << j)) ? -mag : mag;
750 }
751}
752
753static void _ccv_nnc_8i_rowwise_packed_quant_iq2_xs(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
754{
755 static const int scales[16] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32};
756 double best_sse = DBL_MAX1.7976931348623157e+308;
757 int best_scale_code = 0;
758 int best_grid = 0;
759 int best_q8[16] = {0};
760 uint32_t signs = 0;
761 int j;
762 for (j = 0; j < 8; j++)
763 if (y[j] < 0)
764 signs |= (1u << j);
765 int scale_code;
766 for (scale_code = 0; scale_code < 16; scale_code++)
767 {
768 const int scale = scales[scale_code];
769 int index;
770 for (index = 0; index < 512; index++)
771 {
772 double sse = 0;
773 int q8[16] = {0};
774 for (j = 0; j < 8; j++)
775 {
776 const int mag = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2xs_grid, index, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2xs_grid
, index, j) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2_value
(ccv_nnc_8i_rowwise_packed_iq2xs_grid, index, j) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
777 q8[j] = (signs & (1u << j)) ? -mag : mag;
778 const double d = q8[j] - y[j];
779 sse += w[j] * d * d;
780 }
781 if (sse < best_sse)
782 {
783 best_sse = sse;
784 best_scale_code = scale_code;
785 best_grid = index;
786 memcpy(best_q8, q8, sizeof(best_q8));
787 }
788 }
789 }
790 group->scale = best_scale_code;
791 group->grid[0] = best_grid;
792 group->signs = signs;
793 memcpy(group->q8, best_q8, sizeof(best_q8));
794}
795
796static void _ccv_nnc_8i_rowwise_packed_quant_iq3_s(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
797{
798 assert(ccv_nnc_8i_rowwise_packed_iq3s_initialized)((void) sizeof ((ccv_nnc_8i_rowwise_packed_iq3s_initialized) ?
1 : 0), __extension__ ({ if (ccv_nnc_8i_rowwise_packed_iq3s_initialized
) ; else __assert_fail ("ccv_nnc_8i_rowwise_packed_iq3s_initialized"
, "ccv_nnc_8i_rowwise.c", 798, __extension__ __PRETTY_FUNCTION__
); }))
;
799 double best_sse = DBL_MAX1.7976931348623157e+308;
800 int best_scale = 1;
801 int best_grid[4] = {0};
802 double ay[16];
803 uint32_t signs = 0;
804 int j;
805 for (j = 0; j < 16; j++)
806 {
807 ay[j] = fabs(y[j]);
808 if (y[j] < 0)
809 signs |= (1u << j);
810 }
811 int scale;
812 for (scale = 1; scale <= 16; scale++)
813 {
814 double group_sse = 0;
815 int group_grid[4] = {0};
816 int sg;
817 for (sg = 0; sg < 4; sg++)
818 {
819 double best_sub_sse = DBL_MAX1.7976931348623157e+308;
820 int best_sub_grid = 0;
821 const int lane = sg * 4;
822 int index;
823 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ3S_GRID_SIZE(512); index++)
824 {
825 const double sse = _ccv_nnc_8i_rowwise_packed_iq3s_sse(ay, w, lane, scale, index);
826 if (sse < best_sub_sse)
827 {
828 best_sub_sse = sse;
829 best_sub_grid = index;
830 }
831 }
832 group_sse += best_sub_sse;
833 group_grid[sg] = best_sub_grid;
834 }
835 if (group_sse < best_sse)
836 {
837 best_sse = group_sse;
838 best_scale = scale;
839 memcpy(best_grid, group_grid, sizeof(best_grid));
840 }
841 }
842 group->scale = best_scale;
843 group->signs = signs;
844 memcpy(group->grid, best_grid, sizeof(best_grid));
845 for (j = 0; j < 16; j++)
846 {
847 const int sg = j >> 2;
848 const int lane = j & 3;
849 const int mag = ccv_nnc_8i_rowwise_packed_iq3s_scaled_value[best_scale][best_grid[sg]][lane];
850 group->q8[j] = (signs & (1u << j)) ? -mag : mag;
851 }
852}
853
854static void _ccv_nnc_8i_rowwise_packed_quant_iq3_xxs(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
855{
856 assert(ccv_nnc_8i_rowwise_packed_iq3xxs_initialized)((void) sizeof ((ccv_nnc_8i_rowwise_packed_iq3xxs_initialized
) ? 1 : 0), __extension__ ({ if (ccv_nnc_8i_rowwise_packed_iq3xxs_initialized
) ; else __assert_fail ("ccv_nnc_8i_rowwise_packed_iq3xxs_initialized"
, "ccv_nnc_8i_rowwise.c", 856, __extension__ __PRETTY_FUNCTION__
); }))
;
857 double best_sse = DBL_MAX1.7976931348623157e+308;
858 int best_scale = 1;
859 int best_grid[2] = {0};
860 double ay[8];
861 uint32_t signs = 0;
862 int j;
863 for (j = 0; j < 8; j++)
864 {
865 ay[j] = fabs(y[j]);
866 if (y[j] < 0)
867 signs |= (1u << j);
868 }
869 int scale;
870 for (scale = 1; scale <= 16; scale++)
871 {
872 double group_sse = 0;
873 int group_grid[2] = {0};
874 int sg;
875 for (sg = 0; sg < 2; sg++)
876 {
877 double best_sub_sse = DBL_MAX1.7976931348623157e+308;
878 int best_sub_grid = 0;
879 const int lane = sg * 4;
880 int index;
881 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ3XXS_GRID_SIZE(256); index++)
882 {
883 const double sse = _ccv_nnc_8i_rowwise_packed_iq3xxs_sse(ay, w, lane, scale, index);
884 if (sse < best_sub_sse)
885 {
886 best_sub_sse = sse;
887 best_sub_grid = index;
888 }
889 }
890 group_sse += best_sub_sse;
891 group_grid[sg] = best_sub_grid;
892 }
893 if (group_sse < best_sse)
894 {
895 best_sse = group_sse;
896 best_scale = scale;
897 memcpy(best_grid, group_grid, sizeof(best_grid));
898 }
899 }
900 group->scale = best_scale;
901 group->signs = signs;
902 memcpy(group->grid, best_grid, sizeof(best_grid));
903 memset(group->q8, 0, sizeof(group->q8));
904 for (j = 0; j < 8; j++)
905 {
906 const int sg = j >> 2;
907 const int lane = j & 3;
908 const int mag = ccv_nnc_8i_rowwise_packed_iq3xxs_scaled_value[best_scale][best_grid[sg]][lane];
909 group->q8[j] = (signs & (1u << j)) ? -mag : mag;
910 }
911}
912
913static void _ccv_nnc_8i_rowwise_packed_quant_group(const int format, const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
914{
915 switch (format)
916 {
917 case CCV_NNC_QX_8I_ROWWISE_Q5_K:
918 _ccv_nnc_8i_rowwise_packed_quant_q5(y, w, group);
919 break;
920 case CCV_NNC_QX_8I_ROWWISE_Q4_K:
921 _ccv_nnc_8i_rowwise_packed_quant_q4(y, w, group);
922 break;
923 case CCV_NNC_QX_8I_ROWWISE_Q3_K:
924 _ccv_nnc_8i_rowwise_packed_quant_q3(y, w, group);
925 break;
926 case CCV_NNC_QX_8I_ROWWISE_Q2_K:
927 _ccv_nnc_8i_rowwise_packed_quant_q2(y, w, group);
928 break;
929 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
930 _ccv_nnc_8i_rowwise_packed_quant_iq2_xxs(y, w, group);
931 break;
932 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
933 _ccv_nnc_8i_rowwise_packed_quant_iq2_s(y, w, group);
934 break;
935 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS:
936 _ccv_nnc_8i_rowwise_packed_quant_iq2_xs(y, w, group);
937 break;
938 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
939 _ccv_nnc_8i_rowwise_packed_quant_iq3_s(y, w, group);
940 break;
941 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
942 _ccv_nnc_8i_rowwise_packed_quant_iq3_xxs(y, w, group);
943 break;
944 default:
945 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 945, __extension__ __PRETTY_FUNCTION__
); }))
;
946 }
947}
948
949static void _ccv_nnc_8i_rowwise_packed_pack_group(uint8_t* const output, const size_t group_index, const int format, const ccv_nnc_8i_rowwise_packed_group_t* const group)
950{
951 const size_t bit_offset = group_index * _ccv_nnc_8i_rowwise_x_group_bits(format);
952 size_t bit = bit_offset;
953 int j;
954 switch (format)
955 {
956 case CCV_NNC_QX_8I_ROWWISE_Q5_K:
957 for (j = 0; j < 16; j++, bit += 5)
958 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->q[j] + 16), 5);
959 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->m - 1), 3);
960 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 3, (uint32_t)(group->b + 16), 5);
961 break;
962 case CCV_NNC_QX_8I_ROWWISE_Q4_K:
963 for (j = 0; j < 16; j++, bit += 4)
964 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->q[j] + 8), 4);
965 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->m - 1), 4);
966 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 4, (uint32_t)(group->b + 8), 4);
967 break;
968 case CCV_NNC_QX_8I_ROWWISE_Q3_K:
969 for (j = 0; j < 16; j++, bit += 3)
970 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->q[j] + 4), 3);
971 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->m - 1), 5);
972 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 5, (uint32_t)(group->b / 2 + 4), 3);
973 break;
974 case CCV_NNC_QX_8I_ROWWISE_Q2_K:
975 for (j = 0; j < 16; j++, bit += 2)
976 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)group->q[j], 2);
977 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->m - 1), 6);
978 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 6, (uint32_t)(group->z >> 3), 4);
979 break;
980 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
981 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)group->grid[0], 10);
982 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 10, (uint32_t)group->grid[1], 10);
983 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 20, group->signs, 16);
984 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 36, (uint32_t)(group->scale - 1), 6);
985 break;
986 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS:
987 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)group->grid[0], 9);
988 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 9, group->signs, 8);
989 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 17, (uint32_t)group->scale, 4);
990 break;
991 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
992 for (j = 0; j < 4; j++)
993 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + j * 8, (uint32_t)group->grid[j], 8);
994 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 32, group->signs, 28);
995 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 60, (uint32_t)group->scale, 4);
996 break;
997 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
998 for (j = 0; j < 4; j++)
999 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + j * 9, (uint32_t)group->grid[j], 9);
1000 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 36, group->signs, 16);
1001 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 52, (uint32_t)(group->scale - 1), 4);
1002 break;
1003 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
1004 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)group->grid[0], 8);
1005 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 8, (uint32_t)group->grid[1], 8);
1006 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 16, group->signs, 8);
1007 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 24, (uint32_t)(group->scale - 1), 4);
1008 break;
1009 default:
1010 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 1010, __extension__ __PRETTY_FUNCTION__
); }))
;
1011 }
1012}
1013
1014static void _ccv_nnc_8i_rowwise_packed_decode_group(const uint8_t* const input, const size_t group_index, const int format, int* const q8)
1015{
1016 static const int q2_xs_scales[16] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32};
1017 const size_t bit_offset = group_index * _ccv_nnc_8i_rowwise_x_group_bits(format);
1018 size_t bit = bit_offset;
1019 int j;
1020 switch (format)
1021 {
1022 case CCV_NNC_QX_8I_ROWWISE_Q5_K: {
1023 int q[16];
1024 for (j = 0; j < 16; j++, bit += 5)
1025 q[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 5) - 16;
1026 const int m = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 3) + 1;
1027 const int b = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 3, 5) - 16;
1028 for (j = 0; j < 16; j++)
1029 q8[j] = q[j] * m + b;
1030 break;
1031 }
1032 case CCV_NNC_QX_8I_ROWWISE_Q4_K: {
1033 int q[16];
1034 for (j = 0; j < 16; j++, bit += 4)
1035 q[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 4) - 8;
1036 const int m = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 4) + 1;
1037 const int b = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 4, 4) - 8;
1038 for (j = 0; j < 16; j++)
1039 q8[j] = q[j] * m + b;
1040 break;
1041 }
1042 case CCV_NNC_QX_8I_ROWWISE_Q3_K: {
1043 int q[16];
1044 for (j = 0; j < 16; j++, bit += 3)
1045 q[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 3) - 4;
1046 const int m = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 5) + 1;
1047 const int b = ((int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 5, 3) - 4) << 1;
1048 for (j = 0; j < 16; j++)
1049 q8[j] = q[j] * m + b;
1050 break;
1051 }
1052 case CCV_NNC_QX_8I_ROWWISE_Q2_K: {
1053 int q[16];
1054 for (j = 0; j < 16; j++, bit += 2)
1055 q[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 2);
1056 const int m = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 6) + 1;
1057 const int z = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 6, 4) << 3;
1058 for (j = 0; j < 16; j++)
1059 q8[j] = q[j] * m - z;
1060 break;
1061 }
1062 case CCV_NNC_QX_8I_ROWWISE_IQ2_S: {
1063 const int grid0 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 10);
1064 const int grid1 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 10, 10);
1065 const uint32_t signs = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 20, 16);
1066 const int scale = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 36, 6) + 1;
1067 for (j = 0; j < 8; j++)
1068 {
1069 const int mag0 = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid, grid0, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid
, grid0, j) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2_value
(ccv_nnc_8i_rowwise_packed_iq2s_grid, grid0, j) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
1070 const int mag1 = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid, grid1, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid
, grid1, j) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2_value
(ccv_nnc_8i_rowwise_packed_iq2s_grid, grid1, j) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
1071 q8[j] = (signs & (1u << j)) ? -mag0 : mag0;
1072 q8[8 + j] = (signs & (1u << (8 + j))) ? -mag1 : mag1;
1073 }
1074 break;
1075 }
1076 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS: {
1077 const int grid0 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 9);
1078 const uint32_t signs = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 9, 8);
1079 const int scale = q2_xs_scales[_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 17, 4)];
1080 for (j = 0; j < 8; j++)
1081 {
1082 const int mag = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2xs_grid, grid0, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2xs_grid
, grid0, j) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2_value
(ccv_nnc_8i_rowwise_packed_iq2xs_grid, grid0, j) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
1083 q8[j] = (signs & (1u << j)) ? -mag : mag;
1084 }
1085 break;
1086 }
1087 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS: {
1088 int grid[4];
1089 for (j = 0; j < 4; j++)
1090 grid[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + j * 8, 8);
1091 const uint32_t sign_codes = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 32, 28);
1092 const int scale = ccv_nnc_8i_rowwise_packed_iq2xxs_scales[_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 60, 4)];
1093 int sg;
1094 for (sg = 0; sg < 4; sg++)
1095 {
1096 const uint8_t signs = ccv_nnc_8i_rowwise_packed_iq2xxs_ksigns[(sign_codes >> (sg * 7)) & 0x7f];
1097 for (j = 0; j < 8; j++)
1098 {
1099 const int lane = sg * 8 + j;
1100 const int mag = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2xxs_value(grid[sg], j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2xxs_value(grid[sg], j
) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2xxs_value(grid
[sg], j) * scale); typeof (127) _b = (127); (_a < _b) ? _a
: _b; })
;
1101 q8[lane] = (signs & (1u << j)) ? -mag : mag;
1102 }
1103 }
1104 break;
1105 }
1106 case CCV_NNC_QX_8I_ROWWISE_IQ3_S: {
1107 int grid[4];
1108 for (j = 0; j < 4; j++)
1109 grid[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + j * 9, 9);
1110 const uint32_t signs = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 36, 16);
1111 const int scale = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 52, 4) + 1;
1112 int sg;
1113 for (sg = 0; sg < 4; sg++)
1114 for (j = 0; j < 4; j++)
1115 {
1116 const int lane = sg * 4 + j;
1117 const int mag = ccv_min(_ccv_nnc_8i_rowwise_packed_iq3s_value(grid[sg], j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3s_value(grid[sg], j)
* scale) _a = (_ccv_nnc_8i_rowwise_packed_iq3s_value(grid[sg
], j) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b
; })
;
1118 q8[lane] = (signs & (1u << lane)) ? -mag : mag;
1119 }
1120 break;
1121 }
1122 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS: {
1123 const int grid0 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 8);
1124 const int grid1 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 8, 8);
1125 const uint32_t signs = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 16, 8);
1126 const int scale = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 24, 4) + 1;
1127 for (j = 0; j < 4; j++)
1128 {
1129 const int mag0 = ccv_min(_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid0, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid0, j) *
scale) _a = (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid0, j
) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b; }
)
;
1130 const int mag1 = ccv_min(_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid1, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid1, j) *
scale) _a = (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid1, j
) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b; }
)
;
1131 q8[j] = (signs & (1u << j)) ? -mag0 : mag0;
1132 q8[4 + j] = (signs & (1u << (4 + j))) ? -mag1 : mag1;
1133 }
1134 break;
1135 }
1136 default:
1137 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 1137, __extension__ __PRETTY_FUNCTION__
); }))
;
1138 }
1139}
1140
1141CCV_WARN_UNUSED(size_t)size_t __attribute__((warn_unused_result)) ccv_nnc_quantize_8i_rowwise_x(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, const int format, const float* const imatrix, const size_t imatrix_length, void* output, const size_t output_length)
1142{
1143 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1143, __extension__ __PRETTY_FUNCTION__
); }))
;
1
Assuming 'datatype' is not equal to CCV_16F
2
Assuming 'datatype' is not equal to CCV_16BF
3
Assuming 'datatype' is not equal to CCV_32F
4
Assuming 'datatype' is equal to CCV_64F
5
Taking true branch
1144 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1144, __extension__ __PRETTY_FUNCTION__
); }))
;
6
Assuming 'memory_type' is equal to CCV_TENSOR_CPU_MEMORY
7
Taking true branch
1145 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 1145, __extension__ __PRETTY_FUNCTION__
); }))
;
8
Assuming 'row_length' is > 0
9
Taking true branch
1146 assert(input_length % row_length == 0)((void) sizeof ((input_length % row_length == 0) ? 1 : 0), __extension__
({ if (input_length % row_length == 0) ; else __assert_fail (
"input_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 1146
, __extension__ __PRETTY_FUNCTION__); }))
;
10
Assuming the condition is true
11
Taking true branch
1147 const size_t row_count = input_length / row_length;
1148 if (!_ccv_nnc_8i_rowwise_imatrix_is_valid(imatrix, imatrix_length, row_length, row_count))
12
Taking false branch
1149 return 0;
1150 const size_t group_size = _ccv_nnc_8i_rowwise_x_group_size(format);
1151 const int group_bits = _ccv_nnc_8i_rowwise_x_group_bits(format);
1152 const size_t groups_per_row = (row_length + group_size - 1) / group_size;
1153 const size_t padded_row_length = groups_per_row * group_size;
1154 const size_t scale_offset = _ccv_nnc_8i_rowwise_packed_scale_offset(format, input_length, row_length);
1155 const size_t output_size = scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype)_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]
;
1156 assert(output_length >= output_size)((void) sizeof ((output_length >= output_size) ? 1 : 0), __extension__
({ if (output_length >= output_size) ; else __assert_fail
("output_length >= output_size", "ccv_nnc_8i_rowwise.c", 1156
, __extension__ __PRETTY_FUNCTION__); }))
;
13
Assuming 'output_length' is >= 'output_size'
14
Taking true branch
1157 switch (format)
15
'Default' branch taken. Execution continues on line 1172
1158 {
1159 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
1160 _ccv_nnc_8i_rowwise_packed_iq2xxs_init();
1161 break;
1162 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
1163 _ccv_nnc_8i_rowwise_packed_iq2s_init();
1164 break;
1165 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
1166 _ccv_nnc_8i_rowwise_packed_iq3s_init();
1167 break;
1168 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
1169 _ccv_nnc_8i_rowwise_packed_iq3xxs_init();
1170 break;
1171 }
1172 uint8_t* const u8 = (uint8_t*)output;
1173 uint8_t* const scales = u8 + scale_offset;
1174 memset(u8, 0, scale_offset);
1175 const size_t row_bits = groups_per_row * group_bits;
1176 size_t rows_per_chunk;
1177 switch (row_bits & 7)
16
Control jumps to the 'default' case at line 1189
1178 {
1179 case 0:
1180 rows_per_chunk = 1;
1181 break;
1182 case 4:
1183 rows_per_chunk = 2;
1184 break;
1185 case 2:
1186 case 6:
1187 rows_per_chunk = 4;
1188 break;
1189 default:
1190 rows_per_chunk = 8;
1191 break;
17
Execution continues on line 1193
1192 }
1193 const size_t row_chunks = (row_count + rows_per_chunk - 1) / rows_per_chunk;
1194 parallel_for(chunk_idx, (int)row_chunks){ int chunk_idx; for ((chunk_idx) = 0; (chunk_idx) < ((int
)row_chunks); (chunk_idx)++) {
{
18
Assuming 'chunk_idx' is < 'row_chunks'
19
Loop condition is true. Entering loop body
1195 const size_t chunk_begin = (size_t)chunk_idx * rows_per_chunk;
1196 const size_t chunk_end = ccv_min(chunk_begin + rows_per_chunk, row_count)({ typeof (chunk_begin + rows_per_chunk) _a = (chunk_begin + rows_per_chunk
); typeof (row_count) _b = (row_count); (_a < _b) ? _a : _b
; })
;
20
Assuming '_a' is < '_b'
21
'?' condition is true
1197 size_t i;
1198 for (i = chunk_begin; i < chunk_end; i++)
22
Loop condition is true. Entering loop body
1199 {
1200 double* const row = (double*)ccmallocmalloc(sizeof(double) * padded_row_length);
23
Storing uninitialized value
1201 double* const weights = (double*)ccmallocmalloc(sizeof(double) * padded_row_length);
1202 int* const q8 = (int*)ccmallocmalloc(sizeof(int) * padded_row_length);
1203 const size_t row_start = i * row_length;
1204 const float* const row_imatrix = _ccv_nnc_8i_rowwise_imatrix_for_row(imatrix, imatrix_length, row_length, row_count, i);
1205 _ccv_nnc_8i_rowwise_packed_read_row(input, datatype, row_start, row_length, padded_row_length, row);
24
Calling '_ccv_nnc_8i_rowwise_packed_read_row'
33
Returning from '_ccv_nnc_8i_rowwise_packed_read_row'
1206 double max_abs = 0;
1207 size_t j;
1208 for (j = 0; j < row_length; j++)
34
Loop condition is true. Entering loop body
37
Loop condition is false. Execution continues on line 1213
1209 {
1210 max_abs = ccv_max(max_abs, fabs(row[j]))({ typeof (max_abs) _a = (max_abs); typeof (fabs(row[j])) _b =
(fabs(row[j])); (_a > _b) ? _a : _b; })
;
35
Assuming '_a' is <= '_b'
36
'?' condition is false
1211 weights[j] = _ccv_nnc_8i_rowwise_weight(row_imatrix, j);
1212 }
1213 for (; j
37.1
'j' is >= 'padded_row_length'
< padded_row_length; j++)
38
Loop condition is false. Execution continues on line 1215
1214 weights[j] = 0;
1215 double scale = max_abs / 127.;
1216 double best_scale = 0;
1217 double best_sse = DBL_MAX1.7976931348623157e+308;
1218 int k;
1219 for (k = 0; k < 8; k++)
39
Loop condition is true. Entering loop body
1220 {
1221 const double stored_scale = _ccv_nnc_8i_rowwise_packed_stored_scale(scale, datatype);
1222 if (!(stored_scale > 0))
40
Assuming 'stored_scale' is > 0
41
Taking false branch
1223 break;
1224 size_t g;
1225 for (g = 0; g < groups_per_row; g++)
42
Assuming 'g' is < 'groups_per_row'
43
Loop condition is true. Entering loop body
1226 {
1227 double y[32] = {0};
1228 double w[32] = {0};
1229 for (j = 0; j < group_size; j++)
44
Assuming 'j' is < 'group_size'
45
Loop condition is true. Entering loop body
46
Assuming 'j' is < 'group_size'
47
Loop condition is true. Entering loop body
1230 {
1231 y[j] = row[g * group_size + j] / stored_scale;
48
The left operand of '/' is a garbage value
1232 w[j] = weights[g * group_size + j];
1233 }
1234 ccv_nnc_8i_rowwise_packed_group_t group;
1235 _ccv_nnc_8i_rowwise_packed_quant_group(format, y, w, &group);
1236 memcpy(q8 + g * group_size, group.q8, sizeof(int) * group_size);
1237 }
1238 double sse = 0;
1239 double sum_qx = 0;
1240 double sum_qq = 0;
1241 for (j = 0; j < row_length; j++)
1242 {
1243 const double d = row[j] - stored_scale * q8[j];
1244 sse += weights[j] * d * d;
1245 sum_qx += weights[j] * q8[j] * row[j];
1246 sum_qq += weights[j] * q8[j] * q8[j];
1247 }
1248 if (sse < best_sse)
1249 {
1250 best_sse = sse;
1251 best_scale = stored_scale;
1252 }
1253 if (!(sum_qq > 0) || !(sum_qx > 0))
1254 break;
1255 const double next_scale = sum_qx / sum_qq;
1256 if (_ccv_nnc_8i_rowwise_packed_stored_scale(next_scale, datatype) == stored_scale)
1257 break;
1258 scale = next_scale;
1259 }
1260 _ccv_nnc_8i_rowwise_packed_store_scale(scales, datatype, i, best_scale);
1261 const double final_scale = best_scale > 0 ? best_scale : 1;
1262 size_t g;
1263 for (g = 0; g < groups_per_row; g++)
1264 {
1265 double y[32] = {0};
1266 double w[32] = {0};
1267 for (j = 0; j < group_size; j++)
1268 {
1269 y[j] = row[g * group_size + j] / final_scale;
1270 w[j] = weights[g * group_size + j];
1271 }
1272 ccv_nnc_8i_rowwise_packed_group_t group;
1273 _ccv_nnc_8i_rowwise_packed_quant_group(format, y, w, &group);
1274 _ccv_nnc_8i_rowwise_packed_pack_group(u8, i * groups_per_row + g, format, &group);
1275 }
1276 ccfreefree(q8);
1277 ccfreefree(weights);
1278 ccfreefree(row);
1279 }
1280 } parallel_endfor} }
1281 return output_size;
1282}
1283
1284void ccv_nnc_dequantize_8i_rowwise_x(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, const int format, void* output, const size_t output_length)
1285{
1286 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1286, __extension__ __PRETTY_FUNCTION__
); }))
;
1287 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1287, __extension__ __PRETTY_FUNCTION__
); }))
;
1288 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 1288, __extension__ __PRETTY_FUNCTION__
); }))
;
1289 assert(output_length % row_length == 0)((void) sizeof ((output_length % row_length == 0) ? 1 : 0), __extension__
({ if (output_length % row_length == 0) ; else __assert_fail
("output_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 1289
, __extension__ __PRETTY_FUNCTION__); }))
;
1290 const size_t row_count = output_length / row_length;
1291 const size_t group_size = _ccv_nnc_8i_rowwise_x_group_size(format);
1292 const size_t groups_per_row = (row_length + group_size - 1) / group_size;
1293 const size_t scale_offset = _ccv_nnc_8i_rowwise_packed_scale_offset(format, output_length, row_length);
1294 assert(input_length >= scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype))((void) sizeof ((input_length >= scale_offset + row_count *
_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]) ? 1 : 0), __extension__ ({ if (input_length >= scale_offset
+ row_count * _ccv_get_data_type_size[((datatype) & 0xFF000
) >> 12]) ; else __assert_fail ("input_length >= scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype)"
, "ccv_nnc_8i_rowwise.c", 1294, __extension__ __PRETTY_FUNCTION__
); }))
;
1295 const uint8_t* const u8 = (const uint8_t*)input;
1296 const uint8_t* const scales = u8 + scale_offset;
1297 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1298 const double scale = _ccv_nnc_8i_rowwise_packed_load_scale(scales, datatype, i);
1299 size_t g;
1300 for (g = 0; g < groups_per_row; g++)
1301 {
1302 int q8[32] = {0};
1303 _ccv_nnc_8i_rowwise_packed_decode_group(u8, (size_t)i * groups_per_row + g, format, q8);
1304 size_t j;
1305 for (j = 0; j < group_size; j++)
1306 {
1307 const size_t col = g * group_size + j;
1308 if (col < row_length)
1309 _ccv_nnc_8i_rowwise_packed_write_value(output, datatype, (size_t)i * row_length + col, scale * q8[j]);
1310 }
1311 }
1312 } parallel_endfor} }
1313}
1314
1315static inline int _ccv_nnc_8i_rowwise_quantize(const double v, const double inv_scale)
1316{
1317 const int q = (int)lrint(v * inv_scale);
1318 return ccv_clamp(q, -127, 127)({ typeof (-127) _a = (-127); typeof (127) _b = (127); typeof
(q) _x = (q); (_x < _a) ? _a : ((_x > _b) ? _b : _x); }
)
;
1319}
1320
1321static float _ccv_nnc_quantize_8i_rowwise_16f(const uint16_t* const row, const size_t row_length, const float* const imatrix, int8_t* const q)
1322{
1323 size_t j;
1324 double max_abs = 0;
1325 for (j = 0; j < row_length; j++)
1326 {
1327 float v;
1328 ccv_half_precision_to_float(row + j, &v, 1);
1329 max_abs = ccv_max(max_abs, fabs(v))({ typeof (max_abs) _a = (max_abs); typeof (fabs(v)) _b = (fabs
(v)); (_a > _b) ? _a : _b; })
;
1330 }
1331 if (max_abs == 0)
1332 {
1333 memset(q, 0, row_length);
1334 return 0;
1335 }
1336 double scale = max_abs / 127.;
1337 float best_scale = 0;
1338 double best_sse = DBL_MAX1.7976931348623157e+308;
1339 int k;
1340 for (k = 0; k < 8; k++)
1341 {
1342 // Round with the scale that will actually be stored, then refit scale by least squares.
1343 const float scale_f = (float)scale;
1344 uint16_t scale_h;
1345 float stored_scale;
1346 ccv_float_to_half_precision(&scale_f, &scale_h, 1);
1347 ccv_half_precision_to_float(&scale_h, &stored_scale, 1);
1348 if (!(stored_scale > 0))
1349 break;
1350 const double inv_scale = 1. / stored_scale;
1351 double sum_qx = 0;
1352 double sum_qq = 0;
1353 double sse = 0;
1354 for (j = 0; j < row_length; j++)
1355 {
1356 const double w = _ccv_nnc_8i_rowwise_weight(imatrix, j);
1357 float v_f;
1358 ccv_half_precision_to_float(row + j, &v_f, 1);
1359 const double v = v_f;
1360 const int qj = _ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1361 const double d = v - stored_scale * qj;
1362 sse += w * d * d;
1363 sum_qx += w * qj * v;
1364 sum_qq += w * qj * qj;
1365 }
1366 if (sse < best_sse)
1367 {
1368 best_sse = sse;
1369 best_scale = stored_scale;
1370 }
1371 if (!(sum_qq > 0) || !(sum_qx > 0))
1372 break;
1373 const double next_scale = sum_qx / sum_qq;
1374 const float next_scale_f = (float)next_scale;
1375 uint16_t next_scale_h;
1376 float next_stored_scale;
1377 ccv_float_to_half_precision(&next_scale_f, &next_scale_h, 1);
1378 ccv_half_precision_to_float(&next_scale_h, &next_stored_scale, 1);
1379 if (next_stored_scale == stored_scale)
1380 break;
1381 scale = next_scale;
1382 }
1383 if (!(best_scale > 0))
1384 {
1385 memset(q, 0, row_length);
1386 return 0;
1387 }
1388 const double inv_scale = 1. / best_scale;
1389 for (j = 0; j < row_length; j++)
1390 {
1391 float v;
1392 ccv_half_precision_to_float(row + j, &v, 1);
1393 q[j] = (int8_t)_ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1394 }
1395 return best_scale;
1396}
1397
1398static float _ccv_nnc_quantize_8i_rowwise_16bf(const uint16_t* const row, const size_t row_length, const float* const imatrix, int8_t* const q)
1399{
1400 size_t j;
1401 double max_abs = 0;
1402 for (j = 0; j < row_length; j++)
1403 {
1404 float v;
1405 ccv_bfloat_to_float(row + j, &v, 1);
1406 max_abs = ccv_max(max_abs, fabs(v))({ typeof (max_abs) _a = (max_abs); typeof (fabs(v)) _b = (fabs
(v)); (_a > _b) ? _a : _b; })
;
1407 }
1408 if (max_abs == 0)
1409 {
1410 memset(q, 0, row_length);
1411 return 0;
1412 }
1413 double scale = max_abs / 127.;
1414 float best_scale = 0;
1415 double best_sse = DBL_MAX1.7976931348623157e+308;
1416 int k;
1417 for (k = 0; k < 8; k++)
1418 {
1419 const float scale_f = (float)scale;
1420 uint16_t scale_bf;
1421 float stored_scale;
1422 ccv_float_to_bfloat(&scale_f, &scale_bf, 1);
1423 ccv_bfloat_to_float(&scale_bf, &stored_scale, 1);
1424 if (!(stored_scale > 0))
1425 break;
1426 const double inv_scale = 1. / stored_scale;
1427 double sum_qx = 0;
1428 double sum_qq = 0;
1429 double sse = 0;
1430 for (j = 0; j < row_length; j++)
1431 {
1432 const double w = _ccv_nnc_8i_rowwise_weight(imatrix, j);
1433 float v_f;
1434 ccv_bfloat_to_float(row + j, &v_f, 1);
1435 const double v = v_f;
1436 const int qj = _ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1437 const double d = v - stored_scale * qj;
1438 sse += w * d * d;
1439 sum_qx += w * qj * v;
1440 sum_qq += w * qj * qj;
1441 }
1442 if (sse < best_sse)
1443 {
1444 best_sse = sse;
1445 best_scale = stored_scale;
1446 }
1447 if (!(sum_qq > 0) || !(sum_qx > 0))
1448 break;
1449 const double next_scale = sum_qx / sum_qq;
1450 const float next_scale_f = (float)next_scale;
1451 uint16_t next_scale_bf;
1452 float next_stored_scale;
1453 ccv_float_to_bfloat(&next_scale_f, &next_scale_bf, 1);
1454 ccv_bfloat_to_float(&next_scale_bf, &next_stored_scale, 1);
1455 if (next_stored_scale == stored_scale)
1456 break;
1457 scale = next_scale;
1458 }
1459 if (!(best_scale > 0))
1460 {
1461 memset(q, 0, row_length);
1462 return 0;
1463 }
1464 const double inv_scale = 1. / best_scale;
1465 for (j = 0; j < row_length; j++)
1466 {
1467 float v;
1468 ccv_bfloat_to_float(row + j, &v, 1);
1469 q[j] = (int8_t)_ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1470 }
1471 return best_scale;
1472}
1473
1474static float _ccv_nnc_quantize_8i_rowwise_32f(const float* const row, const size_t row_length, const float* const imatrix, int8_t* const q)
1475{
1476 size_t j;
1477 double max_abs = 0;
1478 for (j = 0; j < row_length; j++)
1479 max_abs = ccv_max(max_abs, fabs(row[j]))({ typeof (max_abs) _a = (max_abs); typeof (fabs(row[j])) _b =
(fabs(row[j])); (_a > _b) ? _a : _b; })
;
1480 if (max_abs == 0)
1481 {
1482 memset(q, 0, row_length);
1483 return 0;
1484 }
1485 double scale = max_abs / 127.;
1486 float best_scale = 0;
1487 double best_sse = DBL_MAX1.7976931348623157e+308;
1488 int k;
1489 for (k = 0; k < 8; k++)
1490 {
1491 const float stored_scale = (float)scale;
1492 if (!(stored_scale > 0))
1493 break;
1494 const double inv_scale = 1. / stored_scale;
1495 double sum_qx = 0;
1496 double sum_qq = 0;
1497 double sse = 0;
1498 for (j = 0; j < row_length; j++)
1499 {
1500 const double w = _ccv_nnc_8i_rowwise_weight(imatrix, j);
1501 const double v = row[j];
1502 const int qj = _ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1503 const double d = v - stored_scale * qj;
1504 sse += w * d * d;
1505 sum_qx += w * qj * v;
1506 sum_qq += w * qj * qj;
1507 }
1508 if (sse < best_sse)
1509 {
1510 best_sse = sse;
1511 best_scale = stored_scale;
1512 }
1513 if (!(sum_qq > 0) || !(sum_qx > 0))
1514 break;
1515 const double next_scale = sum_qx / sum_qq;
1516 if ((float)next_scale == stored_scale)
1517 break;
1518 scale = next_scale;
1519 }
1520 if (!(best_scale > 0))
1521 {
1522 memset(q, 0, row_length);
1523 return 0;
1524 }
1525 const double inv_scale = 1. / best_scale;
1526 for (j = 0; j < row_length; j++)
1527 q[j] = (int8_t)_ccv_nnc_8i_rowwise_quantize(row[j], inv_scale);
1528 return best_scale;
1529}
1530
1531static double _ccv_nnc_quantize_8i_rowwise_64f(const double* const row, const size_t row_length, const float* const imatrix, int8_t* const q)
1532{
1533 size_t j;
1534 double max_abs = 0;
1535 for (j = 0; j < row_length; j++)
1536 max_abs = ccv_max(max_abs, fabs(row[j]))({ typeof (max_abs) _a = (max_abs); typeof (fabs(row[j])) _b =
(fabs(row[j])); (_a > _b) ? _a : _b; })
;
1537 if (max_abs == 0)
1538 {
1539 memset(q, 0, row_length);
1540 return 0;
1541 }
1542 double scale = max_abs / 127.;
1543 double best_scale = 0;
1544 double best_sse = DBL_MAX1.7976931348623157e+308;
1545 int k;
1546 for (k = 0; k < 8; k++)
1547 {
1548 const double stored_scale = scale;
1549 if (!(stored_scale > 0))
1550 break;
1551 const double inv_scale = 1. / stored_scale;
1552 double sum_qx = 0;
1553 double sum_qq = 0;
1554 double sse = 0;
1555 for (j = 0; j < row_length; j++)
1556 {
1557 const double w = _ccv_nnc_8i_rowwise_weight(imatrix, j);
1558 const double v = row[j];
1559 const int qj = _ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1560 const double d = v - stored_scale * qj;
1561 sse += w * d * d;
1562 sum_qx += w * qj * v;
1563 sum_qq += w * qj * qj;
1564 }
1565 if (sse < best_sse)
1566 {
1567 best_sse = sse;
1568 best_scale = stored_scale;
1569 }
1570 if (!(sum_qq > 0) || !(sum_qx > 0))
1571 break;
1572 const double next_scale = sum_qx / sum_qq;
1573 if (next_scale == stored_scale)
1574 break;
1575 scale = next_scale;
1576 }
1577 if (!(best_scale > 0))
1578 {
1579 memset(q, 0, row_length);
1580 return 0;
1581 }
1582 const double inv_scale = 1. / best_scale;
1583 for (j = 0; j < row_length; j++)
1584 q[j] = (int8_t)_ccv_nnc_8i_rowwise_quantize(row[j], inv_scale);
1585 return best_scale;
1586}
1587
1588CCV_WARN_UNUSED(size_t)size_t __attribute__((warn_unused_result)) ccv_nnc_quantize_8i_rowwise(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, const float* const imatrix, const size_t imatrix_length, void* output, const size_t output_length)
1589{
1590 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1590, __extension__ __PRETTY_FUNCTION__
); }))
;
1591 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1591, __extension__ __PRETTY_FUNCTION__
); }))
;
1592 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 1592, __extension__ __PRETTY_FUNCTION__
); }))
;
1593 assert(input_length % row_length == 0)((void) sizeof ((input_length % row_length == 0) ? 1 : 0), __extension__
({ if (input_length % row_length == 0) ; else __assert_fail (
"input_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 1593
, __extension__ __PRETTY_FUNCTION__); }))
;
1594 const size_t row_count = input_length / row_length;
1595 if (!_ccv_nnc_8i_rowwise_imatrix_is_valid(imatrix, imatrix_length, row_length, row_count))
1596 return 0;
1597 const size_t scale_offset = (input_length + 127) & -128;
1598 const size_t scale_size = row_count * CCV_GET_DATA_TYPE_SIZE(datatype)_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]
;
1599 assert(output_length >= scale_offset + scale_size)((void) sizeof ((output_length >= scale_offset + scale_size
) ? 1 : 0), __extension__ ({ if (output_length >= scale_offset
+ scale_size) ; else __assert_fail ("output_length >= scale_offset + scale_size"
, "ccv_nnc_8i_rowwise.c", 1599, __extension__ __PRETTY_FUNCTION__
); }))
;
1600 int8_t* const q = (int8_t*)output;
1601 uint8_t* const u8 = (uint8_t*)output;
1602 if (datatype == CCV_16F)
1603 {
1604 const uint16_t* const f16 = (const uint16_t*)input;
1605 uint16_t* const scales = (uint16_t*)(u8 + scale_offset);
1606 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1607 const size_t row_start = (size_t)i * row_length;
1608 const float* const row_imatrix = _ccv_nnc_8i_rowwise_imatrix_for_row(imatrix, imatrix_length, row_length, row_count, (size_t)i);
1609 const float scale_f = _ccv_nnc_quantize_8i_rowwise_16f(f16 + row_start, row_length, row_imatrix, q + row_start);
1610 ccv_float_to_half_precision(&scale_f, scales + i, 1);
1611 } parallel_endfor} }
1612 } else if (datatype == CCV_16BF) {
1613 const uint16_t* const bf16 = (const uint16_t*)input;
1614 uint16_t* const scales = (uint16_t*)(u8 + scale_offset);
1615 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1616 const size_t row_start = (size_t)i * row_length;
1617 const float* const row_imatrix = _ccv_nnc_8i_rowwise_imatrix_for_row(imatrix, imatrix_length, row_length, row_count, (size_t)i);
1618 const float scale_f = _ccv_nnc_quantize_8i_rowwise_16bf(bf16 + row_start, row_length, row_imatrix, q + row_start);
1619 ccv_float_to_bfloat(&scale_f, scales + i, 1);
1620 } parallel_endfor} }
1621 } else if (datatype == CCV_32F) {
1622 const float* const f32 = (const float*)input;
1623 float* const scales = (float*)(u8 + scale_offset);
1624 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1625 const size_t row_start = (size_t)i * row_length;
1626 const float* const row_imatrix = _ccv_nnc_8i_rowwise_imatrix_for_row(imatrix, imatrix_length, row_length, row_count, (size_t)i);
1627 scales[i] = _ccv_nnc_quantize_8i_rowwise_32f(f32 + row_start, row_length, row_imatrix, q + row_start);
1628 } parallel_endfor} }
1629 } else {
1630 assert(datatype == CCV_64F)((void) sizeof ((datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1630, __extension__ __PRETTY_FUNCTION__
); }))
;
1631 const double* const f64 = (const double*)input;
1632 double* const scales = (double*)(u8 + scale_offset);
1633 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1634 const size_t row_start = (size_t)i * row_length;
1635 const float* const row_imatrix = _ccv_nnc_8i_rowwise_imatrix_for_row(imatrix, imatrix_length, row_length, row_count, (size_t)i);
1636 scales[i] = _ccv_nnc_quantize_8i_rowwise_64f(f64 + row_start, row_length, row_imatrix, q + row_start);
1637 } parallel_endfor} }
1638 }
1639 return scale_offset + scale_size;
1640}
1641
1642void ccv_nnc_dequantize_8i_rowwise(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, void* output, const size_t output_length)
1643{
1644 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1644, __extension__ __PRETTY_FUNCTION__
); }))
;
1645 assert(memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY || memory_type
== CCV_TENSOR_GPU_MEMORY) ? 1 : 0), __extension__ ({ if (memory_type
== CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY
) ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1645, __extension__ __PRETTY_FUNCTION__
); }))
;
1646 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 1646, __extension__ __PRETTY_FUNCTION__
); }))
;
1647 assert(output_length % row_length == 0)((void) sizeof ((output_length % row_length == 0) ? 1 : 0), __extension__
({ if (output_length % row_length == 0) ; else __assert_fail
("output_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 1647
, __extension__ __PRETTY_FUNCTION__); }))
;
1648 if (memory_type != CCV_TENSOR_CPU_MEMORY)
1649 {
1650#ifdef HAVE_CUDA1
1651 ccv_nnc_compat_dequantize_8i_rowwise(input, datatype, input_length, row_length, output, output_length, 0);
1652#elif defined(HAVE_MPS)
1653 assert(datatype != CCV_64F)((void) sizeof ((datatype != CCV_64F) ? 1 : 0), __extension__
({ if (datatype != CCV_64F) ; else __assert_fail ("datatype != CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1653, __extension__ __PRETTY_FUNCTION__
); }))
;
1654 ccv_nnc_mps_dequantize_8i_rowwise(input, datatype, input_length, row_length, output, output_length, 0);
1655#else
1656 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1656, __extension__ __PRETTY_FUNCTION__
); }))
;
1657#endif
1658 return;
1659 }
1660 const size_t row_count = output_length / row_length;
1661 const size_t scale_offset = (output_length + 127) & -128;
1662 assert(input_length >= scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype))((void) sizeof ((input_length >= scale_offset + row_count *
_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]) ? 1 : 0), __extension__ ({ if (input_length >= scale_offset
+ row_count * _ccv_get_data_type_size[((datatype) & 0xFF000
) >> 12]) ; else __assert_fail ("input_length >= scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype)"
, "ccv_nnc_8i_rowwise.c", 1662, __extension__ __PRETTY_FUNCTION__
); }))
;
1663 const int8_t* const q = (const int8_t*)input;
1664 const uint8_t* const u8 = (const uint8_t*)input;
1665 if (datatype == CCV_16F)
1666 {
1667 uint16_t* const f16 = (uint16_t*)output;
1668 const uint16_t* const scales = (const uint16_t*)(u8 + scale_offset);
1669 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1670 const size_t row_start = (size_t)i * row_length;
1671 float scale_f;
1672 ccv_half_precision_to_float(scales + i, &scale_f, 1);
1673 size_t j;
1674 for (j = 0; j < row_length; j++)
1675 {
1676 const float v = q[row_start + j] * scale_f;
1677 ccv_float_to_half_precision(&v, f16 + row_start + j, 1);
1678 }
1679 } parallel_endfor} }
1680 } else if (datatype == CCV_16BF) {
1681 uint16_t* const bf16 = (uint16_t*)output;
1682 const uint16_t* const scales = (const uint16_t*)(u8 + scale_offset);
1683 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1684 const size_t row_start = (size_t)i * row_length;
1685 float scale_f;
1686 ccv_bfloat_to_float(scales + i, &scale_f, 1);
1687 size_t j;
1688 for (j = 0; j < row_length; j++)
1689 {
1690 const float v = q[row_start + j] * scale_f;
1691 ccv_float_to_bfloat(&v, bf16 + row_start + j, 1);
1692 }
1693 } parallel_endfor} }
1694 } else if (datatype == CCV_32F) {
1695 float* const f32 = (float*)output;
1696 const float* const scales = (const float*)(u8 + scale_offset);
1697 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1698 const size_t row_start = (size_t)i * row_length;
1699 const float scale = scales[i];
1700 size_t j;
1701 for (j = 0; j < row_length; j++)
1702 f32[row_start + j] = q[row_start + j] * scale;
1703 } parallel_endfor} }
1704 } else {
1705 assert(datatype == CCV_64F)((void) sizeof ((datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1705, __extension__ __PRETTY_FUNCTION__
); }))
;
1706 double* const f64 = (double*)output;
1707 const double* const scales = (const double*)(u8 + scale_offset);
1708 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1709 const size_t row_start = (size_t)i * row_length;
1710 const double scale = scales[i];
1711 size_t j;
1712 for (j = 0; j < row_length; j++)
1713 f64[row_start + j] = q[row_start + j] * scale;
1714 } parallel_endfor} }
1715 }
1716}