Bug Summary

File:nnc/ccv_nnc_8i_rowwise.c
Warning:line 1208, column 12
Assigned value is garbage or undefined

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_8i_rowwise.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2026-05-20-090030-2802958-1 -x c ccv_nnc_8i_rowwise.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include <float.h>
4#include "ccv_nnc_8i_rowwise_packed_grids.inc"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10
11static int _ccv_nnc_8i_rowwise_x_group_size(const int format)
12{
13 switch (format)
14 {
15 case CCV_NNC_QX_8I_ROWWISE_Q5_K:
16 case CCV_NNC_QX_8I_ROWWISE_Q4_K:
17 case CCV_NNC_QX_8I_ROWWISE_Q3_K:
18 case CCV_NNC_QX_8I_ROWWISE_Q2_K:
19 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
20 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
21 return 16;
22 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
23 return 32;
24 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS:
25 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
26 return 8;
27 default:
28 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 28, __extension__ __PRETTY_FUNCTION__
); }))
;
29 return 0;
30 }
31}
32
33static int _ccv_nnc_8i_rowwise_x_group_bits(const int format)
34{
35 switch (format)
36 {
37 case CCV_NNC_QX_8I_ROWWISE_Q5_K:
38 return 88;
39 case CCV_NNC_QX_8I_ROWWISE_Q4_K:
40 return 72;
41 case CCV_NNC_QX_8I_ROWWISE_Q3_K:
42 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
43 return 56;
44 case CCV_NNC_QX_8I_ROWWISE_Q2_K:
45 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
46 return 42;
47 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS:
48 return 21;
49 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
50 return 28;
51 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
52 return 64;
53 default:
54 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 54, __extension__ __PRETTY_FUNCTION__
); }))
;
55 return 0;
56 }
57}
58
59static size_t _ccv_nnc_8i_rowwise_packed_scale_offset(const int format, const size_t input_length, const size_t row_length)
60{
61 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 61, __extension__ __PRETTY_FUNCTION__
); }))
;
62 assert(input_length % row_length == 0)((void) sizeof ((input_length % row_length == 0) ? 1 : 0), __extension__
({ if (input_length % row_length == 0) ; else __assert_fail (
"input_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 62,
__extension__ __PRETTY_FUNCTION__); }))
;
63 const size_t row_count = input_length / row_length;
64 const size_t group_size = _ccv_nnc_8i_rowwise_x_group_size(format);
65 const size_t groups_per_row = (row_length + group_size - 1) / group_size;
66 const size_t group_bits = _ccv_nnc_8i_rowwise_x_group_bits(format);
67 const size_t payload_size = (row_count * groups_per_row * group_bits + 7) / 8;
68 return (payload_size + 127) & -128;
69}
70
71CCV_WARN_UNUSED(size_t)size_t __attribute__((warn_unused_result)) ccv_nnc_8i_rowwise_x_data_size(const int format, const int datatype, const size_t input_length, const size_t row_length)
72{
73 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 73, __extension__ __PRETTY_FUNCTION__
); }))
;
74 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 74, __extension__ __PRETTY_FUNCTION__
); }))
;
75 assert(input_length % row_length == 0)((void) sizeof ((input_length % row_length == 0) ? 1 : 0), __extension__
({ if (input_length % row_length == 0) ; else __assert_fail (
"input_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 75,
__extension__ __PRETTY_FUNCTION__); }))
;
76 const size_t row_count = input_length / row_length;
77 const size_t scale_offset = _ccv_nnc_8i_rowwise_packed_scale_offset(format, input_length, row_length);
78 return scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype)_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]
;
79}
80
81static void _ccv_nnc_8i_rowwise_packed_write_bits(uint8_t* const data, const size_t bit_offset, const uint32_t value, const int bits)
82{
83 int i;
84 for (i = 0; i < bits; i++)
85 if (value & (1u << i))
86 data[(bit_offset + i) >> 3] |= (uint8_t)(1u << ((bit_offset + i) & 7));
87}
88
89static uint32_t _ccv_nnc_8i_rowwise_packed_read_bits(const uint8_t* const data, const size_t bit_offset, const int bits)
90{
91 uint32_t value = 0;
92 int i;
93 for (i = 0; i < bits; i++)
94 if (data[(bit_offset + i) >> 3] & (uint8_t)(1u << ((bit_offset + i) & 7)))
95 value |= (1u << i);
96 return value;
97}
98
99static double _ccv_nnc_8i_rowwise_packed_stored_scale(const double scale, const int datatype)
100{
101 if (datatype == CCV_16F)
102 {
103 const float scale_f = (float)scale;
104 uint16_t scale_h;
105 float stored_scale;
106 ccv_float_to_half_precision(&scale_f, &scale_h, 1);
107 ccv_half_precision_to_float(&scale_h, &stored_scale, 1);
108 return stored_scale;
109 } else if (datatype == CCV_16BF) {
110 const float scale_f = (float)scale;
111 uint16_t scale_bf;
112 float stored_scale;
113 ccv_float_to_bfloat(&scale_f, &scale_bf, 1);
114 ccv_bfloat_to_float(&scale_bf, &stored_scale, 1);
115 return stored_scale;
116 } else if (datatype == CCV_32F)
117 return (float)scale;
118 return scale;
119}
120
121static void _ccv_nnc_8i_rowwise_packed_store_scale(uint8_t* const scales, const int datatype, const size_t i, const double scale)
122{
123 if (datatype == CCV_16F)
124 {
125 const float scale_f = (float)scale;
126 ccv_float_to_half_precision(&scale_f, (uint16_t*)scales + i, 1);
127 } else if (datatype == CCV_16BF) {
128 const float scale_f = (float)scale;
129 ccv_float_to_bfloat(&scale_f, (uint16_t*)scales + i, 1);
130 } else if (datatype == CCV_32F)
131 ((float*)scales)[i] = (float)scale;
132 else
133 ((double*)scales)[i] = scale;
134}
135
136static double _ccv_nnc_8i_rowwise_packed_load_scale(const uint8_t* const scales, const int datatype, const size_t i)
137{
138 if (datatype == CCV_16F)
139 {
140 float scale_f;
141 ccv_half_precision_to_float((const uint16_t*)scales + i, &scale_f, 1);
142 return scale_f;
143 } else if (datatype == CCV_16BF) {
144 float scale_f;
145 ccv_bfloat_to_float((const uint16_t*)scales + i, &scale_f, 1);
146 return scale_f;
147 } else if (datatype == CCV_32F)
148 return ((const float*)scales)[i];
149 return ((const double*)scales)[i];
150}
151
152static void _ccv_nnc_8i_rowwise_packed_read_row(const void* const input, const int datatype, const size_t row_start, const size_t row_length, const size_t padded_row_length, double* const row)
153{
154 size_t j;
155 if (datatype == CCV_16F)
156 {
157 const uint16_t* const f16 = (const uint16_t*)input + row_start;
158 for (j = 0; j < row_length; j++)
159 {
160 float v;
161 ccv_half_precision_to_float(f16 + j, &v, 1);
162 row[j] = v;
163 }
164 } else if (datatype == CCV_16BF) {
165 const uint16_t* const bf16 = (const uint16_t*)input + row_start;
166 for (j = 0; j < row_length; j++)
167 {
168 float v;
169 ccv_bfloat_to_float(bf16 + j, &v, 1);
170 row[j] = v;
171 }
172 } else if (datatype == CCV_32F) {
173 const float* const f32 = (const float*)input + row_start;
174 for (j = 0; j < row_length; j++)
175 row[j] = f32[j];
176 } else {
177 const double* const f64 = (const double*)input + row_start;
178 for (j = 0; j < row_length; j++)
179 row[j] = f64[j];
180 }
181 for (; j < padded_row_length; j++)
182 row[j] = 0;
183}
184
185static void _ccv_nnc_8i_rowwise_packed_write_value(void* const output, const int datatype, const size_t j, const double v)
186{
187 if (datatype == CCV_16F)
188 {
189 const float v_f = (float)v;
190 ccv_float_to_half_precision(&v_f, (uint16_t*)output + j, 1);
191 } else if (datatype == CCV_16BF) {
192 const float v_f = (float)v;
193 ccv_float_to_bfloat(&v_f, (uint16_t*)output + j, 1);
194 } else if (datatype == CCV_32F)
195 ((float*)output)[j] = (float)v;
196 else
197 ((double*)output)[j] = v;
198}
199
200static inline double _ccv_nnc_8i_rowwise_weight(const float* const imatrix, const size_t j)
201{
202 return imatrix ? ccv_max((double)imatrix[j], 0.)({ typeof ((double)imatrix[j]) _a = ((double)imatrix[j]); typeof
(0.) _b = (0.); (_a > _b) ? _a : _b; })
: 1.;
203}
204
205typedef struct {
206 int q[32];
207 int q8[32];
208 int m;
209 int b;
210 int z;
211 int scale;
212 int grid[4];
213 uint32_t signs;
214} ccv_nnc_8i_rowwise_packed_group_t;
215
216static void _ccv_nnc_8i_rowwise_packed_quant_q5(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
217{
218 double best_sse = DBL_MAX1.7976931348623157e+308;
219 int best_q[16] = {0};
220 int best_q8[16] = {0};
221 int best_m = 1, best_b = 0;
222 int m, b, j;
223 for (m = 1; m <= 8; m++)
224 for (b = -16; b <= 15; b++)
225 {
226 if (-16 * m + b < -127 || 15 * m + b > 127)
227 continue;
228 double sse = 0;
229 int q[16];
230 int q8[16];
231 for (j = 0; j < 16; j++)
232 {
233 q[j] = ccv_clamp((int)lrint((y[j] - b) / m), -16, 15)({ typeof (-16) _a = (-16); typeof (15) _b = (15); typeof ((int
)lrint((y[j] - b) / m)) _x = ((int)lrint((y[j] - b) / m)); (_x
< _a) ? _a : ((_x > _b) ? _b : _x); })
;
234 q8[j] = q[j] * m + b;
235 const double d = q8[j] - y[j];
236 sse += w[j] * d * d;
237 }
238 if (sse < best_sse)
239 {
240 best_sse = sse;
241 best_m = m;
242 best_b = b;
243 memcpy(best_q, q, sizeof(best_q));
244 memcpy(best_q8, q8, sizeof(best_q8));
245 }
246 }
247 group->m = best_m;
248 group->b = best_b;
249 memcpy(group->q, best_q, sizeof(best_q));
250 memcpy(group->q8, best_q8, sizeof(best_q8));
251}
252
253static void _ccv_nnc_8i_rowwise_packed_quant_q4(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
254{
255 double best_sse = DBL_MAX1.7976931348623157e+308;
256 int best_q[16] = {0};
257 int best_q8[16] = {0};
258 int best_m = 1, best_b = 0;
259 int m, b, j;
260 for (m = 1; m <= 16; m++)
261 for (b = -8; b <= 7; b++)
262 {
263 if (-8 * m + b < -127 || 7 * m + b > 127)
264 continue;
265 double sse = 0;
266 int q[16];
267 int q8[16];
268 for (j = 0; j < 16; j++)
269 {
270 q[j] = ccv_clamp((int)lrint((y[j] - b) / m), -8, 7)({ typeof (-8) _a = (-8); typeof (7) _b = (7); typeof ((int)lrint
((y[j] - b) / m)) _x = ((int)lrint((y[j] - b) / m)); (_x <
_a) ? _a : ((_x > _b) ? _b : _x); })
;
271 q8[j] = q[j] * m + b;
272 const double d = q8[j] - y[j];
273 sse += w[j] * d * d;
274 }
275 if (sse < best_sse)
276 {
277 best_sse = sse;
278 best_m = m;
279 best_b = b;
280 memcpy(best_q, q, sizeof(best_q));
281 memcpy(best_q8, q8, sizeof(best_q8));
282 }
283 }
284 group->m = best_m;
285 group->b = best_b;
286 memcpy(group->q, best_q, sizeof(best_q));
287 memcpy(group->q8, best_q8, sizeof(best_q8));
288}
289
290static void _ccv_nnc_8i_rowwise_packed_quant_q3(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
291{
292 double best_sse = DBL_MAX1.7976931348623157e+308;
293 int best_q[16] = {0};
294 int best_q8[16] = {0};
295 int best_m = 1, best_b = 0;
296 int m, b, j;
297 for (m = 1; m <= 32; m++)
298 for (b = -8; b <= 6; b += 2)
299 {
300 if (-4 * m + b < -127 || 3 * m + b > 127)
301 continue;
302 double sse = 0;
303 int q[16];
304 int q8[16];
305 for (j = 0; j < 16; j++)
306 {
307 q[j] = ccv_clamp((int)lrint((y[j] - b) / m), -4, 3)({ typeof (-4) _a = (-4); typeof (3) _b = (3); typeof ((int)lrint
((y[j] - b) / m)) _x = ((int)lrint((y[j] - b) / m)); (_x <
_a) ? _a : ((_x > _b) ? _b : _x); })
;
308 q8[j] = q[j] * m + b;
309 const double d = q8[j] - y[j];
310 sse += w[j] * d * d;
311 }
312 if (sse < best_sse)
313 {
314 best_sse = sse;
315 best_m = m;
316 best_b = b;
317 memcpy(best_q, q, sizeof(best_q));
318 memcpy(best_q8, q8, sizeof(best_q8));
319 }
320 }
321 group->m = best_m;
322 group->b = best_b;
323 memcpy(group->q, best_q, sizeof(best_q));
324 memcpy(group->q8, best_q8, sizeof(best_q8));
325}
326
327static void _ccv_nnc_8i_rowwise_packed_quant_q2(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
328{
329 double best_sse = DBL_MAX1.7976931348623157e+308;
330 int best_q[16] = {0};
331 int best_q8[16] = {0};
332 int best_m = 1, best_z = 0;
333 int m, z, j;
334 for (m = 1; m <= 64; m++)
335 for (z = 0; z <= 120; z += 8)
336 {
337 if (3 * m - z > 127)
338 continue;
339 double sse = 0;
340 int q[16];
341 int q8[16];
342 for (j = 0; j < 16; j++)
343 {
344 q[j] = ccv_clamp((int)lrint((y[j] + z) / m), 0, 3)({ typeof (0) _a = (0); typeof (3) _b = (3); typeof ((int)lrint
((y[j] + z) / m)) _x = ((int)lrint((y[j] + z) / m)); (_x <
_a) ? _a : ((_x > _b) ? _b : _x); })
;
345 q8[j] = q[j] * m - z;
346 const double d = q8[j] - y[j];
347 sse += w[j] * d * d;
348 }
349 if (sse < best_sse)
350 {
351 best_sse = sse;
352 best_m = m;
353 best_z = z;
354 memcpy(best_q, q, sizeof(best_q));
355 memcpy(best_q8, q8, sizeof(best_q8));
356 }
357 }
358 group->m = best_m;
359 group->z = best_z;
360 memcpy(group->q, best_q, sizeof(best_q));
361 memcpy(group->q8, best_q8, sizeof(best_q8));
362}
363
364static int _ccv_nnc_8i_rowwise_packed_iq2_value(const uint64_t* const grid, const int index, const int lane)
365{
366 const int v = (int)((grid[index] >> (lane * 8)) & 0xff);
367 if (v == 8)
368 return 1;
369 if (v == 25)
370 return 3;
371 assert(v == 43)((void) sizeof ((v == 43) ? 1 : 0), __extension__ ({ if (v ==
43) ; else __assert_fail ("v == 43", "ccv_nnc_8i_rowwise.c",
371, __extension__ __PRETTY_FUNCTION__); }))
;
372 return 5;
373}
374
375static int _ccv_nnc_8i_rowwise_packed_iq2xxs_value(const int index, const int lane)
376{
377 const int v = (int)((ccv_nnc_8i_rowwise_packed_iq2xxs_grid[index] >> (lane * 2)) & 3);
378 assert(v < 3)((void) sizeof ((v < 3) ? 1 : 0), __extension__ ({ if (v <
3) ; else __assert_fail ("v < 3", "ccv_nnc_8i_rowwise.c",
378, __extension__ __PRETTY_FUNCTION__); }))
;
379 return 1 + v * 2;
380}
381
382enum {
383 CCV_NNC_8I_ROWWISE_PACKED_IQ2XXS_GRID_SIZE = 256,
384 CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE = 1024,
385};
386
387static const int ccv_nnc_8i_rowwise_packed_iq2xxs_scales[16] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32};
388static int ccv_nnc_8i_rowwise_packed_iq2xxs_initialized = 0;
389static uint8_t ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[33][CCV_NNC_8I_ROWWISE_PACKED_IQ2XXS_GRID_SIZE][8];
390
391static void _ccv_nnc_8i_rowwise_packed_iq2xxs_init(void)
392{
393 if (ccv_nnc_8i_rowwise_packed_iq2xxs_initialized)
394 return;
395 int index, j, scale;
396 for (scale = 1; scale <= 32; scale++)
397 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2XXS_GRID_SIZE; index++)
398 for (j = 0; j < 8; j++)
399 ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[scale][index][j] = (uint8_t)ccv_min(_ccv_nnc_8i_rowwise_packed_iq2xxs_value(index, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2xxs_value(index, j) *
scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2xxs_value(index, j
) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b; }
)
;
400 ccv_nnc_8i_rowwise_packed_iq2xxs_initialized = 1;
401}
402
403static double _ccv_nnc_8i_rowwise_packed_iq2xxs_sse(const double* const y, const double* const w, const int lane, const int scale, const int index, const int sign_index)
404{
405 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[scale][index];
406 const uint8_t signs = ccv_nnc_8i_rowwise_packed_iq2xxs_ksigns[sign_index];
407 int j;
408 double sse = 0;
409 for (j = 0; j < 8; j++)
410 {
411 const int q8 = (signs & (1u << j)) ? -(int)mag[j] : (int)mag[j];
412 const double d = (double)q8 - y[lane + j];
413 sse += w[lane + j] * d * d;
414 }
415 return sse;
416}
417
418static double _ccv_nnc_8i_rowwise_packed_iq2xxs_best_sign_sse(const double* const y, const double* const w, const int lane, const int scale, const int index, int* const sign_index)
419{
420 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[scale][index];
421 uint8_t signs = 0;
422 int negative_count = 0;
423 int j;
424 for (j = 0; j < 8; j++)
425 if (y[lane + j] < 0)
426 {
427 signs |= (uint8_t)(1u << j);
428 negative_count++;
429 }
430 if (negative_count & 1)
431 {
432 int best_flip = 0;
433 double best_cost = DBL_MAX1.7976931348623157e+308;
434 for (j = 0; j < 8; j++)
435 {
436 const double cost = w[lane + j] * (double)mag[j] * fabs(y[lane + j]);
437 if (cost < best_cost)
438 {
439 best_cost = cost;
440 best_flip = j;
441 }
442 }
443 signs ^= (uint8_t)(1u << best_flip);
444 }
445 *sign_index = signs & 0x7f;
446 return _ccv_nnc_8i_rowwise_packed_iq2xxs_sse(y, w, lane, scale, index, *sign_index);
447}
448
449static int ccv_nnc_8i_rowwise_packed_iq2s_initialized = 0;
450static uint8_t ccv_nnc_8i_rowwise_packed_iq2s_level[CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE][8];
451static uint8_t ccv_nnc_8i_rowwise_packed_iq2s_scale_level[65][3];
452static uint16_t ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[65][3];
453static uint8_t ccv_nnc_8i_rowwise_packed_iq2s_scaled_value[65][CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE][8];
454
455static void _ccv_nnc_8i_rowwise_packed_iq2s_init(void)
456{
457 if (ccv_nnc_8i_rowwise_packed_iq2s_initialized)
458 return;
459 int index, j, scale;
460 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE; index++)
461 for (j = 0; j < 8; j++)
462 {
463 const int v = _ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid, index, j);
464 ccv_nnc_8i_rowwise_packed_iq2s_level[index][j] = (uint8_t)((v - 1) / 2);
465 }
466 for (scale = 1; scale <= 64; scale++)
467 {
468 for (j = 0; j < 3; j++)
469 {
470 const int v = ccv_min((1 + j * 2) * scale, 127)({ typeof ((1 + j * 2) * scale) _a = ((1 + j * 2) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
471 ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][j] = (uint8_t)v;
472 ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[scale][j] = (uint16_t)(v * v);
473 }
474 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE; index++)
475 for (j = 0; j < 8; j++)
476 ccv_nnc_8i_rowwise_packed_iq2s_scaled_value[scale][index][j] = ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][ccv_nnc_8i_rowwise_packed_iq2s_level[index][j]];
477 }
478 ccv_nnc_8i_rowwise_packed_iq2s_initialized = 1;
479}
480
481static double _ccv_nnc_8i_rowwise_packed_iq2s_sse(const double* const ay, const double* const w, const int lane, const int scale, const int index)
482{
483 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq2s_scaled_value[scale][index];
484 double d = (double)mag[0] - ay[lane];
485 double sse = w[lane] * d * d;
486 d = (double)mag[1] - ay[lane + 1];
487 sse += w[lane + 1] * d * d;
488 d = (double)mag[2] - ay[lane + 2];
489 sse += w[lane + 2] * d * d;
490 d = (double)mag[3] - ay[lane + 3];
491 sse += w[lane + 3] * d * d;
492 d = (double)mag[4] - ay[lane + 4];
493 sse += w[lane + 4] * d * d;
494 d = (double)mag[5] - ay[lane + 5];
495 sse += w[lane + 5] * d * d;
496 d = (double)mag[6] - ay[lane + 6];
497 sse += w[lane + 6] * d * d;
498 d = (double)mag[7] - ay[lane + 7];
499 sse += w[lane + 7] * d * d;
500 return sse;
501}
502
503static int _ccv_nnc_8i_rowwise_packed_iq3xxs_value(const int index, const int lane)
504{
505 const int v = (int)((ccv_nnc_8i_rowwise_packed_iq3xxs_grid[index] >> (lane * 8)) & 0xff);
506 switch (v)
507 {
508 case 4: return 1;
509 case 12: return 3;
510 case 20: return 5;
511 case 28: return 7;
512 case 36: return 9;
513 case 44: return 11;
514 case 52: return 13;
515 default:
516 assert(v == 62)((void) sizeof ((v == 62) ? 1 : 0), __extension__ ({ if (v ==
62) ; else __assert_fail ("v == 62", "ccv_nnc_8i_rowwise.c",
516, __extension__ __PRETTY_FUNCTION__); }))
;
517 return 15;
518 }
519}
520
521static int _ccv_nnc_8i_rowwise_packed_iq3s_value(const int index, const int lane)
522{
523 return (int)((ccv_nnc_8i_rowwise_packed_iq3s_grid[index] >> (lane * 8)) & 0xff);
524}
525
526#define CCV_NNC_8I_ROWWISE_PACKED_IQ3S_GRID_SIZE(512) (512)
527#define CCV_NNC_8I_ROWWISE_PACKED_IQ3XXS_GRID_SIZE(256) (256)
528
529static int ccv_nnc_8i_rowwise_packed_iq3s_initialized = 0;
530static uint8_t ccv_nnc_8i_rowwise_packed_iq3s_scaled_value[17][CCV_NNC_8I_ROWWISE_PACKED_IQ3S_GRID_SIZE(512)][4];
531
532static int ccv_nnc_8i_rowwise_packed_iq3xxs_initialized = 0;
533static uint8_t ccv_nnc_8i_rowwise_packed_iq3xxs_scaled_value[17][CCV_NNC_8I_ROWWISE_PACKED_IQ3XXS_GRID_SIZE(256)][4];
534
535static void _ccv_nnc_8i_rowwise_packed_iq3s_init(void)
536{
537 if (ccv_nnc_8i_rowwise_packed_iq3s_initialized)
538 return;
539 int index, j, scale;
540 for (scale = 1; scale <= 16; scale++)
541 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ3S_GRID_SIZE(512); index++)
542 for (j = 0; j < 4; j++)
543 ccv_nnc_8i_rowwise_packed_iq3s_scaled_value[scale][index][j] = (uint8_t)ccv_min(_ccv_nnc_8i_rowwise_packed_iq3s_value(index, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3s_value(index, j) * scale
) _a = (_ccv_nnc_8i_rowwise_packed_iq3s_value(index, j) * scale
); typeof (127) _b = (127); (_a < _b) ? _a : _b; })
;
544 ccv_nnc_8i_rowwise_packed_iq3s_initialized = 1;
545}
546
547static void _ccv_nnc_8i_rowwise_packed_iq3xxs_init(void)
548{
549 if (ccv_nnc_8i_rowwise_packed_iq3xxs_initialized)
550 return;
551 int index, j, scale;
552 for (scale = 1; scale <= 16; scale++)
553 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ3XXS_GRID_SIZE(256); index++)
554 for (j = 0; j < 4; j++)
555 ccv_nnc_8i_rowwise_packed_iq3xxs_scaled_value[scale][index][j] = (uint8_t)ccv_min(_ccv_nnc_8i_rowwise_packed_iq3xxs_value(index, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(index, j) *
scale) _a = (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(index, j
) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b; }
)
;
556 ccv_nnc_8i_rowwise_packed_iq3xxs_initialized = 1;
557}
558
559static double _ccv_nnc_8i_rowwise_packed_iq3s_sse(const double* const ay, const double* const w, const int lane, const int scale, const int index)
560{
561 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq3s_scaled_value[scale][index];
562 double d = (double)mag[0] - ay[lane];
563 double sse = w[lane] * d * d;
564 d = (double)mag[1] - ay[lane + 1];
565 sse += w[lane + 1] * d * d;
566 d = (double)mag[2] - ay[lane + 2];
567 sse += w[lane + 2] * d * d;
568 d = (double)mag[3] - ay[lane + 3];
569 sse += w[lane + 3] * d * d;
570 return sse;
571}
572
573static double _ccv_nnc_8i_rowwise_packed_iq3xxs_sse(const double* const ay, const double* const w, const int lane, const int scale, const int index)
574{
575 const uint8_t* const mag = ccv_nnc_8i_rowwise_packed_iq3xxs_scaled_value[scale][index];
576 double d = (double)mag[0] - ay[lane];
577 double sse = w[lane] * d * d;
578 d = (double)mag[1] - ay[lane + 1];
579 sse += w[lane + 1] * d * d;
580 d = (double)mag[2] - ay[lane + 2];
581 sse += w[lane + 2] * d * d;
582 d = (double)mag[3] - ay[lane + 3];
583 sse += w[lane + 3] * d * d;
584 return sse;
585}
586
587static void _ccv_nnc_8i_rowwise_packed_quant_iq2_xxs(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
588{
589 assert(ccv_nnc_8i_rowwise_packed_iq2xxs_initialized)((void) sizeof ((ccv_nnc_8i_rowwise_packed_iq2xxs_initialized
) ? 1 : 0), __extension__ ({ if (ccv_nnc_8i_rowwise_packed_iq2xxs_initialized
) ; else __assert_fail ("ccv_nnc_8i_rowwise_packed_iq2xxs_initialized"
, "ccv_nnc_8i_rowwise.c", 589, __extension__ __PRETTY_FUNCTION__
); }))
;
590 double best_sse = DBL_MAX1.7976931348623157e+308;
591 int best_scale_code = 0;
592 int best_grid[4] = {0};
593 int best_sign[4] = {0};
594 int scale_code;
595 for (scale_code = 0; scale_code < 16; scale_code++)
596 {
597 const int scale = ccv_nnc_8i_rowwise_packed_iq2xxs_scales[scale_code];
598 double group_sse = 0;
599 int group_grid[4] = {0};
600 int group_sign[4] = {0};
601 int sg;
602 for (sg = 0; sg < 4; sg++)
603 {
604 double best_sub_sse = DBL_MAX1.7976931348623157e+308;
605 int best_sub_grid = 0;
606 int best_sub_sign = 0;
607 const int lane = sg * 8;
608 int index;
609 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2XXS_GRID_SIZE; index++)
610 {
611 int sign;
612 const double sse = _ccv_nnc_8i_rowwise_packed_iq2xxs_best_sign_sse(y, w, lane, scale, index, &sign);
613 if (sse < best_sub_sse)
614 {
615 best_sub_sse = sse;
616 best_sub_grid = index;
617 best_sub_sign = sign;
618 }
619 }
620 group_sse += best_sub_sse;
621 group_grid[sg] = best_sub_grid;
622 group_sign[sg] = best_sub_sign;
623 }
624 if (group_sse < best_sse)
625 {
626 best_sse = group_sse;
627 best_scale_code = scale_code;
628 memcpy(best_grid, group_grid, sizeof(best_grid));
629 memcpy(best_sign, group_sign, sizeof(best_sign));
630 }
631 }
632 group->scale = best_scale_code;
633 group->signs = 0;
634 memcpy(group->grid, best_grid, sizeof(best_grid));
635 int j;
636 for (j = 0; j < 4; j++)
637 group->signs |= (uint32_t)best_sign[j] << (j * 7);
638 for (j = 0; j < 32; j++)
639 {
640 const int sg = j >> 3;
641 const int lane = j & 7;
642 const uint8_t signs = ccv_nnc_8i_rowwise_packed_iq2xxs_ksigns[best_sign[sg]];
643 const int mag = ccv_nnc_8i_rowwise_packed_iq2xxs_scaled_value[ccv_nnc_8i_rowwise_packed_iq2xxs_scales[best_scale_code]][best_grid[sg]][lane];
644 group->q8[j] = (signs & (1u << lane)) ? -mag : mag;
645 }
646}
647
648static void _ccv_nnc_8i_rowwise_packed_quant_iq2_s(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
649{
650 assert(ccv_nnc_8i_rowwise_packed_iq2s_initialized)((void) sizeof ((ccv_nnc_8i_rowwise_packed_iq2s_initialized) ?
1 : 0), __extension__ ({ if (ccv_nnc_8i_rowwise_packed_iq2s_initialized
) ; else __assert_fail ("ccv_nnc_8i_rowwise_packed_iq2s_initialized"
, "ccv_nnc_8i_rowwise.c", 650, __extension__ __PRETTY_FUNCTION__
); }))
;
651 double best_sse = DBL_MAX1.7976931348623157e+308;
652 int best_scale = 1;
653 int best_grid[2] = {0};
654 double ay[16];
655 double wy[16];
656 uint32_t signs = 0;
657 int j;
658 for (j = 0; j < 16; j++)
659 {
660 ay[j] = fabs(y[j]);
661 wy[j] = w[j] * ay[j];
662 if (y[j] < 0)
663 signs |= (1u << j);
664 }
665 double sub_sse[2][65];
666 int sub_grid[2][65];
667 int sg, scale;
668 for (sg = 0; sg < 2; sg++)
669 for (scale = 1; scale <= 64; scale++)
670 {
671 sub_sse[sg][scale] = DBL_MAX1.7976931348623157e+308;
672 sub_grid[sg][scale] = 0;
673 }
674 for (sg = 0; sg < 2; sg++)
675 {
676 const int lane = sg * 8;
677 double sum_y2 = 0;
678 for (j = 0; j < 8; j++)
679 sum_y2 += w[lane + j] * ay[lane + j] * ay[lane + j];
680 int index;
681 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ2S_GRID_SIZE; index++)
682 {
683 double sw[3] = {0};
684 double swy[3] = {0};
685 for (j = 0; j < 8; j++)
686 {
687 const int level = ccv_nnc_8i_rowwise_packed_iq2s_level[index][j];
688 sw[level] += w[lane + j];
689 swy[level] += wy[lane + j];
690 }
691 for (scale = 1; scale <= 64; scale++)
692 {
693 const double sse = sum_y2 +
694 sw[0] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[scale][0] - 2 * swy[0] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][0] +
695 sw[1] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[scale][1] - 2 * swy[1] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][1] +
696 sw[2] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level2[scale][2] - 2 * swy[2] * (double)ccv_nnc_8i_rowwise_packed_iq2s_scale_level[scale][2];
697 if (sub_sse[sg][scale] == DBL_MAX1.7976931348623157e+308 || sse <= sub_sse[sg][scale] + ccv_max(1., fabs(sub_sse[sg][scale]))({ typeof (1.) _a = (1.); typeof (fabs(sub_sse[sg][scale])) _b
= (fabs(sub_sse[sg][scale])); (_a > _b) ? _a : _b; })
* 1e-9)
698 {
699 const double exact_sse = _ccv_nnc_8i_rowwise_packed_iq2s_sse(ay, w, lane, scale, index);
700 if (exact_sse < sub_sse[sg][scale])
701 {
702 sub_sse[sg][scale] = exact_sse;
703 sub_grid[sg][scale] = index;
704 }
705 }
706 }
707 }
708 }
709 for (scale = 1; scale <= 64; scale++)
710 {
711 const double group_sse = sub_sse[0][scale] + sub_sse[1][scale];
712 if (group_sse < best_sse)
713 {
714 best_sse = group_sse;
715 best_scale = scale;
716 best_grid[0] = sub_grid[0][scale];
717 best_grid[1] = sub_grid[1][scale];
718 }
719 }
720 group->scale = best_scale;
721 group->signs = signs;
722 memcpy(group->grid, best_grid, sizeof(best_grid));
723 for (j = 0; j < 16; j++)
724 {
725 const int sg = j >> 3;
726 const int lane = j & 7;
727 const int mag = ccv_nnc_8i_rowwise_packed_iq2s_scaled_value[best_scale][best_grid[sg]][lane];
728 group->q8[j] = (signs & (1u << j)) ? -mag : mag;
729 }
730}
731
732static void _ccv_nnc_8i_rowwise_packed_quant_iq2_xs(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
733{
734 static const int scales[16] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32};
735 double best_sse = DBL_MAX1.7976931348623157e+308;
736 int best_scale_code = 0;
737 int best_grid = 0;
738 int best_q8[16] = {0};
739 uint32_t signs = 0;
740 int j;
741 for (j = 0; j < 8; j++)
742 if (y[j] < 0)
743 signs |= (1u << j);
744 int scale_code;
745 for (scale_code = 0; scale_code < 16; scale_code++)
746 {
747 const int scale = scales[scale_code];
748 int index;
749 for (index = 0; index < 512; index++)
750 {
751 double sse = 0;
752 int q8[16] = {0};
753 for (j = 0; j < 8; j++)
754 {
755 const int mag = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2xs_grid, index, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2xs_grid
, index, j) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2_value
(ccv_nnc_8i_rowwise_packed_iq2xs_grid, index, j) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
756 q8[j] = (signs & (1u << j)) ? -mag : mag;
757 const double d = q8[j] - y[j];
758 sse += w[j] * d * d;
759 }
760 if (sse < best_sse)
761 {
762 best_sse = sse;
763 best_scale_code = scale_code;
764 best_grid = index;
765 memcpy(best_q8, q8, sizeof(best_q8));
766 }
767 }
768 }
769 group->scale = best_scale_code;
770 group->grid[0] = best_grid;
771 group->signs = signs;
772 memcpy(group->q8, best_q8, sizeof(best_q8));
773}
774
775static void _ccv_nnc_8i_rowwise_packed_quant_iq3_s(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
776{
777 assert(ccv_nnc_8i_rowwise_packed_iq3s_initialized)((void) sizeof ((ccv_nnc_8i_rowwise_packed_iq3s_initialized) ?
1 : 0), __extension__ ({ if (ccv_nnc_8i_rowwise_packed_iq3s_initialized
) ; else __assert_fail ("ccv_nnc_8i_rowwise_packed_iq3s_initialized"
, "ccv_nnc_8i_rowwise.c", 777, __extension__ __PRETTY_FUNCTION__
); }))
;
778 double best_sse = DBL_MAX1.7976931348623157e+308;
779 int best_scale = 1;
780 int best_grid[4] = {0};
781 double ay[16];
782 uint32_t signs = 0;
783 int j;
784 for (j = 0; j < 16; j++)
785 {
786 ay[j] = fabs(y[j]);
787 if (y[j] < 0)
788 signs |= (1u << j);
789 }
790 int scale;
791 for (scale = 1; scale <= 16; scale++)
792 {
793 double group_sse = 0;
794 int group_grid[4] = {0};
795 int sg;
796 for (sg = 0; sg < 4; sg++)
797 {
798 double best_sub_sse = DBL_MAX1.7976931348623157e+308;
799 int best_sub_grid = 0;
800 const int lane = sg * 4;
801 int index;
802 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ3S_GRID_SIZE(512); index++)
803 {
804 const double sse = _ccv_nnc_8i_rowwise_packed_iq3s_sse(ay, w, lane, scale, index);
805 if (sse < best_sub_sse)
806 {
807 best_sub_sse = sse;
808 best_sub_grid = index;
809 }
810 }
811 group_sse += best_sub_sse;
812 group_grid[sg] = best_sub_grid;
813 }
814 if (group_sse < best_sse)
815 {
816 best_sse = group_sse;
817 best_scale = scale;
818 memcpy(best_grid, group_grid, sizeof(best_grid));
819 }
820 }
821 group->scale = best_scale;
822 group->signs = signs;
823 memcpy(group->grid, best_grid, sizeof(best_grid));
824 for (j = 0; j < 16; j++)
825 {
826 const int sg = j >> 2;
827 const int lane = j & 3;
828 const int mag = ccv_nnc_8i_rowwise_packed_iq3s_scaled_value[best_scale][best_grid[sg]][lane];
829 group->q8[j] = (signs & (1u << j)) ? -mag : mag;
830 }
831}
832
833static void _ccv_nnc_8i_rowwise_packed_quant_iq3_xxs(const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
834{
835 assert(ccv_nnc_8i_rowwise_packed_iq3xxs_initialized)((void) sizeof ((ccv_nnc_8i_rowwise_packed_iq3xxs_initialized
) ? 1 : 0), __extension__ ({ if (ccv_nnc_8i_rowwise_packed_iq3xxs_initialized
) ; else __assert_fail ("ccv_nnc_8i_rowwise_packed_iq3xxs_initialized"
, "ccv_nnc_8i_rowwise.c", 835, __extension__ __PRETTY_FUNCTION__
); }))
;
836 double best_sse = DBL_MAX1.7976931348623157e+308;
837 int best_scale = 1;
838 int best_grid[2] = {0};
839 double ay[8];
840 uint32_t signs = 0;
841 int j;
842 for (j = 0; j < 8; j++)
843 {
844 ay[j] = fabs(y[j]);
845 if (y[j] < 0)
846 signs |= (1u << j);
847 }
848 int scale;
849 for (scale = 1; scale <= 16; scale++)
850 {
851 double group_sse = 0;
852 int group_grid[2] = {0};
853 int sg;
854 for (sg = 0; sg < 2; sg++)
855 {
856 double best_sub_sse = DBL_MAX1.7976931348623157e+308;
857 int best_sub_grid = 0;
858 const int lane = sg * 4;
859 int index;
860 for (index = 0; index < CCV_NNC_8I_ROWWISE_PACKED_IQ3XXS_GRID_SIZE(256); index++)
861 {
862 const double sse = _ccv_nnc_8i_rowwise_packed_iq3xxs_sse(ay, w, lane, scale, index);
863 if (sse < best_sub_sse)
864 {
865 best_sub_sse = sse;
866 best_sub_grid = index;
867 }
868 }
869 group_sse += best_sub_sse;
870 group_grid[sg] = best_sub_grid;
871 }
872 if (group_sse < best_sse)
873 {
874 best_sse = group_sse;
875 best_scale = scale;
876 memcpy(best_grid, group_grid, sizeof(best_grid));
877 }
878 }
879 group->scale = best_scale;
880 group->signs = signs;
881 memcpy(group->grid, best_grid, sizeof(best_grid));
882 memset(group->q8, 0, sizeof(group->q8));
883 for (j = 0; j < 8; j++)
884 {
885 const int sg = j >> 2;
886 const int lane = j & 3;
887 const int mag = ccv_nnc_8i_rowwise_packed_iq3xxs_scaled_value[best_scale][best_grid[sg]][lane];
888 group->q8[j] = (signs & (1u << j)) ? -mag : mag;
889 }
890}
891
892static void _ccv_nnc_8i_rowwise_packed_quant_group(const int format, const double* const y, const double* const w, ccv_nnc_8i_rowwise_packed_group_t* const group)
893{
894 switch (format)
895 {
896 case CCV_NNC_QX_8I_ROWWISE_Q5_K:
897 _ccv_nnc_8i_rowwise_packed_quant_q5(y, w, group);
898 break;
899 case CCV_NNC_QX_8I_ROWWISE_Q4_K:
900 _ccv_nnc_8i_rowwise_packed_quant_q4(y, w, group);
901 break;
902 case CCV_NNC_QX_8I_ROWWISE_Q3_K:
903 _ccv_nnc_8i_rowwise_packed_quant_q3(y, w, group);
904 break;
905 case CCV_NNC_QX_8I_ROWWISE_Q2_K:
906 _ccv_nnc_8i_rowwise_packed_quant_q2(y, w, group);
907 break;
908 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
909 _ccv_nnc_8i_rowwise_packed_quant_iq2_xxs(y, w, group);
910 break;
911 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
912 _ccv_nnc_8i_rowwise_packed_quant_iq2_s(y, w, group);
913 break;
914 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS:
915 _ccv_nnc_8i_rowwise_packed_quant_iq2_xs(y, w, group);
916 break;
917 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
918 _ccv_nnc_8i_rowwise_packed_quant_iq3_s(y, w, group);
919 break;
920 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
921 _ccv_nnc_8i_rowwise_packed_quant_iq3_xxs(y, w, group);
922 break;
923 default:
924 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 924, __extension__ __PRETTY_FUNCTION__
); }))
;
925 }
926}
927
928static void _ccv_nnc_8i_rowwise_packed_pack_group(uint8_t* const output, const size_t group_index, const int format, const ccv_nnc_8i_rowwise_packed_group_t* const group)
929{
930 const size_t bit_offset = group_index * _ccv_nnc_8i_rowwise_x_group_bits(format);
931 size_t bit = bit_offset;
932 int j;
933 switch (format)
934 {
935 case CCV_NNC_QX_8I_ROWWISE_Q5_K:
936 for (j = 0; j < 16; j++, bit += 5)
937 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->q[j] + 16), 5);
938 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->m - 1), 3);
939 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 3, (uint32_t)(group->b + 16), 5);
940 break;
941 case CCV_NNC_QX_8I_ROWWISE_Q4_K:
942 for (j = 0; j < 16; j++, bit += 4)
943 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->q[j] + 8), 4);
944 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->m - 1), 4);
945 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 4, (uint32_t)(group->b + 8), 4);
946 break;
947 case CCV_NNC_QX_8I_ROWWISE_Q3_K:
948 for (j = 0; j < 16; j++, bit += 3)
949 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->q[j] + 4), 3);
950 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->m - 1), 5);
951 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 5, (uint32_t)(group->b / 2 + 4), 3);
952 break;
953 case CCV_NNC_QX_8I_ROWWISE_Q2_K:
954 for (j = 0; j < 16; j++, bit += 2)
955 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)group->q[j], 2);
956 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)(group->m - 1), 6);
957 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 6, (uint32_t)(group->z >> 3), 4);
958 break;
959 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
960 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)group->grid[0], 10);
961 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 10, (uint32_t)group->grid[1], 10);
962 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 20, group->signs, 16);
963 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 36, (uint32_t)(group->scale - 1), 6);
964 break;
965 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS:
966 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)group->grid[0], 9);
967 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 9, group->signs, 8);
968 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 17, (uint32_t)group->scale, 4);
969 break;
970 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
971 for (j = 0; j < 4; j++)
972 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + j * 8, (uint32_t)group->grid[j], 8);
973 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 32, group->signs, 28);
974 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 60, (uint32_t)group->scale, 4);
975 break;
976 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
977 for (j = 0; j < 4; j++)
978 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + j * 9, (uint32_t)group->grid[j], 9);
979 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 36, group->signs, 16);
980 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 52, (uint32_t)(group->scale - 1), 4);
981 break;
982 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
983 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit, (uint32_t)group->grid[0], 8);
984 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 8, (uint32_t)group->grid[1], 8);
985 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 16, group->signs, 8);
986 _ccv_nnc_8i_rowwise_packed_write_bits(output, bit + 24, (uint32_t)(group->scale - 1), 4);
987 break;
988 default:
989 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 989, __extension__ __PRETTY_FUNCTION__
); }))
;
990 }
991}
992
993static void _ccv_nnc_8i_rowwise_packed_decode_group(const uint8_t* const input, const size_t group_index, const int format, int* const q8)
994{
995 static const int q2_xs_scales[16] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32};
996 const size_t bit_offset = group_index * _ccv_nnc_8i_rowwise_x_group_bits(format);
997 size_t bit = bit_offset;
998 int j;
999 switch (format)
1000 {
1001 case CCV_NNC_QX_8I_ROWWISE_Q5_K: {
1002 int q[16];
1003 for (j = 0; j < 16; j++, bit += 5)
1004 q[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 5) - 16;
1005 const int m = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 3) + 1;
1006 const int b = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 3, 5) - 16;
1007 for (j = 0; j < 16; j++)
1008 q8[j] = q[j] * m + b;
1009 break;
1010 }
1011 case CCV_NNC_QX_8I_ROWWISE_Q4_K: {
1012 int q[16];
1013 for (j = 0; j < 16; j++, bit += 4)
1014 q[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 4) - 8;
1015 const int m = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 4) + 1;
1016 const int b = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 4, 4) - 8;
1017 for (j = 0; j < 16; j++)
1018 q8[j] = q[j] * m + b;
1019 break;
1020 }
1021 case CCV_NNC_QX_8I_ROWWISE_Q3_K: {
1022 int q[16];
1023 for (j = 0; j < 16; j++, bit += 3)
1024 q[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 3) - 4;
1025 const int m = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 5) + 1;
1026 const int b = ((int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 5, 3) - 4) << 1;
1027 for (j = 0; j < 16; j++)
1028 q8[j] = q[j] * m + b;
1029 break;
1030 }
1031 case CCV_NNC_QX_8I_ROWWISE_Q2_K: {
1032 int q[16];
1033 for (j = 0; j < 16; j++, bit += 2)
1034 q[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 2);
1035 const int m = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 6) + 1;
1036 const int z = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 6, 4) << 3;
1037 for (j = 0; j < 16; j++)
1038 q8[j] = q[j] * m - z;
1039 break;
1040 }
1041 case CCV_NNC_QX_8I_ROWWISE_IQ2_S: {
1042 const int grid0 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 10);
1043 const int grid1 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 10, 10);
1044 const uint32_t signs = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 20, 16);
1045 const int scale = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 36, 6) + 1;
1046 for (j = 0; j < 8; j++)
1047 {
1048 const int mag0 = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid, grid0, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid
, grid0, j) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2_value
(ccv_nnc_8i_rowwise_packed_iq2s_grid, grid0, j) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
1049 const int mag1 = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid, grid1, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2s_grid
, grid1, j) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2_value
(ccv_nnc_8i_rowwise_packed_iq2s_grid, grid1, j) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
1050 q8[j] = (signs & (1u << j)) ? -mag0 : mag0;
1051 q8[8 + j] = (signs & (1u << (8 + j))) ? -mag1 : mag1;
1052 }
1053 break;
1054 }
1055 case CCV_NNC_QX_8I_ROWWISE_IQ2_XS: {
1056 const int grid0 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 9);
1057 const uint32_t signs = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 9, 8);
1058 const int scale = q2_xs_scales[_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 17, 4)];
1059 for (j = 0; j < 8; j++)
1060 {
1061 const int mag = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2xs_grid, grid0, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2_value(ccv_nnc_8i_rowwise_packed_iq2xs_grid
, grid0, j) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2_value
(ccv_nnc_8i_rowwise_packed_iq2xs_grid, grid0, j) * scale); typeof
(127) _b = (127); (_a < _b) ? _a : _b; })
;
1062 q8[j] = (signs & (1u << j)) ? -mag : mag;
1063 }
1064 break;
1065 }
1066 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS: {
1067 int grid[4];
1068 for (j = 0; j < 4; j++)
1069 grid[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + j * 8, 8);
1070 const uint32_t sign_codes = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 32, 28);
1071 const int scale = ccv_nnc_8i_rowwise_packed_iq2xxs_scales[_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 60, 4)];
1072 int sg;
1073 for (sg = 0; sg < 4; sg++)
1074 {
1075 const uint8_t signs = ccv_nnc_8i_rowwise_packed_iq2xxs_ksigns[(sign_codes >> (sg * 7)) & 0x7f];
1076 for (j = 0; j < 8; j++)
1077 {
1078 const int lane = sg * 8 + j;
1079 const int mag = ccv_min(_ccv_nnc_8i_rowwise_packed_iq2xxs_value(grid[sg], j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq2xxs_value(grid[sg], j
) * scale) _a = (_ccv_nnc_8i_rowwise_packed_iq2xxs_value(grid
[sg], j) * scale); typeof (127) _b = (127); (_a < _b) ? _a
: _b; })
;
1080 q8[lane] = (signs & (1u << j)) ? -mag : mag;
1081 }
1082 }
1083 break;
1084 }
1085 case CCV_NNC_QX_8I_ROWWISE_IQ3_S: {
1086 int grid[4];
1087 for (j = 0; j < 4; j++)
1088 grid[j] = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + j * 9, 9);
1089 const uint32_t signs = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 36, 16);
1090 const int scale = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 52, 4) + 1;
1091 int sg;
1092 for (sg = 0; sg < 4; sg++)
1093 for (j = 0; j < 4; j++)
1094 {
1095 const int lane = sg * 4 + j;
1096 const int mag = ccv_min(_ccv_nnc_8i_rowwise_packed_iq3s_value(grid[sg], j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3s_value(grid[sg], j)
* scale) _a = (_ccv_nnc_8i_rowwise_packed_iq3s_value(grid[sg
], j) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b
; })
;
1097 q8[lane] = (signs & (1u << lane)) ? -mag : mag;
1098 }
1099 break;
1100 }
1101 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS: {
1102 const int grid0 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit, 8);
1103 const int grid1 = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 8, 8);
1104 const uint32_t signs = _ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 16, 8);
1105 const int scale = (int)_ccv_nnc_8i_rowwise_packed_read_bits(input, bit + 24, 4) + 1;
1106 for (j = 0; j < 4; j++)
1107 {
1108 const int mag0 = ccv_min(_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid0, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid0, j) *
scale) _a = (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid0, j
) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b; }
)
;
1109 const int mag1 = ccv_min(_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid1, j) * scale, 127)({ typeof (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid1, j) *
scale) _a = (_ccv_nnc_8i_rowwise_packed_iq3xxs_value(grid1, j
) * scale); typeof (127) _b = (127); (_a < _b) ? _a : _b; }
)
;
1110 q8[j] = (signs & (1u << j)) ? -mag0 : mag0;
1111 q8[4 + j] = (signs & (1u << (4 + j))) ? -mag1 : mag1;
1112 }
1113 break;
1114 }
1115 default:
1116 assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "ccv_nnc_8i_rowwise.c", 1116, __extension__ __PRETTY_FUNCTION__
); }))
;
1117 }
1118}
1119
1120CCV_WARN_UNUSED(size_t)size_t __attribute__((warn_unused_result)) ccv_nnc_quantize_8i_rowwise_x(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, const int format, const float* const imatrix, void* output, const size_t output_length)
1121{
1122 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1122, __extension__ __PRETTY_FUNCTION__
); }))
;
1
Assuming 'datatype' is not equal to CCV_16F
2
Assuming 'datatype' is not equal to CCV_16BF
3
Assuming 'datatype' is not equal to CCV_32F
4
Assuming 'datatype' is equal to CCV_64F
5
Taking true branch
1123 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1123, __extension__ __PRETTY_FUNCTION__
); }))
;
6
Assuming 'memory_type' is equal to CCV_TENSOR_CPU_MEMORY
7
Taking true branch
1124 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 1124, __extension__ __PRETTY_FUNCTION__
); }))
;
8
Assuming 'row_length' is > 0
9
Taking true branch
1125 assert(input_length % row_length == 0)((void) sizeof ((input_length % row_length == 0) ? 1 : 0), __extension__
({ if (input_length % row_length == 0) ; else __assert_fail (
"input_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 1125
, __extension__ __PRETTY_FUNCTION__); }))
;
10
Assuming the condition is true
11
Taking true branch
1126 const size_t row_count = input_length / row_length;
1127 const size_t group_size = _ccv_nnc_8i_rowwise_x_group_size(format);
1128 const int group_bits = _ccv_nnc_8i_rowwise_x_group_bits(format);
1129 const size_t groups_per_row = (row_length + group_size - 1) / group_size;
1130 const size_t padded_row_length = groups_per_row * group_size;
1131 const size_t scale_offset = _ccv_nnc_8i_rowwise_packed_scale_offset(format, input_length, row_length);
1132 const size_t output_size = scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype)_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]
;
1133 assert(output_length >= output_size)((void) sizeof ((output_length >= output_size) ? 1 : 0), __extension__
({ if (output_length >= output_size) ; else __assert_fail
("output_length >= output_size", "ccv_nnc_8i_rowwise.c", 1133
, __extension__ __PRETTY_FUNCTION__); }))
;
12
Assuming 'output_length' is >= 'output_size'
13
Taking true branch
1134 switch (format)
14
'Default' branch taken. Execution continues on line 1149
1135 {
1136 case CCV_NNC_QX_8I_ROWWISE_IQ2_XXS:
1137 _ccv_nnc_8i_rowwise_packed_iq2xxs_init();
1138 break;
1139 case CCV_NNC_QX_8I_ROWWISE_IQ2_S:
1140 _ccv_nnc_8i_rowwise_packed_iq2s_init();
1141 break;
1142 case CCV_NNC_QX_8I_ROWWISE_IQ3_S:
1143 _ccv_nnc_8i_rowwise_packed_iq3s_init();
1144 break;
1145 case CCV_NNC_QX_8I_ROWWISE_IQ3_XXS:
1146 _ccv_nnc_8i_rowwise_packed_iq3xxs_init();
1147 break;
1148 }
1149 uint8_t* const u8 = (uint8_t*)output;
1150 uint8_t* const scales = u8 + scale_offset;
1151 memset(u8, 0, scale_offset);
1152 const size_t row_bits = groups_per_row * group_bits;
1153 size_t rows_per_chunk;
1154 switch (row_bits & 7)
15
Control jumps to 'case 6:' at line 1163
1155 {
1156 case 0:
1157 rows_per_chunk = 1;
1158 break;
1159 case 4:
1160 rows_per_chunk = 2;
1161 break;
1162 case 2:
1163 case 6:
1164 rows_per_chunk = 4;
1165 break;
16
Execution continues on line 1170
1166 default:
1167 rows_per_chunk = 8;
1168 break;
1169 }
1170 const size_t row_chunks = (row_count + rows_per_chunk - 1) / rows_per_chunk;
1171 parallel_for(chunk_idx, (int)row_chunks){ int chunk_idx; for ((chunk_idx) = 0; (chunk_idx) < ((int
)row_chunks); (chunk_idx)++) {
{
17
Assuming 'chunk_idx' is < 'row_chunks'
18
Loop condition is true. Entering loop body
1172 const size_t chunk_begin = (size_t)chunk_idx * rows_per_chunk;
1173 const size_t chunk_end = ccv_min(chunk_begin + rows_per_chunk, row_count)({ typeof (chunk_begin + rows_per_chunk) _a = (chunk_begin + rows_per_chunk
); typeof (row_count) _b = (row_count); (_a < _b) ? _a : _b
; })
;
19
Assuming '_a' is >= '_b'
20
'?' condition is false
1174 size_t i;
1175 for (i = chunk_begin; i < chunk_end; i++)
21
Assuming 'i' is < 'chunk_end'
22
Loop condition is true. Entering loop body
1176 {
1177 double* const row = (double*)ccmallocmalloc(sizeof(double) * padded_row_length);
1178 double* const weights = (double*)ccmallocmalloc(sizeof(double) * padded_row_length);
23
Storing uninitialized value
1179 int* const q8 = (int*)ccmallocmalloc(sizeof(int) * padded_row_length);
1180 const size_t row_start = i * row_length;
1181 _ccv_nnc_8i_rowwise_packed_read_row(input, datatype, row_start, row_length, padded_row_length, row);
1182 double max_abs = 0;
1183 size_t j;
1184 for (j = 0; j
23.1
'j' is < 'row_length'
< row_length
; j++)
24
Loop condition is true. Entering loop body
27
Assuming 'j' is >= 'row_length'
28
Loop condition is false. Execution continues on line 1189
1185 {
1186 max_abs = ccv_max(max_abs, fabs(row[j]))({ typeof (max_abs) _a = (max_abs); typeof (fabs(row[j])) _b =
(fabs(row[j])); (_a > _b) ? _a : _b; })
;
25
Assuming '_a' is <= '_b'
26
'?' condition is false
1187 weights[j] = _ccv_nnc_8i_rowwise_weight(imatrix, j);
1188 }
1189 for (; j < padded_row_length; j++)
29
Assuming 'j' is >= 'padded_row_length'
30
Loop condition is false. Execution continues on line 1191
1190 weights[j] = 0;
1191 double scale = max_abs / 127.;
1192 double best_scale = 0;
1193 double best_sse = DBL_MAX1.7976931348623157e+308;
1194 int k;
1195 for (k = 0; k < 8; k++)
31
Loop condition is true. Entering loop body
1196 {
1197 const double stored_scale = _ccv_nnc_8i_rowwise_packed_stored_scale(scale, datatype);
1198 if (!(stored_scale > 0))
32
Assuming 'stored_scale' is > 0
33
Taking false branch
1199 break;
1200 size_t g;
1201 for (g = 0; g < groups_per_row; g++)
34
Assuming 'g' is < 'groups_per_row'
35
Loop condition is true. Entering loop body
1202 {
1203 double y[32] = {0};
1204 double w[32] = {0};
1205 for (j = 0; j < group_size; j++)
36
Assuming 'j' is < 'group_size'
37
Loop condition is true. Entering loop body
38
Assuming 'j' is < 'group_size'
39
Loop condition is true. Entering loop body
1206 {
1207 y[j] = row[g * group_size + j] / stored_scale;
1208 w[j] = weights[g * group_size + j];
40
Assigned value is garbage or undefined
1209 }
1210 ccv_nnc_8i_rowwise_packed_group_t group;
1211 _ccv_nnc_8i_rowwise_packed_quant_group(format, y, w, &group);
1212 memcpy(q8 + g * group_size, group.q8, sizeof(int) * group_size);
1213 }
1214 double sse = 0;
1215 double sum_qx = 0;
1216 double sum_qq = 0;
1217 for (j = 0; j < row_length; j++)
1218 {
1219 const double d = row[j] - stored_scale * q8[j];
1220 sse += weights[j] * d * d;
1221 sum_qx += weights[j] * q8[j] * row[j];
1222 sum_qq += weights[j] * q8[j] * q8[j];
1223 }
1224 if (sse < best_sse)
1225 {
1226 best_sse = sse;
1227 best_scale = stored_scale;
1228 }
1229 if (!(sum_qq > 0) || !(sum_qx > 0))
1230 break;
1231 const double next_scale = sum_qx / sum_qq;
1232 if (_ccv_nnc_8i_rowwise_packed_stored_scale(next_scale, datatype) == stored_scale)
1233 break;
1234 scale = next_scale;
1235 }
1236 _ccv_nnc_8i_rowwise_packed_store_scale(scales, datatype, i, best_scale);
1237 const double final_scale = best_scale > 0 ? best_scale : 1;
1238 size_t g;
1239 for (g = 0; g < groups_per_row; g++)
1240 {
1241 double y[32] = {0};
1242 double w[32] = {0};
1243 for (j = 0; j < group_size; j++)
1244 {
1245 y[j] = row[g * group_size + j] / final_scale;
1246 w[j] = weights[g * group_size + j];
1247 }
1248 ccv_nnc_8i_rowwise_packed_group_t group;
1249 _ccv_nnc_8i_rowwise_packed_quant_group(format, y, w, &group);
1250 _ccv_nnc_8i_rowwise_packed_pack_group(u8, i * groups_per_row + g, format, &group);
1251 }
1252 ccfreefree(q8);
1253 ccfreefree(weights);
1254 ccfreefree(row);
1255 }
1256 } parallel_endfor} }
1257 return output_size;
1258}
1259
1260void ccv_nnc_dequantize_8i_rowwise_x(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, const int format, void* output, const size_t output_length)
1261{
1262 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1262, __extension__ __PRETTY_FUNCTION__
); }))
;
1263 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1263, __extension__ __PRETTY_FUNCTION__
); }))
;
1264 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 1264, __extension__ __PRETTY_FUNCTION__
); }))
;
1265 assert(output_length % row_length == 0)((void) sizeof ((output_length % row_length == 0) ? 1 : 0), __extension__
({ if (output_length % row_length == 0) ; else __assert_fail
("output_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 1265
, __extension__ __PRETTY_FUNCTION__); }))
;
1266 const size_t row_count = output_length / row_length;
1267 const size_t group_size = _ccv_nnc_8i_rowwise_x_group_size(format);
1268 const size_t groups_per_row = (row_length + group_size - 1) / group_size;
1269 const size_t scale_offset = _ccv_nnc_8i_rowwise_packed_scale_offset(format, output_length, row_length);
1270 assert(input_length >= scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype))((void) sizeof ((input_length >= scale_offset + row_count *
_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]) ? 1 : 0), __extension__ ({ if (input_length >= scale_offset
+ row_count * _ccv_get_data_type_size[((datatype) & 0xFF000
) >> 12]) ; else __assert_fail ("input_length >= scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype)"
, "ccv_nnc_8i_rowwise.c", 1270, __extension__ __PRETTY_FUNCTION__
); }))
;
1271 const uint8_t* const u8 = (const uint8_t*)input;
1272 const uint8_t* const scales = u8 + scale_offset;
1273 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1274 const double scale = _ccv_nnc_8i_rowwise_packed_load_scale(scales, datatype, i);
1275 size_t g;
1276 for (g = 0; g < groups_per_row; g++)
1277 {
1278 int q8[32] = {0};
1279 _ccv_nnc_8i_rowwise_packed_decode_group(u8, (size_t)i * groups_per_row + g, format, q8);
1280 size_t j;
1281 for (j = 0; j < group_size; j++)
1282 {
1283 const size_t col = g * group_size + j;
1284 if (col < row_length)
1285 _ccv_nnc_8i_rowwise_packed_write_value(output, datatype, (size_t)i * row_length + col, scale * q8[j]);
1286 }
1287 }
1288 } parallel_endfor} }
1289}
1290
1291static inline int _ccv_nnc_8i_rowwise_quantize(const double v, const double inv_scale)
1292{
1293 const int q = (int)lrint(v * inv_scale);
1294 return ccv_clamp(q, -127, 127)({ typeof (-127) _a = (-127); typeof (127) _b = (127); typeof
(q) _x = (q); (_x < _a) ? _a : ((_x > _b) ? _b : _x); }
)
;
1295}
1296
1297static float _ccv_nnc_quantize_8i_rowwise_16f(const uint16_t* const row, const size_t row_length, const float* const imatrix, int8_t* const q)
1298{
1299 size_t j;
1300 double max_abs = 0;
1301 for (j = 0; j < row_length; j++)
1302 {
1303 float v;
1304 ccv_half_precision_to_float(row + j, &v, 1);
1305 max_abs = ccv_max(max_abs, fabs(v))({ typeof (max_abs) _a = (max_abs); typeof (fabs(v)) _b = (fabs
(v)); (_a > _b) ? _a : _b; })
;
1306 }
1307 if (max_abs == 0)
1308 {
1309 memset(q, 0, row_length);
1310 return 0;
1311 }
1312 double scale = max_abs / 127.;
1313 float best_scale = 0;
1314 double best_sse = DBL_MAX1.7976931348623157e+308;
1315 int k;
1316 for (k = 0; k < 8; k++)
1317 {
1318 // Round with the scale that will actually be stored, then refit scale by least squares.
1319 const float scale_f = (float)scale;
1320 uint16_t scale_h;
1321 float stored_scale;
1322 ccv_float_to_half_precision(&scale_f, &scale_h, 1);
1323 ccv_half_precision_to_float(&scale_h, &stored_scale, 1);
1324 if (!(stored_scale > 0))
1325 break;
1326 const double inv_scale = 1. / stored_scale;
1327 double sum_qx = 0;
1328 double sum_qq = 0;
1329 double sse = 0;
1330 for (j = 0; j < row_length; j++)
1331 {
1332 const double w = _ccv_nnc_8i_rowwise_weight(imatrix, j);
1333 float v_f;
1334 ccv_half_precision_to_float(row + j, &v_f, 1);
1335 const double v = v_f;
1336 const int qj = _ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1337 const double d = v - stored_scale * qj;
1338 sse += w * d * d;
1339 sum_qx += w * qj * v;
1340 sum_qq += w * qj * qj;
1341 }
1342 if (sse < best_sse)
1343 {
1344 best_sse = sse;
1345 best_scale = stored_scale;
1346 }
1347 if (!(sum_qq > 0) || !(sum_qx > 0))
1348 break;
1349 const double next_scale = sum_qx / sum_qq;
1350 const float next_scale_f = (float)next_scale;
1351 uint16_t next_scale_h;
1352 float next_stored_scale;
1353 ccv_float_to_half_precision(&next_scale_f, &next_scale_h, 1);
1354 ccv_half_precision_to_float(&next_scale_h, &next_stored_scale, 1);
1355 if (next_stored_scale == stored_scale)
1356 break;
1357 scale = next_scale;
1358 }
1359 if (!(best_scale > 0))
1360 {
1361 memset(q, 0, row_length);
1362 return 0;
1363 }
1364 const double inv_scale = 1. / best_scale;
1365 for (j = 0; j < row_length; j++)
1366 {
1367 float v;
1368 ccv_half_precision_to_float(row + j, &v, 1);
1369 q[j] = (int8_t)_ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1370 }
1371 return best_scale;
1372}
1373
1374static float _ccv_nnc_quantize_8i_rowwise_16bf(const uint16_t* const row, const size_t row_length, const float* const imatrix, int8_t* const q)
1375{
1376 size_t j;
1377 double max_abs = 0;
1378 for (j = 0; j < row_length; j++)
1379 {
1380 float v;
1381 ccv_bfloat_to_float(row + j, &v, 1);
1382 max_abs = ccv_max(max_abs, fabs(v))({ typeof (max_abs) _a = (max_abs); typeof (fabs(v)) _b = (fabs
(v)); (_a > _b) ? _a : _b; })
;
1383 }
1384 if (max_abs == 0)
1385 {
1386 memset(q, 0, row_length);
1387 return 0;
1388 }
1389 double scale = max_abs / 127.;
1390 float best_scale = 0;
1391 double best_sse = DBL_MAX1.7976931348623157e+308;
1392 int k;
1393 for (k = 0; k < 8; k++)
1394 {
1395 const float scale_f = (float)scale;
1396 uint16_t scale_bf;
1397 float stored_scale;
1398 ccv_float_to_bfloat(&scale_f, &scale_bf, 1);
1399 ccv_bfloat_to_float(&scale_bf, &stored_scale, 1);
1400 if (!(stored_scale > 0))
1401 break;
1402 const double inv_scale = 1. / stored_scale;
1403 double sum_qx = 0;
1404 double sum_qq = 0;
1405 double sse = 0;
1406 for (j = 0; j < row_length; j++)
1407 {
1408 const double w = _ccv_nnc_8i_rowwise_weight(imatrix, j);
1409 float v_f;
1410 ccv_bfloat_to_float(row + j, &v_f, 1);
1411 const double v = v_f;
1412 const int qj = _ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1413 const double d = v - stored_scale * qj;
1414 sse += w * d * d;
1415 sum_qx += w * qj * v;
1416 sum_qq += w * qj * qj;
1417 }
1418 if (sse < best_sse)
1419 {
1420 best_sse = sse;
1421 best_scale = stored_scale;
1422 }
1423 if (!(sum_qq > 0) || !(sum_qx > 0))
1424 break;
1425 const double next_scale = sum_qx / sum_qq;
1426 const float next_scale_f = (float)next_scale;
1427 uint16_t next_scale_bf;
1428 float next_stored_scale;
1429 ccv_float_to_bfloat(&next_scale_f, &next_scale_bf, 1);
1430 ccv_bfloat_to_float(&next_scale_bf, &next_stored_scale, 1);
1431 if (next_stored_scale == stored_scale)
1432 break;
1433 scale = next_scale;
1434 }
1435 if (!(best_scale > 0))
1436 {
1437 memset(q, 0, row_length);
1438 return 0;
1439 }
1440 const double inv_scale = 1. / best_scale;
1441 for (j = 0; j < row_length; j++)
1442 {
1443 float v;
1444 ccv_bfloat_to_float(row + j, &v, 1);
1445 q[j] = (int8_t)_ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1446 }
1447 return best_scale;
1448}
1449
1450static float _ccv_nnc_quantize_8i_rowwise_32f(const float* const row, const size_t row_length, const float* const imatrix, int8_t* const q)
1451{
1452 size_t j;
1453 double max_abs = 0;
1454 for (j = 0; j < row_length; j++)
1455 max_abs = ccv_max(max_abs, fabs(row[j]))({ typeof (max_abs) _a = (max_abs); typeof (fabs(row[j])) _b =
(fabs(row[j])); (_a > _b) ? _a : _b; })
;
1456 if (max_abs == 0)
1457 {
1458 memset(q, 0, row_length);
1459 return 0;
1460 }
1461 double scale = max_abs / 127.;
1462 float best_scale = 0;
1463 double best_sse = DBL_MAX1.7976931348623157e+308;
1464 int k;
1465 for (k = 0; k < 8; k++)
1466 {
1467 const float stored_scale = (float)scale;
1468 if (!(stored_scale > 0))
1469 break;
1470 const double inv_scale = 1. / stored_scale;
1471 double sum_qx = 0;
1472 double sum_qq = 0;
1473 double sse = 0;
1474 for (j = 0; j < row_length; j++)
1475 {
1476 const double w = _ccv_nnc_8i_rowwise_weight(imatrix, j);
1477 const double v = row[j];
1478 const int qj = _ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1479 const double d = v - stored_scale * qj;
1480 sse += w * d * d;
1481 sum_qx += w * qj * v;
1482 sum_qq += w * qj * qj;
1483 }
1484 if (sse < best_sse)
1485 {
1486 best_sse = sse;
1487 best_scale = stored_scale;
1488 }
1489 if (!(sum_qq > 0) || !(sum_qx > 0))
1490 break;
1491 const double next_scale = sum_qx / sum_qq;
1492 if ((float)next_scale == stored_scale)
1493 break;
1494 scale = next_scale;
1495 }
1496 if (!(best_scale > 0))
1497 {
1498 memset(q, 0, row_length);
1499 return 0;
1500 }
1501 const double inv_scale = 1. / best_scale;
1502 for (j = 0; j < row_length; j++)
1503 q[j] = (int8_t)_ccv_nnc_8i_rowwise_quantize(row[j], inv_scale);
1504 return best_scale;
1505}
1506
1507static double _ccv_nnc_quantize_8i_rowwise_64f(const double* const row, const size_t row_length, const float* const imatrix, int8_t* const q)
1508{
1509 size_t j;
1510 double max_abs = 0;
1511 for (j = 0; j < row_length; j++)
1512 max_abs = ccv_max(max_abs, fabs(row[j]))({ typeof (max_abs) _a = (max_abs); typeof (fabs(row[j])) _b =
(fabs(row[j])); (_a > _b) ? _a : _b; })
;
1513 if (max_abs == 0)
1514 {
1515 memset(q, 0, row_length);
1516 return 0;
1517 }
1518 double scale = max_abs / 127.;
1519 double best_scale = 0;
1520 double best_sse = DBL_MAX1.7976931348623157e+308;
1521 int k;
1522 for (k = 0; k < 8; k++)
1523 {
1524 const double stored_scale = scale;
1525 if (!(stored_scale > 0))
1526 break;
1527 const double inv_scale = 1. / stored_scale;
1528 double sum_qx = 0;
1529 double sum_qq = 0;
1530 double sse = 0;
1531 for (j = 0; j < row_length; j++)
1532 {
1533 const double w = _ccv_nnc_8i_rowwise_weight(imatrix, j);
1534 const double v = row[j];
1535 const int qj = _ccv_nnc_8i_rowwise_quantize(v, inv_scale);
1536 const double d = v - stored_scale * qj;
1537 sse += w * d * d;
1538 sum_qx += w * qj * v;
1539 sum_qq += w * qj * qj;
1540 }
1541 if (sse < best_sse)
1542 {
1543 best_sse = sse;
1544 best_scale = stored_scale;
1545 }
1546 if (!(sum_qq > 0) || !(sum_qx > 0))
1547 break;
1548 const double next_scale = sum_qx / sum_qq;
1549 if (next_scale == stored_scale)
1550 break;
1551 scale = next_scale;
1552 }
1553 if (!(best_scale > 0))
1554 {
1555 memset(q, 0, row_length);
1556 return 0;
1557 }
1558 const double inv_scale = 1. / best_scale;
1559 for (j = 0; j < row_length; j++)
1560 q[j] = (int8_t)_ccv_nnc_8i_rowwise_quantize(row[j], inv_scale);
1561 return best_scale;
1562}
1563
1564CCV_WARN_UNUSED(size_t)size_t __attribute__((warn_unused_result)) ccv_nnc_quantize_8i_rowwise(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, const float* const imatrix, void* output, const size_t output_length)
1565{
1566 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1566, __extension__ __PRETTY_FUNCTION__
); }))
;
1567 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1567, __extension__ __PRETTY_FUNCTION__
); }))
;
1568 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 1568, __extension__ __PRETTY_FUNCTION__
); }))
;
1569 assert(input_length % row_length == 0)((void) sizeof ((input_length % row_length == 0) ? 1 : 0), __extension__
({ if (input_length % row_length == 0) ; else __assert_fail (
"input_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 1569
, __extension__ __PRETTY_FUNCTION__); }))
;
1570 const size_t row_count = input_length / row_length;
1571 const size_t scale_offset = (input_length + 127) & -128;
1572 const size_t scale_size = row_count * CCV_GET_DATA_TYPE_SIZE(datatype)_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]
;
1573 assert(output_length >= scale_offset + scale_size)((void) sizeof ((output_length >= scale_offset + scale_size
) ? 1 : 0), __extension__ ({ if (output_length >= scale_offset
+ scale_size) ; else __assert_fail ("output_length >= scale_offset + scale_size"
, "ccv_nnc_8i_rowwise.c", 1573, __extension__ __PRETTY_FUNCTION__
); }))
;
1574 int8_t* const q = (int8_t*)output;
1575 uint8_t* const u8 = (uint8_t*)output;
1576 if (datatype == CCV_16F)
1577 {
1578 const uint16_t* const f16 = (const uint16_t*)input;
1579 uint16_t* const scales = (uint16_t*)(u8 + scale_offset);
1580 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1581 const size_t row_start = (size_t)i * row_length;
1582 const float scale_f = _ccv_nnc_quantize_8i_rowwise_16f(f16 + row_start, row_length, imatrix, q + row_start);
1583 ccv_float_to_half_precision(&scale_f, scales + i, 1);
1584 } parallel_endfor} }
1585 } else if (datatype == CCV_16BF) {
1586 const uint16_t* const bf16 = (const uint16_t*)input;
1587 uint16_t* const scales = (uint16_t*)(u8 + scale_offset);
1588 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1589 const size_t row_start = (size_t)i * row_length;
1590 const float scale_f = _ccv_nnc_quantize_8i_rowwise_16bf(bf16 + row_start, row_length, imatrix, q + row_start);
1591 ccv_float_to_bfloat(&scale_f, scales + i, 1);
1592 } parallel_endfor} }
1593 } else if (datatype == CCV_32F) {
1594 const float* const f32 = (const float*)input;
1595 float* const scales = (float*)(u8 + scale_offset);
1596 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1597 const size_t row_start = (size_t)i * row_length;
1598 scales[i] = _ccv_nnc_quantize_8i_rowwise_32f(f32 + row_start, row_length, imatrix, q + row_start);
1599 } parallel_endfor} }
1600 } else {
1601 assert(datatype == CCV_64F)((void) sizeof ((datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1601, __extension__ __PRETTY_FUNCTION__
); }))
;
1602 const double* const f64 = (const double*)input;
1603 double* const scales = (double*)(u8 + scale_offset);
1604 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1605 const size_t row_start = (size_t)i * row_length;
1606 scales[i] = _ccv_nnc_quantize_8i_rowwise_64f(f64 + row_start, row_length, imatrix, q + row_start);
1607 } parallel_endfor} }
1608 }
1609 return scale_offset + scale_size;
1610}
1611
1612void ccv_nnc_dequantize_8i_rowwise(const void* input, const int datatype, const int memory_type, const size_t input_length, const size_t row_length, void* output, const size_t output_length)
1613{
1614 assert(datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F)((void) sizeof ((datatype == CCV_16F || datatype == CCV_16BF ||
datatype == CCV_32F || datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_16F || datatype == CCV_16BF || datatype
== CCV_32F || datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_16F || datatype == CCV_16BF || datatype == CCV_32F || datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1614, __extension__ __PRETTY_FUNCTION__
); }))
;
1615 assert(memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY || memory_type
== CCV_TENSOR_GPU_MEMORY) ? 1 : 0), __extension__ ({ if (memory_type
== CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY
) ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY || memory_type == CCV_TENSOR_GPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1615, __extension__ __PRETTY_FUNCTION__
); }))
;
1616 assert(row_length > 0)((void) sizeof ((row_length > 0) ? 1 : 0), __extension__ (
{ if (row_length > 0) ; else __assert_fail ("row_length > 0"
, "ccv_nnc_8i_rowwise.c", 1616, __extension__ __PRETTY_FUNCTION__
); }))
;
1617 assert(output_length % row_length == 0)((void) sizeof ((output_length % row_length == 0) ? 1 : 0), __extension__
({ if (output_length % row_length == 0) ; else __assert_fail
("output_length % row_length == 0", "ccv_nnc_8i_rowwise.c", 1617
, __extension__ __PRETTY_FUNCTION__); }))
;
1618 if (memory_type != CCV_TENSOR_CPU_MEMORY)
1619 {
1620#ifdef HAVE_CUDA1
1621 ccv_nnc_compat_dequantize_8i_rowwise(input, datatype, input_length, row_length, output, output_length, 0);
1622#elif defined(HAVE_MPS)
1623 assert(datatype != CCV_64F)((void) sizeof ((datatype != CCV_64F) ? 1 : 0), __extension__
({ if (datatype != CCV_64F) ; else __assert_fail ("datatype != CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1623, __extension__ __PRETTY_FUNCTION__
); }))
;
1624 ccv_nnc_mps_dequantize_8i_rowwise(input, datatype, input_length, row_length, output, output_length, 0);
1625#else
1626 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_8i_rowwise.c", 1626, __extension__ __PRETTY_FUNCTION__
); }))
;
1627#endif
1628 return;
1629 }
1630 const size_t row_count = output_length / row_length;
1631 const size_t scale_offset = (output_length + 127) & -128;
1632 assert(input_length >= scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype))((void) sizeof ((input_length >= scale_offset + row_count *
_ccv_get_data_type_size[((datatype) & 0xFF000) >> 12
]) ? 1 : 0), __extension__ ({ if (input_length >= scale_offset
+ row_count * _ccv_get_data_type_size[((datatype) & 0xFF000
) >> 12]) ; else __assert_fail ("input_length >= scale_offset + row_count * CCV_GET_DATA_TYPE_SIZE(datatype)"
, "ccv_nnc_8i_rowwise.c", 1632, __extension__ __PRETTY_FUNCTION__
); }))
;
1633 const int8_t* const q = (const int8_t*)input;
1634 const uint8_t* const u8 = (const uint8_t*)input;
1635 if (datatype == CCV_16F)
1636 {
1637 uint16_t* const f16 = (uint16_t*)output;
1638 const uint16_t* const scales = (const uint16_t*)(u8 + scale_offset);
1639 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1640 const size_t row_start = (size_t)i * row_length;
1641 float scale_f;
1642 ccv_half_precision_to_float(scales + i, &scale_f, 1);
1643 size_t j;
1644 for (j = 0; j < row_length; j++)
1645 {
1646 const float v = q[row_start + j] * scale_f;
1647 ccv_float_to_half_precision(&v, f16 + row_start + j, 1);
1648 }
1649 } parallel_endfor} }
1650 } else if (datatype == CCV_16BF) {
1651 uint16_t* const bf16 = (uint16_t*)output;
1652 const uint16_t* const scales = (const uint16_t*)(u8 + scale_offset);
1653 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1654 const size_t row_start = (size_t)i * row_length;
1655 float scale_f;
1656 ccv_bfloat_to_float(scales + i, &scale_f, 1);
1657 size_t j;
1658 for (j = 0; j < row_length; j++)
1659 {
1660 const float v = q[row_start + j] * scale_f;
1661 ccv_float_to_bfloat(&v, bf16 + row_start + j, 1);
1662 }
1663 } parallel_endfor} }
1664 } else if (datatype == CCV_32F) {
1665 float* const f32 = (float*)output;
1666 const float* const scales = (const float*)(u8 + scale_offset);
1667 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1668 const size_t row_start = (size_t)i * row_length;
1669 const float scale = scales[i];
1670 size_t j;
1671 for (j = 0; j < row_length; j++)
1672 f32[row_start + j] = q[row_start + j] * scale;
1673 } parallel_endfor} }
1674 } else {
1675 assert(datatype == CCV_64F)((void) sizeof ((datatype == CCV_64F) ? 1 : 0), __extension__
({ if (datatype == CCV_64F) ; else __assert_fail ("datatype == CCV_64F"
, "ccv_nnc_8i_rowwise.c", 1675, __extension__ __PRETTY_FUNCTION__
); }))
;
1676 double* const f64 = (double*)output;
1677 const double* const scales = (const double*)(u8 + scale_offset);
1678 parallel_for(i, (int)row_count){ int i; for ((i) = 0; (i) < ((int)row_count); (i)++) { {
1679 const size_t row_start = (size_t)i * row_length;
1680 const double scale = scales[i];
1681 size_t j;
1682 for (j = 0; j < row_length; j++)
1683 f64[row_start + j] = q[row_start + j] * scale;
1684 } parallel_endfor} }
1685 }
1686}