/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_cnnp_dataframe_csv.c
Line | Count | Source |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_internal.h" |
5 | | #include "_ccv_cnnp_dataframe.h" |
6 | | |
7 | | #include <sys/mman.h> |
8 | | |
9 | | // MARK - Create Dataframe from Comma-separated-values Files |
10 | | |
11 | | typedef struct { |
12 | | int even; |
13 | | int odd; |
14 | | int even_starter; |
15 | | int odd_starter; |
16 | | int quotes; |
17 | | } csv_crlf_t; |
18 | | |
19 | 17.2M | #define ANY_ZEROS(v) ((v - (uint64_t)0x0101010101010101) & ((~v) & (uint64_t)0x8080808080808080)) |
20 | | |
21 | | static inline void _fix_double_quote(const char* src, int count, char* dest) |
22 | 2 | { |
23 | 2 | if (!src || count <= 0) |
24 | 0 | return; |
25 | 2 | char prev_char = src[0]; |
26 | 2 | dest[0] = src[0]; |
27 | 2 | ++dest; |
28 | 2 | int pos = 1; |
29 | 5.76M | while (pos < count) |
30 | 5.76M | { |
31 | | // double-quote, skip. |
32 | 5.76M | if (prev_char == '"' && src[pos] == '"'3 ) |
33 | 3 | ++pos; |
34 | 5.76M | dest[0] = src[pos]; |
35 | 5.76M | prev_char = src[pos]; |
36 | 5.76M | ++dest; |
37 | 5.76M | ++pos; |
38 | 5.76M | } |
39 | 2 | dest[0] = '\0'; |
40 | 2 | } |
41 | | |
42 | | typedef struct { |
43 | | const char* data; |
44 | | void* mmap; |
45 | | size_t file_size; |
46 | | int column_size; |
47 | | int include_header; |
48 | | char delim; |
49 | | char quote; |
50 | | } ccv_cnnp_csv_t; |
51 | | |
52 | | typedef struct { |
53 | | // This need to be compressed to 64-bit. If we expand this to 128-bit. It will double the memory-bandwidth, and |
54 | | // slows the whole process down. |
55 | | uint64_t str:48; |
56 | | uint16_t count:15; |
57 | | uint8_t no_double_quote:1; |
58 | | } ccv_cnnp_csv_str_view_t; |
59 | | |
60 | | void _ccv_cnnp_csv_enum(const int column_idx, const int* const row_idxs, const int row_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context) |
61 | 53 | { |
62 | 53 | ccv_cnnp_csv_t* const csv = (ccv_cnnp_csv_t*)context; |
63 | 53 | const int column_size = csv->column_size; |
64 | 53 | const int include_header = csv->include_header; |
65 | 53 | const char quote = csv->quote; |
66 | 53 | const char delim = csv->delim; |
67 | 53 | const ccv_cnnp_csv_str_view_t* const sp = (const ccv_cnnp_csv_str_view_t*)(csv + 1) + column_idx + include_header * column_size; |
68 | 53 | int i; |
69 | 115 | for (i = 0; i < row_size; i++62 ) |
70 | 62 | { |
71 | 62 | const int row_idx = row_idxs[i]; |
72 | 62 | const ccv_cnnp_csv_str_view_t* const csp = sp + row_idx * column_size; |
73 | | // This is the same as (csp->str == 0 && csp->no_double_quote = 0 && csp->count == 0) |
74 | | // If a string has 0 length, it cannot contain double quote, therefore, this condition |
75 | | // implies the pointer is null. |
76 | 62 | if (((uint64_t*)csp)[0] == 0) |
77 | 1 | { |
78 | 1 | if (data[i]) |
79 | 1 | { |
80 | 1 | int* hdr = (int*)data[i] - 1; |
81 | 1 | ccfree(hdr); |
82 | 1 | } |
83 | 1 | data[i] = 0; |
84 | 1 | continue; |
85 | 1 | } |
86 | 61 | const char* str = csv->data + csp->str; |
87 | 61 | int count = 0; |
88 | 61 | if (csp->count == 0x7fff) // We don't know the count yet. In this case, go over to find it. |
89 | 3 | { |
90 | 3 | const char* const p_end = csv->data + csv->file_size; |
91 | 3 | int quotes = (str > csv->data && str[-1] == quote) ? 11 : 02 ; |
92 | 3 | const char* p = str; |
93 | 3 | const char* quote_end = 0; |
94 | 17.2M | for (; p < p_end; p++17.2M ) |
95 | 17.2M | { |
96 | 17.2M | if (p[0] == quote) |
97 | 3 | { |
98 | 3 | ++quotes; |
99 | 3 | quote_end = p; |
100 | 17.2M | } else if (!(quotes & 1)) { |
101 | 11.5M | if (p[0] == delim || p[0] == '\r'11.5M || p[0] == '\n'11.5M ) |
102 | 3 | { |
103 | 3 | if (quote_end >= str) |
104 | 1 | count = quote_end - str; |
105 | 2 | else |
106 | 2 | count = p - str; |
107 | 3 | break; |
108 | 3 | } |
109 | 11.5M | } |
110 | 17.2M | } |
111 | 3 | } else |
112 | 58 | count = csp->count; |
113 | 61 | if (!data[i]) |
114 | 15 | { |
115 | 15 | int* const hdr = (int*)ccmalloc(sizeof(int) + count + 1); |
116 | 15 | hdr[0] = count + 1; |
117 | 15 | data[i] = (char*)(hdr + 1); |
118 | 46 | } else { |
119 | 46 | int* hdr = (int*)data[i] - 1; |
120 | 46 | if (hdr[0] < count + 1) |
121 | 7 | { |
122 | 7 | hdr = (int*)ccrealloc(hdr, sizeof(int) + count + 1); |
123 | 7 | hdr[0] = count + 1; |
124 | 7 | data[i] = (char*)(hdr + 1); |
125 | 7 | } |
126 | 46 | } |
127 | 61 | if (csp->no_double_quote) |
128 | 59 | { |
129 | 59 | memcpy(data[i], str, count); |
130 | 59 | ((char*)data[i])[count] = '\0'; |
131 | 59 | } else |
132 | 2 | _fix_double_quote(str, count, (char*)data[i]); |
133 | 61 | } |
134 | 53 | } |
135 | | |
136 | | void _ccv_cnnp_csv_data_deinit(void* const data, void* const context) |
137 | 14 | { |
138 | 14 | if (data) |
139 | 14 | { |
140 | 14 | int* hdr = (int*)data - 1; |
141 | 14 | ccfree(hdr); |
142 | 14 | } |
143 | 14 | } |
144 | | |
145 | | void _ccv_cnnp_csv_deinit(void* const context) |
146 | 5 | { |
147 | 5 | ccv_cnnp_csv_t* const csv = (ccv_cnnp_csv_t*)context; |
148 | 5 | if (csv->mmap) |
149 | 5 | munmap(csv->mmap, csv->file_size); |
150 | 5 | ccfree(csv); |
151 | 5 | } |
152 | | |
153 | | ccv_cnnp_dataframe_t* ccv_cnnp_dataframe_from_csv_new(void* const input, const int type, const size_t len, const char _delim, const char _quote, const int include_header, int* const column_size) |
154 | 5 | { |
155 | 5 | assert(input); |
156 | 5 | assert(column_size); |
157 | 5 | size_t file_size; |
158 | 5 | char* data; |
159 | 5 | assert(type == CCV_CNNP_DATAFRAME_CSV_FILE || type == CCV_CNNP_DATAFRAME_CSV_MEMORY); |
160 | 5 | if (type == CCV_CNNP_DATAFRAME_CSV_FILE) |
161 | 5 | { |
162 | 5 | FILE* file = (FILE*)input; |
163 | 5 | const int fd = fileno(file); |
164 | 5 | if (fd == -1) |
165 | 0 | return 0; |
166 | 5 | fseek(file, 0, SEEK_END); |
167 | 5 | file_size = ftell(file); |
168 | 5 | fseek(file, 0, SEEK_SET); |
169 | 5 | if (file_size < 2) |
170 | 0 | return 0; |
171 | 5 | data = mmap(NULL, file_size, PROT_READ, MAP_SHARED, fd, 0); |
172 | 5 | if (!data) |
173 | 0 | return 0; |
174 | 5 | } else { |
175 | 0 | file_size = len; |
176 | 0 | assert(len > 0); |
177 | 0 | if (len < 2) |
178 | 0 | return 0; |
179 | 0 | data = input; |
180 | 0 | } |
181 | | // We cannot handle file size larger than 2^48, which is around 281TB. |
182 | 5 | #if defined(__LP64__) || defined(_WIN64) |
183 | 5 | assert(file_size <= 0xffffffffffffllu); |
184 | 5 | #endif |
185 | 5 | const char delim = _delim ? _delim : ','0 ; |
186 | 5 | const char quote = _quote ? _quote : '"'0 ; |
187 | 5 | const size_t chunk_size = 1024 * 1024; |
188 | 5 | const int aligned_chunks = file_size / chunk_size; |
189 | 5 | const int total_chunks = (file_size + chunk_size - 1) / chunk_size; |
190 | | // Get number of rows. |
191 | 5 | csv_crlf_t* const crlf = cccalloc(total_chunks, sizeof(csv_crlf_t)); |
192 | 5 | #define CSV_QUOTE_BR(c, n) \ |
193 | 20.2M | do { \ |
194 | 20.2M | if (c##n == quote) \ |
195 | 20.2M | ++quotes12.9k ; \ |
196 | 20.2M | else if (20.1M c20.1M ##20.1M n == '\n') { \ |
197 | 63 | ++count[quotes & 1]; \ |
198 | 63 | if (starter[quotes & 1] == -1) \ |
199 | 63 | starter[quotes & 1] = (int)(p - p_start) + n11 ; \ |
200 | 63 | } \ |
201 | 20.2M | } while (0) |
202 | 18 | parallel_for5 (i, aligned_chunks) { |
203 | 18 | const uint64_t* pd = (const uint64_t*)(data + i * chunk_size); |
204 | 18 | const char* const p_start = (const char*)pd; |
205 | 18 | const uint64_t* const pd_end = pd + chunk_size / sizeof(uint64_t); |
206 | 18 | int quotes = 0; |
207 | 18 | int starter[2] = {-1, -1}; |
208 | 18 | int count[2] = {0, 0}; |
209 | 2.35M | for (; pd < pd_end; pd++2.35M ) |
210 | 2.35M | { |
211 | | // Load 8-bytes at batch. |
212 | 2.35M | const char* const p = (const char*)pd; |
213 | 2.35M | char c0, c1, c2, c3, c4, c5, c6, c7; |
214 | 2.35M | c0 = p[0], c1 = p[1], c2 = p[2], c3 = p[3], c4 = p[4], c5 = p[5], c6 = p[6], c7 = p[7]; |
215 | 2.35M | CSV_QUOTE_BR(c, 0); |
216 | 2.35M | CSV_QUOTE_BR(c, 1); |
217 | 2.35M | CSV_QUOTE_BR(c, 2); |
218 | 2.35M | CSV_QUOTE_BR(c, 3); |
219 | 2.35M | CSV_QUOTE_BR(c, 4); |
220 | 2.35M | CSV_QUOTE_BR(c, 5); |
221 | 2.35M | CSV_QUOTE_BR(c, 6); |
222 | 2.35M | CSV_QUOTE_BR(c, 7); |
223 | 2.35M | } |
224 | 18 | crlf[i].even = count[0]; |
225 | 18 | crlf[i].odd = count[1]; |
226 | 18 | crlf[i].even_starter = starter[0]; |
227 | 18 | crlf[i].odd_starter = starter[1]; |
228 | 18 | crlf[i].quotes = quotes; |
229 | 18 | } parallel_endfor |
230 | 5 | if (total_chunks > aligned_chunks) |
231 | 5 | { |
232 | 5 | const int residual_size = file_size - chunk_size * aligned_chunks; |
233 | 5 | const uint64_t* pd = (const uint64_t*)(data + chunk_size * aligned_chunks); |
234 | 5 | const char* const p_start = (const char*)pd; |
235 | 5 | const uint64_t* const pd_end = pd + residual_size / sizeof(uint64_t); |
236 | 5 | int quotes = 0; |
237 | 5 | int starter[2] = {-1, -1}; |
238 | 5 | int count[2] = {0, 0}; |
239 | 167k | for (; pd < pd_end; pd++167k ) |
240 | 167k | { |
241 | 167k | const char* const p = (const char*)pd; |
242 | | // Load 8-bytes at batch. |
243 | 167k | char c0, c1, c2, c3, c4, c5, c6, c7; |
244 | 167k | c0 = p[0], c1 = p[1], c2 = p[2], c3 = p[3], c4 = p[4], c5 = p[5], c6 = p[6], c7 = p[7]; |
245 | 167k | CSV_QUOTE_BR(c, 0); |
246 | 167k | CSV_QUOTE_BR(c, 1); |
247 | 167k | CSV_QUOTE_BR(c, 2); |
248 | 167k | CSV_QUOTE_BR(c, 3); |
249 | 167k | CSV_QUOTE_BR(c, 4); |
250 | 167k | CSV_QUOTE_BR(c, 5); |
251 | 167k | CSV_QUOTE_BR(c, 6); |
252 | 167k | CSV_QUOTE_BR(c, 7); |
253 | 167k | } |
254 | 5 | const char* const p_end = data + file_size; |
255 | 5 | const char* p = (const char*)pd_end; |
256 | 11 | for (; p < p_end; p++6 ) |
257 | 6 | { |
258 | 6 | const char c0 = p[0]; |
259 | 6 | CSV_QUOTE_BR(c, 0); |
260 | 6 | } |
261 | 5 | crlf[aligned_chunks].even = count[0]; |
262 | 5 | crlf[aligned_chunks].odd = count[1]; |
263 | 5 | crlf[aligned_chunks].even_starter = starter[0] < 0 ? residual_size0 : starter[0]; |
264 | 5 | crlf[aligned_chunks].odd_starter = starter[1] < 0 ? residual_size4 : starter[1]1 ; |
265 | 5 | crlf[aligned_chunks].quotes = quotes; |
266 | 5 | } |
267 | 5 | #undef CSV_QUOTE_BR |
268 | 5 | int row_count = crlf[0].even; |
269 | 5 | int quotes = crlf[0].quotes; |
270 | 5 | crlf[0].odd_starter = 0; |
271 | 5 | int i; |
272 | | // Go through all chunks serially to find exactly how many line ends in each chunk, moving that information to even*. |
273 | | // The odd_starter will record which row it currently at for this chunk. |
274 | 23 | for (i = 1; i < total_chunks; i++18 ) |
275 | 18 | { |
276 | 18 | if (quotes & 1) |
277 | 5 | { |
278 | | // Even is the correct one, we will use that throughout. |
279 | 5 | crlf[i].even = crlf[i].odd; |
280 | 5 | crlf[i].even_starter = crlf[i].odd_starter; |
281 | 5 | } |
282 | 18 | crlf[i].odd_starter = row_count + 1; |
283 | 18 | row_count += crlf[i].even; |
284 | 18 | quotes += crlf[i].quotes; |
285 | 18 | } |
286 | | // Didn't end with newline, one more row. |
287 | 5 | if (!(data[file_size - 1] == '\n' || (0 data[file_size - 2] == '\n'0 && data[file_size - 1] == '\r'0 ))) |
288 | 0 | ++row_count; |
289 | | // Get number of columns. |
290 | 5 | int column_count = 0; |
291 | 5 | const uint64_t* pd = (const uint64_t*)data; |
292 | 5 | int first_line_len = file_size; |
293 | 10 | for (i = 0; i < total_chunks; i++5 ) |
294 | 10 | if (crlf[i].even_starter >= 0) |
295 | 5 | { |
296 | 5 | first_line_len = i * chunk_size + crlf[i].even_starter; |
297 | 5 | break; |
298 | 5 | } |
299 | 5 | const uint64_t* const pd_end = pd + first_line_len / sizeof(uint64_t); |
300 | 5 | #define CSV_QUOTE_BR(cn) \ |
301 | 5.81M | do { \ |
302 | 5.81M | if (cn == quote) \ |
303 | 5.81M | ++quotes12.8k ; \ |
304 | 5.81M | else if (5.79M !(quotes & 1)5.79M ) { \ |
305 | 5.76M | if (cn == delim) \ |
306 | 5.76M | ++column_count6.42k ; \ |
307 | 5.76M | } \ |
308 | 5.81M | } while (0) |
309 | 5 | quotes = 0; |
310 | 726k | for (; pd < pd_end; pd++726k ) |
311 | 726k | { |
312 | 726k | const char* const p = (const char*)pd; |
313 | 726k | char c0, c1, c2, c3, c4, c5, c6, c7; |
314 | 726k | c0 = p[0], c1 = p[1], c2 = p[2], c3 = p[3], c4 = p[4], c5 = p[5], c6 = p[6], c7 = p[7]; |
315 | 726k | CSV_QUOTE_BR(c0); |
316 | 726k | CSV_QUOTE_BR(c1); |
317 | 726k | CSV_QUOTE_BR(c2); |
318 | 726k | CSV_QUOTE_BR(c3); |
319 | 726k | CSV_QUOTE_BR(c4); |
320 | 726k | CSV_QUOTE_BR(c5); |
321 | 726k | CSV_QUOTE_BR(c6); |
322 | 726k | CSV_QUOTE_BR(c7); |
323 | 726k | } |
324 | | // If haven't reached the flag yet (i.e., haven't reached a new line). |
325 | 5 | const char* p = (const char*)pd; |
326 | 5 | const char* const p_end = data + first_line_len; |
327 | 15 | for (; p < p_end; p++10 ) |
328 | 10 | { |
329 | 10 | const char c0 = p[0]; |
330 | 10 | CSV_QUOTE_BR(c0); |
331 | 10 | } |
332 | 5 | #undef CSV_QUOTE_BR |
333 | 5 | ++column_count; // column count is 1 more than delimiter. |
334 | 5 | if (row_count == 0) // This is possible because you have an open quote, and then \n is inside the open quote, which won't be recognized. |
335 | 0 | { |
336 | 0 | ccfree(crlf); |
337 | 0 | if (type == CCV_CNNP_DATAFRAME_CSV_FILE) |
338 | 0 | munmap(data, file_size); |
339 | 0 | return 0; |
340 | 0 | } |
341 | | // We only mark the beginning and the end of a cell. Removing double-quote etc will be left when iterating. |
342 | 5 | ccv_cnnp_csv_t* const csv = (ccv_cnnp_csv_t*)ccmalloc(sizeof(ccv_cnnp_csv_t) + sizeof(ccv_cnnp_csv_str_view_t) * row_count * column_count); |
343 | 5 | csv->column_size = column_count; |
344 | 5 | csv->include_header = !!include_header; |
345 | 5 | ccv_cnnp_csv_str_view_t* const sp = (ccv_cnnp_csv_str_view_t*)(csv + 1); |
346 | 5 | memset(sp, 0, sizeof(ccv_cnnp_csv_str_view_t) * row_count * column_count); |
347 | 5 | const uint64_t delim_mask = (uint64_t)0x0101010101010101 * (uint64_t)delim; |
348 | 5 | const uint64_t quote_mask = (uint64_t)0x0101010101010101 * (uint64_t)quote; |
349 | 5 | const uint64_t lf_mask = (uint64_t)0x0101010101010101 * (uint64_t)'\n'; |
350 | 5 | const uint64_t cr_mask = (uint64_t)0x0101010101010101 * (uint64_t)'\r'; |
351 | 5 | #define CSV_QUOTE_BR(c, n) \ |
352 | 1.28M | do { \ |
353 | 1.28M | if (c##n == quote) \ |
354 | 1.28M | { \ |
355 | | /* If the preceding one is not a quote. Set it to be null-terminator temporarily. */ \ |
356 | 12.9k | ++quotes; \ |
357 | 12.9k | quote_end = p + n; \ |
358 | 12.9k | if (!preceding_quote) \ |
359 | 12.9k | preceding_quote = 112.9k ; \ |
360 | 12.9k | else \ |
361 | 12.9k | double_quote = 14 ; \ |
362 | 1.27M | } else { \ |
363 | 1.27M | preceding_quote = 0; \ |
364 | 1.27M | if (!(quotes & 1)) \ |
365 | 1.27M | { \ |
366 | 1.23M | if (c##n == delim) \ |
367 | 1.23M | { \ |
368 | 160k | if (chunk_row_count < row_count) \ |
369 | 160k | { \ |
370 | 160k | if (chunk_column_count < column_count) \ |
371 | 160k | { \ |
372 | 160k | int count; \ |
373 | 160k | if (quote_end > 0 && quote_end - data >= csp[chunk_column_count].str154k ) \ |
374 | 160k | count = (int)((quote_end - data) - csp[chunk_column_count].str)6.42k ; \ |
375 | 160k | else \ |
376 | 160k | count = (int)((p + n - data) - csp[chunk_column_count].str)154k ; \ |
377 | 160k | csp[chunk_column_count].count = ccv_min(count, 0x7fff); \ |
378 | 160k | csp[chunk_column_count].no_double_quote = !double_quote; \ |
379 | 160k | } \ |
380 | 160k | ++chunk_column_count; \ |
381 | 160k | if (chunk_column_count < column_count) \ |
382 | | /* Skip quote if presented. */ \ |
383 | 160k | csp[chunk_column_count].str = (p + (n + 1) < p_end && p[n + 1] == quote ? p + (n + 2)6.47k : p + (n + 1)154k ) - data; \ |
384 | 160k | } \ |
385 | 160k | double_quote = 0; \ |
386 | 1.07M | } else if (c##n == '\n') { \ |
387 | 56 | if (chunk_row_count < row_count && chunk_column_count < column_count) \ |
388 | 56 | { \ |
389 | 56 | int count; \ |
390 | 56 | if (quote_end > 0 && quote_end - data >= csp[chunk_column_count].str55 ) \ |
391 | 56 | count = (int)((quote_end - data) - csp[chunk_column_count].str)49 ; \ |
392 | 56 | else if (7 p + n > data7 && p[n - 1] == '\r'7 ) \ |
393 | 7 | count = (int)((p + n - 1 - data) - csp[chunk_column_count].str)1 ; \ |
394 | 7 | else \ |
395 | 7 | count = (int)((p + n - data) - csp[chunk_column_count].str)6 ; \ |
396 | 56 | csp[chunk_column_count].count = ccv_min(count, 0x7fff); \ |
397 | 56 | csp[chunk_column_count].no_double_quote = !double_quote; \ |
398 | 56 | } \ |
399 | 56 | ++chunk_row_count; \ |
400 | 56 | csp += column_count; \ |
401 | 56 | chunk_column_count = 0; \ |
402 | 56 | if (chunk_row_count < row_count) \ |
403 | 56 | { \ |
404 | 52 | if (p + (n + 1) < p_end && p[n + 1] == '\r') \ |
405 | 52 | csp[0].str = (0 p + (n + 2) < p_end0 && p[n + 2] == quote0 ? p + (n + 3)0 : p + (n + 2)0 ) - data; \ |
406 | 52 | else \ |
407 | 52 | csp[0].str = (p + (n + 1) < p_end && p[n + 1] == quote ? p + (n + 2)0 : p + (n + 1)) - data; \ |
408 | 52 | } \ |
409 | 56 | double_quote = 0; \ |
410 | 56 | } \ |
411 | 1.23M | } \ |
412 | 1.27M | } \ |
413 | 1.28M | } while (0) |
414 | 23 | parallel_for5 (i, total_chunks) { |
415 | | // Skip if existing one don't have a line starter. |
416 | 23 | if (i > 0 && crlf[i].even_starter < 018 ) |
417 | 13 | continue; |
418 | 10 | const char* p = (i == 0) ? data5 : data + i * chunk_size + crlf[i].even_starter + 15 ; |
419 | 10 | const char* p_end = data + file_size; |
420 | 10 | int j; |
421 | 23 | for (j = i + 1; j < total_chunks; j++13 ) |
422 | 18 | if (crlf[j].even_starter >= 0) |
423 | 5 | { |
424 | 5 | p_end = data + j * chunk_size + crlf[j].even_starter; |
425 | 5 | break; |
426 | 5 | } |
427 | 10 | if (p_end <= p) |
428 | 1 | continue; |
429 | 9 | int chunk_row_count = crlf[i].odd_starter; |
430 | 9 | ccv_cnnp_csv_str_view_t* csp = sp + (uintptr_t)column_count * chunk_row_count; |
431 | 9 | if (chunk_row_count < row_count) |
432 | 9 | { |
433 | 9 | if (p[0] == '\r') |
434 | 0 | csp[0].str = (p + 1 < p_end && p[1] == quote ? p + 2 : p + 1) - data; |
435 | 9 | else |
436 | 9 | csp[0].str = (p[0] == quote ? p + 13 : p6 ) - data; |
437 | 9 | } |
438 | 9 | int chunk_column_count = 0; |
439 | 9 | int quotes = 0; |
440 | 9 | int preceding_quote = 0; |
441 | 9 | int double_quote = 0; |
442 | 9 | const char* quote_end = 0; |
443 | 9 | const int padding = ccv_min(0x7 - (((uintptr_t)p - 1) & 0x7), (int)(p_end - p)); |
444 | 32 | for (j = 0; j < padding; j++, p++23 ) |
445 | 23 | { |
446 | 23 | char c0 = p[0]; |
447 | 23 | CSV_QUOTE_BR(c, 0); |
448 | 23 | } |
449 | 9 | const size_t cur_chunk_size = (size_t)(p_end - p); |
450 | 9 | const uint64_t* pd = (const uint64_t*)p; |
451 | 9 | const uint64_t* pd_end = pd + cur_chunk_size / sizeof(uint64_t); |
452 | 2.52M | for (; pd < pd_end; pd++2.52M ) |
453 | 2.52M | { |
454 | 2.52M | const uint64_t v = *pd; |
455 | 2.52M | const uint64_t delim_v = v ^ delim_mask; |
456 | 2.52M | const uint64_t quote_v = v ^ quote_mask; |
457 | 2.52M | const uint64_t lf_v = v ^ lf_mask; |
458 | 2.52M | const uint64_t cr_v = v ^ cr_mask; |
459 | | // If it doesn't contain any zeros, skip the logic. |
460 | 2.52M | if (!ANY_ZEROS(delim_v) && !2.36M ANY_ZEROS2.36M (quote_v) && !2.36M ANY_ZEROS2.36M (lf_v) && !2.36M ANY_ZEROS2.36M (cr_v)) |
461 | 2.36M | continue; |
462 | | // Need to check and assign the length and starting point. |
463 | 160k | p = (const char*)pd; |
464 | | // Load 8-bytes at batch. |
465 | 160k | char c0, c1, c2, c3, c4, c5, c6, c7; |
466 | 160k | c0 = p[0], c1 = p[1], c2 = p[2], c3 = p[3], c4 = p[4], c5 = p[5], c6 = p[6], c7 = p[7]; |
467 | 160k | CSV_QUOTE_BR(c, 0); |
468 | 160k | CSV_QUOTE_BR(c, 1); |
469 | 160k | CSV_QUOTE_BR(c, 2); |
470 | 160k | CSV_QUOTE_BR(c, 3); |
471 | 160k | CSV_QUOTE_BR(c, 4); |
472 | 160k | CSV_QUOTE_BR(c, 5); |
473 | 160k | CSV_QUOTE_BR(c, 6); |
474 | 160k | CSV_QUOTE_BR(c, 7); |
475 | 160k | } |
476 | 9 | p = (const char*)pd; |
477 | 27 | for (; p < p_end; p++18 ) |
478 | 18 | { |
479 | 18 | char c0 = p[0]; |
480 | 18 | CSV_QUOTE_BR(c, 0); |
481 | 18 | } |
482 | 9 | if (chunk_row_count < row_count && chunk_column_count < column_count5 ) |
483 | 5 | { |
484 | 5 | int count; |
485 | 5 | if (quote_end > 0 && quote_end - data >= csp[chunk_column_count].str3 ) |
486 | 3 | count = (int)(quote_end - data - csp[chunk_column_count].str); |
487 | 2 | else |
488 | 2 | count = (int)(p - data - csp[chunk_column_count].str); |
489 | 5 | csp[chunk_column_count].count = ccv_min(count, 0x7fff); |
490 | 5 | csp[chunk_column_count].no_double_quote = !double_quote; |
491 | 5 | } |
492 | 9 | } parallel_endfor |
493 | 5 | #undef CSV_QUOTE_BR |
494 | 5 | ccfree(crlf); |
495 | 5 | csv->data = data; |
496 | 5 | assert(file_size > 0); |
497 | 5 | csv->file_size = file_size; |
498 | 5 | csv->delim = delim; |
499 | 5 | csv->quote = quote; |
500 | 5 | if (type == CCV_CNNP_DATAFRAME_CSV_FILE) |
501 | 5 | csv->mmap = data; |
502 | 5 | *column_size = column_count; |
503 | 5 | assert(column_count > 0); |
504 | 5 | ccv_cnnp_column_data_t* const column_data = (ccv_cnnp_column_data_t*)cccalloc(column_count, sizeof(ccv_cnnp_column_data_t)); |
505 | 6.43k | for (i = 0; i < column_count; i++6.43k ) |
506 | 6.43k | { |
507 | 6.43k | column_data[i].data_enum = _ccv_cnnp_csv_enum; |
508 | 6.43k | column_data[i].context = csv; |
509 | 6.43k | column_data[i].data_deinit = _ccv_cnnp_csv_data_deinit; |
510 | 6.43k | } |
511 | 5 | if (include_header) |
512 | 3.21k | for (i = 0; 1 i < column_count; i++3.21k ) |
513 | 3.21k | if (((uint64_t*)sp)[i] != 0) |
514 | 3.21k | { |
515 | 3.21k | column_data[i].name = (char*)ccmalloc(sp[i].count + 1); |
516 | 3.21k | const char* str = data + sp[i].str; |
517 | 3.21k | if (sp[i].no_double_quote) |
518 | 3.21k | { |
519 | 3.21k | memcpy(column_data[i].name, str, sp[i].count); |
520 | 3.21k | column_data[i].name[sp[i].count] = '\0'; |
521 | 3.21k | } else |
522 | 0 | _fix_double_quote(str, sp[i].count, column_data[i].name); |
523 | 3.21k | } |
524 | 5 | column_data[0].context_deinit = _ccv_cnnp_csv_deinit; |
525 | 5 | ccv_cnnp_dataframe_t* dataframe = ccv_cnnp_dataframe_new(column_data, column_count, row_count - !!include_header); |
526 | 5 | if (include_header) |
527 | 3.21k | for (i = 0; 1 i < column_count; i++3.21k ) |
528 | 3.21k | ccfree(column_data[i].name); |
529 | 5 | ccfree(column_data); |
530 | 5 | return dataframe; |
531 | 5 | } |