Coverage Report

Created: 2022-07-27 23:53

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_cnnp_dataframe_csv.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_cnnp_dataframe.h"
6
7
#include <sys/mman.h>
8
9
// MARK - Create Dataframe from Comma-separated-values Files
10
11
typedef struct {
12
  int even;
13
  int odd;
14
  int even_starter;
15
  int odd_starter;
16
  int quotes;
17
} csv_crlf_t;
18
19
13.1M
#define ANY_ZEROS(v) ((v - (uint64_t)0x0101010101010101) & ((~v) & (uint64_t)0x8080808080808080))
20
21
static inline void _fix_double_quote(const char* src, int count, char* dest)
22
2
{
23
2
  if (!src || count <= 0)
24
0
    return;
25
2
  char prev_char = src[0];
26
2
  dest[0] = src[0];
27
2
  ++dest;
28
2
  int pos = 1;
29
5.76M
  while (pos < count)
30
5.76M
  {
31
    // double-quote, skip.
32
5.76M
    if (prev_char == '"' && 
src[pos] == '"'3
)
33
3
      ++pos;
34
5.76M
    dest[0] = src[pos];
35
5.76M
    prev_char = src[pos];
36
5.76M
    ++dest;
37
5.76M
    ++pos;
38
5.76M
  }
39
2
  dest[0] = '\0';
40
2
}
41
42
typedef struct {
43
  const char* data;
44
  void* mmap;
45
  size_t file_size;
46
  int column_size;
47
  int include_header;
48
  char delim;
49
  char quote;
50
} ccv_cnnp_csv_t;
51
52
typedef struct {
53
  // This need to be compressed to 64-bit. If we expand this to 128-bit. It will double the memory-bandwidth, and
54
  // slows the whole process down.
55
  uint64_t str:48;
56
  uint16_t count:15;
57
  uint8_t no_double_quote:1;
58
} ccv_cnnp_csv_str_view_t;
59
60
void _ccv_cnnp_csv_enum(const int column_idx, const int* const row_idxs, const int row_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
61
53
{
62
53
  ccv_cnnp_csv_t* const csv = (ccv_cnnp_csv_t*)context;
63
53
  const int column_size = csv->column_size;
64
53
  const int include_header = csv->include_header;
65
53
  const char quote = csv->quote;
66
53
  const char delim = csv->delim;
67
53
  const ccv_cnnp_csv_str_view_t* const sp = (const ccv_cnnp_csv_str_view_t*)(csv + 1) + column_idx + include_header * column_size;
68
53
  int i;
69
115
  for (i = 0; i < row_size; 
i++62
)
70
62
  {
71
62
    const int row_idx = row_idxs[i];
72
62
    const ccv_cnnp_csv_str_view_t* const csp = sp + row_idx * column_size;
73
    // This is the same as (csp->str == 0 && csp->no_double_quote = 0 && csp->count == 0)
74
    // If a string has 0 length, it cannot contain double quote, therefore, this condition
75
    // implies the pointer is null.
76
62
    if (((uint64_t*)csp)[0] == 0)
77
1
    {
78
1
      if (data[i])
79
1
      {
80
1
        int* hdr = (int*)data[i] - 1;
81
1
        ccfree(hdr);
82
1
      }
83
1
      data[i] = 0;
84
1
      continue;
85
1
    }
86
61
    const char* str = csv->data + csp->str;
87
61
    int count = 0;
88
61
    if (csp->count == 0x7fff) // We don't know the count yet. In this case, go over to find it.
89
3
    {
90
3
      const char* const p_end = csv->data + csv->file_size;
91
3
      int quotes = (str > csv->data && str[-1] == quote) ? 
11
:
02
;
92
3
      const char* p = str;
93
3
      const char* quote_end = 0;
94
17.2M
      for (; p < p_end; 
p++17.2M
)
95
17.2M
      {
96
17.2M
        if (p[0] == quote)
97
3
        {
98
3
          ++quotes;
99
3
          quote_end = p;
100
17.2M
        } else if (!(quotes & 1)) {
101
11.5M
          if (p[0] == delim || 
p[0] == '\r'11.5M
||
p[0] == '\n'11.5M
)
102
3
          {
103
3
            if (quote_end >= str)
104
1
              count = quote_end - str;
105
2
            else
106
2
              count = p - str;
107
3
            break;
108
3
          }
109
11.5M
        }
110
17.2M
      }
111
3
    } else
112
58
      count = csp->count;
113
61
    if (!data[i])
114
15
    {
115
15
      int* const hdr = (int*)ccmalloc(sizeof(int) + count + 1);
116
15
      hdr[0] = count + 1;
117
15
      data[i] = (char*)(hdr + 1);
118
46
    } else {
119
46
      int* hdr = (int*)data[i] - 1;
120
46
      if (hdr[0] < count + 1)
121
7
      {
122
7
        hdr = (int*)ccrealloc(hdr, sizeof(int) + count + 1);
123
7
        hdr[0] = count + 1;
124
7
        data[i] = (char*)(hdr + 1);
125
7
      }
126
46
    }
127
61
    if (csp->no_double_quote)
128
59
    {
129
59
      memcpy(data[i], str, count);
130
59
      ((char*)data[i])[count] = '\0';
131
59
    } else
132
2
      _fix_double_quote(str, count, (char*)data[i]);
133
61
  }
134
53
}
135
136
void _ccv_cnnp_csv_data_deinit(void* const data, void* const context)
137
14
{
138
14
  if (data)
139
14
  {
140
14
    int* hdr = (int*)data - 1;
141
14
    ccfree(hdr);
142
14
  }
143
14
}
144
145
void _ccv_cnnp_csv_deinit(void* const context)
146
5
{
147
5
  ccv_cnnp_csv_t* const csv = (ccv_cnnp_csv_t*)context;
148
5
  if (csv->mmap)
149
5
    munmap(csv->mmap, csv->file_size);
150
5
  ccfree(csv);
151
5
}
152
153
ccv_cnnp_dataframe_t* ccv_cnnp_dataframe_from_csv_new(void* const input, const int type, const size_t len, const char _delim, const char _quote, const int include_header, int* const column_size)
154
5
{
155
5
  assert(input);
156
5
  assert(column_size);
157
5
  size_t file_size;
158
5
  char* data;
159
5
  assert(type == CCV_CNNP_DATAFRAME_CSV_FILE || type == CCV_CNNP_DATAFRAME_CSV_MEMORY);
160
5
  if (type == CCV_CNNP_DATAFRAME_CSV_FILE)
161
5
  {
162
5
    FILE* file = (FILE*)input;
163
5
    const int fd = fileno(file);
164
5
    if (fd == -1)
165
0
      return 0;
166
5
    fseek(file, 0, SEEK_END);
167
5
    file_size = ftell(file);
168
5
    fseek(file, 0, SEEK_SET);
169
5
    if (file_size < 2)
170
0
      return 0;
171
5
    data = mmap(NULL, file_size, PROT_READ, MAP_SHARED, fd, 0);
172
5
    if (!data)
173
0
      return 0;
174
5
  } else {
175
0
    file_size = len;
176
0
    assert(len > 0);
177
0
    if (len < 2)
178
0
      return 0;
179
0
    data = input;
180
0
  }
181
  // We cannot handle file size larger than 2^48, which is around 281TB.
182
5
#if defined(__LP64__) || defined(_WIN64)
183
5
  assert(file_size <= 0xffffffffffffllu);
184
5
#endif
185
5
  const char delim = _delim ? _delim : 
','0
;
186
5
  const char quote = _quote ? _quote : 
'"'0
;
187
5
  const size_t chunk_size = 1024 * 1024;
188
5
  const int aligned_chunks = file_size / chunk_size;
189
5
  const int total_chunks = (file_size + chunk_size - 1) / chunk_size;
190
  // Get number of rows.
191
5
  csv_crlf_t* const crlf = cccalloc(total_chunks, sizeof(csv_crlf_t));
192
5
#define CSV_QUOTE_BR(c, n) \
193
5.88M
  do { \
194
5.88M
    if (c##n == quote) \
195
5.88M
      
++quotes12.9k
; \
196
5.88M
    else 
if (c5.87M
##n == '\n'5.87M
) { \
197
63
      ++count[quotes & 1]; \
198
63
      if (starter[quotes & 1] == -1) \
199
63
        
starter[quotes & 1] = (int)(p - p_start) + n11
; \
200
63
    } \
201
5.88M
  } while (0)
202
214
  
parallel_for112
(i, aligned_chunks) {
203
214
    const uint64_t* pd = (const uint64_t*)(data + i * chunk_size);
204
214
    const char* const p_start = (const char*)pd;
205
214
    const uint64_t* const pd_end = pd + chunk_size / sizeof(uint64_t);
206
214
    int quotes = 0;
207
214
    int starter[2] = {-1, -1};
208
214
    int count[2] = {0, 0};
209
568k
    for (; pd < pd_end; 
pd++568k
)
210
568k
    {
211
      // Load 8-bytes at batch.
212
568k
      const char* const p = (const char*)pd;
213
568k
      char c0, c1, c2, c3, c4, c5, c6, c7;
214
568k
      c0 = p[0], c1 = p[1], c2 = p[2], c3 = p[3], c4 = p[4], c5 = p[5], c6 = p[6], c7 = p[7];
215
568k
      CSV_QUOTE_BR(c, 0);
216
568k
      CSV_QUOTE_BR(c, 1);
217
568k
      CSV_QUOTE_BR(c, 2);
218
568k
      CSV_QUOTE_BR(c, 3);
219
568k
      CSV_QUOTE_BR(c, 4);
220
568k
      CSV_QUOTE_BR(c, 5);
221
568k
      CSV_QUOTE_BR(c, 6);
222
568k
      CSV_QUOTE_BR(c, 7);
223
568k
    }
224
214
    crlf[i].even = count[0];
225
214
    crlf[i].odd = count[1];
226
214
    crlf[i].even_starter = starter[0];
227
214
    crlf[i].odd_starter = starter[1];
228
214
    crlf[i].quotes = quotes;
229
214
  } 
parallel_endfor112
230
5
  if (total_chunks > aligned_chunks)
231
5
  {
232
5
    const int residual_size = file_size - chunk_size * aligned_chunks;
233
5
    const uint64_t* pd = (const uint64_t*)(data + chunk_size * aligned_chunks);
234
5
    const char* const p_start = (const char*)pd;
235
5
    const uint64_t* const pd_end = pd + residual_size / sizeof(uint64_t);
236
5
    int quotes = 0;
237
5
    int starter[2] = {-1, -1};
238
5
    int count[2] = {0, 0};
239
167k
    for (; pd < pd_end; 
pd++167k
)
240
167k
    {
241
167k
      const char* const p = (const char*)pd;
242
      // Load 8-bytes at batch.
243
167k
      char c0, c1, c2, c3, c4, c5, c6, c7;
244
167k
      c0 = p[0], c1 = p[1], c2 = p[2], c3 = p[3], c4 = p[4], c5 = p[5], c6 = p[6], c7 = p[7];
245
167k
      CSV_QUOTE_BR(c, 0);
246
167k
      CSV_QUOTE_BR(c, 1);
247
167k
      CSV_QUOTE_BR(c, 2);
248
167k
      CSV_QUOTE_BR(c, 3);
249
167k
      CSV_QUOTE_BR(c, 4);
250
167k
      CSV_QUOTE_BR(c, 5);
251
167k
      CSV_QUOTE_BR(c, 6);
252
167k
      CSV_QUOTE_BR(c, 7);
253
167k
    }
254
5
    const char* const p_end = data + file_size;
255
5
    const char* p = (const char*)pd_end;
256
11
    for (; p < p_end; 
p++6
)
257
6
    {
258
6
      const char c0 = p[0];
259
6
      CSV_QUOTE_BR(c, 0);
260
6
    }
261
5
    crlf[aligned_chunks].even = count[0];
262
5
    crlf[aligned_chunks].odd = count[1];
263
5
    crlf[aligned_chunks].even_starter = starter[0] < 0 ? 
residual_size0
: starter[0];
264
5
    crlf[aligned_chunks].odd_starter = starter[1] < 0 ? 
residual_size4
:
starter[1]1
;
265
5
    crlf[aligned_chunks].quotes = quotes;
266
5
  }
267
5
#undef CSV_QUOTE_BR
268
5
  int row_count = crlf[0].even;
269
5
  int quotes = crlf[0].quotes;
270
5
  crlf[0].odd_starter = 0;
271
5
  int i;
272
  // Go through all chunks serially to find exactly how many line ends in each chunk, moving that information to even*.
273
  // The odd_starter will record which row it currently at for this chunk.
274
23
  for (i = 1; i < total_chunks; 
i++18
)
275
18
  {
276
18
    if (quotes & 1)
277
5
    {
278
      // Even is the correct one, we will use that throughout.
279
5
      crlf[i].even = crlf[i].odd;
280
5
      crlf[i].even_starter = crlf[i].odd_starter;
281
5
    }
282
18
    crlf[i].odd_starter = row_count + 1;
283
18
    row_count += crlf[i].even;
284
18
    quotes += crlf[i].quotes;
285
18
  }
286
  // Didn't end with newline, one more row.
287
5
  if (!(data[file_size - 1] == '\n' || 
(0
data[file_size - 2] == '\n'0
&&
data[file_size - 1] == '\r'0
)))
288
0
    ++row_count;
289
  // Get number of columns.
290
5
  int column_count = 0;
291
5
  const uint64_t* pd = (const uint64_t*)data;
292
5
  int first_line_len = file_size;
293
10
  for (i = 0; i < total_chunks; 
i++5
)
294
10
    if (crlf[i].even_starter >= 0)
295
5
    {
296
5
      first_line_len = i * chunk_size + crlf[i].even_starter;
297
5
      break;
298
5
    }
299
5
  const uint64_t* const pd_end = pd + first_line_len / sizeof(uint64_t);
300
5
#define CSV_QUOTE_BR(cn) \
301
5.81M
  do { \
302
5.81M
    if (cn == quote) \
303
5.81M
      
++quotes12.8k
; \
304
5.81M
    else 
if (5.79M
!(quotes & 1)5.79M
) { \
305
5.76M
      if (cn == delim) \
306
5.76M
        
++column_count6.42k
; \
307
5.76M
    } \
308
5.81M
  } while (0)
309
5
  quotes = 0;
310
726k
  for (; pd < pd_end; 
pd++726k
)
311
726k
  {
312
726k
    const char* const p = (const char*)pd;
313
726k
    char c0, c1, c2, c3, c4, c5, c6, c7;
314
726k
    c0 = p[0], c1 = p[1], c2 = p[2], c3 = p[3], c4 = p[4], c5 = p[5], c6 = p[6], c7 = p[7];
315
726k
    CSV_QUOTE_BR(c0);
316
726k
    CSV_QUOTE_BR(c1);
317
726k
    CSV_QUOTE_BR(c2);
318
726k
    CSV_QUOTE_BR(c3);
319
726k
    CSV_QUOTE_BR(c4);
320
726k
    CSV_QUOTE_BR(c5);
321
726k
    CSV_QUOTE_BR(c6);
322
726k
    CSV_QUOTE_BR(c7);
323
726k
  }
324
  // If haven't reached the flag yet (i.e., haven't reached a new line).
325
5
  const char* p = (const char*)pd;
326
5
  const char* const p_end = data + first_line_len;
327
15
  for (; p < p_end; 
p++10
)
328
10
  {
329
10
    const char c0 = p[0];
330
10
    CSV_QUOTE_BR(c0);
331
10
  }
332
5
#undef CSV_QUOTE_BR
333
5
  ++column_count; // column count is 1 more than delimiter.
334
5
  if (row_count == 0) // This is possible because you have an open quote, and then \n is inside the open quote, which won't be recognized.
335
0
  {
336
0
    ccfree(crlf);
337
0
    if (type == CCV_CNNP_DATAFRAME_CSV_FILE)
338
0
      munmap(data, file_size);
339
0
    return 0;
340
0
  }
341
  // We only mark the beginning and the end of a cell. Removing double-quote etc will be left when iterating.
342
5
  ccv_cnnp_csv_t* const csv = (ccv_cnnp_csv_t*)ccmalloc(sizeof(ccv_cnnp_csv_t) + sizeof(ccv_cnnp_csv_str_view_t) * row_count * column_count);
343
5
  csv->column_size = column_count;
344
5
  csv->include_header = !!include_header;
345
5
  ccv_cnnp_csv_str_view_t* const sp = (ccv_cnnp_csv_str_view_t*)(csv + 1);
346
5
  memset(sp, 0, sizeof(ccv_cnnp_csv_str_view_t) * row_count * column_count);
347
5
  const uint64_t delim_mask = (uint64_t)0x0101010101010101 * (uint64_t)delim;
348
5
  const uint64_t quote_mask = (uint64_t)0x0101010101010101 * (uint64_t)quote;
349
5
  const uint64_t lf_mask = (uint64_t)0x0101010101010101 * (uint64_t)'\n';
350
5
  const uint64_t cr_mask = (uint64_t)0x0101010101010101 * (uint64_t)'\r';
351
5
#define CSV_QUOTE_BR(c, n) \
352
1.09M
  do { \
353
1.09M
    if (c##n == quote) \
354
1.09M
    { \
355
      /* If the preceding one is not a quote. Set it to be null-terminator temporarily. */ \
356
12.9k
      ++quotes; \
357
12.9k
      quote_end = p + n; \
358
12.9k
      if (!preceding_quote) \
359
12.9k
        
preceding_quote = 112.9k
; \
360
12.9k
      else \
361
12.9k
        
double_quote = 14
; \
362
1.08M
    } else { \
363
1.08M
      preceding_quote = 0; \
364
1.08M
      if (!(quotes & 1)) \
365
1.22M
      { \
366
1.22M
        if (c##n == delim) \
367
1.22M
        { \
368
160k
          if (chunk_row_count < row_count) \
369
160k
          { \
370
160k
            if (chunk_column_count < column_count) \
371
160k
            { \
372
160k
              int count; \
373
160k
              if (quote_end > 0 && 
quote_end - data >= csp[chunk_column_count].str153k
) \
374
160k
                
count = (int)((quote_end - data) - csp[chunk_column_count].str)6.42k
; \
375
160k
              else \
376
160k
                
count = (int)((p + n - data) - csp[chunk_column_count].str)154k
; \
377
160k
              csp[chunk_column_count].count = ccv_min(count, 0x7fff); \
378
160k
              csp[chunk_column_count].no_double_quote = !double_quote; \
379
160k
            } \
380
160k
            ++chunk_column_count; \
381
160k
            if (chunk_column_count < column_count) \
382
              /* Skip quote if presented. */ \
383
160k
              
csp[chunk_column_count].str = (160k
p + (n + 1) < p_end160k
&&
p[n + 1] == quote160k
?
p + (n + 2)6.47k
:
p + (n + 1)153k
) - data; \
384
160k
          } \
385
160k
          double_quote = 0; \
386
1.06M
        } else if (c##n == '\n') { \
387
56
          if (chunk_row_count < row_count && chunk_column_count < column_count) \
388
56
          { \
389
56
            int count; \
390
56
            if (quote_end > 0 && 
quote_end - data >= csp[chunk_column_count].str55
) \
391
56
              
count = (int)((quote_end - data) - csp[chunk_column_count].str)49
; \
392
56
            else 
if (7
p + n > data7
&&
p[n - 1] == '\r'7
) \
393
7
              
count = (int)((p + n - 1 - data) - csp[chunk_column_count].str)1
; \
394
7
            else \
395
7
              
count = (int)((p + n - data) - csp[chunk_column_count].str)6
; \
396
56
            csp[chunk_column_count].count = ccv_min(count, 0x7fff); \
397
56
            csp[chunk_column_count].no_double_quote = !double_quote; \
398
56
          } \
399
56
          ++chunk_row_count; \
400
56
          csp += column_count; \
401
56
          chunk_column_count = 0; \
402
56
          if (chunk_row_count < row_count) \
403
56
          { \
404
52
            if (p + (n + 1) < p_end && p[n + 1] == '\r') \
405
52
              
csp[0].str = (0
p + (n + 2) < p_end0
&&
p[n + 2] == quote0
?
p + (n + 3)0
:
p + (n + 2)0
) - data; \
406
52
            else \
407
52
              csp[0].str = (p + (n + 1) < p_end && p[n + 1] == quote ? 
p + (n + 2)0
: p + (n + 1)) - data; \
408
52
          } \
409
56
          double_quote = 0; \
410
56
        } \
411
1.22M
      } \
412
1.08M
    } \
413
1.09M
  } while (0)
414
174
  
parallel_for92
(i, total_chunks) {
415
    // Skip if existing one don't have a line starter.
416
174
    if (
i > 087
&&
crlf[i].even_starter < 07
)
417
9
      continue;
418
78
    const char* p = (i == 0) ? 
data5
:
data + i * chunk_size + crlf[i].even_starter + 173
;
419
78
    const char* p_end = data + file_size;
420
78
    int j;
421
91
    for (j = i + 1; j < total_chunks; 
j++13
)
422
18
      if (crlf[j].even_starter >= 0)
423
5
      {
424
5
        p_end = data + j * chunk_size + crlf[j].even_starter;
425
5
        break;
426
5
      }
427
78
    if (p_end <= p)
428
1
      continue;
429
77
    int chunk_row_count = crlf[i].odd_starter;
430
77
    ccv_cnnp_csv_str_view_t* csp = sp + (uintptr_t)column_count * chunk_row_count;
431
77
    if (chunk_row_count < row_count)
432
9
    {
433
9
      if (p[0] == '\r')
434
0
        csp[0].str = (p + 1 < p_end && p[1] == quote ? p + 2 : p + 1) - data;
435
9
      else
436
9
        csp[0].str = (p[0] == quote ? 
p + 13
:
p6
) - data;
437
9
    }
438
77
    int chunk_column_count = 0;
439
77
    int quotes = 0;
440
77
    int preceding_quote = 0;
441
77
    int double_quote = 0;
442
77
    const char* quote_end = 0;
443
77
    const int padding = ccv_min(0x7 - (((uintptr_t)p - 1) & 0x7), (int)(p_end - p));
444
100
    for (j = 0; j < padding; 
j++, p++23
)
445
23
    {
446
23
      char c0 = p[0];
447
28
      CSV_QUOTE_BR(c, 0);
448
23
    }
449
77
    const size_t cur_chunk_size = (size_t)(p_end - p);
450
77
    const uint64_t* pd = (const uint64_t*)p;
451
77
    const uint64_t* pd_end = pd + cur_chunk_size / sizeof(uint64_t);
452
2.01M
    for (; pd < pd_end; 
pd++2.01M
)
453
2.01M
    {
454
2.01M
      const uint64_t v = *pd;
455
2.01M
      const uint64_t delim_v = v ^ delim_mask;
456
2.01M
      const uint64_t quote_v = v ^ quote_mask;
457
2.01M
      const uint64_t lf_v = v ^ lf_mask;
458
2.01M
      const uint64_t cr_v = v ^ cr_mask;
459
      // If it doesn't contain any zeros, skip the logic.
460
2.01M
      if (!ANY_ZEROS(delim_v) && 
!1.86M
ANY_ZEROS1.86M
(quote_v) &&
!1.86M
ANY_ZEROS1.86M
(lf_v) &&
!1.38M
ANY_ZEROS1.38M
(cr_v))
461
1.87M
        continue;
462
      // Need to check and assign the length and starting point.
463
137k
      p = (const char*)pd;
464
      // Load 8-bytes at batch.
465
137k
      char c0, c1, c2, c3, c4, c5, c6, c7;
466
137k
      c0 = p[0], c1 = p[1], c2 = p[2], c3 = p[3], c4 = p[4], c5 = p[5], c6 = p[6], c7 = p[7];
467
137k
      CSV_QUOTE_BR(c, 0);
468
137k
      CSV_QUOTE_BR(c, 1);
469
137k
      CSV_QUOTE_BR(c, 2);
470
137k
      CSV_QUOTE_BR(c, 3);
471
137k
      CSV_QUOTE_BR(c, 4);
472
137k
      CSV_QUOTE_BR(c, 5);
473
137k
      CSV_QUOTE_BR(c, 6);
474
137k
      CSV_QUOTE_BR(c, 7);
475
137k
    }
476
77
    p = (const char*)pd;
477
95
    for (; p < p_end; 
p++18
)
478
18
    {
479
18
      char c0 = p[0];
480
18
      CSV_QUOTE_BR(c, 0);
481
18
    }
482
77
    if (chunk_row_count < row_count && 
chunk_column_count < column_count5
)
483
5
    {
484
5
      int count;
485
5
      if (quote_end > 0 && 
quote_end - data >= csp[chunk_column_count].str3
)
486
3
        count = (int)(quote_end - data - csp[chunk_column_count].str);
487
2
      else
488
2
        count = (int)(p - data - csp[chunk_column_count].str);
489
5
      csp[chunk_column_count].count = ccv_min(count, 0x7fff);
490
5
      csp[chunk_column_count].no_double_quote = !double_quote;
491
5
    }
492
82
  } parallel_endfor
493
5
#undef CSV_QUOTE_BR
494
5
  ccfree(crlf);
495
5
  csv->data = data;
496
5
  assert(file_size > 0);
497
5
  csv->file_size = file_size;
498
5
  csv->delim = delim;
499
5
  csv->quote = quote;
500
5
  if (type == CCV_CNNP_DATAFRAME_CSV_FILE)
501
5
    csv->mmap = data;
502
5
  *column_size = column_count;
503
5
  assert(column_count > 0);
504
5
  ccv_cnnp_column_data_t* const column_data = (ccv_cnnp_column_data_t*)cccalloc(column_count, sizeof(ccv_cnnp_column_data_t));
505
6.43k
  for (i = 0; i < column_count; 
i++6.43k
)
506
6.43k
  {
507
6.43k
    column_data[i].data_enum = _ccv_cnnp_csv_enum;
508
6.43k
    column_data[i].context = csv;
509
6.43k
    column_data[i].data_deinit = _ccv_cnnp_csv_data_deinit;
510
6.43k
  }
511
5
  if (include_header)
512
3.21k
    
for (i = 0; 1
i < column_count;
i++3.21k
)
513
3.21k
      if (((uint64_t*)sp)[i] != 0)
514
3.21k
      {
515
3.21k
        column_data[i].name = (char*)ccmalloc(sp[i].count + 1);
516
3.21k
        const char* str = data + sp[i].str;
517
3.21k
        if (sp[i].no_double_quote)
518
3.21k
        {
519
3.21k
          memcpy(column_data[i].name, str, sp[i].count);
520
3.21k
          column_data[i].name[sp[i].count] = '\0';
521
3.21k
        } else
522
0
          _fix_double_quote(str, sp[i].count, column_data[i].name);
523
3.21k
      }
524
5
  column_data[0].context_deinit = _ccv_cnnp_csv_deinit;
525
5
  ccv_cnnp_dataframe_t* dataframe = ccv_cnnp_dataframe_new(column_data, column_count, row_count - !!include_header);
526
5
  if (include_header)
527
3.21k
    
for (i = 0; 1
i < column_count;
i++3.21k
)
528
3.21k
      ccfree(column_data[i].name);
529
5
  ccfree(column_data);
530
5
  return dataframe;
531
5
}