Coverage Report

Created: 2021-09-21 23:33

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_cnnp_dataframe_addons.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_cnnp_dataframe.h"
6
#include "3rdparty/sfmt/SFMT.h"
7
8
// MARK - Create Dataframe from Array
9
10
static void _ccv_cnnp_array_enum(const int column_idx, const int* const row_idxs, const int row_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
11
380
{
12
380
  int i;
13
380
  ccv_array_t* const array = (ccv_array_t*)context;
14
180k
  for (i = 0; i < row_size; 
i++180k
)
15
180k
    data[i] = ccv_array_get(array, row_idxs[i]);
16
380
}
17
18
ccv_cnnp_dataframe_t* ccv_cnnp_dataframe_from_array_new(ccv_array_t* const array)
19
18
{
20
18
  const ccv_cnnp_column_data_t array_column_data = {
21
18
    .data_enum = _ccv_cnnp_array_enum,
22
18
    .context = array
23
18
  };
24
18
  return ccv_cnnp_dataframe_new(&array_column_data, 1, array->rnum);
25
18
}
26
27
typedef struct {
28
  ccv_cnnp_dataframe_tuple_t tuple;
29
  int tensor_offset;
30
  int device_id;
31
} ccv_cnnp_copy_to_gpu_context_t;
32
33
// MARK - Copy Tensors from CPU to GPU
34
35
static void _ccv_cnnp_tensor_list_deinit(void* const data, void* const context)
36
3.92k
{
37
3.92k
  ccv_cnnp_dataframe_tuple_t* const tuple = (ccv_cnnp_dataframe_tuple_t*)context;
38
3.92k
  ccv_nnc_tensor_t** const tensor_list = (ccv_nnc_tensor_t**)data;
39
3.92k
  int i;
40
7.88k
  for (i = 0; i < tuple->size; 
i++3.96k
)
41
3.96k
    if (tensor_list[i])
42
3.96k
      ccv_nnc_tensor_free(tensor_list[i]);
43
3.92k
  ccfree(tensor_list);
44
3.92k
}
45
46
static void _ccv_cnnp_copy_to_gpu(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
47
748
{
48
748
  const ccv_cnnp_copy_to_gpu_context_t* const copy_to_gpu_context = (ccv_cnnp_copy_to_gpu_context_t*)context;
49
748
  int i, j;
50
1.49k
  for (i = 0; i < batch_size; 
i++748
)
51
748
  {
52
748
    ccv_nnc_tensor_t* const* const inputs = (ccv_nnc_tensor_t* const*)column_data[0][i] + copy_to_gpu_context->tensor_offset;
53
748
    ccv_nnc_tensor_t** outputs = (ccv_nnc_tensor_t**)data[i];
54
748
    if (!outputs)
55
67
      outputs = (ccv_nnc_tensor_t**)(data[i] = cccalloc(copy_to_gpu_context->tuple.size, sizeof(ccv_nnc_tensor_t*)));
56
2.09k
    for (j = 0; j < copy_to_gpu_context->tuple.size; 
j++1.34k
)
57
1.34k
    {
58
1.34k
      ccv_nnc_tensor_param_t params = inputs[j]->info;
59
1.34k
      params.type &= ~CCV_TENSOR_CPU_MEMORY;
60
1.34k
      params.type |= CCV_TENSOR_GPU_MEMORY; // Change to GPU memory.
61
1.34k
      CCV_TENSOR_SET_DEVICE_ID(params.type, copy_to_gpu_context->device_id);
62
1.34k
      outputs[j] = outputs[j] ? 
ccv_nnc_tensor_resize(outputs[j], params)1.25k
:
ccv_nnc_tensor_new(0, params, 0)93
;
63
1.34k
      ccv_nnc_tensor_pin_memory(inputs[j]);
64
1.34k
    }
65
748
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, copy_to_gpu_context->tuple.size, outputs, copy_to_gpu_context->tuple.size, stream_context);
66
748
  }
67
748
}
68
69
int ccv_cnnp_dataframe_copy_to_gpu(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const int tensor_offset, const int tensor_size, const int device_id, const char* name)
70
58
{
71
58
  assert(tensor_size > 0);
72
58
  int stream_type = CCV_STREAM_CONTEXT_GPU;
73
58
  CCV_STREAM_SET_DEVICE_ID(stream_type, device_id);
74
58
  ccv_cnnp_copy_to_gpu_context_t* const copy_to_gpu_context = (ccv_cnnp_copy_to_gpu_context_t*)ccmalloc(sizeof(ccv_cnnp_copy_to_gpu_context_t));
75
58
  copy_to_gpu_context->tuple.size = tensor_size;
76
58
  copy_to_gpu_context->tensor_offset = tensor_offset;
77
58
  copy_to_gpu_context->device_id = device_id;
78
58
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_copy_to_gpu, stream_type, _ccv_cnnp_tensor_list_deinit, COLUMN_ID_LIST(column_idx), copy_to_gpu_context, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
79
58
}
80
81
// MARK - Use Command to Generate Output Tuple
82
83
typedef struct {
84
  ccv_cnnp_dataframe_tuple_t tuple;
85
  int input_offset;
86
  int input_size;
87
  ccv_nnc_cmd_t cmd;
88
  ccv_nnc_hint_t hint;
89
  int flags;
90
  ccv_nnc_tensor_param_t output_params[1];
91
} ccv_cnnp_cmd_exec_context_t;
92
93
static void _ccv_cnnp_dataframe_cmd_exec(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
94
296
{
95
296
  const ccv_cnnp_cmd_exec_context_t* const cmd_exec_context = (ccv_cnnp_cmd_exec_context_t*)context;
96
296
  int i, j;
97
120k
  for (i = 0; i < batch_size; 
i++120k
)
98
120k
  {
99
120k
    ccv_nnc_tensor_t* const* const inputs = (ccv_nnc_tensor_t* const*)column_data[0][i] + cmd_exec_context->input_offset;
100
120k
    ccv_nnc_tensor_t** outputs = (ccv_nnc_tensor_t**)data[i];
101
120k
    if (!outputs)
102
3.84k
    {
103
3.84k
      outputs = (ccv_nnc_tensor_t**)(data[i] = ccmalloc(sizeof(ccv_nnc_tensor_t*) * cmd_exec_context->tuple.size));
104
7.68k
      for (j = 0; j < cmd_exec_context->tuple.size; 
j++3.84k
)
105
3.84k
        outputs[j] = ccv_nnc_tensor_new(0, cmd_exec_context->output_params[j], 0);
106
3.84k
    }
107
120k
    ccv_nnc_cmd_exec(cmd_exec_context->cmd, cmd_exec_context->hint, cmd_exec_context->flags, inputs, cmd_exec_context->input_size, outputs, cmd_exec_context->tuple.size, stream_context);
108
120k
  }
109
296
}
110
111
int ccv_cnnp_dataframe_cmd_exec(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, const int input_offset, const int input_size, const ccv_nnc_tensor_param_t* const output_params, const int output_size, const int stream_type, const char* name)
112
5
{
113
5
  assert(input_size > 0);
114
5
  assert(output_size > 0);
115
5
  ccv_cnnp_cmd_exec_context_t* const cmd_exec_context = (ccv_cnnp_cmd_exec_context_t*)ccmalloc(sizeof(ccv_cnnp_cmd_exec_context_t) + sizeof(ccv_nnc_tensor_param_t) * (output_size - 1));
116
5
  cmd_exec_context->tuple.size = output_size;
117
5
  cmd_exec_context->input_offset = input_offset;
118
5
  cmd_exec_context->input_size = input_size;
119
5
  cmd_exec_context->cmd = cmd;
120
5
  cmd_exec_context->hint = hint;
121
5
  cmd_exec_context->flags = flags;
122
5
  memcpy(cmd_exec_context->output_params, output_params, sizeof(ccv_nnc_tensor_param_t) * output_size);
123
5
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_dataframe_cmd_exec, stream_type, _ccv_cnnp_tensor_list_deinit, COLUMN_ID_LIST(column_idx), cmd_exec_context, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
124
5
  
return 00
;
125
5
}
126
127
// MARK - Make Auxiliary Tensor as a new Column
128
129
static void _ccv_cnnp_tensor_deinit(void* const data, void* const context)
130
5.15k
{
131
5.15k
  ccv_nnc_tensor_free((ccv_nnc_tensor_t*)data);
132
5.15k
}
133
134
static void _ccv_cnnp_tensor_new(const int column_idx, const int* const row_idxs, const int row_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
135
708
{
136
708
  ccv_nnc_tensor_param_t params = *(ccv_nnc_tensor_param_t*)context;
137
708
  int i;
138
1.41k
  for (i = 0; i < row_size; 
i++708
)
139
708
    if (!data[i])
140
27
      data[i] = ccv_nnc_tensor_new(0, params, 0);
141
708
}
142
143
int ccv_cnnp_dataframe_add_aux(ccv_cnnp_dataframe_t* const dataframe, const ccv_nnc_tensor_param_t params, const char* name)
144
18
{
145
18
  int stream_type = CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_CPU_MEMORY ? 
00
: CCV_STREAM_CONTEXT_GPU;
146
18
  if (stream_type == CCV_STREAM_CONTEXT_GPU)
147
18
    CCV_STREAM_SET_DEVICE_ID(stream_type, CCV_TENSOR_GET_DEVICE_ID(params.type));
148
18
  ccv_nnc_tensor_param_t* const context = (ccv_nnc_tensor_param_t*)ccmalloc(sizeof(ccv_nnc_tensor_param_t));
149
18
  context[0] = params;
150
18
  return ccv_cnnp_dataframe_add(dataframe, _ccv_cnnp_tensor_new, stream_type, _ccv_cnnp_tensor_deinit, context, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
151
18
}
152
153
// MARK - Load Tensor from File Path
154
155
static void _ccv_cnnp_image_deinit(void* const data, void* const context)
156
4.60k
{
157
4.60k
  ccv_matrix_free(data);
158
4.60k
}
159
160
static void _ccv_cnnp_read_image(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
161
0
{
162
0
  parallel_for(i, batch_size) {
163
0
    if (data[i])
164
0
      ccv_matrix_free(data[i]);
165
0
    off_t structof = (off_t)context;
166
0
    const char* const filename = *(char* const*)((const char*)column_data[0][i] + structof);
167
0
    data[i] = 0;
168
0
    ccv_read(filename, (ccv_dense_matrix_t**)&data[i], CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
169
0
  } parallel_endfor
170
0
}
171
172
int ccv_cnnp_dataframe_read_image(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const off_t structof, const char* name)
173
0
{
174
0
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_read_image, 0, _ccv_cnnp_image_deinit, COLUMN_ID_LIST(column_idx), (void*)(uintptr_t)structof, 0, name);
175
0
}
176
177
// MARK - Apply Random Jitter to Image
178
179
typedef struct {
180
  sfmt_t sfmt;
181
  int datatype;
182
  ccv_cnnp_random_jitter_t random_jitter;
183
} ccv_cnnp_random_jitter_context_t;
184
185
static void _ccv_cnnp_image_lighting(ccv_dense_matrix_t* image, const float alpha_r, const float alpha_g, const float alpha_b)
186
1
{
187
1
  assert(CCV_GET_DATA_TYPE(image->type) == CCV_32F);
188
1
  assert(CCV_GET_CHANNEL(image->type) == CCV_C3);
189
1
  // These eigenvector values can be computed out of imageNet dataset (see ccv_convnet for how that is done). Here I just copied
190
1
  // from mxnet: https://github.com/apache/incubator-mxnet/blob/master/src/operator/image/image_random-inl.h#L632
191
1
  const float pca_r = alpha_r * (55.46 * -0.5675) + alpha_g * (4.794 * 0.7192) + alpha_b * (1.148 * 0.4009);
192
1
  const float pca_g = alpha_r * (55.46 * -0.5808) + alpha_g * (4.794 * -0.0045) + alpha_b * (1.148 * -0.8140);
193
1
  const float pca_b = alpha_r * (55.46 * -0.5836) + alpha_g * (4.794 * -0.6948) + alpha_b * (1.148 * 0.4203);
194
1
  int i;
195
1
  const int size = image->rows * image->cols;
196
1
  float* const ptr = image->data.f32;
197
53.3k
  for (i = 0; i < size; 
i++53.3k
)
198
53.3k
  {
199
53.3k
    ptr[i * 3] = ccv_clamp(ptr[i * 3] + pca_r, 0, 255);
200
53.3k
    ptr[i * 3 + 1] = ccv_clamp(ptr[i * 3 + 1] + pca_g, 0, 255);
201
53.3k
    ptr[i * 3 + 2] = ccv_clamp(ptr[i * 3 + 2] + pca_b, 0, 255);
202
53.3k
  }
203
1
}
204
205
static float _ccv_cnnp_random_logexp(sfmt_t* const sfmt, const float jitter)
206
4
{
207
4
  // We want to get something around logarithmic scale, thus, 0 is no good, and infinity is no good. 1 is the same.
208
4
  // jitter is some turbulence we want around 1. We want the range range to be around [1 / (1 + jitter), 1 + jitter]
209
4
  // but the distribution is not uniform (50% fall under 1, and 50% fall above 1). The way to do this is to first
210
4
  // get to logarithmic range, doing a uniform sampling, and then convert back.
211
4
  double log_jitter_limit = log(1 + jitter);
212
4
  double log_random_jitter = sfmt_genrand_real1(sfmt) * 2 * log_jitter_limit - log_jitter_limit;
213
4
  return (float)exp(log_random_jitter); // Convert it back to exponential form.
214
4
}
215
216
static void _ccv_cnnp_image_manip(ccv_dense_matrix_t* image, const ccv_cnnp_random_jitter_t random_jitter, sfmt_t* const sfmt)
217
149k
{
218
149k
  assert(sfmt && CCV_GET_CHANNEL(image->type) == CCV_C3);
219
149k
  int idx[4] = {0, 1, 2, 3};
220
149k
  sfmt_genrand_shuffle(sfmt, idx, 4, sizeof(int));
221
149k
  int i;
222
733k
  for (i = 0; i < 4; 
i++584k
)
223
587k
    // change the applying order
224
587k
    switch (idx[i])
225
587k
    {
226
147k
      case 0:
227
147k
        if (random_jitter.brightness == 0)
228
147k
          break;
229
18.4E
        // introduce some brightness changes to the original image
230
18.4E
        ccv_scale(image, (ccv_matrix_t**)&image, 0, _ccv_cnnp_random_logexp(sfmt, random_jitter.brightness));
231
18.4E
        break;
232
18.4E
      case 1:
233
148k
        // introduce some saturation changes to the original image
234
148k
        if (random_jitter.saturation == 0)
235
148k
          break;
236
18.4E
        ccv_saturation(image, &image, 0, _ccv_cnnp_random_logexp(sfmt, random_jitter.saturation));
237
18.4E
        break;
238
18.4E
      case 2:
239
147k
        // introduce some contrast changes to the original image
240
147k
        if (random_jitter.contrast == 0)
241
148k
          break;
242
18.4E
        ccv_contrast(image, &image, 0, _ccv_cnnp_random_logexp(sfmt, random_jitter.contrast));
243
18.4E
        break;
244
18.4E
      case 3:
245
145k
        if (random_jitter.lighting == 0)
246
146k
          break;
247
18.4E
        _ccv_cnnp_image_lighting(image, sfmt_genrand_real1(sfmt) * random_jitter.lighting, sfmt_genrand_real1(sfmt) * random_jitter.lighting, sfmt_genrand_real1(sfmt) * random_jitter.lighting);
248
18.4E
        break;
249
587k
    }
250
149k
}
251
252
static void _ccv_cnnp_normalize(ccv_dense_matrix_t* const image, const float mean[3], const float inv_std[3])
253
145k
{
254
145k
  int i;
255
145k
  const int count = image->rows * image->cols;
256
145k
  float* ap = image->data.f32;
257
63.7M
  for (i = 0; i < count; 
i++63.5M
)
258
63.5M
  {
259
63.5M
    ap[i * 3] = (ap[i * 3] - mean[0]) * inv_std[0];
260
63.5M
    ap[i * 3 + 1] = (ap[i * 3 + 1] - mean[1]) * inv_std[1];
261
63.5M
    ap[i * 3 + 2] = (ap[i * 3 + 2] - mean[2]) * inv_std[2];
262
63.5M
  }
263
145k
}
264
265
static void _ccv_cnnp_random_jitter(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
266
295
{
267
295
  sfmt_t* const sfmt = (sfmt_t*)ccmalloc(sizeof(sfmt_t) * batch_size);
268
295
  ccv_cnnp_random_jitter_context_t* const ctx = (ccv_cnnp_random_jitter_context_t*)context;
269
295
  int i;
270
150k
  for (i = 0; i < batch_size; 
i++150k
)
271
150k
    sfmt_init_gen_rand(&sfmt[i], sfmt_genrand_uint32(&ctx->sfmt));
272
295
  const ccv_cnnp_random_jitter_t random_jitter = ctx->random_jitter;
273
295
  assert(random_jitter.resize.min > 0);
274
295
  assert(random_jitter.resize.max >= random_jitter.resize.min);
275
295
  parallel_for(i, batch_size) {
276
0
    if (data[i])
277
129k
      ccv_matrix_free(data[i]);
278
0
    ccv_dense_matrix_t* const input = (ccv_dense_matrix_t*)column_data[0][i];
279
295
    const int resize = ccv_clamp((int)(sfmt_genrand_real1(&sfmt[i]) * (random_jitter.resize.max - random_jitter.resize.min) + 0.5) + random_jitter.resize.min, random_jitter.resize.min, random_jitter.resize.max);
280
0
    int resize_rows = ccv_max(resize, (int)(input->rows * (float)resize / input->cols + 0.5));
281
0
    int resize_cols = ccv_max(resize, (int)(input->cols * (float)resize / input->rows + 0.5));
282
0
    if (random_jitter.aspect_ratio > 0)
283
1
    {
284
1
      const float aspect_ratio = sqrtf(_ccv_cnnp_random_logexp(&sfmt[i],  random_jitter.aspect_ratio));
285
1
      resize_rows = (int)(resize_rows * aspect_ratio + 0.5);
286
1
      resize_cols = (int)(resize_cols / aspect_ratio + 0.5);
287
1
    }
288
0
    if (random_jitter.resize.roundup > 0)
289
0
    {
290
0
      const int roundup = random_jitter.resize.roundup;
291
0
      const int roundup_2 = roundup / 2;
292
0
      resize_rows = (resize_rows + roundup_2) / roundup * roundup;
293
0
      resize_cols = (resize_cols + roundup_2) / roundup * roundup;
294
0
    }
295
146k
    const int need_crop = (
random_jitter.size.cols > 00
&& random_jitter.size.rows > 0 &&
296
146k
      
(146k
(146k
resize_cols != random_jitter.size.cols146k
|| resize_rows != random_jitter.size.rows) ||
297
146k
       (random_jitter.offset.x != 0 || 
random_jitter.offset.y != 00
)));
298
0
    int cropped = 0, crop_x = 0, crop_y = 0;
299
0
    ccv_dense_matrix_t* sliced = 0;
300
0
    if (need_crop)
301
148k
    {
302
148k
      // Compute crop x, y.
303
148k
      crop_x = random_jitter.center_crop ?
304
0
        (resize_cols - random_jitter.size.cols + 1) / 2 : // Otherwise, random select x.
305
148k
        (int)(sfmt_genrand_real1(&sfmt[i]) * (resize_cols - random_jitter.size.cols + 1));
306
148k
      crop_x = ccv_clamp(crop_x,
307
148k
        ccv_min(0, resize_cols - random_jitter.size.cols),
308
148k
        ccv_max(0, resize_cols - random_jitter.size.cols));
309
148k
      crop_y = random_jitter.center_crop ?
310
0
        (resize_rows - random_jitter.size.rows + 1) / 2 : // Otherwise, random select y.
311
148k
        (int)(sfmt_genrand_real1(&sfmt[i]) * (resize_rows - random_jitter.size.rows + 1));
312
148k
      crop_y = ccv_clamp(crop_y,
313
148k
        ccv_min(0, resize_rows - random_jitter.size.rows),
314
148k
        ccv_max(0, resize_rows - random_jitter.size.rows));
315
148k
      if (random_jitter.offset.x != 0)
316
146k
        crop_x += sfmt_genrand_real1(&sfmt[i]) * random_jitter.offset.x * 2 - random_jitter.offset.x;
317
148k
      if (random_jitter.offset.y != 0)
318
144k
        crop_y += sfmt_genrand_real1(&sfmt[i]) * random_jitter.offset.y * 2 - random_jitter.offset.y;
319
148k
      // If we can fill in the whole view (not introducing any 0 padding), we can first crop and then scale down / up.
320
148k
      if (resize_cols >= random_jitter.size.cols && 
resize_rows >= random_jitter.size.rows143k
)
321
143k
      {
322
143k
        const float scale_x = (float)input->cols / resize_cols;
323
143k
        const float scale_y = (float)input->rows / resize_rows;
324
143k
        const int slice_cols = (int)(random_jitter.size.cols * scale_x + 0.5);
325
143k
        const int slice_rows = (int)(random_jitter.size.rows * scale_y + 0.5);
326
143k
        assert(slice_cols <= input->cols);
327
144k
        assert(slice_rows <= input->rows);
328
144k
        const int x = ccv_clamp((int)(crop_x * scale_x + 0.5), 0, input->cols - slice_cols);
329
144k
        const int y = ccv_clamp((int)(crop_y * scale_y + 0.5), 0, input->rows - slice_rows);
330
144k
        ccv_slice(input, (ccv_matrix_t**)&sliced, 0, y, x, slice_rows, slice_cols);
331
144k
        resize_cols = random_jitter.size.cols;
332
144k
        resize_rows = random_jitter.size.rows;
333
144k
        cropped = 1;
334
144k
      } else
335
4.50k
        sliced = input;
336
148k
    } else
337
18.4E
      sliced = input;
338
826
    ccv_dense_matrix_t* resized = 0;
339
826
    // Resize.
340
145k
    if (
sliced->rows >= resize_rows826
&& sliced->cols >= resize_cols)
341
145k
    {
342
145k
      // If we can fill in the whole view, we can first crop and then scale down / up.
343
145k
      ccv_resample(sliced, &resized, CCV_32F, resize_rows, resize_cols, CCV_INTER_AREA);
344
18.4E
    } else if (sliced->rows != resize_rows || 
sliced->cols != resize_cols0
) {
345
0
      ccv_resample(sliced, &resized, CCV_32F, resize_rows, resize_cols, CCV_INTER_CUBIC);
346
18.4E
    } else {
347
18.4E
      ccv_shift(sliced, (ccv_matrix_t**)&resized, CCV_32F, 0, 0); // converting to 32f
348
18.4E
    }
349
826
    if (sliced != input)
350
149k
      ccv_matrix_free(sliced);
351
149k
    if (
random_jitter.symmetric826
&& (sfmt_genrand_uint32(&sfmt[i]) & 1) == 0)
352
74.8k
      ccv_flip(resized, &resized, 0, CCV_FLIP_X);
353
826
    _ccv_cnnp_image_manip(resized, random_jitter, &sfmt[i]);
354
826
    // Apply normalization. Slice will introduce 0 padding, which won't be correct before normalization.
355
826
    if (random_jitter.normalize.mean[0] != 0 || 
random_jitter.normalize.std[0] != 10
||
356
826
      
random_jitter.normalize.mean[1] != 00
||
random_jitter.normalize.std[1] != 10
||
357
826
      
random_jitter.normalize.mean[2] != 00
||
random_jitter.normalize.std[2] != 10
)
358
146k
      _ccv_cnnp_normalize(resized, random_jitter.normalize.mean, random_jitter.normalize.std);
359
826
    // If we haven't cropped in previous step (likely because we have some fill-ins due to the resize down too much).
360
826
    // Do the crop now.
361
826
    ccv_dense_matrix_t* patch = 0;
362
826
    if (!cropped && 
need_crop1
)
363
1
    {
364
1
      ccv_slice(resized, (ccv_matrix_t**)&patch, CCV_32F, crop_y, crop_x, random_jitter.size.rows, random_jitter.size.cols);
365
1
      ccv_matrix_free(resized);
366
1
    } else
367
825
      patch = resized;
368
826
    assert(!ccv_any_nan(patch));
369
146k
    data[i] = patch;
370
147k
  } parallel_endfor
371
295
  ccfree(sfmt);
372
295
}
373
374
int ccv_cnnp_dataframe_image_random_jitter(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const int datatype, const ccv_cnnp_random_jitter_t random_jitter, const char* name)
375
4
{
376
4
  assert(datatype == CCV_32F);
377
4
  ccv_cnnp_random_jitter_context_t* const random_jitter_context = (ccv_cnnp_random_jitter_context_t*)ccmalloc(sizeof(ccv_cnnp_random_jitter_context_t));
378
4
  if (random_jitter.seed)
379
4
    sfmt_init_gen_rand(&random_jitter_context->sfmt, (uint32_t)random_jitter.seed);
380
0
  else
381
0
    sfmt_init_gen_rand(&random_jitter_context->sfmt, (uint32_t)(uintptr_t)dataframe);
382
4
  random_jitter_context->datatype = datatype;
383
4
  random_jitter_context->random_jitter = random_jitter;
384
4
  int i;
385
4
  // The std in the random jitter should be inv_std.
386
16
  for (i = 0; i < 3; 
i++12
)
387
12
    random_jitter_context->random_jitter.normalize.std[i] = random_jitter_context->random_jitter.normalize.std[i] ? 
1. / random_jitter_context->random_jitter.normalize.std[i]3
:
19
;
388
4
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_random_jitter, 0, _ccv_cnnp_image_deinit, COLUMN_ID_LIST(column_idx), random_jitter_context, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
389
4
}
390
391
typedef struct {
392
  int range;
393
  int datatype;
394
  int format;
395
  float onval;
396
  float offval;
397
  off_t structof;
398
} ccv_cnnp_one_hot_context_t;
399
400
static void _ccv_cnnp_one_hot(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
401
305
{
402
305
  ccv_cnnp_one_hot_context_t* const one_hot = (ccv_cnnp_one_hot_context_t*)context;
403
305
  ccv_nnc_tensor_param_t params = {
404
305
    .datatype = one_hot->datatype,
405
305
    .type = CCV_TENSOR_CPU_MEMORY,
406
305
    .format = one_hot->format,
407
305
    .dim = {
408
305
      one_hot->range,
409
305
    },
410
305
  };
411
305
  parallel_for(i, batch_size) {
412
0
    int j;
413
0
    const int label = *(const int*)((const char*)column_data[0][i] + one_hot->structof);
414
0
    if (!data[i])
415
3.00k
      data[i] = ccv_nnc_tensor_new(0, params, 0);
416
0
    ccv_nnc_tensor_t* const tensor = (ccv_nnc_tensor_t*)data[i];
417
0
    assert(label >= 0 && label < one_hot->range);
418
81.6k
    if (tensor->info.datatype == CCV_32F)
419
38.5k
      
for (j = 0; 13.0k
j < one_hot->range;
j++25.5k
)
420
25.5k
        tensor->data.f32[j] = (j == label) ? 
one_hot->onval7.82k
:
one_hot->offval17.7k
;
421
68.6k
    else if (tensor->info.datatype == CCV_16F)
422
80.2k
      
for (j = 0; 50.8k
j < one_hot->range;
j++29.3k
)
423
29.3k
        ccv_float_to_half_precision((j == label) ? 
&one_hot->onval28.0k
:
&one_hot->offval1.35k
, (uint16_t*)(tensor->data.f16 + j), 1);
424
17.7k
    else
425
17.7k
      { assert(0); }
426
81.9k
  } parallel_endfor
427
305
}
428
429
int ccv_cnnp_dataframe_one_hot(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const off_t structof, const int range, const float onval, const float offval, const int datatype, const int format, const char* name)
430
7
{
431
7
  assert(datatype == CCV_32F || datatype == CCV_16F);
432
7
  ccv_cnnp_one_hot_context_t* const one_hot = (ccv_cnnp_one_hot_context_t*)ccmalloc(sizeof(ccv_cnnp_one_hot_context_t));
433
7
  one_hot->range = range;
434
7
  one_hot->datatype = datatype;
435
7
  one_hot->format = format;
436
7
  one_hot->onval = onval;
437
7
  one_hot->offval = offval;
438
7
  one_hot->structof = structof;
439
7
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_one_hot, 0, _ccv_cnnp_tensor_deinit, COLUMN_ID_LIST(column_idx), one_hot, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
440
7
}
441
442
typedef struct {
443
  int from_dt;
444
  int to_dt;
445
  int format;
446
  off_t structof;
447
} ccv_cnnp_copy_scalar_context_t;
448
449
static void _ccv_cnnp_copy_scalar(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
450
10
{
451
10
  ccv_cnnp_copy_scalar_context_t* const copy_scalar = (ccv_cnnp_copy_scalar_context_t*)context;
452
10
  ccv_nnc_tensor_param_t params = {
453
10
    .datatype = copy_scalar->to_dt,
454
10
    .type = CCV_TENSOR_CPU_MEMORY,
455
10
    .format = copy_scalar->format,
456
10
    .dim = {1},
457
10
  };
458
10
  parallel_for(i, batch_size) {
459
0
    const ccv_numeric_data_t value = {
460
0
      .u8 = (unsigned char *)((const char*)column_data[0][i] + copy_scalar->structof),
461
0
    };
462
0
    if (!data[i])
463
236
      data[i] = ccv_nnc_tensor_new(0, params, 0);
464
0
    ccv_nnc_tensor_t* const tensor = (ccv_nnc_tensor_t*)data[i];
465
0
    if (copy_scalar->from_dt == CCV_32S)
466
242
    {
467
242
      if (tensor->info.datatype == CCV_32F)
468
242
        tensor->data.f32[0] = value.i32[0];
469
0
      else if (tensor->info.datatype == CCV_16F) {
470
0
        float fval = value.i32[0];
471
0
        ccv_float_to_half_precision(&fval, (uint16_t*)tensor->data.f16, 1);
472
0
      }
473
18.4E
    } else if (copy_scalar->from_dt == CCV_32F) {
474
0
      if (tensor->info.datatype == CCV_32F)
475
0
        tensor->data.f32[0] = value.f32[0];
476
0
      else if (tensor->info.datatype == CCV_16F)
477
0
        ccv_float_to_half_precision(value.f32, (uint16_t*)tensor->data.f16, 1);
478
18.4E
    } else if (copy_scalar->from_dt == CCV_16F) {
479
0
      if (tensor->info.datatype == CCV_32F)
480
0
        ccv_half_precision_to_float((uint16_t*)value.f16, tensor->data.f32, 1);
481
0
      else if (tensor->info.datatype == CCV_16F)
482
0
        tensor->data.f16[0] = value.f16[0];
483
0
    }
484
10
  } parallel_endfor
485
10
}
486
487
CCV_WARN_UNUSED(int) ccv_cnnp_dataframe_copy_scalar(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const off_t structof, const int from_dt, const int to_dt, const int format, const char* name)
488
3
{
489
3
  assert(from_dt == CCV_32S || from_dt == CCV_32F || from_dt == CCV_16F);
490
3
  assert(to_dt == CCV_32F || to_dt == CCV_16F);
491
3
  ccv_cnnp_copy_scalar_context_t* const copy_scalar = (ccv_cnnp_copy_scalar_context_t*)ccmalloc(sizeof(ccv_cnnp_copy_scalar_context_t));
492
3
  copy_scalar->from_dt = from_dt;
493
3
  copy_scalar->to_dt = to_dt;
494
3
  copy_scalar->format = format;
495
3
  copy_scalar->structof = structof;
496
3
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_copy_scalar, 0, _ccv_cnnp_tensor_deinit, COLUMN_ID_LIST(column_idx), copy_scalar, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
497
3
}
498
499
// MARK - Matrix of Ones
500
501
typedef struct {
502
  ccv_cnnp_dataframe_tuple_t tuple;
503
  int variable_size;
504
  int max_length;
505
} ccv_cnnp_one_squared_context_t;
506
507
static void _ccv_cnnp_one_squared(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
508
12
{
509
12
  ccv_cnnp_one_squared_context_t* const ones = (ccv_cnnp_one_squared_context_t*)context;
510
12
  assert(ones->tuple.size == column_size);
511
12
  const int max_length = ones->max_length;
512
12
  if (ones->variable_size)
513
3
  {
514
3
    parallel_for(i, batch_size) {
515
0
      ccv_nnc_tensor_t* const first_seq = (ccv_nnc_tensor_t*)column_data[0][i];
516
0
      assert(first_seq->info.datatype == CCV_32S);
517
3
      const int first_len = ccv_nnc_tensor_count(first_seq->info);
518
3
      ccv_nnc_tensor_t** outputs = data[i];
519
3
      if (!outputs)
520
3
        outputs = (ccv_nnc_tensor_t**)(data[i] = cccalloc(column_size, sizeof(ccv_nnc_tensor_t*)));
521
3
      int k;
522
12
      for (k = 0; k < column_size; 
k++9
)
523
9
        if (!outputs[k])
524
12
          
outputs[k] = ccv_nnc_tensor_new(0, 9
CPU_TENSOR_NHWC(32S, first_len, max_length, max_length), 0);
525
3
      int max_len = 0;
526
12
      for (k = 0; k < column_size; 
k++9
)
527
9
      {
528
9
        ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[k][i];
529
9
        assert(seq->info.datatype == CCV_32S);
530
9
        const int len = ccv_nnc_tensor_count(seq->info);
531
9
        assert(len == first_len);
532
9
        const int* const ia = seq->data.i32;
533
9
        int l;
534
523
        for (l = 0; l < len; 
l++514
)
535
514
          max_len = ccv_max(max_len, ia[l]);
536
9
      }
537
3
      assert(max_len <= max_length);
538
3
      parallel_for(c, column_size) {
539
0
        ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[c][i];
540
0
        assert(seq->info.datatype == CCV_32S);
541
9
        const int len = ccv_nnc_tensor_count(seq->info);
542
9
        assert(len == first_len);
543
9
        ccv_nnc_tensor_t* tensor = outputs[c];
544
12
        tensor = ccv_nnc_tensor_resize(tensor, CPU_TENSOR_NHWC(32S, len, max_len, max_len));
545
9
        assert(outputs[c] == tensor); // Since we allocated with max_length, this cannot be reallocated.
546
9
        const int* const ia = seq->data.i32;
547
9
        parallel_for(j, len) {
548
0
          int x, y;
549
0
          int seq_len = ia[j];
550
0
          int* ib = tensor->data.i32 + j * max_len * max_len;
551
97.7k
          for (y = 0; y < seq_len; y++)
552
97.7k
          {
553
26.5M
            for (x = 0; x < seq_len; 
x++26.4M
)
554
26.4M
              ib[x] = 1;
555
23.3M
            for (x = seq_len; x < max_len; 
x++23.2M
)
556
23.2M
              ib[x] = 0;
557
97.7k
            ib += max_len;
558
97.7k
          }
559
0
          if (seq_len < max_len)
560
509
            memset(ib, 0, sizeof(int) * max_len * (max_len - seq_len));
561
9
        } parallel_endfor
562
12
      } parallel_endfor
563
6
    } parallel_endfor
564
9
  } else {
565
9
    parallel_for(i, batch_size) {
566
0
      ccv_nnc_tensor_t** outputs = data[i];
567
0
      ccv_nnc_tensor_t* const first_seq = (ccv_nnc_tensor_t*)column_data[0][i];
568
0
      assert(first_seq->info.datatype == CCV_32S);
569
9
      const int first_len = ccv_nnc_tensor_count(first_seq->info);
570
9
      if (!outputs)
571
9
        outputs = (ccv_nnc_tensor_t**)(data[i] = cccalloc(column_size, sizeof(ccv_nnc_tensor_t*)));
572
9
      int k;
573
18
      for (k = 0; k < column_size; 
k++9
)
574
9
        if (!outputs[k])
575
18
          
outputs[k] = ccv_nnc_tensor_new(0, 9
CPU_TENSOR_NHWC(32S, first_len, max_length, max_length), 0);
576
9
      parallel_for(c, column_size) {
577
0
        ccv_nnc_tensor_t* const tensor = outputs[c];
578
0
        ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[c][i];
579
0
        assert(seq->info.datatype == CCV_32S);
580
9
        const int len = ccv_nnc_tensor_count(seq->info);
581
9
        assert(len == first_len);
582
9
        const int* const ia = seq->data.i32;
583
9
        parallel_for(j, len) {
584
0
          int x, y;
585
0
          int seq_len = ia[j];
586
0
          int* ib = tensor->data.i32 + j * max_length * max_length;
587
97.7k
          for (y = 0; y < seq_len; y++)
588
97.7k
          {
589
26.5M
            for (x = 0; x < seq_len; 
x++26.4M
)
590
26.4M
              ib[x] = 1;
591
23.6M
            for (x = seq_len; x < max_length; 
x++23.5M
)
592
23.5M
              ib[x] = 0;
593
97.7k
            ib += max_length;
594
97.7k
          }
595
0
          if (seq_len < max_length)
596
514
            memset(ib, 0, sizeof(int) * max_length * (max_length - seq_len));
597
9
        } parallel_endfor
598
18
      } parallel_endfor
599
18
    } parallel_endfor
600
9
  }
601
12
}
602
603
CCV_WARN_UNUSED(int) ccv_cnnp_dataframe_one_squared(ccv_cnnp_dataframe_t* const dataframe,  const int* const column_idxs, const int column_idx_size, const int variable_size, const int max_length, const char* name)
604
12
{
605
12
  assert(max_length > 0);
606
12
  assert(variable_size == 0 || variable_size == 1);
607
12
  ccv_cnnp_one_squared_context_t* const ones = (ccv_cnnp_one_squared_context_t*)ccmalloc(sizeof(ccv_cnnp_one_squared_context_t));
608
12
  ones->tuple.size = column_idx_size;
609
12
  ones->variable_size = variable_size;
610
12
  ones->max_length = max_length;
611
12
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_one_squared, 0, _ccv_cnnp_tensor_list_deinit, column_idxs, column_idx_size, ones, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
612
12
}
613
614
// MARK - Truncate Matrix
615
616
static void _ccv_cnnp_truncate(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
617
3
{
618
3
  assert(column_size >= 2);
619
3
  assert(column_size % 2 == 0);
620
3
  const int tuple_size = column_size / 2;
621
3
  ccv_cnnp_dataframe_tuple_t* const tuple = (ccv_cnnp_dataframe_tuple_t*)context;
622
3
  assert(tuple->size == tuple_size);
623
3
  parallel_for(i, batch_size) {
624
0
    int k;
625
0
    ccv_nnc_tensor_t* const first_seq = (ccv_nnc_tensor_t*)column_data[tuple_size][i];
626
0
    assert(first_seq->info.datatype == CCV_32S);
627
3
    const int first_len = ccv_nnc_tensor_count(first_seq->info);
628
3
    int max_len = 0;
629
12
    for (k = 0; k < tuple_size; 
k++9
)
630
9
    {
631
9
      ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[tuple_size + k][i];
632
9
      assert(seq->info.datatype == CCV_32S);
633
9
      const int len = ccv_nnc_tensor_count(seq->info);
634
9
      assert(len == first_len);
635
9
      const int* const ia = seq->data.i32;
636
9
      int l;
637
523
      for (l = 0; l < len; 
l++514
)
638
514
        max_len = ccv_max(max_len, ia[l]);
639
9
    }
640
3
    ccv_nnc_tensor_t* const first_inp = (ccv_nnc_tensor_t*)column_data[0][i];
641
3
    ccv_nnc_tensor_param_t first_params = first_inp->info;
642
3
    assert(first_params.dim[0] == first_len);
643
3
    assert(max_len <= first_params.dim[1]);
644
3
    first_params.dim[1] = max_len;
645
3
    ccv_nnc_tensor_t** outputs = data[i];
646
3
    if (!outputs)
647
3
      outputs = (ccv_nnc_tensor_t**)(data[i] = cccalloc(tuple_size, sizeof(ccv_nnc_tensor_t*)));
648
12
    for (k = 0; k < tuple_size; 
k++9
)
649
9
    {
650
9
      if (!outputs[k])
651
9
        outputs[k] = ccv_nnc_tensor_new(0, first_params, 0);
652
0
      else
653
0
        outputs[k] = ccv_nnc_tensor_resize(outputs[k], first_params);
654
9
    }
655
3
    parallel_for(c, tuple_size) {
656
0
      ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[tuple_size + c][i];
657
0
      assert(seq->info.datatype == CCV_32S);
658
9
      const int len = ccv_nnc_tensor_count(seq->info);
659
9
      ccv_nnc_tensor_t* const inp = (ccv_nnc_tensor_t*)column_data[c][i];
660
9
      ccv_nnc_tensor_param_t params = inp->info;
661
9
      assert(params.dim[0] == len);
662
9
      assert(first_len == len);
663
9
      assert(max_len <= params.dim[1]);
664
9
      assert(params.dim[2] == 0);
665
9
      const int ori_len = params.dim[1];
666
9
      ccv_nnc_tensor_t* const out = outputs[c];
667
9
      uint8_t* const ua = inp->data.u8;
668
9
      uint8_t* const ub = out->data.u8;
669
9
      size_t la = CCV_GET_DATA_TYPE_SIZE(params.datatype) * ori_len;
670
9
      size_t lb = CCV_GET_DATA_TYPE_SIZE(params.datatype) * max_len;
671
9
      parallel_for(j, len) {
672
0
        memcpy(ub + lb * j, ua + la * j, lb);
673
9
      } parallel_endfor
674
12
    } parallel_endfor
675
6
  } parallel_endfor
676
3
}
677
678
int ccv_cnnp_dataframe_truncate(ccv_cnnp_dataframe_t* const dataframe, const int* const vec_idxs, const int vec_idx_size, const int* const len_idxs, const int len_idx_size, const char* name)
679
3
{
680
3
  const int total_idx_size = vec_idx_size + len_idx_size;
681
3
  assert(total_idx_size > 0);
682
3
  assert(vec_idx_size == len_idx_size);
683
3
  int total_idxs[total_idx_size];
684
3
  memcpy(total_idxs, vec_idxs, sizeof(int) * vec_idx_size);
685
3
  memcpy(total_idxs + vec_idx_size, len_idxs, sizeof(int) * len_idx_size);
686
3
  ccv_cnnp_dataframe_tuple_t* const tuple =  (ccv_cnnp_dataframe_tuple_t*)ccmalloc(sizeof(ccv_cnnp_dataframe_tuple_t));
687
3
  tuple->size = vec_idx_size;
688
3
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_truncate, 0, _ccv_cnnp_tensor_list_deinit, total_idxs, total_idx_size, tuple, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
689
3
}
690
691
// MARK - Batching
692
693
typedef struct {
694
  ccv_cnnp_dataframe_tuple_t tuple;
695
  int format;
696
  int batch_count;
697
  int group_count;
698
} ccv_cnnp_batch_context_t;
699
700
static void _ccv_cnnp_combine_new(void* const* const input_data, const int input_size, void** const output_data, void* const context, ccv_nnc_stream_context_t* const stream_context)
701
359
{
702
359
  ccv_cnnp_batch_context_t* const batch = (ccv_cnnp_batch_context_t*)context;
703
359
  const int output_tuple_size = batch->tuple.size;
704
359
  const int batch_count = batch->batch_count;
705
359
  const int group_count = batch->group_count;
706
359
  const int input_tuple_size = output_tuple_size / group_count;
707
359
  int i, j, k;
708
359
  assert(input_size > 0);
709
359
  if (!output_data[0])
710
14
  {
711
14
    ccv_nnc_tensor_t** const inputs = (ccv_nnc_tensor_t**)input_data[0];
712
14
    ccv_nnc_tensor_t** const tensors = (ccv_nnc_tensor_t**)(output_data[0] = ccmalloc(sizeof(ccv_nnc_tensor_t*) * output_tuple_size));
713
59
    for (i = 0; i < group_count; 
i++45
)
714
140
      
for (j = 0; 45
j < input_tuple_size;
j++95
)
715
95
      {
716
95
        ccv_nnc_tensor_param_t params = inputs[j]->info;
717
95
        assert(params.datatype == CCV_32F || params.datatype == CCV_32S || params.datatype == CCV_16F); // Only support 32 bit float yet.
718
95
        assert(params.format == CCV_TENSOR_FORMAT_NHWC || params.format == CCV_TENSOR_FORMAT_NCHW);
719
95
        params.format = batch->format;
720
95
        // Special-case for dim count is 3 and 1, in these two cases, the N is not provided.
721
95
        if (batch->format == inputs[j]->info.format)
722
36
        {
723
36
          const int nd = ccv_nnc_tensor_nd(params.dim);
724
36
          memset(params.dim, 0, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC);
725
36
          memcpy(params.dim + 1, inputs[j]->info.dim, sizeof(int) * nd);
726
59
        } else {
727
59
          const int nd = ccv_nnc_tensor_nd(params.dim);
728
59
          if (nd < 3)
729
32
          {
730
32
            memset(params.dim, 0, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC);
731
32
            memcpy(params.dim + 1, inputs[j]->info.dim, sizeof(int) * nd);
732
32
          } else 
if (27
nd >= 327
) {
733
27
            memset(params.dim, 0, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC);
734
27
            const int hw = ccv_nnc_tensor_hw(inputs[j]->info, nd);
735
27
            if (batch->format == CCV_TENSOR_FORMAT_NCHW)
736
27
            {
737
27
              params.dim[1] = ccv_nnc_tensor_get_c(inputs[j]->info);
738
81
              for (k = 0; k < CCV_NNC_MAX_DIM; 
k++54
)
739
54
                params.dim[k + 2] = inputs[j]->info.dim[k + hw];
740
27
            } else {
741
0
              params.dim[CCV_NNC_MAX_DIM + 1] = ccv_nnc_tensor_get_c(inputs[j]->info);
742
0
              for (k = 0; k < CCV_NNC_MAX_DIM; k++)
743
0
                params.dim[k + 1] = inputs[j]->info.dim[k + hw];
744
0
            }
745
27
          }
746
59
        }
747
95
        params.dim[0] = batch_count; // Set the batch count now.
748
95
        tensors[i * input_tuple_size + j] = ccv_nnc_tensor_new(0, params, 0);
749
95
      }
750
14
  }
751
1.08k
  
for (i = 0; 359
i < group_count;
i++726
)
752
2.07k
    
for (j = 0; 726
j < input_tuple_size;
j++1.34k
)
753
1.34k
    {
754
1.34k
      ccv_nnc_tensor_t* const output = ((ccv_nnc_tensor_t**)output_data[0])[i * input_tuple_size + j];
755
1.34k
      parallel_for(k, batch_count) {
756
0
        ccv_nnc_tensor_t* const input = ((ccv_nnc_tensor_t**)input_data[(k + i * batch_count) % input_size])[j];
757
0
        const size_t tensor_count = ccv_nnc_tensor_count(input->info);
758
0
        if (input->info.datatype == CCV_32F)
759
55.5k
        {
760
55.5k
          float* const ap = input->data.f32;
761
55.5k
          float* const bp = output->data.f32 + k * tensor_count;
762
55.5k
          if (input->info.format == output->info.format)
763
5.95k
            memcpy(bp, ap, sizeof(float) * tensor_count);
764
49.5k
          else {
765
49.5k
            // Do a simple format conversion.
766
49.5k
            const int c = ccv_nnc_tensor_get_c(input->info);
767
49.5k
            assert(c > 0);
768
49.5k
            const size_t hw_count = tensor_count / c;
769
38.4k
            size_t x;
770
38.4k
            int y;
771
38.7k
            if (
input->info.format == CCV_TENSOR_FORMAT_NHWC38.4k
&& output->info.format == CCV_TENSOR_FORMAT_NCHW)
772
3.60M
              
for (x = 0; 38.9k
x < hw_count;
x++3.57M
)
773
9.42M
                
for (y = 0; 3.57M
y < c;
y++5.85M
)
774
5.85M
                  bp[y * hw_count + x] = ap[x * c + y];
775
18.4E
            else if (input->info.format == CCV_TENSOR_FORMAT_NCHW && 
output->info.format == CCV_TENSOR_FORMAT_NHWC0
)
776
0
              for (x = 0; x < hw_count; x++)
777
0
                for (y = 0; y < c; y++)
778
0
                  bp[x * c + y] = ap[y * hw_count + x];
779
38.4k
          }
780
18.4E
        } else if (input->info.datatype == CCV_32S) {
781
393
          int* const ap = input->data.i32;
782
393
          int* const bp = output->data.i32 + k * tensor_count;
783
393
          if (input->info.format == output->info.format)
784
0
            memcpy(bp, ap, sizeof(int) * tensor_count);
785
393
          else {
786
393
            // Do a simple format conversion.
787
393
            const int c = ccv_nnc_tensor_get_c(input->info);
788
393
            assert(c > 0);
789
483
            const size_t hw_count = tensor_count / c;
790
483
            size_t x;
791
483
            int y;
792
491
            if (
input->info.format == CCV_TENSOR_FORMAT_NHWC483
&& output->info.format == CCV_TENSOR_FORMAT_NCHW)
793
990
              
for (x = 0; 490
x < hw_count;
x++500
)
794
29.2k
                
for (y = 0; 500
y < c;
y++28.7k
)
795
28.7k
                  bp[y * hw_count + x] = ap[x * c + y];
796
18.4E
            else if (input->info.format == CCV_TENSOR_FORMAT_NCHW && 
output->info.format == CCV_TENSOR_FORMAT_NHWC0
)
797
0
              for (x = 0; x < hw_count; x++)
798
0
                for (y = 0; y < c; y++)
799
0
                  bp[x * c + y] = ap[y * hw_count + x];
800
483
          }
801
18.4E
        } else if (input->info.datatype == CCV_16F) {
802
109k
          ccv_float16_t* const ap = input->data.f16;
803
109k
          ccv_float16_t* const bp = output->data.f16 + k * tensor_count;
804
109k
          if (input->info.format == output->info.format)
805
10.5k
            memcpy(bp, ap, sizeof(ccv_float16_t) * tensor_count);
806
98.7k
          else {
807
98.7k
            // Do a simple format conversion.
808
98.7k
            const int c = ccv_nnc_tensor_get_c(input->info);
809
98.7k
            assert(c > 0);
810
98.7k
            const size_t hw_count = tensor_count / c;
811
78.7k
            size_t x;
812
78.7k
            int y;
813
79.0k
            if (
input->info.format == CCV_TENSOR_FORMAT_NHWC78.7k
&& output->info.format == CCV_TENSOR_FORMAT_NCHW)
814
7.16M
              
for (x = 0; 79.4k
x < hw_count;
x++7.08M
)
815
18.9M
                
for (y = 0; 7.08M
y < c;
y++11.8M
)
816
11.8M
                  bp[y * hw_count + x] = ap[x * c + y];
817
18.4E
            else if (input->info.format == CCV_TENSOR_FORMAT_NCHW && 
output->info.format == CCV_TENSOR_FORMAT_NHWC0
)
818
0
              for (x = 0; x < hw_count; x++)
819
0
                for (y = 0; y < c; y++)
820
0
                  bp[x * c + y] = ap[y * hw_count + x];
821
78.7k
          }
822
18.4E
        } else {
823
18.4E
          assert(0);
824
18.4E
        }
825
1.34k
      } parallel_endfor
826
1.34k
    }
827
359
}
828
829
static void _ccv_cnnp_combine_deinit(void* const self, void* const context)
830
14
{
831
14
  ccv_cnnp_batch_context_t* const batch = (ccv_cnnp_batch_context_t*)context;
832
14
  ccv_nnc_tensor_t** const tensors = (ccv_nnc_tensor_t**)self;
833
14
  const int size = batch->tuple.size;
834
14
  int i;
835
109
  for (i = 0; i < size; 
i++95
)
836
95
    ccv_nnc_tensor_free(tensors[i]);
837
14
  ccfree(tensors);
838
14
}
839
840
ccv_cnnp_dataframe_t* ccv_cnnp_dataframe_combine_new(ccv_cnnp_dataframe_t* const dataframe, const int* const column_idxs, const int column_idx_size, const int batch_count, const int group_count, const int format)
841
11
{
842
11
  assert(format == CCV_TENSOR_FORMAT_NCHW || format == CCV_TENSOR_FORMAT_NHWC);
843
11
  assert(column_idx_size >= 1);
844
11
  assert(batch_count > 0);
845
11
  assert(group_count > 0);
846
11
  const int derived = ccv_cnnp_dataframe_make_tuple(dataframe, column_idxs, column_idx_size, 0);
847
11
  ccv_cnnp_batch_context_t* const batch = (ccv_cnnp_batch_context_t*)ccmalloc(sizeof(ccv_cnnp_batch_context_t));
848
11
  batch->tuple.size = column_idx_size * group_count;
849
11
  batch->format = format;
850
11
  batch->batch_count = batch_count;
851
11
  batch->group_count = group_count;
852
11
  return ccv_cnnp_dataframe_sample_new(dataframe, _ccv_cnnp_combine_new, _ccv_cnnp_combine_deinit, derived, batch_count * group_count, batch, (ccv_cnnp_column_data_context_deinit_f)ccfree);
853
11
}