Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_cnnp_dataframe_addons.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_cnnp_dataframe.h"
6
#include "3rdparty/sfmt/SFMT.h"
7
8
// MARK - Create Dataframe from Array
9
10
static void _ccv_cnnp_array_enum(const int column_idx, const int* const row_idxs, const int row_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
11
384
{
12
384
  int i;
13
384
  ccv_array_t* const array = (ccv_array_t*)context;
14
181k
  for (i = 0; i < row_size; 
i++180k
)
15
180k
    data[i] = ccv_array_get(array, row_idxs[i]);
16
384
}
17
18
ccv_cnnp_dataframe_t* ccv_cnnp_dataframe_from_array_new(ccv_array_t* const array)
19
20
{
20
20
  const ccv_cnnp_column_data_t array_column_data = {
21
20
    .data_enum = _ccv_cnnp_array_enum,
22
20
    .context = array
23
20
  };
24
20
  return ccv_cnnp_dataframe_new(&array_column_data, 1, array->rnum);
25
20
}
26
27
typedef struct {
28
  ccv_cnnp_dataframe_tuple_t tuple;
29
  int tensor_offset;
30
  int device_id;
31
} ccv_cnnp_copy_to_gpu_context_t;
32
33
// MARK - Copy Tensors from CPU to GPU
34
35
static void _ccv_cnnp_tensor_list_deinit(void* const data, void* const context)
36
3.93k
{
37
3.93k
  ccv_cnnp_dataframe_tuple_t* const tuple = (ccv_cnnp_dataframe_tuple_t*)context;
38
3.93k
  ccv_nnc_tensor_t** const tensor_list = (ccv_nnc_tensor_t**)data;
39
3.93k
  int i;
40
7.90k
  for (i = 0; i < tuple->size; 
i++3.97k
)
41
3.97k
    if (tensor_list[i])
42
3.97k
      ccv_nnc_tensor_free(tensor_list[i]);
43
3.93k
  ccfree(tensor_list);
44
3.93k
}
45
46
static void _ccv_cnnp_copy_to_gpu(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
47
760
{
48
760
  const ccv_cnnp_copy_to_gpu_context_t* const copy_to_gpu_context = (ccv_cnnp_copy_to_gpu_context_t*)context;
49
760
  int i, j;
50
1.52k
  for (i = 0; i < batch_size; 
i++760
)
51
760
  {
52
760
    ccv_nnc_tensor_t* const* const inputs = (ccv_nnc_tensor_t* const*)column_data[0][i] + copy_to_gpu_context->tensor_offset;
53
760
    ccv_nnc_tensor_t** outputs = (ccv_nnc_tensor_t**)data[i];
54
760
    if (!outputs)
55
73
      outputs = (ccv_nnc_tensor_t**)(data[i] = cccalloc(copy_to_gpu_context->tuple.size, sizeof(ccv_nnc_tensor_t*)));
56
2.11k
    for (j = 0; j < copy_to_gpu_context->tuple.size; 
j++1.35k
)
57
1.35k
    {
58
1.35k
      ccv_nnc_tensor_param_t params = inputs[j]->info;
59
1.35k
      params.type &= ~CCV_TENSOR_CPU_MEMORY;
60
1.35k
      params.type |= CCV_TENSOR_GPU_MEMORY; // Change to GPU memory.
61
1.35k
      CCV_TENSOR_SET_DEVICE_ID(params.type, copy_to_gpu_context->device_id);
62
1.35k
      outputs[j] = outputs[j] ? 
ccv_nnc_tensor_resize(outputs[j], params)1.25k
:
ccv_nnc_tensor_new(0, params, 0)99
;
63
1.35k
      ccv_nnc_tensor_pin_memory(inputs[j]);
64
1.35k
    }
65
760
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, copy_to_gpu_context->tuple.size, outputs, copy_to_gpu_context->tuple.size, stream_context);
66
760
  }
67
760
}
68
69
int ccv_cnnp_dataframe_copy_to_gpu(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const int tensor_offset, const int tensor_size, const int device_id, const char* name)
70
64
{
71
64
  assert(tensor_size > 0);
72
64
  int stream_type = CCV_STREAM_CONTEXT_GPU;
73
64
  CCV_STREAM_SET_DEVICE_ID(stream_type, device_id);
74
64
  ccv_cnnp_copy_to_gpu_context_t* const copy_to_gpu_context = (ccv_cnnp_copy_to_gpu_context_t*)ccmalloc(sizeof(ccv_cnnp_copy_to_gpu_context_t));
75
64
  copy_to_gpu_context->tuple.size = tensor_size;
76
64
  copy_to_gpu_context->tensor_offset = tensor_offset;
77
64
  copy_to_gpu_context->device_id = device_id;
78
64
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_copy_to_gpu, stream_type, _ccv_cnnp_tensor_list_deinit, COLUMN_ID_LIST(column_idx), copy_to_gpu_context, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
79
64
}
80
81
// MARK - Use Command to Generate Output Tuple
82
83
typedef struct {
84
  ccv_cnnp_dataframe_tuple_t tuple;
85
  int input_offset;
86
  int input_size;
87
  ccv_nnc_cmd_t cmd;
88
  ccv_nnc_hint_t hint;
89
  int flags;
90
  ccv_nnc_tensor_param_t output_params[1];
91
} ccv_cnnp_cmd_exec_context_t;
92
93
static void _ccv_cnnp_dataframe_cmd_exec(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
94
296
{
95
296
  const ccv_cnnp_cmd_exec_context_t* const cmd_exec_context = (ccv_cnnp_cmd_exec_context_t*)context;
96
296
  int i, j;
97
120k
  for (i = 0; i < batch_size; 
i++120k
)
98
120k
  {
99
120k
    ccv_nnc_tensor_t* const* const inputs = (ccv_nnc_tensor_t* const*)column_data[0][i] + cmd_exec_context->input_offset;
100
120k
    ccv_nnc_tensor_t** outputs = (ccv_nnc_tensor_t**)data[i];
101
120k
    if (!outputs)
102
3.84k
    {
103
3.84k
      outputs = (ccv_nnc_tensor_t**)(data[i] = ccmalloc(sizeof(ccv_nnc_tensor_t*) * cmd_exec_context->tuple.size));
104
7.68k
      for (j = 0; j < cmd_exec_context->tuple.size; 
j++3.84k
)
105
3.84k
        outputs[j] = ccv_nnc_tensor_new(0, cmd_exec_context->output_params[j], 0);
106
3.84k
    }
107
120k
    ccv_nnc_cmd_exec(cmd_exec_context->cmd, cmd_exec_context->hint, cmd_exec_context->flags, inputs, cmd_exec_context->input_size, outputs, cmd_exec_context->tuple.size, stream_context);
108
120k
  }
109
296
}
110
111
int ccv_cnnp_dataframe_cmd_exec(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, const int input_offset, const int input_size, const ccv_nnc_tensor_param_t* const output_params, const int output_size, const int stream_type, const char* name)
112
5
{
113
5
  assert(input_size > 0);
114
5
  assert(output_size > 0);
115
5
  ccv_cnnp_cmd_exec_context_t* const cmd_exec_context = (ccv_cnnp_cmd_exec_context_t*)ccmalloc(sizeof(ccv_cnnp_cmd_exec_context_t) + sizeof(ccv_nnc_tensor_param_t) * (output_size - 1));
116
5
  cmd_exec_context->tuple.size = output_size;
117
5
  cmd_exec_context->input_offset = input_offset;
118
5
  cmd_exec_context->input_size = input_size;
119
5
  cmd_exec_context->cmd = cmd;
120
5
  cmd_exec_context->hint = hint;
121
5
  cmd_exec_context->flags = flags;
122
5
  memcpy(cmd_exec_context->output_params, output_params, sizeof(ccv_nnc_tensor_param_t) * output_size);
123
5
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_dataframe_cmd_exec, stream_type, _ccv_cnnp_tensor_list_deinit, COLUMN_ID_LIST(column_idx), cmd_exec_context, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
124
0
  return 0;
125
5
}
126
127
// MARK - Make Auxiliary Tensor as a new Column
128
129
static void _ccv_cnnp_tensor_deinit(void* const data, void* const context)
130
5.28k
{
131
5.28k
  ccv_nnc_tensor_free((ccv_nnc_tensor_t*)data);
132
5.28k
}
133
134
static void _ccv_cnnp_tensor_new(const int column_idx, const int* const row_idxs, const int row_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
135
708
{
136
708
  ccv_nnc_tensor_param_t params = *(ccv_nnc_tensor_param_t*)context;
137
708
  int i;
138
1.41k
  for (i = 0; i < row_size; 
i++708
)
139
708
    if (!data[i])
140
27
      data[i] = ccv_nnc_tensor_new(0, params, 0);
141
708
}
142
143
int ccv_cnnp_dataframe_add_aux(ccv_cnnp_dataframe_t* const dataframe, const ccv_nnc_tensor_param_t params, const char* name)
144
18
{
145
18
  int stream_type = CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_CPU_MEMORY ? 
00
: CCV_STREAM_CONTEXT_GPU;
146
18
  if (stream_type == CCV_STREAM_CONTEXT_GPU)
147
18
    CCV_STREAM_SET_DEVICE_ID(stream_type, CCV_TENSOR_GET_DEVICE_ID(params.type));
148
18
  ccv_nnc_tensor_param_t* const context = (ccv_nnc_tensor_param_t*)ccmalloc(sizeof(ccv_nnc_tensor_param_t));
149
18
  context[0] = params;
150
18
  return ccv_cnnp_dataframe_add(dataframe, _ccv_cnnp_tensor_new, stream_type, _ccv_cnnp_tensor_deinit, context, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
151
18
}
152
153
// MARK - Load Tensor from File Path
154
155
static void _ccv_cnnp_image_deinit(void* const data, void* const context)
156
4.60k
{
157
4.60k
  ccv_matrix_free(data);
158
4.60k
}
159
160
static void _ccv_cnnp_read_image(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
161
0
{
162
0
  parallel_for(i, batch_size) {
163
0
    if (data[i])
164
0
      ccv_matrix_free(data[i]);
165
0
    off_t structof = (off_t)context;
166
0
    const char* const filename = *(char* const*)((const char*)column_data[0][i] + structof);
167
0
    data[i] = 0;
168
0
    ccv_read(filename, (ccv_dense_matrix_t**)&data[i], CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
169
0
  } parallel_endfor
170
0
}
171
172
int ccv_cnnp_dataframe_read_image(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const off_t structof, const char* name)
173
0
{
174
0
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_read_image, 0, _ccv_cnnp_image_deinit, COLUMN_ID_LIST(column_idx), (void*)(uintptr_t)structof, 0, name);
175
0
}
176
177
// MARK - Apply Random Jitter to Image
178
179
typedef struct {
180
  sfmt_t sfmt;
181
  int datatype;
182
  ccv_cnnp_random_jitter_t random_jitter;
183
} ccv_cnnp_random_jitter_context_t;
184
185
static void _ccv_cnnp_image_lighting(ccv_dense_matrix_t* image, const float alpha_r, const float alpha_g, const float alpha_b)
186
1
{
187
1
  assert(CCV_GET_DATA_TYPE(image->type) == CCV_32F);
188
1
  assert(CCV_GET_CHANNEL(image->type) == CCV_C3);
189
  // These eigenvector values can be computed out of imageNet dataset (see ccv_convnet for how that is done). Here I just copied
190
  // from mxnet: https://github.com/apache/incubator-mxnet/blob/master/src/operator/image/image_random-inl.h#L632
191
1
  const float pca_r = alpha_r * (55.46 * -0.5675) + alpha_g * (4.794 * 0.7192) + alpha_b * (1.148 * 0.4009);
192
1
  const float pca_g = alpha_r * (55.46 * -0.5808) + alpha_g * (4.794 * -0.0045) + alpha_b * (1.148 * -0.8140);
193
1
  const float pca_b = alpha_r * (55.46 * -0.5836) + alpha_g * (4.794 * -0.6948) + alpha_b * (1.148 * 0.4203);
194
1
  int i;
195
1
  const int size = image->rows * image->cols;
196
1
  float* const ptr = image->data.f32;
197
53.3k
  for (i = 0; i < size; 
i++53.3k
)
198
53.3k
  {
199
53.3k
    ptr[i * 3] = ccv_clamp(ptr[i * 3] + pca_r, 0, 255);
200
53.3k
    ptr[i * 3 + 1] = ccv_clamp(ptr[i * 3 + 1] + pca_g, 0, 255);
201
53.3k
    ptr[i * 3 + 2] = ccv_clamp(ptr[i * 3 + 2] + pca_b, 0, 255);
202
53.3k
  }
203
1
}
204
205
static float _ccv_cnnp_random_logexp(sfmt_t* const sfmt, const float jitter)
206
4
{
207
  // We want to get something around logarithmic scale, thus, 0 is no good, and infinity is no good. 1 is the same.
208
  // jitter is some turbulence we want around 1. We want the range range to be around [1 / (1 + jitter), 1 + jitter]
209
  // but the distribution is not uniform (50% fall under 1, and 50% fall above 1). The way to do this is to first
210
  // get to logarithmic range, doing a uniform sampling, and then convert back.
211
4
  double log_jitter_limit = log(1 + jitter);
212
4
  double log_random_jitter = sfmt_genrand_real1(sfmt) * 2 * log_jitter_limit - log_jitter_limit;
213
4
  return (float)exp(log_random_jitter); // Convert it back to exponential form.
214
4
}
215
216
static void _ccv_cnnp_image_manip(ccv_dense_matrix_t* image, const ccv_cnnp_random_jitter_t random_jitter, sfmt_t* const sfmt)
217
150k
{
218
150k
  assert(sfmt && CCV_GET_CHANNEL(image->type) == CCV_C3);
219
150k
  int idx[4] = {0, 1, 2, 3};
220
150k
  sfmt_genrand_shuffle(sfmt, idx, 4, sizeof(int));
221
150k
  int i;
222
750k
  for (i = 0; i < 4; 
i++600k
)
223
    // change the applying order
224
600k
    switch (idx[i])
225
600k
    {
226
150k
      case 0:
227
150k
        if (random_jitter.brightness == 0)
228
150k
          break;
229
        // introduce some brightness changes to the original image
230
1
        ccv_scale(image, (ccv_matrix_t**)&image, 0, _ccv_cnnp_random_logexp(sfmt, random_jitter.brightness));
231
1
        break;
232
150k
      case 1:
233
        // introduce some saturation changes to the original image
234
150k
        if (random_jitter.saturation == 0)
235
150k
          break;
236
1
        ccv_saturation(image, &image, 0, _ccv_cnnp_random_logexp(sfmt, random_jitter.saturation));
237
1
        break;
238
150k
      case 2:
239
        // introduce some contrast changes to the original image
240
150k
        if (random_jitter.contrast == 0)
241
150k
          break;
242
1
        ccv_contrast(image, &image, 0, _ccv_cnnp_random_logexp(sfmt, random_jitter.contrast));
243
1
        break;
244
150k
      case 3:
245
150k
        if (random_jitter.lighting == 0)
246
150k
          break;
247
1
        _ccv_cnnp_image_lighting(image, sfmt_genrand_real1(sfmt) * random_jitter.lighting, sfmt_genrand_real1(sfmt) * random_jitter.lighting, sfmt_genrand_real1(sfmt) * random_jitter.lighting);
248
1
        break;
249
600k
    }
250
150k
}
251
252
static void _ccv_cnnp_normalize(ccv_dense_matrix_t* const image, const float mean[3], const float inv_std[3])
253
150k
{
254
150k
  int i;
255
150k
  const int count = image->rows * image->cols;
256
150k
  float* ap = image->data.f32;
257
153M
  for (i = 0; i < count; 
i++153M
)
258
153M
  {
259
153M
    ap[i * 3] = (ap[i * 3] - mean[0]) * inv_std[0];
260
153M
    ap[i * 3 + 1] = (ap[i * 3 + 1] - mean[1]) * inv_std[1];
261
153M
    ap[i * 3 + 2] = (ap[i * 3 + 2] - mean[2]) * inv_std[2];
262
153M
  }
263
150k
}
264
265
static void _ccv_cnnp_random_jitter(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
266
295
{
267
295
  sfmt_t* const sfmt = (sfmt_t*)ccmalloc(sizeof(sfmt_t) * batch_size);
268
295
  ccv_cnnp_random_jitter_context_t* const ctx = (ccv_cnnp_random_jitter_context_t*)context;
269
295
  int i;
270
150k
  for (i = 0; i < batch_size; 
i++150k
)
271
150k
    sfmt_init_gen_rand(&sfmt[i], sfmt_genrand_uint32(&ctx->sfmt));
272
295
  const ccv_cnnp_random_jitter_t random_jitter = ctx->random_jitter;
273
295
  assert(random_jitter.resize.min > 0);
274
295
  assert(random_jitter.resize.max >= random_jitter.resize.min);
275
150k
  
parallel_for295
(i, batch_size) {
276
150k
    if (data[i])
277
145k
      ccv_matrix_free(data[i]);
278
150k
    ccv_dense_matrix_t* const input = (ccv_dense_matrix_t*)column_data[0][i];
279
150k
    const int resize = ccv_clamp((int)(sfmt_genrand_real1(&sfmt[i]) * (random_jitter.resize.max - random_jitter.resize.min) + 0.5) + random_jitter.resize.min, random_jitter.resize.min, random_jitter.resize.max);
280
150k
    int resize_rows = ccv_max(resize, (int)(input->rows * (float)resize / input->cols + 0.5));
281
150k
    int resize_cols = ccv_max(resize, (int)(input->cols * (float)resize / input->rows + 0.5));
282
150k
    if (random_jitter.aspect_ratio > 0)
283
1
    {
284
1
      const float aspect_ratio = sqrtf(_ccv_cnnp_random_logexp(&sfmt[i],  random_jitter.aspect_ratio));
285
1
      resize_rows = (int)(resize_rows * aspect_ratio + 0.5);
286
1
      resize_cols = (int)(resize_cols / aspect_ratio + 0.5);
287
1
    }
288
150k
    if (random_jitter.resize.roundup > 0)
289
0
    {
290
0
      const int roundup = random_jitter.resize.roundup;
291
0
      const int roundup_2 = roundup / 2;
292
0
      resize_rows = (resize_rows + roundup_2) / roundup * roundup;
293
0
      resize_cols = (resize_cols + roundup_2) / roundup * roundup;
294
0
    }
295
150k
    const int need_crop = (random_jitter.size.cols > 0 && random_jitter.size.rows > 0 &&
296
150k
      ((resize_cols != random_jitter.size.cols || 
resize_rows != random_jitter.size.rows150k
) ||
297
150k
       
(150k
random_jitter.offset.x != 0150k
||
random_jitter.offset.y != 00
)));
298
150k
    int cropped = 0, crop_x = 0, crop_y = 0;
299
150k
    ccv_dense_matrix_t* sliced = 0;
300
150k
    if (need_crop)
301
150k
    {
302
      // Compute crop x, y.
303
150k
      crop_x = random_jitter.center_crop ?
304
0
        (resize_cols - random_jitter.size.cols + 1) / 2 : // Otherwise, random select x.
305
150k
        (int)(sfmt_genrand_real1(&sfmt[i]) * (resize_cols - random_jitter.size.cols + 1));
306
150k
      crop_x = ccv_clamp(crop_x,
307
150k
        ccv_min(0, resize_cols - random_jitter.size.cols),
308
150k
        ccv_max(0, resize_cols - random_jitter.size.cols));
309
150k
      crop_y = random_jitter.center_crop ?
310
0
        (resize_rows - random_jitter.size.rows + 1) / 2 : // Otherwise, random select y.
311
150k
        (int)(sfmt_genrand_real1(&sfmt[i]) * (resize_rows - random_jitter.size.rows + 1));
312
150k
      crop_y = ccv_clamp(crop_y,
313
150k
        ccv_min(0, resize_rows - random_jitter.size.rows),
314
150k
        ccv_max(0, resize_rows - random_jitter.size.rows));
315
150k
      if (random_jitter.offset.x != 0)
316
150k
        crop_x += sfmt_genrand_real1(&sfmt[i]) * random_jitter.offset.x * 2 - random_jitter.offset.x;
317
150k
      if (random_jitter.offset.y != 0)
318
150k
        crop_y += sfmt_genrand_real1(&sfmt[i]) * random_jitter.offset.y * 2 - random_jitter.offset.y;
319
      // If we can fill in the whole view (not introducing any 0 padding), we can first crop and then scale down / up.
320
150k
      if (resize_cols >= random_jitter.size.cols && resize_rows >= random_jitter.size.rows)
321
150k
      {
322
150k
        const float scale_x = (float)input->cols / resize_cols;
323
150k
        const float scale_y = (float)input->rows / resize_rows;
324
150k
        const int slice_cols = (int)(random_jitter.size.cols * scale_x + 0.5);
325
150k
        const int slice_rows = (int)(random_jitter.size.rows * scale_y + 0.5);
326
150k
        assert(slice_cols <= input->cols);
327
150k
        assert(slice_rows <= input->rows);
328
150k
        const int x = ccv_clamp((int)(crop_x * scale_x + 0.5), 0, input->cols - slice_cols);
329
150k
        const int y = ccv_clamp((int)(crop_y * scale_y + 0.5), 0, input->rows - slice_rows);
330
150k
        ccv_slice(input, (ccv_matrix_t**)&sliced, 0, y, x, slice_rows, slice_cols);
331
150k
        resize_cols = random_jitter.size.cols;
332
150k
        resize_rows = random_jitter.size.rows;
333
150k
        cropped = 1;
334
150k
      } else
335
1
        sliced = input;
336
150k
    } else
337
0
      sliced = input;
338
150k
    ccv_dense_matrix_t* resized = 0;
339
    // Resize.
340
150k
    if (sliced->rows >= resize_rows && sliced->cols >= resize_cols)
341
150k
    {
342
      // If we can fill in the whole view, we can first crop and then scale down / up.
343
150k
      ccv_resample(sliced, &resized, CCV_32F, (double)resize_rows / (double)sliced->rows, (double)resize_cols / (double)sliced->cols, CCV_INTER_AREA);
344
150k
    } else 
if (0
sliced->rows != resize_rows0
||
sliced->cols != resize_cols0
) {
345
0
      ccv_resample(sliced, &resized, CCV_32F, (double)resize_rows / (double)sliced->rows, (double)resize_cols / (double)sliced->cols, CCV_INTER_CUBIC);
346
0
    } else {
347
0
      ccv_shift(sliced, (ccv_matrix_t**)&resized, CCV_32F, 0, 0); // converting to 32f
348
0
    }
349
150k
    if (sliced != input)
350
150k
      ccv_matrix_free(sliced);
351
150k
    if (random_jitter.symmetric && 
(sfmt_genrand_uint32(&sfmt[i]) & 1) == 0150k
)
352
75.1k
      ccv_flip(resized, &resized, 0, CCV_FLIP_X);
353
150k
    _ccv_cnnp_image_manip(resized, random_jitter, &sfmt[i]);
354
    // Apply normalization. Slice will introduce 0 padding, which won't be correct before normalization.
355
150k
    if (random_jitter.normalize.mean[0] != 0 || 
random_jitter.normalize.std[0] != 10
||
356
150k
      
random_jitter.normalize.mean[1] != 00
||
random_jitter.normalize.std[1] != 10
||
357
150k
      
random_jitter.normalize.mean[2] != 00
||
random_jitter.normalize.std[2] != 10
)
358
150k
      _ccv_cnnp_normalize(resized, random_jitter.normalize.mean, random_jitter.normalize.std);
359
    // If we haven't cropped in previous step (likely because we have some fill-ins due to the resize down too much).
360
    // Do the crop now.
361
150k
    ccv_dense_matrix_t* patch = 0;
362
150k
    if (!cropped && 
need_crop1
)
363
1
    {
364
1
      ccv_slice(resized, (ccv_matrix_t**)&patch, CCV_32F, crop_y, crop_x, random_jitter.size.rows, random_jitter.size.cols);
365
1
      ccv_matrix_free(resized);
366
1
    } else
367
150k
      patch = resized;
368
150k
    assert(!ccv_any_nan(patch));
369
150k
    data[i] = patch;
370
150k
  } parallel_endfor
371
295
  ccfree(sfmt);
372
295
}
373
374
int ccv_cnnp_dataframe_image_random_jitter(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const int datatype, const ccv_cnnp_random_jitter_t random_jitter, const char* name)
375
4
{
376
4
  assert(datatype == CCV_32F);
377
4
  ccv_cnnp_random_jitter_context_t* const random_jitter_context = (ccv_cnnp_random_jitter_context_t*)ccmalloc(sizeof(ccv_cnnp_random_jitter_context_t));
378
4
  if (random_jitter.seed)
379
4
    sfmt_init_gen_rand(&random_jitter_context->sfmt, (uint32_t)random_jitter.seed);
380
0
  else
381
0
    sfmt_init_gen_rand(&random_jitter_context->sfmt, ccv_nnc_stream_context_genrand_uint32(0));
382
4
  random_jitter_context->datatype = datatype;
383
4
  random_jitter_context->random_jitter = random_jitter;
384
4
  int i;
385
  // The std in the random jitter should be inv_std.
386
16
  for (i = 0; i < 3; 
i++12
)
387
12
    random_jitter_context->random_jitter.normalize.std[i] = random_jitter_context->random_jitter.normalize.std[i] ? 
1. / random_jitter_context->random_jitter.normalize.std[i]3
:
19
;
388
4
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_random_jitter, 0, _ccv_cnnp_image_deinit, COLUMN_ID_LIST(column_idx), random_jitter_context, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
389
4
}
390
391
typedef struct {
392
  int range;
393
  int datatype;
394
  int format;
395
  float onval;
396
  float offval;
397
  off_t structof;
398
} ccv_cnnp_one_hot_context_t;
399
400
static void _ccv_cnnp_one_hot(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
401
305
{
402
305
  ccv_cnnp_one_hot_context_t* const one_hot = (ccv_cnnp_one_hot_context_t*)context;
403
305
  ccv_nnc_tensor_param_t params = {
404
305
    .datatype = one_hot->datatype,
405
305
    .type = CCV_TENSOR_CPU_MEMORY,
406
305
    .format = one_hot->format,
407
305
    .dim = {
408
305
      one_hot->range,
409
305
    },
410
305
  };
411
150k
  
parallel_for305
(i, batch_size) {
412
150k
    int j;
413
150k
    const int label = *(const int*)((const char*)column_data[0][i] + one_hot->structof);
414
150k
    if (!data[i])
415
4.87k
      data[i] = ccv_nnc_tensor_new(0, params, 0);
416
150k
    ccv_nnc_tensor_t* const tensor = (ccv_nnc_tensor_t*)data[i];
417
150k
    assert(label >= 0 && label < one_hot->range);
418
150k
    if (tensor->info.datatype == CCV_32F)
419
550k
      
for (j = 0; 50.2k
j < one_hot->range;
j++500k
)
420
500k
        tensor->data.f32[j] = (j == label) ? 
one_hot->onval50.2k
:
one_hot->offval450k
;
421
100k
    else if (tensor->info.datatype == CCV_16F)
422
1.10M
      
for (j = 0; 100k
j < one_hot->range;
j++1.00M
)
423
1.00M
        ccv_float_to_half_precision((j == label) ? 
&one_hot->onval100k
:
&one_hot->offval900k
, (uint16_t*)(tensor->data.f16 + j), 1);
424
0
    else
425
0
      { assert(0); }
426
150k
  } parallel_endfor
427
305
}
428
429
int ccv_cnnp_dataframe_one_hot(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const off_t structof, const int range, const float onval, const float offval, const int datatype, const int format, const char* name)
430
7
{
431
7
  assert(datatype == CCV_32F || datatype == CCV_16F);
432
7
  ccv_cnnp_one_hot_context_t* const one_hot = (ccv_cnnp_one_hot_context_t*)ccmalloc(sizeof(ccv_cnnp_one_hot_context_t));
433
7
  one_hot->range = range;
434
7
  one_hot->datatype = datatype;
435
7
  one_hot->format = format;
436
7
  one_hot->onval = onval;
437
7
  one_hot->offval = offval;
438
7
  one_hot->structof = structof;
439
7
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_one_hot, 0, _ccv_cnnp_tensor_deinit, COLUMN_ID_LIST(column_idx), one_hot, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
440
7
}
441
442
typedef struct {
443
  int from_dt;
444
  int to_dt;
445
  int format;
446
  off_t structof;
447
} ccv_cnnp_copy_scalar_context_t;
448
449
static void _ccv_cnnp_copy_scalar(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
450
14
{
451
14
  ccv_cnnp_copy_scalar_context_t* const copy_scalar = (ccv_cnnp_copy_scalar_context_t*)context;
452
14
  ccv_nnc_tensor_param_t params = {
453
14
    .datatype = copy_scalar->to_dt,
454
14
    .type = CCV_TENSOR_CPU_MEMORY,
455
14
    .format = copy_scalar->format,
456
14
    .dim = {1},
457
14
  };
458
520
  
parallel_for14
(i, batch_size) {
459
520
    const ccv_numeric_data_t value = {
460
520
      .u8 = (unsigned char *)((const char*)column_data[0][i] + copy_scalar->structof),
461
520
    };
462
520
    if (!data[i])
463
387
      data[i] = ccv_nnc_tensor_new(0, params, 0);
464
520
    ccv_nnc_tensor_t* const tensor = (ccv_nnc_tensor_t*)data[i];
465
520
    if (copy_scalar->from_dt == CCV_32S)
466
520
    {
467
520
      if (tensor->info.datatype == CCV_32F)
468
520
        tensor->data.f32[0] = value.i32[0];
469
0
      else if (tensor->info.datatype == CCV_16F) {
470
0
        float fval = value.i32[0];
471
0
        ccv_float_to_half_precision(&fval, (uint16_t*)tensor->data.f16, 1);
472
0
      }
473
520
    } else 
if (0
copy_scalar->from_dt == CCV_32F0
) {
474
0
      if (tensor->info.datatype == CCV_32F)
475
0
        tensor->data.f32[0] = value.f32[0];
476
0
      else if (tensor->info.datatype == CCV_16F)
477
0
        ccv_float_to_half_precision(value.f32, (uint16_t*)tensor->data.f16, 1);
478
0
    } else if (copy_scalar->from_dt == CCV_16F) {
479
0
      if (tensor->info.datatype == CCV_32F)
480
0
        ccv_half_precision_to_float((uint16_t*)value.f16, tensor->data.f32, 1);
481
0
      else if (tensor->info.datatype == CCV_16F)
482
0
        tensor->data.f16[0] = value.f16[0];
483
0
    }
484
520
  } parallel_endfor
485
14
}
486
487
CCV_WARN_UNUSED(int) ccv_cnnp_dataframe_copy_scalar(ccv_cnnp_dataframe_t* const dataframe, const int column_idx, const off_t structof, const int from_dt, const int to_dt, const int format, const char* name)
488
5
{
489
5
  assert(from_dt == CCV_32S || from_dt == CCV_32F || from_dt == CCV_16F);
490
5
  assert(to_dt == CCV_32F || to_dt == CCV_16F);
491
5
  ccv_cnnp_copy_scalar_context_t* const copy_scalar = (ccv_cnnp_copy_scalar_context_t*)ccmalloc(sizeof(ccv_cnnp_copy_scalar_context_t));
492
5
  copy_scalar->from_dt = from_dt;
493
5
  copy_scalar->to_dt = to_dt;
494
5
  copy_scalar->format = format;
495
5
  copy_scalar->structof = structof;
496
5
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_copy_scalar, 0, _ccv_cnnp_tensor_deinit, COLUMN_ID_LIST(column_idx), copy_scalar, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
497
5
}
498
499
// MARK - Matrix of Ones
500
501
typedef struct {
502
  ccv_cnnp_dataframe_tuple_t tuple;
503
  int variable_size;
504
  int max_length;
505
} ccv_cnnp_one_squared_context_t;
506
507
static void _ccv_cnnp_one_squared(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
508
12
{
509
12
  ccv_cnnp_one_squared_context_t* const ones = (ccv_cnnp_one_squared_context_t*)context;
510
12
  assert(ones->tuple.size == column_size);
511
12
  const int max_length = ones->max_length;
512
12
  if (ones->variable_size)
513
3
  {
514
3
    parallel_for(i, batch_size) {
515
3
      ccv_nnc_tensor_t* const first_seq = (ccv_nnc_tensor_t*)column_data[0][i];
516
3
      assert(first_seq->info.datatype == CCV_32S);
517
3
      const int first_len = ccv_nnc_tensor_count(first_seq->info);
518
3
      ccv_nnc_tensor_t** outputs = data[i];
519
3
      if (!outputs)
520
3
        outputs = (ccv_nnc_tensor_t**)(data[i] = cccalloc(column_size, sizeof(ccv_nnc_tensor_t*)));
521
3
      int k;
522
12
      for (k = 0; k < column_size; 
k++9
)
523
9
        if (!outputs[k])
524
9
          outputs[k] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, first_len, max_length, max_length), 0);
525
3
      int max_len = 0;
526
12
      for (k = 0; k < column_size; 
k++9
)
527
9
      {
528
9
        ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[k][i];
529
9
        assert(seq->info.datatype == CCV_32S);
530
9
        const int len = ccv_nnc_tensor_count(seq->info);
531
9
        assert(len == first_len);
532
9
        const int* const ia = seq->data.i32;
533
9
        int l;
534
523
        for (l = 0; l < len; 
l++514
)
535
514
          max_len = ccv_max(max_len, ia[l]);
536
9
      }
537
3
      assert(max_len <= max_length);
538
9
      
parallel_for3
(c, column_size) {
539
9
        ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[c][i];
540
9
        assert(seq->info.datatype == CCV_32S);
541
9
        const int len = ccv_nnc_tensor_count(seq->info);
542
9
        assert(len == first_len);
543
9
        ccv_nnc_tensor_t* tensor = outputs[c];
544
9
        tensor = ccv_nnc_tensor_resize(tensor, CPU_TENSOR_NHWC(32S, len, max_len, max_len));
545
9
        assert(outputs[c] == tensor); // Since we allocated with max_length, this cannot be reallocated.
546
9
        const int* const ia = seq->data.i32;
547
514
        
parallel_for9
(j, len) {
548
514
          int x, y;
549
514
          int seq_len = ia[j];
550
514
          int* ib = tensor->data.i32 + j * max_len * max_len;
551
118k
          for (y = 0; y < seq_len; 
y++118k
)
552
118k
          {
553
37.0M
            for (x = 0; x < seq_len; 
x++36.9M
)
554
36.9M
              ib[x] = 1;
555
23.6M
            for (x = seq_len; x < max_len; 
x++23.5M
)
556
23.5M
              ib[x] = 0;
557
118k
            ib += max_len;
558
118k
          }
559
514
          if (seq_len < max_len)
560
473
            memset(ib, 0, sizeof(int) * max_len * (max_len - seq_len));
561
514
        } parallel_endfor
562
9
      } parallel_endfor
563
3
    } parallel_endfor
564
9
  } else {
565
9
    parallel_for(i, batch_size) {
566
9
      ccv_nnc_tensor_t** outputs = data[i];
567
9
      ccv_nnc_tensor_t* const first_seq = (ccv_nnc_tensor_t*)column_data[0][i];
568
9
      assert(first_seq->info.datatype == CCV_32S);
569
9
      const int first_len = ccv_nnc_tensor_count(first_seq->info);
570
9
      if (!outputs)
571
9
        outputs = (ccv_nnc_tensor_t**)(data[i] = cccalloc(column_size, sizeof(ccv_nnc_tensor_t*)));
572
9
      int k;
573
18
      for (k = 0; k < column_size; 
k++9
)
574
9
        if (!outputs[k])
575
9
          outputs[k] = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, first_len, max_length, max_length), 0);
576
9
      parallel_for(c, column_size) {
577
9
        ccv_nnc_tensor_t* const tensor = outputs[c];
578
9
        ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[c][i];
579
9
        assert(seq->info.datatype == CCV_32S);
580
9
        const int len = ccv_nnc_tensor_count(seq->info);
581
9
        assert(len == first_len);
582
9
        const int* const ia = seq->data.i32;
583
514
        
parallel_for9
(j, len) {
584
514
          int x, y;
585
514
          int seq_len = ia[j];
586
514
          int* ib = tensor->data.i32 + j * max_length * max_length;
587
118k
          for (y = 0; y < seq_len; 
y++118k
)
588
118k
          {
589
37.0M
            for (x = 0; x < seq_len; 
x++36.9M
)
590
36.9M
              ib[x] = 1;
591
23.6M
            for (x = seq_len; x < max_length; 
x++23.5M
)
592
23.5M
              ib[x] = 0;
593
118k
            ib += max_length;
594
118k
          }
595
514
          if (seq_len < max_length)
596
474
            memset(ib, 0, sizeof(int) * max_length * (max_length - seq_len));
597
514
        } parallel_endfor
598
9
      } parallel_endfor
599
9
    } parallel_endfor
600
9
  }
601
12
}
602
603
CCV_WARN_UNUSED(int) ccv_cnnp_dataframe_one_squared(ccv_cnnp_dataframe_t* const dataframe,  const int* const column_idxs, const int column_idx_size, const int variable_size, const int max_length, const char* name)
604
12
{
605
12
  assert(max_length > 0);
606
12
  assert(variable_size == 0 || variable_size == 1);
607
12
  ccv_cnnp_one_squared_context_t* const ones = (ccv_cnnp_one_squared_context_t*)ccmalloc(sizeof(ccv_cnnp_one_squared_context_t));
608
12
  ones->tuple.size = column_idx_size;
609
12
  ones->variable_size = variable_size;
610
12
  ones->max_length = max_length;
611
12
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_one_squared, 0, _ccv_cnnp_tensor_list_deinit, column_idxs, column_idx_size, ones, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
612
12
}
613
614
// MARK - Truncate Matrix
615
616
static void _ccv_cnnp_truncate(void* const* const* const column_data, const int column_size, const int batch_size, void** const data, void* const context, ccv_nnc_stream_context_t* const stream_context)
617
7
{
618
7
  assert(column_size >= 2);
619
7
  assert(column_size % 2 == 0);
620
7
  const int tuple_size = column_size / 2;
621
7
  ccv_cnnp_dataframe_tuple_t* const tuple = (ccv_cnnp_dataframe_tuple_t*)context;
622
7
  assert(tuple->size == tuple_size);
623
7
  parallel_for(i, batch_size) {
624
7
    int k;
625
7
    ccv_nnc_tensor_t* const first_seq = (ccv_nnc_tensor_t*)column_data[tuple_size][i];
626
7
    assert(first_seq->info.datatype == CCV_32S);
627
7
    const int first_len = ccv_nnc_tensor_count(first_seq->info);
628
7
    int max_len = 0;
629
20
    for (k = 0; k < tuple_size; 
k++13
)
630
13
    {
631
13
      ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[tuple_size + k][i];
632
13
      assert(seq->info.datatype == CCV_32S);
633
13
      const int len = ccv_nnc_tensor_count(seq->info);
634
13
      assert(len == first_len);
635
13
      const int* const ia = seq->data.i32;
636
13
      int l;
637
783
      for (l = 0; l < len; 
l++770
)
638
770
        max_len = ccv_max(max_len, ia[l]);
639
13
    }
640
7
    ccv_nnc_tensor_t* const first_inp = (ccv_nnc_tensor_t*)column_data[0][i];
641
7
    ccv_nnc_tensor_param_t first_params = first_inp->info;
642
7
    assert(first_params.dim[0] == first_len);
643
7
    assert(max_len <= first_params.dim[1]);
644
7
    first_params.dim[1] = max_len;
645
7
    ccv_nnc_tensor_t** outputs = data[i];
646
7
    if (!outputs)
647
5
      outputs = (ccv_nnc_tensor_t**)(data[i] = cccalloc(tuple_size, sizeof(ccv_nnc_tensor_t*)));
648
20
    for (k = 0; k < tuple_size; 
k++13
)
649
13
    {
650
13
      if (!outputs[k])
651
11
        outputs[k] = ccv_nnc_tensor_new(0, first_params, 0);
652
2
      else
653
2
        outputs[k] = ccv_nnc_tensor_resize(outputs[k], first_params);
654
13
    }
655
13
    
parallel_for7
(c, tuple_size) {
656
13
      ccv_nnc_tensor_t* const seq = (ccv_nnc_tensor_t*)column_data[tuple_size + c][i];
657
13
      assert(seq->info.datatype == CCV_32S);
658
13
      const int len = ccv_nnc_tensor_count(seq->info);
659
13
      ccv_nnc_tensor_t* const inp = (ccv_nnc_tensor_t*)column_data[c][i];
660
13
      ccv_nnc_tensor_param_t params = inp->info;
661
13
      assert(params.dim[0] == len);
662
13
      assert(first_len == len);
663
13
      assert(max_len <= params.dim[1]);
664
13
      assert(params.dim[2] == 0);
665
13
      const int ori_len = params.dim[1];
666
13
      ccv_nnc_tensor_t* const out = outputs[c];
667
13
      uint8_t* const ua = inp->data.u8;
668
13
      uint8_t* const ub = out->data.u8;
669
13
      size_t la = CCV_GET_DATA_TYPE_SIZE(params.datatype) * ori_len;
670
13
      size_t lb = CCV_GET_DATA_TYPE_SIZE(params.datatype) * max_len;
671
770
      
parallel_for13
(j, len) {
672
770
        memcpy(ub + lb * j, ua + la * j, lb);
673
770
      } parallel_endfor
674
13
    } parallel_endfor
675
7
  } parallel_endfor
676
7
}
677
678
int ccv_cnnp_dataframe_truncate(ccv_cnnp_dataframe_t* const dataframe, const int* const vec_idxs, const int vec_idx_size, const int* const len_idxs, const int len_idx_size, const char* name)
679
5
{
680
5
  const int total_idx_size = vec_idx_size + len_idx_size;
681
5
  assert(total_idx_size > 0);
682
5
  assert(vec_idx_size == len_idx_size);
683
5
  int total_idxs[total_idx_size];
684
5
  memcpy(total_idxs, vec_idxs, sizeof(int) * vec_idx_size);
685
5
  memcpy(total_idxs + vec_idx_size, len_idxs, sizeof(int) * len_idx_size);
686
5
  ccv_cnnp_dataframe_tuple_t* const tuple =  (ccv_cnnp_dataframe_tuple_t*)ccmalloc(sizeof(ccv_cnnp_dataframe_tuple_t));
687
5
  tuple->size = vec_idx_size;
688
5
  return ccv_cnnp_dataframe_map(dataframe, _ccv_cnnp_truncate, 0, _ccv_cnnp_tensor_list_deinit, total_idxs, total_idx_size, tuple, (ccv_cnnp_column_data_context_deinit_f)ccfree, name);
689
5
}
690
691
// MARK - Batching
692
693
typedef struct {
694
  ccv_cnnp_dataframe_tuple_t tuple;
695
  int format;
696
  int batch_count;
697
  int group_count;
698
} ccv_cnnp_batch_context_t;
699
700
static void _ccv_cnnp_combine_new(void* const* const input_data, const int input_size, void** const output_data, void* const context, ccv_nnc_stream_context_t* const stream_context)
701
363
{
702
363
  ccv_cnnp_batch_context_t* const batch = (ccv_cnnp_batch_context_t*)context;
703
363
  const int output_tuple_size = batch->tuple.size;
704
363
  const int batch_count = batch->batch_count;
705
363
  const int group_count = batch->group_count;
706
363
  const int input_tuple_size = output_tuple_size / group_count;
707
363
  int i, j, k;
708
363
  assert(input_size > 0);
709
363
  if (!output_data[0])
710
16
  {
711
16
    ccv_nnc_tensor_t** const inputs = (ccv_nnc_tensor_t**)input_data[0];
712
16
    ccv_nnc_tensor_t** const tensors = (ccv_nnc_tensor_t**)(output_data[0] = ccmalloc(sizeof(ccv_nnc_tensor_t*) * output_tuple_size));
713
63
    for (i = 0; i < group_count; 
i++47
)
714
148
      
for (j = 0; 47
j < input_tuple_size;
j++101
)
715
101
      {
716
101
        ccv_nnc_tensor_param_t params = inputs[j]->info;
717
101
        assert(params.datatype == CCV_32F || params.datatype == CCV_32S || params.datatype == CCV_16F); // Only support 32 bit float yet.
718
101
        assert(params.format == CCV_TENSOR_FORMAT_NHWC || params.format == CCV_TENSOR_FORMAT_NCHW);
719
101
        params.format = batch->format;
720
        // Special-case for dim count is 3 and 1, in these two cases, the N is not provided.
721
101
        if (batch->format == inputs[j]->info.format)
722
38
        {
723
38
          const int nd = ccv_nnc_tensor_nd(params.dim);
724
38
          memset(params.dim, 0, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC);
725
38
          memcpy(params.dim + 1, inputs[j]->info.dim, sizeof(int) * nd);
726
63
        } else {
727
63
          const int nd = ccv_nnc_tensor_nd(params.dim);
728
63
          if (nd < 3)
729
36
          {
730
36
            memset(params.dim, 0, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC);
731
36
            memcpy(params.dim + 1, inputs[j]->info.dim, sizeof(int) * nd);
732
36
          } else 
if (27
nd >= 327
) {
733
27
            memset(params.dim, 0, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC);
734
27
            const int hw = ccv_nnc_tensor_hw(inputs[j]->info, nd);
735
27
            if (batch->format == CCV_TENSOR_FORMAT_NCHW)
736
27
            {
737
27
              params.dim[1] = ccv_nnc_tensor_get_c(inputs[j]->info);
738
81
              for (k = 0; k < CCV_NNC_MAX_DIM; 
k++54
)
739
54
                params.dim[k + 2] = inputs[j]->info.dim[k + hw];
740
27
            } else {
741
0
              params.dim[CCV_NNC_MAX_DIM + 1] = ccv_nnc_tensor_get_c(inputs[j]->info);
742
0
              for (k = 0; k < CCV_NNC_MAX_DIM; k++)
743
0
                params.dim[k + 1] = inputs[j]->info.dim[k + hw];
744
0
            }
745
27
          }
746
63
        }
747
101
        params.dim[0] = batch_count; // Set the batch count now.
748
101
        tensors[i * input_tuple_size + j] = ccv_nnc_tensor_new(0, params, 0);
749
101
      }
750
16
  }
751
1.09k
  
for (i = 0; 363
i < group_count;
i++730
)
752
2.08k
    
for (j = 0; 730
j < input_tuple_size;
j++1.35k
)
753
1.35k
    {
754
1.35k
      ccv_nnc_tensor_t* const output = ((ccv_nnc_tensor_t**)output_data[0])[i * input_tuple_size + j];
755
335k
      
parallel_for1.35k
(k, batch_count) {
756
335k
        ccv_nnc_tensor_t* const input = ((ccv_nnc_tensor_t**)input_data[(k + i * batch_count) % input_size])[j];
757
335k
        const size_t tensor_count = ccv_nnc_tensor_count(input->info);
758
335k
        if (input->info.datatype == CCV_32F)
759
111k
        {
760
111k
          float* const ap = input->data.f32;
761
111k
          float* const bp = output->data.f32 + k * tensor_count;
762
111k
          if (input->info.format == output->info.format)
763
51.4k
            memcpy(bp, ap, sizeof(float) * tensor_count);
764
60.4k
          else {
765
            // Do a simple format conversion.
766
60.4k
            const int c = ccv_nnc_tensor_get_c(input->info);
767
60.4k
            assert(c > 0);
768
60.4k
            const size_t hw_count = tensor_count / c;
769
60.4k
            size_t x;
770
60.4k
            int y;
771
60.4k
            if (input->info.format == CCV_TENSOR_FORMAT_NHWC && output->info.format == CCV_TENSOR_FORMAT_NCHW)
772
61.9M
              
for (x = 0; 60.4k
x < hw_count;
x++61.8M
)
773
247M
                
for (y = 0; 61.8M
y < c;
y++185M
)
774
185M
                  bp[y * hw_count + x] = ap[x * c + y];
775
0
            else if (input->info.format == CCV_TENSOR_FORMAT_NCHW && output->info.format == CCV_TENSOR_FORMAT_NHWC)
776
0
              for (x = 0; x < hw_count; x++)
777
0
                for (y = 0; y < c; y++)
778
0
                  bp[x * c + y] = ap[y * hw_count + x];
779
60.4k
          }
780
223k
        } else if (input->info.datatype == CCV_32S) {
781
2.56k
          int* const ap = input->data.i32;
782
2.56k
          int* const bp = output->data.i32 + k * tensor_count;
783
2.56k
          if (input->info.format == output->info.format)
784
0
            memcpy(bp, ap, sizeof(int) * tensor_count);
785
2.56k
          else {
786
            // Do a simple format conversion.
787
2.56k
            const int c = ccv_nnc_tensor_get_c(input->info);
788
2.56k
            assert(c > 0);
789
2.56k
            const size_t hw_count = tensor_count / c;
790
2.56k
            size_t x;
791
2.56k
            int y;
792
2.56k
            if (input->info.format == CCV_TENSOR_FORMAT_NHWC && output->info.format == CCV_TENSOR_FORMAT_NCHW)
793
5.12k
              
for (x = 0; 2.56k
x < hw_count;
x++2.56k
)
794
659k
                
for (y = 0; 2.56k
y < c;
y++656k
)
795
656k
                  bp[y * hw_count + x] = ap[x * c + y];
796
0
            else if (input->info.format == CCV_TENSOR_FORMAT_NCHW && output->info.format == CCV_TENSOR_FORMAT_NHWC)
797
0
              for (x = 0; x < hw_count; x++)
798
0
                for (y = 0; y < c; y++)
799
0
                  bp[x * c + y] = ap[y * hw_count + x];
800
2.56k
          }
801
221k
        } else if (input->info.datatype == CCV_16F) {
802
221k
          ccv_float16_t* const ap = input->data.f16;
803
221k
          ccv_float16_t* const bp = output->data.f16 + k * tensor_count;
804
221k
          if (input->info.format == output->info.format)
805
100k
            memcpy(bp, ap, sizeof(ccv_float16_t) * tensor_count);
806
120k
          else {
807
            // Do a simple format conversion.
808
120k
            const int c = ccv_nnc_tensor_get_c(input->info);
809
120k
            assert(c > 0);
810
120k
            const size_t hw_count = tensor_count / c;
811
120k
            size_t x;
812
120k
            int y;
813
120k
            if (input->info.format == CCV_TENSOR_FORMAT_NHWC && output->info.format == CCV_TENSOR_FORMAT_NCHW)
814
123M
              
for (x = 0; 120k
x < hw_count;
x++123M
)
815
494M
                
for (y = 0; 123M
y < c;
y++371M
)
816
371M
                  bp[y * hw_count + x] = ap[x * c + y];
817
0
            else if (input->info.format == CCV_TENSOR_FORMAT_NCHW && output->info.format == CCV_TENSOR_FORMAT_NHWC)
818
0
              for (x = 0; x < hw_count; x++)
819
0
                for (y = 0; y < c; y++)
820
0
                  bp[x * c + y] = ap[y * hw_count + x];
821
120k
          }
822
221k
        } else {
823
0
          assert(0);
824
0
        }
825
335k
      } parallel_endfor
826
1.35k
    }
827
363
}
828
829
static void _ccv_cnnp_combine_deinit(void* const self, void* const context)
830
16
{
831
16
  ccv_cnnp_batch_context_t* const batch = (ccv_cnnp_batch_context_t*)context;
832
16
  ccv_nnc_tensor_t** const tensors = (ccv_nnc_tensor_t**)self;
833
16
  const int size = batch->tuple.size;
834
16
  int i;
835
117
  for (i = 0; i < size; 
i++101
)
836
101
    ccv_nnc_tensor_free(tensors[i]);
837
16
  ccfree(tensors);
838
16
}
839
840
ccv_cnnp_dataframe_t* ccv_cnnp_dataframe_combine_new(ccv_cnnp_dataframe_t* const dataframe, const int* const column_idxs, const int column_idx_size, const int batch_count, const int group_count, const int format)
841
13
{
842
13
  assert(format == CCV_TENSOR_FORMAT_NCHW || format == CCV_TENSOR_FORMAT_NHWC);
843
13
  assert(column_idx_size >= 1);
844
13
  assert(batch_count > 0);
845
13
  assert(group_count > 0);
846
13
  const int derived = ccv_cnnp_dataframe_make_tuple(dataframe, column_idxs, column_idx_size, 0);
847
13
  ccv_cnnp_batch_context_t* const batch = (ccv_cnnp_batch_context_t*)ccmalloc(sizeof(ccv_cnnp_batch_context_t));
848
13
  batch->tuple.size = column_idx_size * group_count;
849
13
  batch->format = format;
850
13
  batch->batch_count = batch_count;
851
13
  batch->group_count = group_count;
852
13
  return ccv_cnnp_dataframe_sample_new(dataframe, _ccv_cnnp_combine_new, _ccv_cnnp_combine_deinit, derived, batch_count * group_count, batch, (ccv_cnnp_column_data_context_deinit_f)ccfree);
853
13
}