Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/test/int/nnc/cifar.tests.c
Line
Count
Source (jump to first uncovered line)
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <ccv_internal.h>
6
#include <nnc/ccv_nnc.h>
7
#include <nnc/ccv_nnc_easy.h>
8
#include <3rdparty/dsfmt/dSFMT.h>
9
10
TEST_SETUP()
11
{
12
  ccv_nnc_init();
13
}
14
15
static ccv_cnnp_model_t* _dawn_layer_new(const int filters, const int strides, const int residual)
16
6
{
17
6
  ccv_cnnp_model_io_t input = ccv_cnnp_input();
18
6
  ccv_cnnp_model_t* conv = ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), (ccv_cnnp_param_t){
19
6
    .norm = CCV_CNNP_BATCH_NORM,
20
6
    .activation = CCV_CNNP_ACTIVATION_RELU,
21
6
    .hint = HINT((1, 1), (1, 1)),
22
6
  });
23
6
  ccv_cnnp_model_io_t output = ccv_cnnp_model_apply(conv, MODEL_IO_LIST(input));
24
6
  ccv_cnnp_model_t* pool = ccv_cnnp_max_pool(DIM_ALLOC(strides, strides), (ccv_cnnp_param_t){
25
6
    .hint = HINT((strides, strides), (0, 0)),
26
6
  });
27
6
  output = ccv_cnnp_model_apply(pool, MODEL_IO_LIST(output));
28
6
  if (residual)
29
4
  {
30
4
    ccv_cnnp_model_io_t shortcut = output;
31
4
    ccv_cnnp_model_t* res1 = ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), (ccv_cnnp_param_t){
32
4
      .norm = CCV_CNNP_BATCH_NORM,
33
4
      .activation = CCV_CNNP_ACTIVATION_RELU,
34
4
      .hint = HINT((1, 1), (1, 1)),
35
4
    });
36
4
    output = ccv_cnnp_model_apply(res1, MODEL_IO_LIST(output));
37
4
    ccv_cnnp_model_t* res2 = ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), (ccv_cnnp_param_t){
38
4
      .norm = CCV_CNNP_BATCH_NORM,
39
4
      .activation = CCV_CNNP_ACTIVATION_RELU,
40
4
      .hint = HINT((1, 1), (1, 1)),
41
4
    });
42
4
    output = ccv_cnnp_model_apply(res2, MODEL_IO_LIST(output));
43
4
    ccv_cnnp_model_t* const add = ccv_cnnp_add();
44
4
    output = ccv_cnnp_model_apply(add, MODEL_IO_LIST(output, shortcut));
45
4
  }
46
6
  return ccv_cnnp_model_new(MODEL_IO_LIST(input), MODEL_IO_LIST(output));
47
6
}
48
49
static ccv_cnnp_model_t* _cifar_10_dawn(void)
50
2
{
51
2
  ccv_cnnp_model_t* prep = ccv_cnnp_convolution(1, 64, DIM_ALLOC(3, 3), (ccv_cnnp_param_t){
52
2
    .norm = CCV_CNNP_BATCH_NORM,
53
2
    .activation = CCV_CNNP_ACTIVATION_RELU,
54
2
    .hint = HINT((1, 1), (1, 1)),
55
2
  });
56
2
  ccv_cnnp_model_t* layer1 = _dawn_layer_new(128, 2, 1);
57
2
  ccv_cnnp_model_t* layer2 = _dawn_layer_new(256, 2, 0);
58
2
  ccv_cnnp_model_t* layer3 = _dawn_layer_new(512, 2, 1);
59
2
  return ccv_cnnp_sequential_new(MODEL_LIST(
60
2
    prep,
61
2
    layer1,
62
2
    layer2,
63
2
    layer3,
64
2
    ccv_cnnp_max_pool(DIM_ALLOC(0, 0), (ccv_cnnp_param_t){}),
65
2
    ccv_cnnp_flatten(),
66
2
    ccv_cnnp_dense(10, (ccv_cnnp_param_t){
67
2
      .activation = CCV_CNNP_ACTIVATION_SOFTMAX,
68
2
    })));
69
2
}
70
71
static ccv_nnc_cmd_t _no_wd(const ccv_cnnp_model_t* const model, const ccv_cnnp_trainable_index_t* const trainable_indexes, const int trainable_index_size, const void* const context)
72
89.1k
{
73
89.1k
  int i;
74
89.1k
  ccv_nnc_cmd_t cmd = *(const ccv_nnc_cmd_t*)context;
75
322k
  for (i = 0; i < trainable_index_size; 
i++233k
)
76
233k
  {
77
233k
    if (trainable_indexes[i].cmd.cmd == CCV_NNC_BATCH_NORM_FORWARD &&
78
233k
      
(54.8k
trainable_indexes[i].index == 154.8k
||
trainable_indexes[i].index == 227.4k
)) // If it is scale / bias of batch norm, remove weight decay.
79
54.8k
      cmd.info.minimize.decay = 0;
80
233k
    if (trainable_indexes[i].cmd.cmd == CCV_NNC_GEMM_FORWARD &&
81
233k
      
trainable_indexes[i].index == 26.86k
) // bias in gemm.
82
3.43k
      cmd.info.minimize.decay = 0;
83
233k
    if (trainable_indexes[i].cmd.cmd == CCV_NNC_CONVOLUTION_FORWARD &&
84
233k
      
trainable_indexes[i].index == 227.4k
) // bias in convolution.
85
0
      cmd.info.minimize.decay = 0;
86
233k
  }
87
89.1k
  return cmd;
88
89.1k
}
89
90
static int train_cifar_10(ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set)
91
1
{
92
1
  ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn();
93
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
94
1
  if (device_count < 1)
95
0
    return -1;
96
1
  const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 32F, batch_size, 3, 32, 32);
97
1
  float learn_rate = 0.001;
98
1
  ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD());
99
1
  ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024);
100
1
  int i, j, k;
101
1
  ccv_nnc_tensor_t* cpu_outputs[device_count];
102
5
  for (i = 0; i < device_count; 
i++4
)
103
4
  {
104
4
    cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0);
105
4
    ccv_nnc_tensor_pin_memory(cpu_outputs[i]);
106
4
  }
107
1
  ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set);
108
1
  const ccv_cnnp_random_jitter_t random_jitter = {
109
1
    .resize = {
110
1
      .min = 32,
111
1
      .max = 32,
112
1
    },
113
1
    .size = {
114
1
      .rows = 32,
115
1
      .cols = 32,
116
1
    },
117
1
    .symmetric = 1,
118
1
    .normalize = {
119
1
      .mean = {
120
1
        mean[0], mean[1], mean[2],
121
1
      },
122
1
    },
123
1
    .offset = {
124
1
      .x = 4,
125
1
      .y = 4,
126
1
    },
127
1
    .seed = 1,
128
1
  };
129
1
  const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix));
130
1
  const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter);
131
1
  const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_32F, CCV_TENSOR_FORMAT_NCHW);
132
1
  ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_batching_new(raw_train_data, COLUMN_ID_LIST(jitter_images, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
133
1
  ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set);
134
1
  const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix));
135
1
  ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_batching_new(raw_test_data, COLUMN_ID_LIST(test_images), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
136
1
  int train_device_columns[device_count * 2];
137
1
  int test_device_columns[device_count * 2];
138
5
  for (i = 0; i < device_count; 
i++4
)
139
4
  {
140
4
    int stream_type = CCV_STREAM_CONTEXT_GPU;
141
4
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
142
4
    train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i);
143
4
    ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 32F, batch_size, 10);
144
4
    CCV_TENSOR_SET_DEVICE_ID(params.type, i);
145
4
    train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params);
146
4
    test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i);
147
4
    test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params);
148
4
  }
149
1
  ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2);
150
1
  ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2);
151
1
  ccv_nnc_stream_context_t* stream_contexts[2];
152
1
  stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
153
1
  stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
154
1
  int p = 0, q = 1;
155
1
  const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count);
156
1
  int correct = 0;
157
1
  int epoch = 0;
158
1
  ccv_cnnp_model_set_data_parallel(cifar_10, device_count);
159
1
  ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]);
160
1
  ccv_nnc_tensor_t** input_fits[device_count * 2];
161
1
  ccv_nnc_tensor_t* input_fit_inputs[device_count];
162
1
  ccv_nnc_tensor_t* input_fit_fits[device_count];
163
1
  ccv_nnc_tensor_t* outputs[device_count];
164
1.71k
  for (i = 0; epoch < 35; 
i++1.71k
)
165
1.71k
  {
166
1.71k
    // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/
167
1.71k
    learn_rate = (i + 1) < 10 * epoch_end ? 
0.4 * (i + 1) / (10 * epoch_end)489
:
0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)1.22k
;
168
1.71k
    learn_rate = ccv_max(learn_rate, 0.000001);
169
1.71k
    ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9);
170
1.71k
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, _no_wd, &sgd);
171
1.71k
    ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]);
172
1.71k
    ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena.
173
8.57k
    for (j = 0; j < device_count; 
j++6.86k
)
174
6.86k
    {
175
6.86k
      input_fit_inputs[j] = input_fits[j][0];
176
6.86k
      input_fit_fits[j] = input_fits[j][1];
177
6.86k
      outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j];
178
6.86k
    }
179
1.71k
    ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, stream_contexts[p]);
180
1.71k
    // Prefetch the next round.
181
1.71k
    ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]);
182
1.71k
    if ((i + 1) % epoch_end == 0)
183
35
    {
184
35
      ++epoch;
185
35
      // Reshuffle and reset cursor.
186
35
      ccv_cnnp_dataframe_shuffle(raw_train_data);
187
35
      ccv_cnnp_dataframe_iter_set_cursor(iter, 0);
188
35
    }
189
1.71k
    int t;
190
1.71k
    CCV_SWAP(p, q, t);
191
1.71k
  }
192
1
  ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0);
193
1
  ccv_nnc_stream_context_wait(stream_contexts[p]);
194
1
  ccv_nnc_stream_context_wait(stream_contexts[q]);
195
1
  correct = 0;
196
1
  p = 0, q = 1;
197
11
  for (j = 0; j < test_set->rnum; 
j += batch_size * device_count10
)
198
10
  {
199
10
    ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0);
200
50
    for (k = 0; k < device_count; 
k++40
)
201
40
    {
202
40
      input_fit_inputs[k] = input_fits[k][0];
203
40
      outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k];
204
40
    }
205
10
    ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){
206
10
      .is_test = 1
207
10
    }, input_fit_inputs, device_count, outputs, device_count, 0);
208
10
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs, device_count, 0);
209
10.0k
    for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); 
k++10.0k
)
210
10.0k
    {
211
10.0k
      ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k);
212
10.0k
      const int d = k / batch_size;
213
10.0k
      const int b = k % batch_size;
214
10.0k
      float max = -FLT_MAX;
215
10.0k
      int t = -1;
216
10.0k
      int fi;
217
110k
      for (fi = 0; fi < 10; 
fi++100k
)
218
100k
        if (cpu_outputs[d]->data.f32[b * 10 + fi] > max)
219
29.7k
          max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi;
220
10.0k
      if (categorized->c == t)
221
9.14k
        ++correct;
222
10.0k
    }
223
10
  }
224
1
  ccv_cnnp_dataframe_iter_free(iter);
225
1
  ccv_cnnp_dataframe_free(batch_train_data);
226
1
  ccv_cnnp_dataframe_free(raw_train_data);
227
1
  ccv_cnnp_dataframe_iter_free(test_iter);
228
1
  ccv_cnnp_dataframe_free(batch_test_data);
229
1
  ccv_cnnp_dataframe_free(raw_test_data);
230
1
  ccv_cnnp_model_free(cifar_10);
231
1
  ccv_nnc_stream_context_free(stream_contexts[0]);
232
1
  ccv_nnc_stream_context_free(stream_contexts[1]);
233
5
  for (i = 0; i < device_count; 
i++4
)
234
4
    ccv_nnc_tensor_free(cpu_outputs[i]);
235
1
  return correct;
236
1
}
237
238
TEST_CASE("cifar-10 with dawnnet to > 90% under 3 minutes")
239
1
{
240
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
241
1
      ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
242
1
  FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb");
243
1
  FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb");
244
1
  if (!train || !test)
245
0
  {
246
0
    if (train)
247
0
      fclose(train);
248
0
    if (test)
249
0
      fclose(test);
250
0
    GUARD_ELSE_RETURN(0);
251
0
  }
252
1
  int i, j, k;
253
1
  unsigned char bytes[32 * 32 + 1];
254
1
  double mean[3] = {};
255
1
  const int train_count = 50000;
256
1
  const int test_count = 10000;
257
1
  ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0);
258
50.0k
  for (k = 0; k < train_count; 
k++50.0k
)
259
50.0k
  {
260
50.0k
    fread(bytes, 32 * 32 + 1, 1, train);
261
50.0k
    double per_mean[3] = {};
262
50.0k
    int c = bytes[0];
263
50.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
264
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
265
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
266
51.2M
        per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.);
267
50.0k
    fread(bytes, 32 * 32, 1, train);
268
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
269
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
270
51.2M
        per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.);
271
50.0k
    fread(bytes, 32 * 32, 1, train);
272
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
273
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
274
51.2M
        per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.);
275
50.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
276
50.0k
    ccv_array_push(categorizeds, &categorized);
277
50.0k
    mean[0] += per_mean[0] / (32 * 32);
278
50.0k
    mean[1] += per_mean[1] / (32 * 32);
279
50.0k
    mean[2] += per_mean[2] / (32 * 32);
280
50.0k
  }
281
1
  float meanf[3];
282
1
  meanf[0] = mean[0] / train_count;
283
1
  meanf[1] = mean[1] / train_count;
284
1
  meanf[2] = mean[2] / train_count;
285
1
  ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0);
286
10.0k
  for (k = 0; k < test_count; 
k++10.0k
)
287
10.0k
  {
288
10.0k
    fread(bytes, 32 * 32 + 1, 1, test);
289
10.0k
    int c = bytes[0];
290
10.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
291
330k
    for (i = 0; i < 32; 
i++320k
)
292
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
293
10.2M
        a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0];
294
10.0k
    fread(bytes, 32 * 32, 1, test);
295
330k
    for (i = 0; i < 32; 
i++320k
)
296
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
297
10.2M
        a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1];
298
10.0k
    fread(bytes, 32 * 32, 1, test);
299
330k
    for (i = 0; i < 32; 
i++320k
)
300
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
301
10.2M
        a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2];
302
10.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
303
10.0k
    ccv_array_push(tests, &categorized);
304
10.0k
  }
305
1
  int correct = train_cifar_10(categorizeds, 256, meanf, tests);
306
1
  fclose(train);
307
1
  fclose(test);
308
1
  REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000);
309
1
}
310
311
static int train_cifar_10_fp16(ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set)
312
1
{
313
1
  ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn();
314
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
315
1
  if (device_count < 1)
316
0
    return -1;
317
1
  const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 16F, batch_size, 3, 32, 32);
318
1
  float learn_rate = 0.001;
319
1
  ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD());
320
1
  ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024);
321
1
  int i, j, k;
322
1
  ccv_nnc_tensor_t* cpu_outputs[device_count];
323
1
  ccv_nnc_tensor_t* cpu_outputs_16f[device_count];
324
5
  for (i = 0; i < device_count; 
i++4
)
325
4
  {
326
4
    cpu_outputs_16f[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, batch_size, 10), 0);
327
4
    ccv_nnc_tensor_pin_memory(cpu_outputs_16f[i]);
328
4
    cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0);
329
4
  }
330
1
  ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set);
331
1
  const ccv_cnnp_random_jitter_t random_jitter = {
332
1
    .resize = {
333
1
      .min = 32,
334
1
      .max = 32,
335
1
    },
336
1
    .size = {
337
1
      .rows = 32,
338
1
      .cols = 32,
339
1
    },
340
1
    .symmetric = 1,
341
1
    .normalize = {
342
1
      .mean = {
343
1
        mean[0], mean[1], mean[2],
344
1
      },
345
1
    },
346
1
    .offset = {
347
1
      .x = 4,
348
1
      .y = 4,
349
1
    },
350
1
    .seed = 1,
351
1
  };
352
1
  const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix));
353
1
  const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter);
354
1
  ccv_nnc_tensor_param_t images_16f_params = CPU_TENSOR_NHWC(16F, 32, 32, 3);
355
1
  const int jitter_images_in = ccv_cnnp_dataframe_make_tuple(raw_train_data, COLUMN_ID_LIST(jitter_images));
356
1
  const int jitter_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_train_data, jitter_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0);
357
1
  const int jitter_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_train_data, jitter_images_out_16f, 0);
358
1
  const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_16F, CCV_TENSOR_FORMAT_NCHW);
359
1
  ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_batching_new(raw_train_data, COLUMN_ID_LIST(jitter_images_16f, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
360
1
  ccv_cnnp_dataframe_shuffle(raw_train_data);
361
1
  ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set);
362
1
  const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix));
363
1
  const int test_images_in = ccv_cnnp_dataframe_make_tuple(raw_test_data, COLUMN_ID_LIST(test_images));
364
1
  const int test_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_test_data, test_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0);
365
1
  const int test_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_test_data, test_images_out_16f, 0);
366
1
  ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_batching_new(raw_test_data, COLUMN_ID_LIST(test_images_16f), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
367
1
  int train_device_columns[device_count * 2];
368
1
  int test_device_columns[device_count * 2];
369
5
  for (i = 0; i < device_count; 
i++4
)
370
4
  {
371
4
    int stream_type = CCV_STREAM_CONTEXT_GPU;
372
4
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
373
4
    train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i);
374
4
    ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 16F, batch_size, 10);
375
4
    CCV_TENSOR_SET_DEVICE_ID(params.type, i);
376
4
    train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params);
377
4
    test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i);
378
4
    test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params);
379
4
  }
380
1
  ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2);
381
1
  ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2);
382
1
  ccv_nnc_stream_context_t* stream_contexts[2];
383
1
  stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
384
1
  stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
385
1
  int p = 0, q = 1;
386
1
  const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count);
387
1
  int correct = 0;
388
1
  int epoch = 0;
389
1
  ccv_cnnp_model_set_data_parallel(cifar_10, device_count);
390
1
  ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]);
391
1
  ccv_nnc_tensor_t** input_fits[device_count * 2];
392
1
  ccv_nnc_tensor_t* input_fit_inputs[device_count];
393
1
  ccv_nnc_tensor_t* input_fit_fits[device_count];
394
1
  ccv_nnc_tensor_t* outputs[device_count];
395
1.71k
  for (i = 0; epoch < 35; 
i++1.71k
)
396
1.71k
  {
397
1.71k
    // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/
398
1.71k
    learn_rate = (i + 1) < 10 * epoch_end ? 
0.4 * (i + 1) / (10 * epoch_end)489
:
0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)1.22k
;
399
1.71k
    learn_rate = ccv_max(learn_rate, 0.000001);
400
1.71k
    ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9);
401
1.71k
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, _no_wd, &sgd);
402
1.71k
    ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]);
403
1.71k
    ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena.
404
8.57k
    for (j = 0; j < device_count; 
j++6.86k
)
405
6.86k
    {
406
6.86k
      input_fit_inputs[j] = input_fits[j][0];
407
6.86k
      input_fit_fits[j] = input_fits[j][1];
408
6.86k
      outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j];
409
6.86k
    }
410
1.71k
    ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, stream_contexts[p]);
411
1.71k
    // Prefetch the next round.
412
1.71k
    ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]);
413
1.71k
    if ((i + 1) % epoch_end == 0)
414
35
    {
415
35
      ++epoch;
416
35
      // Reshuffle and reset cursor.
417
35
      ccv_cnnp_dataframe_shuffle(raw_train_data);
418
35
      ccv_cnnp_dataframe_iter_set_cursor(iter, 0);
419
35
    }
420
1.71k
    int t;
421
1.71k
    CCV_SWAP(p, q, t);
422
1.71k
  }
423
1
  ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0);
424
1
  ccv_nnc_stream_context_wait(stream_contexts[p]);
425
1
  ccv_nnc_stream_context_wait(stream_contexts[q]);
426
1
  correct = 0;
427
1
  p = 0, q = 1;
428
11
  for (j = 0; j < test_set->rnum; 
j += batch_size * device_count10
)
429
10
  {
430
10
    ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0);
431
50
    for (k = 0; k < device_count; 
k++40
)
432
40
    {
433
40
      input_fit_inputs[k] = input_fits[k][0];
434
40
      outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k];
435
40
    }
436
10
    ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){
437
10
      .is_test = 1
438
10
    }, input_fit_inputs, device_count, outputs, device_count, 0);
439
10
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs_16f, device_count, 0);
440
10
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, cpu_outputs_16f, device_count, cpu_outputs, device_count, 0);
441
10.0k
    for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); 
k++10.0k
)
442
10.0k
    {
443
10.0k
      ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k);
444
10.0k
      const int d = k / batch_size;
445
10.0k
      const int b = k % batch_size;
446
10.0k
      float max = -FLT_MAX;
447
10.0k
      int t = -1;
448
10.0k
      int fi;
449
110k
      for (fi = 0; fi < 10; 
fi++100k
)
450
100k
        if (cpu_outputs[d]->data.f32[b * 10 + fi] > max)
451
28.5k
          max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi;
452
10.0k
      if (categorized->c == t)
453
9.12k
        ++correct;
454
10.0k
    }
455
10
  }
456
1
  ccv_cnnp_dataframe_iter_free(iter);
457
1
  ccv_cnnp_dataframe_free(batch_train_data);
458
1
  ccv_cnnp_dataframe_free(raw_train_data);
459
1
  ccv_cnnp_dataframe_iter_free(test_iter);
460
1
  ccv_cnnp_dataframe_free(batch_test_data);
461
1
  ccv_cnnp_dataframe_free(raw_test_data);
462
1
  ccv_cnnp_model_free(cifar_10);
463
1
  ccv_nnc_stream_context_free(stream_contexts[0]);
464
1
  ccv_nnc_stream_context_free(stream_contexts[1]);
465
5
  for (i = 0; i < device_count; 
i++4
)
466
4
  {
467
4
    ccv_nnc_tensor_free(cpu_outputs[i]);
468
4
    ccv_nnc_tensor_free(cpu_outputs_16f[i]);
469
4
  }
470
1
  return correct;
471
1
}
472
473
TEST_CASE("cifar-10 with dawnnet to > 90% under 1 minutes (fp16)")
474
1
{
475
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
476
1
      ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
477
1
  FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb");
478
1
  FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb");
479
1
  if (!train || !test)
480
0
  {
481
0
    if (train)
482
0
      fclose(train);
483
0
    if (test)
484
0
      fclose(test);
485
0
    GUARD_ELSE_RETURN(0);
486
0
  }
487
1
  int i, j, k;
488
1
  unsigned char bytes[32 * 32 + 1];
489
1
  double mean[3] = {};
490
1
  const int train_count = 50000;
491
1
  const int test_count = 10000;
492
1
  ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0);
493
50.0k
  for (k = 0; k < train_count; 
k++50.0k
)
494
50.0k
  {
495
50.0k
    fread(bytes, 32 * 32 + 1, 1, train);
496
50.0k
    double per_mean[3] = {};
497
50.0k
    int c = bytes[0];
498
50.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
499
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
500
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
501
51.2M
        per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.);
502
50.0k
    fread(bytes, 32 * 32, 1, train);
503
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
504
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
505
51.2M
        per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.);
506
50.0k
    fread(bytes, 32 * 32, 1, train);
507
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
508
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
509
51.2M
        per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.);
510
50.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
511
50.0k
    ccv_array_push(categorizeds, &categorized);
512
50.0k
    mean[0] += per_mean[0] / (32 * 32);
513
50.0k
    mean[1] += per_mean[1] / (32 * 32);
514
50.0k
    mean[2] += per_mean[2] / (32 * 32);
515
50.0k
  }
516
1
  float meanf[3];
517
1
  meanf[0] = mean[0] / train_count;
518
1
  meanf[1] = mean[1] / train_count;
519
1
  meanf[2] = mean[2] / train_count;
520
1
  ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0);
521
10.0k
  for (k = 0; k < test_count; 
k++10.0k
)
522
10.0k
  {
523
10.0k
    fread(bytes, 32 * 32 + 1, 1, test);
524
10.0k
    int c = bytes[0];
525
10.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
526
330k
    for (i = 0; i < 32; 
i++320k
)
527
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
528
10.2M
        a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0];
529
10.0k
    fread(bytes, 32 * 32, 1, test);
530
330k
    for (i = 0; i < 32; 
i++320k
)
531
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
532
10.2M
        a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1];
533
10.0k
    fread(bytes, 32 * 32, 1, test);
534
330k
    for (i = 0; i < 32; 
i++320k
)
535
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
536
10.2M
        a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2];
537
10.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
538
10.0k
    ccv_array_push(tests, &categorized);
539
10.0k
  }
540
1
  int correct = train_cifar_10_fp16(categorizeds, 256, meanf, tests);
541
1
  fclose(train);
542
1
  fclose(test);
543
1
  REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000);
544
1
}
545
546
#include "case_main.h"