Coverage Report

Created: 2021-04-12 03:25

/home/liu/buildslave/linux-x64-runtests/build/test/int/nnc/cifar.tests.c
Line
Count
Source (jump to first uncovered line)
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <ccv_internal.h>
6
#include <nnc/ccv_nnc.h>
7
#include <nnc/ccv_nnc_easy.h>
8
#include <3rdparty/dsfmt/dSFMT.h>
9
10
TEST_SETUP()
11
{
12
  ccv_nnc_init();
13
}
14
15
static ccv_cnnp_model_t* _dawn_layer_new(const int filters, const int strides, const int residual)
16
9
{
17
9
  ccv_cnnp_model_io_t input = ccv_cnnp_input();
18
9
  ccv_cnnp_model_t* conv = ccv_cnnp_sequential_new(MODEL_LIST(
19
9
    ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), 0, HINT((1, 1), (1, 1)), 0),
20
9
    ccv_cnnp_batch_norm(0.9, 1e-4, 0),
21
9
    ccv_cnnp_relu(0)
22
9
  ), 0);
23
9
  ccv_cnnp_model_io_t output = ccv_cnnp_model_apply(conv, MODEL_IO_LIST(input));
24
9
  ccv_cnnp_model_t* pool = ccv_cnnp_max_pool(DIM_ALLOC(strides, strides), HINT((strides, strides), (0, 0)), 0);
25
9
  output = ccv_cnnp_model_apply(pool, MODEL_IO_LIST(output));
26
9
  if (residual)
27
6
  {
28
6
    ccv_cnnp_model_io_t shortcut = output;
29
6
    ccv_cnnp_model_t* res1 = ccv_cnnp_sequential_new(MODEL_LIST(
30
6
      ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), 0, HINT((1, 1), (1, 1)), 0),
31
6
      ccv_cnnp_batch_norm(0.9, 1e-4, 0),
32
6
      ccv_cnnp_relu(0)
33
6
    ), 0);
34
6
    output = ccv_cnnp_model_apply(res1, MODEL_IO_LIST(output));
35
6
    ccv_cnnp_model_t* res2 = ccv_cnnp_sequential_new(MODEL_LIST(
36
6
      ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), 0, HINT((1, 1), (1, 1)), 0),
37
6
      ccv_cnnp_batch_norm(0.9, 1e-4, 0),
38
6
      ccv_cnnp_relu(0)
39
6
    ), 0);
40
6
    output = ccv_cnnp_model_apply(res2, MODEL_IO_LIST(output));
41
6
    ccv_cnnp_model_t* const add = ccv_cnnp_sum(0);
42
6
    output = ccv_cnnp_model_apply(add, MODEL_IO_LIST(output, shortcut));
43
6
  }
44
9
  return ccv_cnnp_model_new(MODEL_IO_LIST(input), MODEL_IO_LIST(output), 0);
45
9
}
46
47
static ccv_cnnp_model_t* _cifar_10_dawn(const int softmax)
48
3
{
49
3
  ccv_cnnp_model_t* prep = ccv_cnnp_sequential_new(MODEL_LIST(
50
3
    ccv_cnnp_convolution(1, 64, DIM_ALLOC(3, 3), 0, HINT((1, 1), (1, 1)), 0),
51
3
    ccv_cnnp_batch_norm(0.9, 1e-4, 0),
52
3
    ccv_cnnp_relu(0)
53
3
  ), 0);
54
3
  ccv_cnnp_model_t* layer1 = _dawn_layer_new(128, 2, 1);
55
3
  ccv_cnnp_model_t* layer2 = _dawn_layer_new(256, 2, 0);
56
3
  ccv_cnnp_model_t* layer3 = _dawn_layer_new(512, 2, 1);
57
3
  if (softmax)
58
2
  {
59
2
    return ccv_cnnp_sequential_new(MODEL_LIST(
60
2
      prep,
61
2
      layer1,
62
2
      layer2,
63
2
      layer3,
64
2
      ccv_cnnp_max_pool(DIM_ALLOC(0, 0), ccv_nnc_no_hint, 0),
65
2
      ccv_cnnp_flatten(0),
66
2
      ccv_cnnp_dense(10, 0, 0),
67
2
      ccv_cnnp_softmax(0)
68
2
    ), 0);
69
2
  } else {
70
1
    return ccv_cnnp_sequential_new(MODEL_LIST(
71
1
      prep,
72
1
      layer1,
73
1
      layer2,
74
1
      layer3,
75
1
      ccv_cnnp_max_pool(DIM_ALLOC(0, 0), ccv_nnc_no_hint, 0),
76
1
      ccv_cnnp_flatten(0),
77
1
      ccv_cnnp_dense(10, 0, 0)
78
1
    ), 0);
79
1
  }
80
3
}
81
82
static int train_cifar_10(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set)
83
1
{
84
1
  ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1);
85
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
86
1
  if (device_count < 1)
87
0
    return -1;
88
1
  const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 32F, batch_size, 3, 32, 32);
89
1
  float learn_rate = 0.001;
90
1
  ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD());
91
1
  ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024);
92
1
  int i, j, k;
93
1
  ccv_nnc_tensor_t* cpu_outputs[device_count];
94
5
  for (i = 0; i < device_count; 
i++4
)
95
4
  {
96
4
    cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0);
97
4
    ccv_nnc_tensor_pin_memory(cpu_outputs[i]);
98
4
  }
99
1
  ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set);
100
1
  const ccv_cnnp_random_jitter_t random_jitter = {
101
1
    .resize = {
102
1
      .min = 32,
103
1
      .max = 32,
104
1
    },
105
1
    .size = {
106
1
      .rows = 32,
107
1
      .cols = 32,
108
1
    },
109
1
    .symmetric = 1,
110
1
    .normalize = {
111
1
      .mean = {
112
1
        mean[0], mean[1], mean[2],
113
1
      },
114
1
    },
115
1
    .offset = {
116
1
      .x = 4,
117
1
      .y = 4,
118
1
    },
119
1
    .seed = 1,
120
1
  };
121
1
  const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0);
122
1
  const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0);
123
1
  const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_32F, CCV_TENSOR_FORMAT_NCHW, 0);
124
1
  ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
125
1
  ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set);
126
1
  const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0);
127
1
  ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
128
1
  int train_device_columns[device_count * 2];
129
1
  int test_device_columns[device_count * 2];
130
5
  for (i = 0; i < device_count; 
i++4
)
131
4
  {
132
4
    int stream_type = CCV_STREAM_CONTEXT_GPU;
133
4
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
134
4
    train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0);
135
4
    ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 32F, batch_size, 10);
136
4
    CCV_TENSOR_SET_DEVICE_ID(params.type, i);
137
4
    train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0);
138
4
    test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0);
139
4
    test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0);
140
4
  }
141
1
  ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2);
142
1
  ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2);
143
1
  ccv_nnc_stream_context_t* stream_contexts[2];
144
1
  stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
145
1
  stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
146
1
  int p = 0, q = 1;
147
1
  const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count);
148
1
  int correct = 0;
149
1
  int epoch = 0;
150
1
  ccv_cnnp_model_set_data_parallel(cifar_10, device_count);
151
1
  ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]);
152
1
  ccv_nnc_tensor_t** input_fits[device_count * 2];
153
1
  ccv_nnc_tensor_t* input_fit_inputs[device_count];
154
1
  ccv_nnc_tensor_t* input_fit_fits[device_count];
155
1
  ccv_nnc_tensor_t* outputs[device_count];
156
50
  for (i = 0; epoch < epoch_limit; 
i++49
)
157
49
  {
158
49
    // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/
159
49
    learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 
0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0
;
160
49
    learn_rate = ccv_max(learn_rate, 0.000001);
161
49
    ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9);
162
49
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0);
163
49
    sgd.info.sgd.decay = 0;
164
49
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0
165
49
    ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]);
166
49
    ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena.
167
245
    for (j = 0; j < device_count; 
j++196
)
168
196
    {
169
196
      input_fit_inputs[j] = input_fits[j][0];
170
196
      input_fit_fits[j] = input_fits[j][1];
171
196
      outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j];
172
196
    }
173
49
    ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, 0, stream_contexts[p]);
174
49
    // Prefetch the next round.
175
49
    ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]);
176
49
    if ((i + 1) % epoch_end == 0)
177
1
    {
178
1
      ++epoch;
179
1
      // Reshuffle and reset cursor.
180
1
      ccv_cnnp_dataframe_shuffle(raw_train_data);
181
1
      ccv_cnnp_dataframe_iter_set_cursor(iter, 0);
182
1
    }
183
49
    int t;
184
49
    CCV_SWAP(p, q, t);
185
49
  }
186
1
  ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0);
187
1
  ccv_nnc_stream_context_wait(stream_contexts[p]);
188
1
  ccv_nnc_stream_context_wait(stream_contexts[q]);
189
1
  correct = 0;
190
1
  p = 0, q = 1;
191
11
  for (j = 0; j < test_set->rnum; 
j += batch_size * device_count10
)
192
10
  {
193
10
    ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0);
194
50
    for (k = 0; k < device_count; 
k++40
)
195
40
    {
196
40
      input_fit_inputs[k] = input_fits[k][0];
197
40
      outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k];
198
40
    }
199
10
    ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){
200
10
      .is_test = 1
201
10
    }, input_fit_inputs, device_count, outputs, device_count, 0, 0);
202
10
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs, device_count, 0);
203
10.0k
    for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); 
k++10.0k
)
204
10.0k
    {
205
10.0k
      ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k);
206
10.0k
      const int d = k / batch_size;
207
10.0k
      const int b = k % batch_size;
208
10.0k
      float max = -FLT_MAX;
209
10.0k
      int t = -1;
210
10.0k
      int fi;
211
110k
      for (fi = 0; fi < 10; 
fi++100k
)
212
100k
        if (cpu_outputs[d]->data.f32[b * 10 + fi] > max)
213
30.1k
          max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi;
214
10.0k
      if (categorized->c == t)
215
4.55k
        ++correct;
216
10.0k
    }
217
10
  }
218
1
  ccv_cnnp_dataframe_iter_free(iter);
219
1
  ccv_cnnp_dataframe_free(batch_train_data);
220
1
  ccv_cnnp_dataframe_free(raw_train_data);
221
1
  ccv_cnnp_dataframe_iter_free(test_iter);
222
1
  ccv_cnnp_dataframe_free(batch_test_data);
223
1
  ccv_cnnp_dataframe_free(raw_test_data);
224
1
  ccv_cnnp_model_free(cifar_10);
225
1
  ccv_nnc_stream_context_free(stream_contexts[0]);
226
1
  ccv_nnc_stream_context_free(stream_contexts[1]);
227
5
  for (i = 0; i < device_count; 
i++4
)
228
4
    ccv_nnc_tensor_free(cpu_outputs[i]);
229
1
  return correct;
230
1
}
231
232
TEST_CASE("cifar-10 with dawnnet to > 90% under 3 minutes")
233
1
{
234
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
235
1
      ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
236
1
  FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb");
237
1
  FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb");
238
1
  if (!train || !test)
239
0
  {
240
0
    if (train)
241
0
      fclose(train);
242
0
    if (test)
243
0
      fclose(test);
244
0
    GUARD_ELSE_RETURN(0);
245
0
  }
246
1
  int i, j, k;
247
1
  unsigned char bytes[32 * 32 + 1];
248
1
  double mean[3] = {};
249
1
  const int train_count = 50000;
250
1
  const int test_count = 10000;
251
1
  ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0);
252
50.0k
  for (k = 0; k < train_count; 
k++50.0k
)
253
50.0k
  {
254
50.0k
    fread(bytes, 32 * 32 + 1, 1, train);
255
50.0k
    double per_mean[3] = {};
256
50.0k
    int c = bytes[0];
257
50.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
258
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
259
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
260
51.2M
        per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.);
261
50.0k
    fread(bytes, 32 * 32, 1, train);
262
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
263
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
264
51.2M
        per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.);
265
50.0k
    fread(bytes, 32 * 32, 1, train);
266
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
267
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
268
51.2M
        per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.);
269
50.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
270
50.0k
    ccv_array_push(categorizeds, &categorized);
271
50.0k
    mean[0] += per_mean[0] / (32 * 32);
272
50.0k
    mean[1] += per_mean[1] / (32 * 32);
273
50.0k
    mean[2] += per_mean[2] / (32 * 32);
274
50.0k
  }
275
1
  float meanf[3];
276
1
  meanf[0] = mean[0] / train_count;
277
1
  meanf[1] = mean[1] / train_count;
278
1
  meanf[2] = mean[2] / train_count;
279
1
  ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0);
280
10.0k
  for (k = 0; k < test_count; 
k++10.0k
)
281
10.0k
  {
282
10.0k
    fread(bytes, 32 * 32 + 1, 1, test);
283
10.0k
    int c = bytes[0];
284
10.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
285
330k
    for (i = 0; i < 32; 
i++320k
)
286
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
287
10.2M
        a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0];
288
10.0k
    fread(bytes, 32 * 32, 1, test);
289
330k
    for (i = 0; i < 32; 
i++320k
)
290
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
291
10.2M
        a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1];
292
10.0k
    fread(bytes, 32 * 32, 1, test);
293
330k
    for (i = 0; i < 32; 
i++320k
)
294
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
295
10.2M
        a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2];
296
10.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
297
10.0k
    ccv_array_push(tests, &categorized);
298
10.0k
  }
299
1
  fclose(train);
300
1
  fclose(test);
301
1
  if (!ccv_is_coverage())
302
0
  {
303
0
    int correct = train_cifar_10(35, categorizeds, 256, meanf, tests);
304
0
    REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000);
305
0
  } else
306
1
    train_cifar_10(1, categorizeds, 256, meanf, tests);
307
1
}
308
309
static int train_cifar_10_fp16(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set)
310
1
{
311
1
  ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1);
312
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
313
1
  if (device_count < 1)
314
0
    return -1;
315
1
  const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 16F, batch_size, 3, 32, 32);
316
1
  float learn_rate = 0.001;
317
1
  ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD());
318
1
  ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024);
319
1
  int i, j, k;
320
1
  ccv_nnc_tensor_t* cpu_outputs[device_count];
321
1
  ccv_nnc_tensor_t* cpu_outputs_16f[device_count];
322
5
  for (i = 0; i < device_count; 
i++4
)
323
4
  {
324
4
    cpu_outputs_16f[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, batch_size, 10), 0);
325
4
    ccv_nnc_tensor_pin_memory(cpu_outputs_16f[i]);
326
4
    cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0);
327
4
  }
328
1
  ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set);
329
1
  const ccv_cnnp_random_jitter_t random_jitter = {
330
1
    .resize = {
331
1
      .min = 32,
332
1
      .max = 32,
333
1
    },
334
1
    .size = {
335
1
      .rows = 32,
336
1
      .cols = 32,
337
1
    },
338
1
    .symmetric = 1,
339
1
    .normalize = {
340
1
      .mean = {
341
1
        mean[0], mean[1], mean[2],
342
1
      },
343
1
    },
344
1
    .offset = {
345
1
      .x = 4,
346
1
      .y = 4,
347
1
    },
348
1
    .seed = 1,
349
1
  };
350
1
  const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0);
351
1
  const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0);
352
1
  ccv_nnc_tensor_param_t images_16f_params = CPU_TENSOR_NHWC(16F, 32, 32, 3);
353
1
  const int jitter_images_in = ccv_cnnp_dataframe_make_tuple(raw_train_data, COLUMN_ID_LIST(jitter_images), 0);
354
1
  const int jitter_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_train_data, jitter_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0);
355
1
  const int jitter_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_train_data, jitter_images_out_16f, 0, 0);
356
1
  const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_16F, CCV_TENSOR_FORMAT_NCHW, 0);
357
1
  ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images_16f, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
358
1
  ccv_cnnp_dataframe_shuffle(raw_train_data);
359
1
  ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set);
360
1
  const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0);
361
1
  const int test_images_in = ccv_cnnp_dataframe_make_tuple(raw_test_data, COLUMN_ID_LIST(test_images), 0);
362
1
  const int test_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_test_data, test_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0);
363
1
  const int test_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_test_data, test_images_out_16f, 0, 0);
364
1
  ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images_16f), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
365
1
  int train_device_columns[device_count * 2];
366
1
  int test_device_columns[device_count * 2];
367
5
  for (i = 0; i < device_count; 
i++4
)
368
4
  {
369
4
    int stream_type = CCV_STREAM_CONTEXT_GPU;
370
4
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
371
4
    train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0);
372
4
    ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 16F, batch_size, 10);
373
4
    CCV_TENSOR_SET_DEVICE_ID(params.type, i);
374
4
    train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0);
375
4
    test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0);
376
4
    test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0);
377
4
  }
378
1
  ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2);
379
1
  ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2);
380
1
  ccv_nnc_stream_context_t* stream_contexts[2];
381
1
  stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
382
1
  stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
383
1
  int p = 0, q = 1;
384
1
  const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count);
385
1
  int correct = 0;
386
1
  int epoch = 0;
387
1
  ccv_cnnp_model_set_data_parallel(cifar_10, device_count);
388
1
  ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]);
389
1
  ccv_nnc_tensor_t** input_fits[device_count * 2];
390
1
  ccv_nnc_tensor_t* input_fit_inputs[device_count];
391
1
  ccv_nnc_tensor_t* input_fit_fits[device_count];
392
1
  ccv_nnc_tensor_t* outputs[device_count];
393
50
  for (i = 0; epoch < epoch_limit; 
i++49
)
394
49
  {
395
49
    // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/
396
49
    learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 
0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0
;
397
49
    learn_rate = ccv_max(learn_rate, 0.000001);
398
49
    ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9);
399
49
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0);
400
49
    sgd.info.sgd.decay = 0;
401
49
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0
402
49
    ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]);
403
49
    ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena.
404
245
    for (j = 0; j < device_count; 
j++196
)
405
196
    {
406
196
      input_fit_inputs[j] = input_fits[j][0];
407
196
      input_fit_fits[j] = input_fits[j][1];
408
196
      outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j];
409
196
    }
410
49
    ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, 0, stream_contexts[p]);
411
49
    // Prefetch the next round.
412
49
    ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]);
413
49
    if ((i + 1) % epoch_end == 0)
414
1
    {
415
1
      ++epoch;
416
1
      // Reshuffle and reset cursor.
417
1
      ccv_cnnp_dataframe_shuffle(raw_train_data);
418
1
      ccv_cnnp_dataframe_iter_set_cursor(iter, 0);
419
1
    }
420
49
    int t;
421
49
    CCV_SWAP(p, q, t);
422
49
  }
423
1
  ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0);
424
1
  ccv_nnc_stream_context_wait(stream_contexts[p]);
425
1
  ccv_nnc_stream_context_wait(stream_contexts[q]);
426
1
  correct = 0;
427
1
  p = 0, q = 1;
428
11
  for (j = 0; j < test_set->rnum; 
j += batch_size * device_count10
)
429
10
  {
430
10
    ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0);
431
50
    for (k = 0; k < device_count; 
k++40
)
432
40
    {
433
40
      input_fit_inputs[k] = input_fits[k][0];
434
40
      outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k];
435
40
    }
436
10
    ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){
437
10
      .is_test = 1
438
10
    }, input_fit_inputs, device_count, outputs, device_count, 0, 0);
439
10
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs_16f, device_count, 0);
440
10
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, cpu_outputs_16f, device_count, cpu_outputs, device_count, 0);
441
10.0k
    for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); 
k++10.0k
)
442
10.0k
    {
443
10.0k
      ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k);
444
10.0k
      const int d = k / batch_size;
445
10.0k
      const int b = k % batch_size;
446
10.0k
      float max = -FLT_MAX;
447
10.0k
      int t = -1;
448
10.0k
      int fi;
449
110k
      for (fi = 0; fi < 10; 
fi++100k
)
450
100k
        if (cpu_outputs[d]->data.f32[b * 10 + fi] > max)
451
31.4k
          max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi;
452
10.0k
      if (categorized->c == t)
453
4.55k
        ++correct;
454
10.0k
    }
455
10
  }
456
1
  ccv_cnnp_dataframe_iter_free(iter);
457
1
  ccv_cnnp_dataframe_free(batch_train_data);
458
1
  ccv_cnnp_dataframe_free(raw_train_data);
459
1
  ccv_cnnp_dataframe_iter_free(test_iter);
460
1
  ccv_cnnp_dataframe_free(batch_test_data);
461
1
  ccv_cnnp_dataframe_free(raw_test_data);
462
1
  ccv_cnnp_model_free(cifar_10);
463
1
  ccv_nnc_stream_context_free(stream_contexts[0]);
464
1
  ccv_nnc_stream_context_free(stream_contexts[1]);
465
5
  for (i = 0; i < device_count; 
i++4
)
466
4
  {
467
4
    ccv_nnc_tensor_free(cpu_outputs[i]);
468
4
    ccv_nnc_tensor_free(cpu_outputs_16f[i]);
469
4
  }
470
1
  return correct;
471
1
}
472
473
TEST_CASE("cifar-10 with dawnnet to > 90% under 1 minutes (fp16)")
474
1
{
475
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
476
1
      ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
477
1
  FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb");
478
1
  FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb");
479
1
  if (!train || !test)
480
0
  {
481
0
    if (train)
482
0
      fclose(train);
483
0
    if (test)
484
0
      fclose(test);
485
0
    GUARD_ELSE_RETURN(0);
486
0
  }
487
1
  int i, j, k;
488
1
  unsigned char bytes[32 * 32 + 1];
489
1
  double mean[3] = {};
490
1
  const int train_count = 50000;
491
1
  const int test_count = 10000;
492
1
  ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0);
493
50.0k
  for (k = 0; k < train_count; 
k++50.0k
)
494
50.0k
  {
495
50.0k
    fread(bytes, 32 * 32 + 1, 1, train);
496
50.0k
    double per_mean[3] = {};
497
50.0k
    int c = bytes[0];
498
50.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
499
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
500
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
501
51.2M
        per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.);
502
50.0k
    fread(bytes, 32 * 32, 1, train);
503
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
504
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
505
51.2M
        per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.);
506
50.0k
    fread(bytes, 32 * 32, 1, train);
507
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
508
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
509
51.2M
        per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.);
510
50.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
511
50.0k
    ccv_array_push(categorizeds, &categorized);
512
50.0k
    mean[0] += per_mean[0] / (32 * 32);
513
50.0k
    mean[1] += per_mean[1] / (32 * 32);
514
50.0k
    mean[2] += per_mean[2] / (32 * 32);
515
50.0k
  }
516
1
  float meanf[3];
517
1
  meanf[0] = mean[0] / train_count;
518
1
  meanf[1] = mean[1] / train_count;
519
1
  meanf[2] = mean[2] / train_count;
520
1
  ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0);
521
10.0k
  for (k = 0; k < test_count; 
k++10.0k
)
522
10.0k
  {
523
10.0k
    fread(bytes, 32 * 32 + 1, 1, test);
524
10.0k
    int c = bytes[0];
525
10.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
526
330k
    for (i = 0; i < 32; 
i++320k
)
527
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
528
10.2M
        a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0];
529
10.0k
    fread(bytes, 32 * 32, 1, test);
530
330k
    for (i = 0; i < 32; 
i++320k
)
531
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
532
10.2M
        a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1];
533
10.0k
    fread(bytes, 32 * 32, 1, test);
534
330k
    for (i = 0; i < 32; 
i++320k
)
535
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
536
10.2M
        a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2];
537
10.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
538
10.0k
    ccv_array_push(tests, &categorized);
539
10.0k
  }
540
1
  fclose(train);
541
1
  fclose(test);
542
1
  if (!ccv_is_coverage())
543
0
  {
544
0
    int correct = train_cifar_10_fp16(35, categorizeds, 256, meanf, tests);
545
0
    REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000);
546
0
  } else
547
1
    train_cifar_10_fp16(1, categorizeds, 256, meanf, tests);
548
1
}
549
550
static int train_cifar_10_fp16_dy(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set)
551
1
{
552
1
  ccv_cnnp_model_t* const cifar_10_0 = _cifar_10_dawn(0);
553
1
  ccv_cnnp_model_t* const cifar_10 = ccv_cnnp_model_copy(cifar_10_0);
554
1
  ccv_cnnp_model_free(cifar_10_0);
555
1
  const int device_count = 1;
556
1
  const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 16F, batch_size, 3, 32, 32);
557
1
  float learn_rate = 0.001;
558
1
  ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / batch_size, 0.01, 0.9, 0.9), CMD_NOOP());
559
1
  ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024);
560
1
  int i, j, k;
561
1
  ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set);
562
1
  const ccv_cnnp_random_jitter_t random_jitter = {
563
1
    .resize = {
564
1
      .min = 32,
565
1
      .max = 32,
566
1
    },
567
1
    .size = {
568
1
      .rows = 32,
569
1
      .cols = 32,
570
1
    },
571
1
    .symmetric = 1,
572
1
    .normalize = {
573
1
      .mean = {
574
1
        mean[0], mean[1], mean[2],
575
1
      },
576
1
    },
577
1
    .offset = {
578
1
      .x = 4,
579
1
      .y = 4,
580
1
    },
581
1
    .seed = 1,
582
1
  };
583
1
  const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0);
584
1
  const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0);
585
1
  ccv_nnc_tensor_param_t images_16f_params = CPU_TENSOR_NHWC(16F, 32, 32, 3);
586
1
  const int jitter_images_in = ccv_cnnp_dataframe_make_tuple(raw_train_data, COLUMN_ID_LIST(jitter_images), 0);
587
1
  const int jitter_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_train_data, jitter_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0);
588
1
  const int jitter_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_train_data, jitter_images_out_16f, 0, 0);
589
1
  const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_16F, CCV_TENSOR_FORMAT_NCHW, 0);
590
1
  ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images_16f, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
591
1
  ccv_cnnp_dataframe_shuffle(raw_train_data);
592
1
  ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set);
593
1
  const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0);
594
1
  const int test_images_in = ccv_cnnp_dataframe_make_tuple(raw_test_data, COLUMN_ID_LIST(test_images), 0);
595
1
  const int test_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_test_data, test_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0);
596
1
  const int test_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_test_data, test_images_out_16f, 0, 0);
597
1
  ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images_16f), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
598
1
  int train_device_columns[device_count * 2];
599
1
  int test_device_columns[device_count * 2];
600
2
  for (i = 0; i < device_count; 
i++1
)
601
1
  {
602
1
    int stream_type = CCV_STREAM_CONTEXT_GPU;
603
1
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
604
1
    train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0);
605
1
    ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 16F, batch_size, 10);
606
1
    CCV_TENSOR_SET_DEVICE_ID(params.type, i);
607
1
    train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0);
608
1
    test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0);
609
1
    test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0);
610
1
  }
611
1
  ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2);
612
1
  ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2);
613
1
  ccv_nnc_stream_context_t* stream_contexts[2];
614
1
  stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
615
1
  stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
616
1
  int p = 0, q = 1;
617
1
  const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count);
618
1
  int correct = 0;
619
1
  int epoch = 0;
620
1
  ccv_cnnp_model_set_data_parallel(cifar_10, device_count);
621
1
  ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]);
622
1
  ccv_nnc_tensor_t** input_fits[device_count * 2];
623
1
  ccv_nnc_tensor_t* input_fit_inputs[device_count];
624
1
  ccv_nnc_tensor_t* input_fit_fits[device_count];
625
1
  ccv_nnc_tensor_t* outputs[device_count];
626
1
  ccv_nnc_dynamic_graph_t* const graph = ccv_nnc_dynamic_graph_new();
627
197
  for (i = 0; epoch < epoch_limit; 
i++196
)
628
196
  {
629
196
    // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/
630
196
    learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 
0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0
;
631
196
    learn_rate = ccv_max(learn_rate, 0.000001);
632
196
    ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9);
633
196
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0);
634
196
    sgd.info.sgd.decay = 0;
635
196
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0
636
196
    ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]);
637
196
    ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena.
638
392
    for (j = 0; j < device_count; 
j++196
)
639
196
    {
640
196
      input_fit_inputs[j] = input_fits[j][0];
641
196
      input_fit_fits[j] = input_fits[j][1];
642
196
      outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j];
643
196
    }
644
196
    ccv_nnc_stream_context_wait(stream_contexts[p]); // Need to wait the other context to finish, we use the same tensor_arena.
645
196
    ccv_nnc_tensor_variable_t const input = ccv_nnc_tensor_variable_new(graph);
646
196
    ccv_nnc_tensor_variable_set(graph, input, input_fit_inputs[0]);
647
196
    ccv_nnc_tensor_variable_t const output = ccv_nnc_tensor_variable_new(graph);
648
196
    ccv_nnc_tensor_variable_set(graph, output, outputs[0]);
649
196
    ccv_nnc_dynamic_graph_evaluate(graph, cifar_10, 0, TENSOR_VARIABLE_LIST(input), TENSOR_VARIABLE_LIST(output), 0, stream_contexts[q]);
650
196
    ccv_nnc_tensor_variable_t const fit = ccv_nnc_tensor_variable_new(graph);
651
196
    ccv_nnc_tensor_variable_set(graph, fit, input_fit_fits[0]);
652
196
    ccv_nnc_tensor_variable_t const softmax = ccv_nnc_tensor_variable_new(graph);
653
196
    ccv_nnc_dynamic_graph_exec(graph, CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_VARIABLE_LIST(output, fit), TENSOR_VARIABLE_LIST(0, softmax), 0, stream_contexts[q]);
654
196
    ccv_nnc_dynamic_graph_backward(graph, TENSOR_VARIABLE_LIST(softmax), 0, TENSOR_VARIABLE_LIST(input), TENSOR_VARIABLE_LIST(0), stream_contexts[q]);
655
196
    ccv_nnc_dynamic_graph_apply_gradients(graph, sgd, TENSOR_VARIABLE_LIST(), TENSOR_VARIABLE_LIST(), 0, 0, stream_contexts[q]);
656
196
    ccv_nnc_tensor_variable_free(graph, input);
657
196
    ccv_nnc_tensor_variable_free(graph, output);
658
196
    ccv_nnc_tensor_variable_free(graph, fit);
659
196
    ccv_nnc_tensor_variable_free(graph, softmax);
660
196
    // Prefetch the next round.
661
196
    ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]);
662
196
    if ((i + 1) % epoch_end == 0)
663
1
    {
664
1
      ++epoch;
665
1
      // Reshuffle and reset cursor.
666
1
      ccv_cnnp_dataframe_shuffle(raw_train_data);
667
1
      ccv_cnnp_dataframe_iter_set_cursor(iter, 0);
668
1
    }
669
196
    int t;
670
196
    CCV_SWAP(p, q, t);
671
196
  }
672
1
  ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0);
673
1
  ccv_nnc_stream_context_wait(stream_contexts[p]);
674
1
  ccv_nnc_stream_context_wait(stream_contexts[q]);
675
1
  correct = 0;
676
1
  ccv_nnc_tensor_t* cpu_outputs[device_count];
677
1
  ccv_nnc_tensor_t* cpu_outputs_16f[device_count];
678
2
  for (i = 0; i < device_count; 
i++1
)
679
1
  {
680
1
    cpu_outputs_16f[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, batch_size, 10), 0);
681
1
    ccv_nnc_tensor_pin_memory(cpu_outputs_16f[i]);
682
1
    cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0);
683
1
  }
684
41
  for (j = 0; j < test_set->rnum; 
j += batch_size * device_count40
)
685
40
  {
686
40
    ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0);
687
80
    for (k = 0; k < device_count; 
k++40
)
688
40
    {
689
40
      input_fit_inputs[k] = input_fits[k][0];
690
40
      outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k];
691
40
    }
692
40
    ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){
693
40
      .is_test = 1
694
40
    }, input_fit_inputs, device_count, outputs, device_count, 0, 0);
695
40
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs_16f, device_count, 0);
696
40
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, cpu_outputs_16f, device_count, cpu_outputs, device_count, 0);
697
10.0k
    for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); 
k++10.0k
)
698
10.0k
    {
699
10.0k
      ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k);
700
10.0k
      const int d = k / batch_size;
701
10.0k
      const int b = k % batch_size;
702
10.0k
      float max = -FLT_MAX;
703
10.0k
      int t = -1;
704
10.0k
      int fi;
705
110k
      for (fi = 0; fi < 10; 
fi++100k
)
706
100k
        if (cpu_outputs[d]->data.f32[b * 10 + fi] > max)
707
28.4k
          max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi;
708
10.0k
      if (categorized->c == t)
709
5.72k
        ++correct;
710
10.0k
    }
711
40
  }
712
1
  ccv_cnnp_dataframe_iter_free(iter);
713
1
  ccv_cnnp_dataframe_free(batch_train_data);
714
1
  ccv_cnnp_dataframe_free(raw_train_data);
715
1
  ccv_cnnp_dataframe_iter_free(test_iter);
716
1
  ccv_cnnp_dataframe_free(batch_test_data);
717
1
  ccv_cnnp_dataframe_free(raw_test_data);
718
1
  ccv_cnnp_model_free(cifar_10);
719
1
  ccv_nnc_dynamic_graph_free(graph);
720
1
  ccv_nnc_stream_context_free(stream_contexts[0]);
721
1
  ccv_nnc_stream_context_free(stream_contexts[1]);
722
2
  for (i = 0; i < device_count; 
i++1
)
723
1
  {
724
1
    ccv_nnc_tensor_free(cpu_outputs[i]);
725
1
    ccv_nnc_tensor_free(cpu_outputs_16f[i]);
726
1
  }
727
1
  return correct;
728
1
}
729
730
TEST_CASE("cifar-10 with dawnnet to > 65% after 10 epoch (fp16) use dynamic graph")
731
1
{
732
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
733
1
      ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
734
1
  FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb");
735
1
  FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb");
736
1
  if (!train || !test)
737
0
  {
738
0
    if (train)
739
0
      fclose(train);
740
0
    if (test)
741
0
      fclose(test);
742
0
    GUARD_ELSE_RETURN(0);
743
0
  }
744
1
  int i, j, k;
745
1
  unsigned char bytes[32 * 32 + 1];
746
1
  double mean[3] = {};
747
1
  const int train_count = 50000;
748
1
  const int test_count = 10000;
749
1
  ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0);
750
50.0k
  for (k = 0; k < train_count; 
k++50.0k
)
751
50.0k
  {
752
50.0k
    fread(bytes, 32 * 32 + 1, 1, train);
753
50.0k
    double per_mean[3] = {};
754
50.0k
    int c = bytes[0];
755
50.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
756
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
757
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
758
51.2M
        per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.);
759
50.0k
    fread(bytes, 32 * 32, 1, train);
760
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
761
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
762
51.2M
        per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.);
763
50.0k
    fread(bytes, 32 * 32, 1, train);
764
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
765
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
766
51.2M
        per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.);
767
50.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
768
50.0k
    ccv_array_push(categorizeds, &categorized);
769
50.0k
    mean[0] += per_mean[0] / (32 * 32);
770
50.0k
    mean[1] += per_mean[1] / (32 * 32);
771
50.0k
    mean[2] += per_mean[2] / (32 * 32);
772
50.0k
  }
773
1
  float meanf[3];
774
1
  meanf[0] = mean[0] / train_count;
775
1
  meanf[1] = mean[1] / train_count;
776
1
  meanf[2] = mean[2] / train_count;
777
1
  ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0);
778
10.0k
  for (k = 0; k < test_count; 
k++10.0k
)
779
10.0k
  {
780
10.0k
    fread(bytes, 32 * 32 + 1, 1, test);
781
10.0k
    int c = bytes[0];
782
10.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
783
330k
    for (i = 0; i < 32; 
i++320k
)
784
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
785
10.2M
        a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0];
786
10.0k
    fread(bytes, 32 * 32, 1, test);
787
330k
    for (i = 0; i < 32; 
i++320k
)
788
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
789
10.2M
        a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1];
790
10.0k
    fread(bytes, 32 * 32, 1, test);
791
330k
    for (i = 0; i < 32; 
i++320k
)
792
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
793
10.2M
        a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2];
794
10.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
795
10.0k
    ccv_array_push(tests, &categorized);
796
10.0k
  }
797
1
  fclose(train);
798
1
  fclose(test);
799
1
  if (!ccv_is_coverage())
800
0
  {
801
0
    int correct = train_cifar_10_fp16_dy(10, categorizeds, 256, meanf, tests);
802
0
    REQUIRE(correct > 6500, "accuracy %.2f after 10 epoch should be higher than 65%%", (float)correct / 10000);
803
0
  } else
804
1
    train_cifar_10_fp16_dy(1, categorizeds, 256, meanf, tests);
805
1
}
806
807
#include "case_main.h"