Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/cifar.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <ccv_internal.h>
6
#include <nnc/ccv_nnc.h>
7
#include <nnc/ccv_nnc_easy.h>
8
#include <3rdparty/dsfmt/dSFMT.h>
9
10
TEST_SETUP()
11
{
12
  ccv_nnc_init();
13
}
14
15
static ccv_cnnp_model_t* _dawn_layer_new(const int filters, const int strides, const int residual)
16
9
{
17
9
  ccv_cnnp_model_io_t input = ccv_cnnp_input();
18
9
  ccv_cnnp_model_t* conv = ccv_cnnp_sequential_new(MODEL_LIST(
19
9
    ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0),
20
9
    ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0),
21
9
    ccv_cnnp_relu(0)
22
9
  ), 1, 0);
23
9
  ccv_cnnp_model_io_t output = ccv_cnnp_model_apply(conv, MODEL_IO_LIST(input));
24
9
  ccv_cnnp_model_t* pool = ccv_cnnp_max_pool(DIM_ALLOC(strides, strides), HINT((strides, strides), (0, 0)), 0);
25
9
  output = ccv_cnnp_model_apply(pool, MODEL_IO_LIST(output));
26
9
  if (residual)
27
6
  {
28
6
    ccv_cnnp_model_io_t shortcut = output;
29
6
    ccv_cnnp_model_t* res1 = ccv_cnnp_sequential_new(MODEL_LIST(
30
6
      ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0),
31
6
      ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0),
32
6
      ccv_cnnp_relu(0)
33
6
    ), 1, 0);
34
6
    output = ccv_cnnp_model_apply(res1, MODEL_IO_LIST(output));
35
6
    ccv_cnnp_model_t* res2 = ccv_cnnp_sequential_new(MODEL_LIST(
36
6
      ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0),
37
6
      ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0),
38
6
      ccv_cnnp_relu(0)
39
6
    ), 1, 0);
40
6
    output = ccv_cnnp_model_apply(res2, MODEL_IO_LIST(output));
41
6
    ccv_cnnp_model_t* const add = ccv_cnnp_sum(0);
42
6
    output = ccv_cnnp_model_apply(add, MODEL_IO_LIST(output, shortcut));
43
6
  }
44
9
  return ccv_cnnp_model_new(MODEL_IO_LIST(input), MODEL_IO_LIST(output), 1, 0);
45
9
}
46
47
static ccv_cnnp_model_t* _cifar_10_dawn(const int softmax)
48
3
{
49
3
  ccv_cnnp_model_t* prep = ccv_cnnp_sequential_new(MODEL_LIST(
50
3
    ccv_cnnp_convolution(1, 64, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0),
51
3
    ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0),
52
3
    ccv_cnnp_relu(0)
53
3
  ), 1, 0);
54
3
  ccv_cnnp_model_t* layer1 = _dawn_layer_new(128, 2, 1);
55
3
  ccv_cnnp_model_t* layer2 = _dawn_layer_new(256, 2, 0);
56
3
  ccv_cnnp_model_t* layer3 = _dawn_layer_new(512, 2, 1);
57
3
  if (softmax)
58
2
  {
59
2
    return ccv_cnnp_sequential_new(MODEL_LIST(
60
2
      prep,
61
2
      layer1,
62
2
      layer2,
63
2
      layer3,
64
2
      ccv_cnnp_max_pool(DIM_ALLOC(0, 0), ccv_nnc_no_hint, 0),
65
2
      ccv_cnnp_flatten(0),
66
2
      ccv_cnnp_dense(10, 0, 0, 1, 0),
67
2
      ccv_cnnp_softmax(0)
68
2
    ), 1, 0);
69
2
  } else {
70
1
    return ccv_cnnp_sequential_new(MODEL_LIST(
71
1
      prep,
72
1
      layer1,
73
1
      layer2,
74
1
      layer3,
75
1
      ccv_cnnp_max_pool(DIM_ALLOC(0, 0), ccv_nnc_no_hint, 0),
76
1
      ccv_cnnp_flatten(0),
77
1
      ccv_cnnp_dense(10, 0, 0, 1, 0)
78
1
    ), 1, 0);
79
1
  }
80
3
}
81
82
static int train_cifar_10(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set)
83
1
{
84
1
  ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1);
85
1
  int device_map[4] = {3, 2, 1, 0};
86
1
  ccv_nnc_set_device_permutation(CCV_STREAM_CONTEXT_GPU, device_map, 4);
87
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
88
1
  if (device_count < 1)
89
0
    return -1;
90
1
  const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 32F, batch_size, 3, 32, 32);
91
1
  float learn_rate = 0.001;
92
1
  ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD());
93
1
  ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024);
94
1
  int i, j, k;
95
1
  ccv_nnc_tensor_t* cpu_outputs[device_count];
96
5
  for (i = 0; i < device_count; 
i++4
)
97
4
  {
98
4
    cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0);
99
4
    ccv_nnc_tensor_pin_memory(cpu_outputs[i]);
100
4
  }
101
1
  ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set);
102
1
  const ccv_cnnp_random_jitter_t random_jitter = {
103
1
    .resize = {
104
1
      .min = 32,
105
1
      .max = 32,
106
1
    },
107
1
    .size = {
108
1
      .rows = 32,
109
1
      .cols = 32,
110
1
    },
111
1
    .symmetric = 1,
112
1
    .normalize = {
113
1
      .mean = {
114
1
        mean[0], mean[1], mean[2],
115
1
      },
116
1
    },
117
1
    .offset = {
118
1
      .x = 4,
119
1
      .y = 4,
120
1
    },
121
1
    .seed = 1,
122
1
  };
123
1
  const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0);
124
1
  const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0);
125
1
  const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_32F, CCV_TENSOR_FORMAT_NCHW, 0);
126
1
  ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
127
1
  ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set);
128
1
  const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0);
129
1
  ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
130
1
  int train_device_columns[device_count * 2];
131
1
  int test_device_columns[device_count * 2];
132
5
  for (i = 0; i < device_count; 
i++4
)
133
4
  {
134
4
    int stream_type = CCV_STREAM_CONTEXT_GPU;
135
4
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
136
4
    train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0);
137
4
    ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 32F, batch_size, 10);
138
4
    CCV_TENSOR_SET_DEVICE_ID(params.type, i);
139
4
    train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0);
140
4
    test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0);
141
4
    test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0);
142
4
  }
143
1
  ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2);
144
1
  ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2);
145
1
  ccv_nnc_stream_context_t* stream_contexts[2];
146
1
  stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
147
1
  stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
148
1
  int p = 0, q = 1;
149
1
  const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count);
150
1
  int correct = 0;
151
1
  int epoch = 0;
152
1
  ccv_cnnp_model_set_data_parallel(cifar_10, device_count);
153
1
  ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]);
154
1
  ccv_nnc_tensor_t** input_fits[device_count * 2];
155
1
  ccv_nnc_tensor_t* input_fit_inputs[device_count];
156
1
  ccv_nnc_tensor_t* input_fit_fits[device_count];
157
1
  ccv_nnc_tensor_t* outputs[device_count];
158
50
  for (i = 0; epoch < epoch_limit; 
i++49
)
159
49
  {
160
    // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/
161
49
    learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 
0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0
;
162
49
    learn_rate = ccv_max(learn_rate, 0.000001);
163
49
    ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9);
164
49
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0);
165
49
    sgd.info.sgd.decay = 0;
166
49
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0
167
49
    ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]);
168
49
    ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena.
169
245
    for (j = 0; j < device_count; 
j++196
)
170
196
    {
171
196
      input_fit_inputs[j] = input_fits[j][0];
172
196
      input_fit_fits[j] = input_fits[j][1];
173
196
      outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j];
174
196
    }
175
49
    ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, 0, stream_contexts[p]);
176
    // Prefetch the next round.
177
49
    ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]);
178
49
    if ((i + 1) % epoch_end == 0)
179
1
    {
180
1
      ++epoch;
181
      // Reshuffle and reset cursor.
182
1
      ccv_cnnp_dataframe_shuffle(raw_train_data);
183
1
      ccv_cnnp_dataframe_iter_set_cursor(iter, 0);
184
1
    }
185
49
    int t;
186
49
    CCV_SWAP(p, q, t);
187
49
  }
188
1
  ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0);
189
1
  ccv_nnc_stream_context_wait(stream_contexts[p]);
190
1
  ccv_nnc_stream_context_wait(stream_contexts[q]);
191
1
  correct = 0;
192
1
  p = 0, q = 1;
193
11
  for (j = 0; j < test_set->rnum; 
j += batch_size * device_count10
)
194
10
  {
195
10
    ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0);
196
50
    for (k = 0; k < device_count; 
k++40
)
197
40
    {
198
40
      input_fit_inputs[k] = input_fits[k][0];
199
40
      outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k];
200
40
    }
201
10
    ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){
202
10
      .is_test = 1
203
10
    }, input_fit_inputs, device_count, outputs, device_count, 0, 0);
204
10
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs, device_count, 0);
205
10.0k
    for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); 
k++10.0k
)
206
10.0k
    {
207
10.0k
      ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k);
208
10.0k
      const int d = k / batch_size;
209
10.0k
      const int b = k % batch_size;
210
10.0k
      float max = -FLT_MAX;
211
10.0k
      int t = -1;
212
10.0k
      int fi;
213
110k
      for (fi = 0; fi < 10; 
fi++100k
)
214
100k
        if (cpu_outputs[d]->data.f32[b * 10 + fi] > max)
215
29.4k
          max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi;
216
10.0k
      if (categorized->c == t)
217
4.55k
        ++correct;
218
10.0k
    }
219
10
  }
220
1
  ccv_cnnp_dataframe_iter_free(iter);
221
1
  ccv_cnnp_dataframe_free(batch_train_data);
222
1
  ccv_cnnp_dataframe_free(raw_train_data);
223
1
  ccv_cnnp_dataframe_iter_free(test_iter);
224
1
  ccv_cnnp_dataframe_free(batch_test_data);
225
1
  ccv_cnnp_dataframe_free(raw_test_data);
226
1
  ccv_cnnp_model_free(cifar_10);
227
1
  ccv_nnc_stream_context_free(stream_contexts[0]);
228
1
  ccv_nnc_stream_context_free(stream_contexts[1]);
229
5
  for (i = 0; i < device_count; 
i++4
)
230
4
    ccv_nnc_tensor_free(cpu_outputs[i]);
231
1
  ccv_nnc_set_device_permutation(CCV_STREAM_CONTEXT_GPU, 0, 0);
232
1
  return correct;
233
1
}
234
235
TEST_CASE("cifar-10 with dawnnet to > 90% under 3 minutes")
236
1
{
237
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
238
1
      ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
239
1
  FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb");
240
1
  FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb");
241
1
  if (!train || !test)
242
0
  {
243
0
    if (train)
244
0
      fclose(train);
245
0
    if (test)
246
0
      fclose(test);
247
0
    GUARD_ELSE_RETURN(0);
248
0
  }
249
1
  int i, j, k;
250
1
  unsigned char bytes[32 * 32 + 1];
251
1
  double mean[3] = {};
252
1
  const int train_count = 50000;
253
1
  const int test_count = 10000;
254
1
  ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0);
255
50.0k
  for (k = 0; k < train_count; 
k++50.0k
)
256
50.0k
  {
257
50.0k
    fread(bytes, 32 * 32 + 1, 1, train);
258
50.0k
    double per_mean[3] = {};
259
50.0k
    int c = bytes[0];
260
50.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
261
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
262
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
263
51.2M
        per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.);
264
50.0k
    fread(bytes, 32 * 32, 1, train);
265
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
266
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
267
51.2M
        per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.);
268
50.0k
    fread(bytes, 32 * 32, 1, train);
269
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
270
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
271
51.2M
        per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.);
272
50.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
273
50.0k
    ccv_array_push(categorizeds, &categorized);
274
50.0k
    mean[0] += per_mean[0] / (32 * 32);
275
50.0k
    mean[1] += per_mean[1] / (32 * 32);
276
50.0k
    mean[2] += per_mean[2] / (32 * 32);
277
50.0k
  }
278
1
  float meanf[3];
279
1
  meanf[0] = mean[0] / train_count;
280
1
  meanf[1] = mean[1] / train_count;
281
1
  meanf[2] = mean[2] / train_count;
282
1
  ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0);
283
10.0k
  for (k = 0; k < test_count; 
k++10.0k
)
284
10.0k
  {
285
10.0k
    fread(bytes, 32 * 32 + 1, 1, test);
286
10.0k
    int c = bytes[0];
287
10.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
288
330k
    for (i = 0; i < 32; 
i++320k
)
289
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
290
10.2M
        a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0];
291
10.0k
    fread(bytes, 32 * 32, 1, test);
292
330k
    for (i = 0; i < 32; 
i++320k
)
293
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
294
10.2M
        a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1];
295
10.0k
    fread(bytes, 32 * 32, 1, test);
296
330k
    for (i = 0; i < 32; 
i++320k
)
297
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
298
10.2M
        a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2];
299
10.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
300
10.0k
    ccv_array_push(tests, &categorized);
301
10.0k
  }
302
1
  fclose(train);
303
1
  fclose(test);
304
1
  if (!ccv_is_coverage())
305
0
  {
306
0
    int correct = train_cifar_10(35, categorizeds, 256, meanf, tests);
307
0
    REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000);
308
0
  } else
309
1
    train_cifar_10(1, categorizeds, 256, meanf, tests);
310
1
}
311
312
static int train_cifar_10_fp16(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set)
313
1
{
314
1
  ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1);
315
1
  const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
316
1
  if (device_count < 1)
317
0
    return -1;
318
1
  const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 16F, batch_size, 3, 32, 32);
319
1
  float learn_rate = 0.001;
320
1
  ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD());
321
1
  ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024);
322
1
  int i, j, k;
323
1
  ccv_nnc_tensor_t* cpu_outputs[device_count];
324
1
  ccv_nnc_tensor_t* cpu_outputs_16f[device_count];
325
5
  for (i = 0; i < device_count; 
i++4
)
326
4
  {
327
4
    cpu_outputs_16f[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, batch_size, 10), 0);
328
4
    ccv_nnc_tensor_pin_memory(cpu_outputs_16f[i]);
329
4
    cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0);
330
4
  }
331
1
  ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set);
332
1
  const ccv_cnnp_random_jitter_t random_jitter = {
333
1
    .resize = {
334
1
      .min = 32,
335
1
      .max = 32,
336
1
    },
337
1
    .size = {
338
1
      .rows = 32,
339
1
      .cols = 32,
340
1
    },
341
1
    .symmetric = 1,
342
1
    .normalize = {
343
1
      .mean = {
344
1
        mean[0], mean[1], mean[2],
345
1
      },
346
1
    },
347
1
    .offset = {
348
1
      .x = 4,
349
1
      .y = 4,
350
1
    },
351
1
    .seed = 1,
352
1
  };
353
1
  const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0);
354
1
  const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0);
355
1
  ccv_nnc_tensor_param_t images_16f_params = CPU_TENSOR_NHWC(16F, 32, 32, 3);
356
1
  const int jitter_images_in = ccv_cnnp_dataframe_make_tuple(raw_train_data, COLUMN_ID_LIST(jitter_images), 0);
357
1
  const int jitter_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_train_data, jitter_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0);
358
1
  const int jitter_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_train_data, jitter_images_out_16f, 0, 0);
359
1
  const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_16F, CCV_TENSOR_FORMAT_NCHW, 0);
360
1
  ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images_16f, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
361
1
  ccv_cnnp_dataframe_shuffle(raw_train_data);
362
1
  ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set);
363
1
  const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0);
364
1
  const int test_images_in = ccv_cnnp_dataframe_make_tuple(raw_test_data, COLUMN_ID_LIST(test_images), 0);
365
1
  const int test_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_test_data, test_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0);
366
1
  const int test_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_test_data, test_images_out_16f, 0, 0);
367
1
  ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images_16f), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
368
1
  int train_device_columns[device_count * 2];
369
1
  int test_device_columns[device_count * 2];
370
5
  for (i = 0; i < device_count; 
i++4
)
371
4
  {
372
4
    int stream_type = CCV_STREAM_CONTEXT_GPU;
373
4
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
374
4
    train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0);
375
4
    ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 16F, batch_size, 10);
376
4
    CCV_TENSOR_SET_DEVICE_ID(params.type, i);
377
4
    train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0);
378
4
    test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0);
379
4
    test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0);
380
4
  }
381
1
  ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2);
382
1
  ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2);
383
1
  ccv_nnc_stream_context_t* stream_contexts[2];
384
1
  stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
385
1
  stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
386
1
  int p = 0, q = 1;
387
1
  const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count);
388
1
  int correct = 0;
389
1
  int epoch = 0;
390
1
  ccv_cnnp_model_set_data_parallel(cifar_10, device_count);
391
1
  ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]);
392
1
  ccv_nnc_tensor_t** input_fits[device_count * 2];
393
1
  ccv_nnc_tensor_t* input_fit_inputs[device_count];
394
1
  ccv_nnc_tensor_t* input_fit_fits[device_count];
395
1
  ccv_nnc_tensor_t* outputs[device_count];
396
50
  for (i = 0; epoch < epoch_limit; 
i++49
)
397
49
  {
398
    // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/
399
49
    learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 
0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0
;
400
49
    learn_rate = ccv_max(learn_rate, 0.000001);
401
49
    ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9);
402
49
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0);
403
49
    sgd.info.sgd.decay = 0;
404
49
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0
405
49
    ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]);
406
49
    ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena.
407
245
    for (j = 0; j < device_count; 
j++196
)
408
196
    {
409
196
      input_fit_inputs[j] = input_fits[j][0];
410
196
      input_fit_fits[j] = input_fits[j][1];
411
196
      outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j];
412
196
    }
413
49
    ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, 0, stream_contexts[p]);
414
    // Prefetch the next round.
415
49
    ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]);
416
49
    if ((i + 1) % epoch_end == 0)
417
1
    {
418
1
      ++epoch;
419
      // Reshuffle and reset cursor.
420
1
      ccv_cnnp_dataframe_shuffle(raw_train_data);
421
1
      ccv_cnnp_dataframe_iter_set_cursor(iter, 0);
422
1
    }
423
49
    int t;
424
49
    CCV_SWAP(p, q, t);
425
49
  }
426
1
  ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0);
427
1
  ccv_nnc_stream_context_wait(stream_contexts[p]);
428
1
  ccv_nnc_stream_context_wait(stream_contexts[q]);
429
1
  correct = 0;
430
1
  p = 0, q = 1;
431
11
  for (j = 0; j < test_set->rnum; 
j += batch_size * device_count10
)
432
10
  {
433
10
    ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0);
434
50
    for (k = 0; k < device_count; 
k++40
)
435
40
    {
436
40
      input_fit_inputs[k] = input_fits[k][0];
437
40
      outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k];
438
40
    }
439
10
    ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){
440
10
      .is_test = 1
441
10
    }, input_fit_inputs, device_count, outputs, device_count, 0, 0);
442
10
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs_16f, device_count, 0);
443
10
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, cpu_outputs_16f, device_count, cpu_outputs, device_count, 0);
444
10.0k
    for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); 
k++10.0k
)
445
10.0k
    {
446
10.0k
      ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k);
447
10.0k
      const int d = k / batch_size;
448
10.0k
      const int b = k % batch_size;
449
10.0k
      float max = -FLT_MAX;
450
10.0k
      int t = -1;
451
10.0k
      int fi;
452
110k
      for (fi = 0; fi < 10; 
fi++100k
)
453
100k
        if (cpu_outputs[d]->data.f32[b * 10 + fi] > max)
454
30.8k
          max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi;
455
10.0k
      if (categorized->c == t)
456
4.55k
        ++correct;
457
10.0k
    }
458
10
  }
459
1
  ccv_cnnp_dataframe_iter_free(iter);
460
1
  ccv_cnnp_dataframe_free(batch_train_data);
461
1
  ccv_cnnp_dataframe_free(raw_train_data);
462
1
  ccv_cnnp_dataframe_iter_free(test_iter);
463
1
  ccv_cnnp_dataframe_free(batch_test_data);
464
1
  ccv_cnnp_dataframe_free(raw_test_data);
465
1
  ccv_cnnp_model_free(cifar_10);
466
1
  ccv_nnc_stream_context_free(stream_contexts[0]);
467
1
  ccv_nnc_stream_context_free(stream_contexts[1]);
468
5
  for (i = 0; i < device_count; 
i++4
)
469
4
  {
470
4
    ccv_nnc_tensor_free(cpu_outputs[i]);
471
4
    ccv_nnc_tensor_free(cpu_outputs_16f[i]);
472
4
  }
473
1
  return correct;
474
1
}
475
476
TEST_CASE("cifar-10 with dawnnet to > 90% under 1 minutes (fp16)")
477
1
{
478
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
479
1
      ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
480
1
  FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb");
481
1
  FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb");
482
1
  if (!train || !test)
483
0
  {
484
0
    if (train)
485
0
      fclose(train);
486
0
    if (test)
487
0
      fclose(test);
488
0
    GUARD_ELSE_RETURN(0);
489
0
  }
490
1
  int i, j, k;
491
1
  unsigned char bytes[32 * 32 + 1];
492
1
  double mean[3] = {};
493
1
  const int train_count = 50000;
494
1
  const int test_count = 10000;
495
1
  ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0);
496
50.0k
  for (k = 0; k < train_count; 
k++50.0k
)
497
50.0k
  {
498
50.0k
    fread(bytes, 32 * 32 + 1, 1, train);
499
50.0k
    double per_mean[3] = {};
500
50.0k
    int c = bytes[0];
501
50.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
502
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
503
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
504
51.2M
        per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.);
505
50.0k
    fread(bytes, 32 * 32, 1, train);
506
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
507
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
508
51.2M
        per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.);
509
50.0k
    fread(bytes, 32 * 32, 1, train);
510
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
511
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
512
51.2M
        per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.);
513
50.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
514
50.0k
    ccv_array_push(categorizeds, &categorized);
515
50.0k
    mean[0] += per_mean[0] / (32 * 32);
516
50.0k
    mean[1] += per_mean[1] / (32 * 32);
517
50.0k
    mean[2] += per_mean[2] / (32 * 32);
518
50.0k
  }
519
1
  float meanf[3];
520
1
  meanf[0] = mean[0] / train_count;
521
1
  meanf[1] = mean[1] / train_count;
522
1
  meanf[2] = mean[2] / train_count;
523
1
  ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0);
524
10.0k
  for (k = 0; k < test_count; 
k++10.0k
)
525
10.0k
  {
526
10.0k
    fread(bytes, 32 * 32 + 1, 1, test);
527
10.0k
    int c = bytes[0];
528
10.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
529
330k
    for (i = 0; i < 32; 
i++320k
)
530
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
531
10.2M
        a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0];
532
10.0k
    fread(bytes, 32 * 32, 1, test);
533
330k
    for (i = 0; i < 32; 
i++320k
)
534
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
535
10.2M
        a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1];
536
10.0k
    fread(bytes, 32 * 32, 1, test);
537
330k
    for (i = 0; i < 32; 
i++320k
)
538
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
539
10.2M
        a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2];
540
10.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
541
10.0k
    ccv_array_push(tests, &categorized);
542
10.0k
  }
543
1
  fclose(train);
544
1
  fclose(test);
545
1
  if (!ccv_is_coverage())
546
0
  {
547
0
    int correct = train_cifar_10_fp16(35, categorizeds, 256, meanf, tests);
548
0
    REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000);
549
0
  } else
550
1
    train_cifar_10_fp16(1, categorizeds, 256, meanf, tests);
551
1
}
552
553
static int train_cifar_10_fp16_dy(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set)
554
1
{
555
1
  ccv_cnnp_model_t* const cifar_10_0 = _cifar_10_dawn(0);
556
1
  ccv_cnnp_model_t* const cifar_10 = ccv_cnnp_model_copy(cifar_10_0, 1);
557
1
  ccv_cnnp_model_free(cifar_10_0);
558
1
  const int device_count = 1;
559
1
  const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 16F, batch_size, 3, 32, 32);
560
1
  float learn_rate = 0.001;
561
1
  ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / batch_size, 0.01, 0.9, 0.9), CMD_NOOP());
562
1
  ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024);
563
1
  int i, j, k;
564
1
  ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set);
565
1
  const ccv_cnnp_random_jitter_t random_jitter = {
566
1
    .resize = {
567
1
      .min = 32,
568
1
      .max = 32,
569
1
    },
570
1
    .size = {
571
1
      .rows = 32,
572
1
      .cols = 32,
573
1
    },
574
1
    .symmetric = 1,
575
1
    .normalize = {
576
1
      .mean = {
577
1
        mean[0], mean[1], mean[2],
578
1
      },
579
1
    },
580
1
    .offset = {
581
1
      .x = 4,
582
1
      .y = 4,
583
1
    },
584
1
    .seed = 1,
585
1
  };
586
1
  const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0);
587
1
  const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0);
588
1
  ccv_nnc_tensor_param_t images_16f_params = CPU_TENSOR_NHWC(16F, 32, 32, 3);
589
1
  const int jitter_images_in = ccv_cnnp_dataframe_make_tuple(raw_train_data, COLUMN_ID_LIST(jitter_images), 0);
590
1
  const int jitter_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_train_data, jitter_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0);
591
1
  const int jitter_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_train_data, jitter_images_out_16f, 0, 0);
592
1
  const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_16F, CCV_TENSOR_FORMAT_NCHW, 0);
593
1
  ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images_16f, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
594
1
  ccv_cnnp_dataframe_shuffle(raw_train_data);
595
1
  ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set);
596
1
  const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0);
597
1
  const int test_images_in = ccv_cnnp_dataframe_make_tuple(raw_test_data, COLUMN_ID_LIST(test_images), 0);
598
1
  const int test_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_test_data, test_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0);
599
1
  const int test_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_test_data, test_images_out_16f, 0, 0);
600
1
  ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images_16f), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW);
601
1
  int train_device_columns[device_count * 2];
602
1
  int test_device_columns[device_count * 2];
603
2
  for (i = 0; i < device_count; 
i++1
)
604
1
  {
605
1
    int stream_type = CCV_STREAM_CONTEXT_GPU;
606
1
    CCV_STREAM_SET_DEVICE_ID(stream_type, i);
607
1
    train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0);
608
1
    ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 16F, batch_size, 10);
609
1
    CCV_TENSOR_SET_DEVICE_ID(params.type, i);
610
1
    train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0);
611
1
    test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0);
612
1
    test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0);
613
1
  }
614
1
  ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2);
615
1
  ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2);
616
1
  ccv_nnc_stream_context_t* stream_contexts[2];
617
1
  stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
618
1
  stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
619
1
  int p = 0, q = 1;
620
1
  const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count);
621
1
  int correct = 0;
622
1
  int epoch = 0;
623
1
  ccv_cnnp_model_set_data_parallel(cifar_10, device_count);
624
1
  ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]);
625
1
  ccv_nnc_tensor_t** input_fits[device_count * 2];
626
1
  ccv_nnc_tensor_t* input_fit_inputs[device_count];
627
1
  ccv_nnc_tensor_t* input_fit_fits[device_count];
628
1
  ccv_nnc_tensor_t* outputs[device_count];
629
1
  ccv_nnc_dynamic_graph_t* const graph = ccv_nnc_dynamic_graph_new();
630
197
  for (i = 0; epoch < epoch_limit; 
i++196
)
631
196
  {
632
    // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/
633
196
    learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 
0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0
;
634
196
    learn_rate = ccv_max(learn_rate, 0.000001);
635
196
    ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9);
636
196
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0);
637
196
    sgd.info.sgd.decay = 0;
638
196
    ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0
639
196
    ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]);
640
196
    ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena.
641
392
    for (j = 0; j < device_count; 
j++196
)
642
196
    {
643
196
      input_fit_inputs[j] = input_fits[j][0];
644
196
      input_fit_fits[j] = input_fits[j][1];
645
196
      outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j];
646
196
    }
647
196
    ccv_nnc_stream_context_wait(stream_contexts[p]); // Need to wait the other context to finish, we use the same tensor_arena.
648
196
    ccv_nnc_tensor_variable_t const input = ccv_nnc_tensor_variable_new(graph);
649
196
    ccv_nnc_tensor_variable_set(graph, input, input_fit_inputs[0]);
650
196
    ccv_nnc_tensor_variable_t const output = ccv_nnc_tensor_variable_new(graph);
651
196
    ccv_nnc_tensor_variable_set(graph, output, outputs[0]);
652
196
    ccv_nnc_dynamic_graph_evaluate(graph, cifar_10, 0, TENSOR_VARIABLE_LIST(input), TENSOR_VARIABLE_LIST(output), 0, stream_contexts[q]);
653
196
    ccv_nnc_tensor_variable_t const fit = ccv_nnc_tensor_variable_new(graph);
654
196
    ccv_nnc_tensor_variable_set(graph, fit, input_fit_fits[0]);
655
196
    ccv_nnc_tensor_variable_t const softmax = ccv_nnc_tensor_variable_new(graph);
656
196
    ccv_nnc_dynamic_graph_exec(graph, CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_VARIABLE_LIST(output, fit), TENSOR_VARIABLE_LIST(0, softmax), 0, stream_contexts[q]);
657
196
    ccv_nnc_dynamic_graph_backward(graph, TENSOR_VARIABLE_LIST(softmax), 0, TENSOR_VARIABLE_LIST(input), TENSOR_VARIABLE_LIST(0), stream_contexts[q]);
658
196
    ccv_nnc_dynamic_graph_apply_gradients(graph, sgd, TENSOR_VARIABLE_LIST(), TENSOR_VARIABLE_LIST(), 0, 0, stream_contexts[q]);
659
196
    ccv_nnc_tensor_variable_free(graph, input);
660
196
    ccv_nnc_tensor_variable_free(graph, output);
661
196
    ccv_nnc_tensor_variable_free(graph, fit);
662
196
    ccv_nnc_tensor_variable_free(graph, softmax);
663
    // Prefetch the next round.
664
196
    ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]);
665
196
    if ((i + 1) % epoch_end == 0)
666
1
    {
667
1
      ++epoch;
668
      // Reshuffle and reset cursor.
669
1
      ccv_cnnp_dataframe_shuffle(raw_train_data);
670
1
      ccv_cnnp_dataframe_iter_set_cursor(iter, 0);
671
1
    }
672
196
    int t;
673
196
    CCV_SWAP(p, q, t);
674
196
  }
675
1
  ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0);
676
1
  ccv_nnc_stream_context_wait(stream_contexts[p]);
677
1
  ccv_nnc_stream_context_wait(stream_contexts[q]);
678
1
  correct = 0;
679
1
  ccv_nnc_tensor_t* cpu_outputs[device_count];
680
1
  ccv_nnc_tensor_t* cpu_outputs_16f[device_count];
681
2
  for (i = 0; i < device_count; 
i++1
)
682
1
  {
683
1
    cpu_outputs_16f[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, batch_size, 10), 0);
684
1
    ccv_nnc_tensor_pin_memory(cpu_outputs_16f[i]);
685
1
    cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0);
686
1
  }
687
41
  for (j = 0; j < test_set->rnum; 
j += batch_size * device_count40
)
688
40
  {
689
40
    ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0);
690
80
    for (k = 0; k < device_count; 
k++40
)
691
40
    {
692
40
      input_fit_inputs[k] = input_fits[k][0];
693
40
      outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k];
694
40
    }
695
40
    ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){
696
40
      .is_test = 1
697
40
    }, input_fit_inputs, device_count, outputs, device_count, 0, 0);
698
40
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs_16f, device_count, 0);
699
40
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, cpu_outputs_16f, device_count, cpu_outputs, device_count, 0);
700
10.0k
    for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); 
k++10.0k
)
701
10.0k
    {
702
10.0k
      ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k);
703
10.0k
      const int d = k / batch_size;
704
10.0k
      const int b = k % batch_size;
705
10.0k
      float max = -FLT_MAX;
706
10.0k
      int t = -1;
707
10.0k
      int fi;
708
110k
      for (fi = 0; fi < 10; 
fi++100k
)
709
100k
        if (cpu_outputs[d]->data.f32[b * 10 + fi] > max)
710
31.0k
          max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi;
711
10.0k
      if (categorized->c == t)
712
5.65k
        ++correct;
713
10.0k
    }
714
40
  }
715
1
  ccv_cnnp_dataframe_iter_free(iter);
716
1
  ccv_cnnp_dataframe_free(batch_train_data);
717
1
  ccv_cnnp_dataframe_free(raw_train_data);
718
1
  ccv_cnnp_dataframe_iter_free(test_iter);
719
1
  ccv_cnnp_dataframe_free(batch_test_data);
720
1
  ccv_cnnp_dataframe_free(raw_test_data);
721
1
  ccv_cnnp_model_free(cifar_10);
722
1
  ccv_nnc_dynamic_graph_free(graph);
723
1
  ccv_nnc_stream_context_free(stream_contexts[0]);
724
1
  ccv_nnc_stream_context_free(stream_contexts[1]);
725
2
  for (i = 0; i < device_count; 
i++1
)
726
1
  {
727
1
    ccv_nnc_tensor_free(cpu_outputs[i]);
728
1
    ccv_nnc_tensor_free(cpu_outputs_16f[i]);
729
1
  }
730
1
  return correct;
731
1
}
732
733
TEST_CASE("cifar-10 with dawnnet to > 65% after 10 epoch (fp16) use dynamic graph")
734
1
{
735
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) &&
736
1
      ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
737
1
  FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb");
738
1
  FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb");
739
1
  if (!train || !test)
740
0
  {
741
0
    if (train)
742
0
      fclose(train);
743
0
    if (test)
744
0
      fclose(test);
745
0
    GUARD_ELSE_RETURN(0);
746
0
  }
747
1
  int i, j, k;
748
1
  unsigned char bytes[32 * 32 + 1];
749
1
  double mean[3] = {};
750
1
  const int train_count = 50000;
751
1
  const int test_count = 10000;
752
1
  ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0);
753
50.0k
  for (k = 0; k < train_count; 
k++50.0k
)
754
50.0k
  {
755
50.0k
    fread(bytes, 32 * 32 + 1, 1, train);
756
50.0k
    double per_mean[3] = {};
757
50.0k
    int c = bytes[0];
758
50.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
759
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
760
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
761
51.2M
        per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.);
762
50.0k
    fread(bytes, 32 * 32, 1, train);
763
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
764
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
765
51.2M
        per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.);
766
50.0k
    fread(bytes, 32 * 32, 1, train);
767
1.65M
    for (i = 0; i < 32; 
i++1.60M
)
768
52.8M
      
for (j = 0; 1.60M
j < 32;
j++51.2M
)
769
51.2M
        per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.);
770
50.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
771
50.0k
    ccv_array_push(categorizeds, &categorized);
772
50.0k
    mean[0] += per_mean[0] / (32 * 32);
773
50.0k
    mean[1] += per_mean[1] / (32 * 32);
774
50.0k
    mean[2] += per_mean[2] / (32 * 32);
775
50.0k
  }
776
1
  float meanf[3];
777
1
  meanf[0] = mean[0] / train_count;
778
1
  meanf[1] = mean[1] / train_count;
779
1
  meanf[2] = mean[2] / train_count;
780
1
  ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0);
781
10.0k
  for (k = 0; k < test_count; 
k++10.0k
)
782
10.0k
  {
783
10.0k
    fread(bytes, 32 * 32 + 1, 1, test);
784
10.0k
    int c = bytes[0];
785
10.0k
    ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
786
330k
    for (i = 0; i < 32; 
i++320k
)
787
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
788
10.2M
        a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0];
789
10.0k
    fread(bytes, 32 * 32, 1, test);
790
330k
    for (i = 0; i < 32; 
i++320k
)
791
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
792
10.2M
        a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1];
793
10.0k
    fread(bytes, 32 * 32, 1, test);
794
330k
    for (i = 0; i < 32; 
i++320k
)
795
10.5M
      
for (j = 0; 320k
j < 32;
j++10.2M
)
796
10.2M
        a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2];
797
10.0k
    ccv_categorized_t categorized = ccv_categorized(c, a, 0);
798
10.0k
    ccv_array_push(tests, &categorized);
799
10.0k
  }
800
1
  fclose(train);
801
1
  fclose(test);
802
1
  if (!ccv_is_coverage())
803
0
  {
804
0
    int correct = train_cifar_10_fp16_dy(10, categorizeds, 256, meanf, tests);
805
0
    REQUIRE(correct > 6500, "accuracy %.2f after 10 epoch should be higher than 65%%", (float)correct / 10000);
806
0
  } else
807
1
    train_cifar_10_fp16_dy(1, categorizeds, 256, meanf, tests);
808
1
}
809
810
#include "case_main.h"