/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/cifar.tests.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "case.h" |
2 | | #include "ccv_case.h" |
3 | | #include "ccv_nnc_case.h" |
4 | | #include <ccv.h> |
5 | | #include <ccv_internal.h> |
6 | | #include <nnc/ccv_nnc.h> |
7 | | #include <nnc/ccv_nnc_easy.h> |
8 | | #include <3rdparty/dsfmt/dSFMT.h> |
9 | | |
10 | | TEST_SETUP() |
11 | | { |
12 | | ccv_nnc_init(); |
13 | | } |
14 | | |
15 | | static ccv_cnnp_model_t* _dawn_layer_new(const int filters, const int strides, const int residual) |
16 | 9 | { |
17 | 9 | ccv_cnnp_model_io_t input = ccv_cnnp_input(); |
18 | 9 | ccv_cnnp_model_t* conv = ccv_cnnp_sequential_new(MODEL_LIST( |
19 | 9 | ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0), |
20 | 9 | ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0), |
21 | 9 | ccv_cnnp_relu(0) |
22 | 9 | ), 1, 0); |
23 | 9 | ccv_cnnp_model_io_t output = ccv_cnnp_model_apply(conv, MODEL_IO_LIST(input)); |
24 | 9 | ccv_cnnp_model_t* pool = ccv_cnnp_max_pool(DIM_ALLOC(strides, strides), HINT((strides, strides), (0, 0)), 0); |
25 | 9 | output = ccv_cnnp_model_apply(pool, MODEL_IO_LIST(output)); |
26 | 9 | if (residual) |
27 | 6 | { |
28 | 6 | ccv_cnnp_model_io_t shortcut = output; |
29 | 6 | ccv_cnnp_model_t* res1 = ccv_cnnp_sequential_new(MODEL_LIST( |
30 | 6 | ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0), |
31 | 6 | ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0), |
32 | 6 | ccv_cnnp_relu(0) |
33 | 6 | ), 1, 0); |
34 | 6 | output = ccv_cnnp_model_apply(res1, MODEL_IO_LIST(output)); |
35 | 6 | ccv_cnnp_model_t* res2 = ccv_cnnp_sequential_new(MODEL_LIST( |
36 | 6 | ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0), |
37 | 6 | ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0), |
38 | 6 | ccv_cnnp_relu(0) |
39 | 6 | ), 1, 0); |
40 | 6 | output = ccv_cnnp_model_apply(res2, MODEL_IO_LIST(output)); |
41 | 6 | ccv_cnnp_model_t* const add = ccv_cnnp_sum(0); |
42 | 6 | output = ccv_cnnp_model_apply(add, MODEL_IO_LIST(output, shortcut)); |
43 | 6 | } |
44 | 9 | return ccv_cnnp_model_new(MODEL_IO_LIST(input), MODEL_IO_LIST(output), 1, 0); |
45 | 9 | } |
46 | | |
47 | | static ccv_cnnp_model_t* _cifar_10_dawn(const int softmax) |
48 | 3 | { |
49 | 3 | ccv_cnnp_model_t* prep = ccv_cnnp_sequential_new(MODEL_LIST( |
50 | 3 | ccv_cnnp_convolution(1, 64, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0), |
51 | 3 | ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0), |
52 | 3 | ccv_cnnp_relu(0) |
53 | 3 | ), 1, 0); |
54 | 3 | ccv_cnnp_model_t* layer1 = _dawn_layer_new(128, 2, 1); |
55 | 3 | ccv_cnnp_model_t* layer2 = _dawn_layer_new(256, 2, 0); |
56 | 3 | ccv_cnnp_model_t* layer3 = _dawn_layer_new(512, 2, 1); |
57 | 3 | if (softmax) |
58 | 2 | { |
59 | 2 | return ccv_cnnp_sequential_new(MODEL_LIST( |
60 | 2 | prep, |
61 | 2 | layer1, |
62 | 2 | layer2, |
63 | 2 | layer3, |
64 | 2 | ccv_cnnp_max_pool(DIM_ALLOC(0, 0), ccv_nnc_no_hint, 0), |
65 | 2 | ccv_cnnp_flatten(0), |
66 | 2 | ccv_cnnp_dense(10, 0, 0, 1, 0), |
67 | 2 | ccv_cnnp_softmax(0) |
68 | 2 | ), 1, 0); |
69 | 2 | } else { |
70 | 1 | return ccv_cnnp_sequential_new(MODEL_LIST( |
71 | 1 | prep, |
72 | 1 | layer1, |
73 | 1 | layer2, |
74 | 1 | layer3, |
75 | 1 | ccv_cnnp_max_pool(DIM_ALLOC(0, 0), ccv_nnc_no_hint, 0), |
76 | 1 | ccv_cnnp_flatten(0), |
77 | 1 | ccv_cnnp_dense(10, 0, 0, 1, 0) |
78 | 1 | ), 1, 0); |
79 | 1 | } |
80 | 3 | } |
81 | | |
82 | | static int train_cifar_10(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set) |
83 | 1 | { |
84 | 1 | ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1); |
85 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
86 | 1 | if (device_count < 1) |
87 | 0 | return -1; |
88 | 1 | const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 32F, batch_size, 3, 32, 32); |
89 | 1 | float learn_rate = 0.001; |
90 | 1 | ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD()); |
91 | 1 | ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024); |
92 | 1 | int i, j, k; |
93 | 1 | ccv_nnc_tensor_t* cpu_outputs[device_count]; |
94 | 5 | for (i = 0; i < device_count; i++4 ) |
95 | 4 | { |
96 | 4 | cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0); |
97 | 4 | ccv_nnc_tensor_pin_memory(cpu_outputs[i]); |
98 | 4 | } |
99 | 1 | ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set); |
100 | 1 | const ccv_cnnp_random_jitter_t random_jitter = { |
101 | 1 | .resize = { |
102 | 1 | .min = 32, |
103 | 1 | .max = 32, |
104 | 1 | }, |
105 | 1 | .size = { |
106 | 1 | .rows = 32, |
107 | 1 | .cols = 32, |
108 | 1 | }, |
109 | 1 | .symmetric = 1, |
110 | 1 | .normalize = { |
111 | 1 | .mean = { |
112 | 1 | mean[0], mean[1], mean[2], |
113 | 1 | }, |
114 | 1 | }, |
115 | 1 | .offset = { |
116 | 1 | .x = 4, |
117 | 1 | .y = 4, |
118 | 1 | }, |
119 | 1 | .seed = 1, |
120 | 1 | }; |
121 | 1 | const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
122 | 1 | const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0); |
123 | 1 | const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_32F, CCV_TENSOR_FORMAT_NCHW, 0); |
124 | 1 | ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
125 | 1 | ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set); |
126 | 1 | const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
127 | 1 | ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
128 | 1 | int train_device_columns[device_count * 2]; |
129 | 1 | int test_device_columns[device_count * 2]; |
130 | 5 | for (i = 0; i < device_count; i++4 ) |
131 | 4 | { |
132 | 4 | int stream_type = CCV_STREAM_CONTEXT_GPU; |
133 | 4 | CCV_STREAM_SET_DEVICE_ID(stream_type, i); |
134 | 4 | train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0); |
135 | 4 | ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 32F, batch_size, 10); |
136 | 4 | CCV_TENSOR_SET_DEVICE_ID(params.type, i); |
137 | 4 | train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0); |
138 | 4 | test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0); |
139 | 4 | test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0); |
140 | 4 | } |
141 | 1 | ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2); |
142 | 1 | ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2); |
143 | 1 | ccv_nnc_stream_context_t* stream_contexts[2]; |
144 | 1 | stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
145 | 1 | stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
146 | 1 | int p = 0, q = 1; |
147 | 1 | const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count); |
148 | 1 | int correct = 0; |
149 | 1 | int epoch = 0; |
150 | 1 | ccv_cnnp_model_set_data_parallel(cifar_10, device_count); |
151 | 1 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]); |
152 | 1 | ccv_nnc_tensor_t** input_fits[device_count * 2]; |
153 | 1 | ccv_nnc_tensor_t* input_fit_inputs[device_count]; |
154 | 1 | ccv_nnc_tensor_t* input_fit_fits[device_count]; |
155 | 1 | ccv_nnc_tensor_t* outputs[device_count]; |
156 | 50 | for (i = 0; epoch < epoch_limit; i++49 ) |
157 | 49 | { |
158 | | // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/ |
159 | 49 | learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0 ; |
160 | 49 | learn_rate = ccv_max(learn_rate, 0.000001); |
161 | 49 | ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9); |
162 | 49 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0); |
163 | 49 | sgd.info.sgd.decay = 0; |
164 | 49 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0 |
165 | 49 | ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]); |
166 | 49 | ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena. |
167 | 245 | for (j = 0; j < device_count; j++196 ) |
168 | 196 | { |
169 | 196 | input_fit_inputs[j] = input_fits[j][0]; |
170 | 196 | input_fit_fits[j] = input_fits[j][1]; |
171 | 196 | outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j]; |
172 | 196 | } |
173 | 49 | ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, 0, stream_contexts[p]); |
174 | | // Prefetch the next round. |
175 | 49 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]); |
176 | 49 | if ((i + 1) % epoch_end == 0) |
177 | 1 | { |
178 | 1 | ++epoch; |
179 | | // Reshuffle and reset cursor. |
180 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
181 | 1 | ccv_cnnp_dataframe_iter_set_cursor(iter, 0); |
182 | 1 | } |
183 | 49 | int t; |
184 | 49 | CCV_SWAP(p, q, t); |
185 | 49 | } |
186 | 1 | ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0); |
187 | 1 | ccv_nnc_stream_context_wait(stream_contexts[p]); |
188 | 1 | ccv_nnc_stream_context_wait(stream_contexts[q]); |
189 | 1 | correct = 0; |
190 | 1 | p = 0, q = 1; |
191 | 11 | for (j = 0; j < test_set->rnum; j += batch_size * device_count10 ) |
192 | 10 | { |
193 | 10 | ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0); |
194 | 50 | for (k = 0; k < device_count; k++40 ) |
195 | 40 | { |
196 | 40 | input_fit_inputs[k] = input_fits[k][0]; |
197 | 40 | outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k]; |
198 | 40 | } |
199 | 10 | ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){ |
200 | 10 | .is_test = 1 |
201 | 10 | }, input_fit_inputs, device_count, outputs, device_count, 0, 0); |
202 | 10 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs, device_count, 0); |
203 | 10.0k | for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); k++10.0k ) |
204 | 10.0k | { |
205 | 10.0k | ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k); |
206 | 10.0k | const int d = k / batch_size; |
207 | 10.0k | const int b = k % batch_size; |
208 | 10.0k | float max = -FLT_MAX; |
209 | 10.0k | int t = -1; |
210 | 10.0k | int fi; |
211 | 110k | for (fi = 0; fi < 10; fi++100k ) |
212 | 100k | if (cpu_outputs[d]->data.f32[b * 10 + fi] > max) |
213 | 29.4k | max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi; |
214 | 10.0k | if (categorized->c == t) |
215 | 4.51k | ++correct; |
216 | 10.0k | } |
217 | 10 | } |
218 | 1 | ccv_cnnp_dataframe_iter_free(iter); |
219 | 1 | ccv_cnnp_dataframe_free(batch_train_data); |
220 | 1 | ccv_cnnp_dataframe_free(raw_train_data); |
221 | 1 | ccv_cnnp_dataframe_iter_free(test_iter); |
222 | 1 | ccv_cnnp_dataframe_free(batch_test_data); |
223 | 1 | ccv_cnnp_dataframe_free(raw_test_data); |
224 | 1 | ccv_cnnp_model_free(cifar_10); |
225 | 1 | ccv_nnc_stream_context_free(stream_contexts[0]); |
226 | 1 | ccv_nnc_stream_context_free(stream_contexts[1]); |
227 | 5 | for (i = 0; i < device_count; i++4 ) |
228 | 4 | ccv_nnc_tensor_free(cpu_outputs[i]); |
229 | 1 | return correct; |
230 | 1 | } |
231 | | |
232 | | TEST_CASE("cifar-10 with dawnnet to > 90% under 3 minutes") |
233 | 1 | { |
234 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) && |
235 | 1 | ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN)); |
236 | 1 | FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb"); |
237 | 1 | FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb"); |
238 | 1 | if (!train || !test) |
239 | 0 | { |
240 | 0 | if (train) |
241 | 0 | fclose(train); |
242 | 0 | if (test) |
243 | 0 | fclose(test); |
244 | 0 | GUARD_ELSE_RETURN(0); |
245 | 0 | } |
246 | 1 | int i, j, k; |
247 | 1 | unsigned char bytes[32 * 32 + 1]; |
248 | 1 | double mean[3] = {}; |
249 | 1 | const int train_count = 50000; |
250 | 1 | const int test_count = 10000; |
251 | 1 | ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0); |
252 | 50.0k | for (k = 0; k < train_count; k++50.0k ) |
253 | 50.0k | { |
254 | 50.0k | fread(bytes, 32 * 32 + 1, 1, train); |
255 | 50.0k | double per_mean[3] = {}; |
256 | 50.0k | int c = bytes[0]; |
257 | 50.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
258 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
259 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
260 | 51.2M | per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.); |
261 | 50.0k | fread(bytes, 32 * 32, 1, train); |
262 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
263 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
264 | 51.2M | per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.); |
265 | 50.0k | fread(bytes, 32 * 32, 1, train); |
266 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
267 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
268 | 51.2M | per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.); |
269 | 50.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
270 | 50.0k | ccv_array_push(categorizeds, &categorized); |
271 | 50.0k | mean[0] += per_mean[0] / (32 * 32); |
272 | 50.0k | mean[1] += per_mean[1] / (32 * 32); |
273 | 50.0k | mean[2] += per_mean[2] / (32 * 32); |
274 | 50.0k | } |
275 | 1 | float meanf[3]; |
276 | 1 | meanf[0] = mean[0] / train_count; |
277 | 1 | meanf[1] = mean[1] / train_count; |
278 | 1 | meanf[2] = mean[2] / train_count; |
279 | 1 | ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0); |
280 | 10.0k | for (k = 0; k < test_count; k++10.0k ) |
281 | 10.0k | { |
282 | 10.0k | fread(bytes, 32 * 32 + 1, 1, test); |
283 | 10.0k | int c = bytes[0]; |
284 | 10.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
285 | 330k | for (i = 0; i < 32; i++320k ) |
286 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
287 | 10.2M | a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0]; |
288 | 10.0k | fread(bytes, 32 * 32, 1, test); |
289 | 330k | for (i = 0; i < 32; i++320k ) |
290 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
291 | 10.2M | a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1]; |
292 | 10.0k | fread(bytes, 32 * 32, 1, test); |
293 | 330k | for (i = 0; i < 32; i++320k ) |
294 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
295 | 10.2M | a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2]; |
296 | 10.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
297 | 10.0k | ccv_array_push(tests, &categorized); |
298 | 10.0k | } |
299 | 1 | fclose(train); |
300 | 1 | fclose(test); |
301 | 1 | if (!ccv_is_coverage()) |
302 | 0 | { |
303 | 0 | int correct = train_cifar_10(35, categorizeds, 256, meanf, tests); |
304 | 0 | REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000); |
305 | 0 | } else |
306 | 1 | train_cifar_10(1, categorizeds, 256, meanf, tests); |
307 | 1 | } |
308 | | |
309 | | static int train_cifar_10_fp16(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set) |
310 | 1 | { |
311 | 1 | ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1); |
312 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
313 | 1 | if (device_count < 1) |
314 | 0 | return -1; |
315 | 1 | const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 16F, batch_size, 3, 32, 32); |
316 | 1 | float learn_rate = 0.001; |
317 | 1 | ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD()); |
318 | 1 | ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024); |
319 | 1 | int i, j, k; |
320 | 1 | ccv_nnc_tensor_t* cpu_outputs[device_count]; |
321 | 1 | ccv_nnc_tensor_t* cpu_outputs_16f[device_count]; |
322 | 5 | for (i = 0; i < device_count; i++4 ) |
323 | 4 | { |
324 | 4 | cpu_outputs_16f[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, batch_size, 10), 0); |
325 | 4 | ccv_nnc_tensor_pin_memory(cpu_outputs_16f[i]); |
326 | 4 | cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0); |
327 | 4 | } |
328 | 1 | ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set); |
329 | 1 | const ccv_cnnp_random_jitter_t random_jitter = { |
330 | 1 | .resize = { |
331 | 1 | .min = 32, |
332 | 1 | .max = 32, |
333 | 1 | }, |
334 | 1 | .size = { |
335 | 1 | .rows = 32, |
336 | 1 | .cols = 32, |
337 | 1 | }, |
338 | 1 | .symmetric = 1, |
339 | 1 | .normalize = { |
340 | 1 | .mean = { |
341 | 1 | mean[0], mean[1], mean[2], |
342 | 1 | }, |
343 | 1 | }, |
344 | 1 | .offset = { |
345 | 1 | .x = 4, |
346 | 1 | .y = 4, |
347 | 1 | }, |
348 | 1 | .seed = 1, |
349 | 1 | }; |
350 | 1 | const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
351 | 1 | const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0); |
352 | 1 | ccv_nnc_tensor_param_t images_16f_params = CPU_TENSOR_NHWC(16F, 32, 32, 3); |
353 | 1 | const int jitter_images_in = ccv_cnnp_dataframe_make_tuple(raw_train_data, COLUMN_ID_LIST(jitter_images), 0); |
354 | 1 | const int jitter_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_train_data, jitter_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0); |
355 | 1 | const int jitter_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_train_data, jitter_images_out_16f, 0, 0); |
356 | 1 | const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_16F, CCV_TENSOR_FORMAT_NCHW, 0); |
357 | 1 | ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images_16f, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
358 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
359 | 1 | ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set); |
360 | 1 | const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
361 | 1 | const int test_images_in = ccv_cnnp_dataframe_make_tuple(raw_test_data, COLUMN_ID_LIST(test_images), 0); |
362 | 1 | const int test_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_test_data, test_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0); |
363 | 1 | const int test_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_test_data, test_images_out_16f, 0, 0); |
364 | 1 | ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images_16f), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
365 | 1 | int train_device_columns[device_count * 2]; |
366 | 1 | int test_device_columns[device_count * 2]; |
367 | 5 | for (i = 0; i < device_count; i++4 ) |
368 | 4 | { |
369 | 4 | int stream_type = CCV_STREAM_CONTEXT_GPU; |
370 | 4 | CCV_STREAM_SET_DEVICE_ID(stream_type, i); |
371 | 4 | train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0); |
372 | 4 | ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 16F, batch_size, 10); |
373 | 4 | CCV_TENSOR_SET_DEVICE_ID(params.type, i); |
374 | 4 | train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0); |
375 | 4 | test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0); |
376 | 4 | test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0); |
377 | 4 | } |
378 | 1 | ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2); |
379 | 1 | ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2); |
380 | 1 | ccv_nnc_stream_context_t* stream_contexts[2]; |
381 | 1 | stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
382 | 1 | stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
383 | 1 | int p = 0, q = 1; |
384 | 1 | const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count); |
385 | 1 | int correct = 0; |
386 | 1 | int epoch = 0; |
387 | 1 | ccv_cnnp_model_set_data_parallel(cifar_10, device_count); |
388 | 1 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]); |
389 | 1 | ccv_nnc_tensor_t** input_fits[device_count * 2]; |
390 | 1 | ccv_nnc_tensor_t* input_fit_inputs[device_count]; |
391 | 1 | ccv_nnc_tensor_t* input_fit_fits[device_count]; |
392 | 1 | ccv_nnc_tensor_t* outputs[device_count]; |
393 | 50 | for (i = 0; epoch < epoch_limit; i++49 ) |
394 | 49 | { |
395 | | // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/ |
396 | 49 | learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0 ; |
397 | 49 | learn_rate = ccv_max(learn_rate, 0.000001); |
398 | 49 | ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9); |
399 | 49 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0); |
400 | 49 | sgd.info.sgd.decay = 0; |
401 | 49 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0 |
402 | 49 | ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]); |
403 | 49 | ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena. |
404 | 245 | for (j = 0; j < device_count; j++196 ) |
405 | 196 | { |
406 | 196 | input_fit_inputs[j] = input_fits[j][0]; |
407 | 196 | input_fit_fits[j] = input_fits[j][1]; |
408 | 196 | outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j]; |
409 | 196 | } |
410 | 49 | ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, 0, stream_contexts[p]); |
411 | | // Prefetch the next round. |
412 | 49 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]); |
413 | 49 | if ((i + 1) % epoch_end == 0) |
414 | 1 | { |
415 | 1 | ++epoch; |
416 | | // Reshuffle and reset cursor. |
417 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
418 | 1 | ccv_cnnp_dataframe_iter_set_cursor(iter, 0); |
419 | 1 | } |
420 | 49 | int t; |
421 | 49 | CCV_SWAP(p, q, t); |
422 | 49 | } |
423 | 1 | ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0); |
424 | 1 | ccv_nnc_stream_context_wait(stream_contexts[p]); |
425 | 1 | ccv_nnc_stream_context_wait(stream_contexts[q]); |
426 | 1 | correct = 0; |
427 | 1 | p = 0, q = 1; |
428 | 11 | for (j = 0; j < test_set->rnum; j += batch_size * device_count10 ) |
429 | 10 | { |
430 | 10 | ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0); |
431 | 50 | for (k = 0; k < device_count; k++40 ) |
432 | 40 | { |
433 | 40 | input_fit_inputs[k] = input_fits[k][0]; |
434 | 40 | outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k]; |
435 | 40 | } |
436 | 10 | ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){ |
437 | 10 | .is_test = 1 |
438 | 10 | }, input_fit_inputs, device_count, outputs, device_count, 0, 0); |
439 | 10 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs_16f, device_count, 0); |
440 | 10 | ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, cpu_outputs_16f, device_count, cpu_outputs, device_count, 0); |
441 | 10.0k | for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); k++10.0k ) |
442 | 10.0k | { |
443 | 10.0k | ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k); |
444 | 10.0k | const int d = k / batch_size; |
445 | 10.0k | const int b = k % batch_size; |
446 | 10.0k | float max = -FLT_MAX; |
447 | 10.0k | int t = -1; |
448 | 10.0k | int fi; |
449 | 110k | for (fi = 0; fi < 10; fi++100k ) |
450 | 100k | if (cpu_outputs[d]->data.f32[b * 10 + fi] > max) |
451 | 30.7k | max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi; |
452 | 10.0k | if (categorized->c == t) |
453 | 4.58k | ++correct; |
454 | 10.0k | } |
455 | 10 | } |
456 | 1 | ccv_cnnp_dataframe_iter_free(iter); |
457 | 1 | ccv_cnnp_dataframe_free(batch_train_data); |
458 | 1 | ccv_cnnp_dataframe_free(raw_train_data); |
459 | 1 | ccv_cnnp_dataframe_iter_free(test_iter); |
460 | 1 | ccv_cnnp_dataframe_free(batch_test_data); |
461 | 1 | ccv_cnnp_dataframe_free(raw_test_data); |
462 | 1 | ccv_cnnp_model_free(cifar_10); |
463 | 1 | ccv_nnc_stream_context_free(stream_contexts[0]); |
464 | 1 | ccv_nnc_stream_context_free(stream_contexts[1]); |
465 | 5 | for (i = 0; i < device_count; i++4 ) |
466 | 4 | { |
467 | 4 | ccv_nnc_tensor_free(cpu_outputs[i]); |
468 | 4 | ccv_nnc_tensor_free(cpu_outputs_16f[i]); |
469 | 4 | } |
470 | 1 | return correct; |
471 | 1 | } |
472 | | |
473 | | TEST_CASE("cifar-10 with dawnnet to > 90% under 1 minutes (fp16)") |
474 | 1 | { |
475 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) && |
476 | 1 | ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN)); |
477 | 1 | FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb"); |
478 | 1 | FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb"); |
479 | 1 | if (!train || !test) |
480 | 0 | { |
481 | 0 | if (train) |
482 | 0 | fclose(train); |
483 | 0 | if (test) |
484 | 0 | fclose(test); |
485 | 0 | GUARD_ELSE_RETURN(0); |
486 | 0 | } |
487 | 1 | int i, j, k; |
488 | 1 | unsigned char bytes[32 * 32 + 1]; |
489 | 1 | double mean[3] = {}; |
490 | 1 | const int train_count = 50000; |
491 | 1 | const int test_count = 10000; |
492 | 1 | ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0); |
493 | 50.0k | for (k = 0; k < train_count; k++50.0k ) |
494 | 50.0k | { |
495 | 50.0k | fread(bytes, 32 * 32 + 1, 1, train); |
496 | 50.0k | double per_mean[3] = {}; |
497 | 50.0k | int c = bytes[0]; |
498 | 50.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
499 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
500 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
501 | 51.2M | per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.); |
502 | 50.0k | fread(bytes, 32 * 32, 1, train); |
503 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
504 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
505 | 51.2M | per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.); |
506 | 50.0k | fread(bytes, 32 * 32, 1, train); |
507 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
508 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
509 | 51.2M | per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.); |
510 | 50.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
511 | 50.0k | ccv_array_push(categorizeds, &categorized); |
512 | 50.0k | mean[0] += per_mean[0] / (32 * 32); |
513 | 50.0k | mean[1] += per_mean[1] / (32 * 32); |
514 | 50.0k | mean[2] += per_mean[2] / (32 * 32); |
515 | 50.0k | } |
516 | 1 | float meanf[3]; |
517 | 1 | meanf[0] = mean[0] / train_count; |
518 | 1 | meanf[1] = mean[1] / train_count; |
519 | 1 | meanf[2] = mean[2] / train_count; |
520 | 1 | ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0); |
521 | 10.0k | for (k = 0; k < test_count; k++10.0k ) |
522 | 10.0k | { |
523 | 10.0k | fread(bytes, 32 * 32 + 1, 1, test); |
524 | 10.0k | int c = bytes[0]; |
525 | 10.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
526 | 330k | for (i = 0; i < 32; i++320k ) |
527 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
528 | 10.2M | a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0]; |
529 | 10.0k | fread(bytes, 32 * 32, 1, test); |
530 | 330k | for (i = 0; i < 32; i++320k ) |
531 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
532 | 10.2M | a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1]; |
533 | 10.0k | fread(bytes, 32 * 32, 1, test); |
534 | 330k | for (i = 0; i < 32; i++320k ) |
535 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
536 | 10.2M | a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2]; |
537 | 10.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
538 | 10.0k | ccv_array_push(tests, &categorized); |
539 | 10.0k | } |
540 | 1 | fclose(train); |
541 | 1 | fclose(test); |
542 | 1 | if (!ccv_is_coverage()) |
543 | 0 | { |
544 | 0 | int correct = train_cifar_10_fp16(35, categorizeds, 256, meanf, tests); |
545 | 0 | REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000); |
546 | 0 | } else |
547 | 1 | train_cifar_10_fp16(1, categorizeds, 256, meanf, tests); |
548 | 1 | } |
549 | | |
550 | | static int train_cifar_10_fp16_dy(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set) |
551 | 1 | { |
552 | 1 | ccv_cnnp_model_t* const cifar_10_0 = _cifar_10_dawn(0); |
553 | 1 | ccv_cnnp_model_t* const cifar_10 = ccv_cnnp_model_copy(cifar_10_0, 1); |
554 | 1 | ccv_cnnp_model_free(cifar_10_0); |
555 | 1 | const int device_count = 1; |
556 | 1 | const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 16F, batch_size, 3, 32, 32); |
557 | 1 | float learn_rate = 0.001; |
558 | 1 | ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / batch_size, 0.01, 0.9, 0.9), CMD_NOOP()); |
559 | 1 | ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024); |
560 | 1 | int i, j, k; |
561 | 1 | ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set); |
562 | 1 | const ccv_cnnp_random_jitter_t random_jitter = { |
563 | 1 | .resize = { |
564 | 1 | .min = 32, |
565 | 1 | .max = 32, |
566 | 1 | }, |
567 | 1 | .size = { |
568 | 1 | .rows = 32, |
569 | 1 | .cols = 32, |
570 | 1 | }, |
571 | 1 | .symmetric = 1, |
572 | 1 | .normalize = { |
573 | 1 | .mean = { |
574 | 1 | mean[0], mean[1], mean[2], |
575 | 1 | }, |
576 | 1 | }, |
577 | 1 | .offset = { |
578 | 1 | .x = 4, |
579 | 1 | .y = 4, |
580 | 1 | }, |
581 | 1 | .seed = 1, |
582 | 1 | }; |
583 | 1 | const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
584 | 1 | const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0); |
585 | 1 | ccv_nnc_tensor_param_t images_16f_params = CPU_TENSOR_NHWC(16F, 32, 32, 3); |
586 | 1 | const int jitter_images_in = ccv_cnnp_dataframe_make_tuple(raw_train_data, COLUMN_ID_LIST(jitter_images), 0); |
587 | 1 | const int jitter_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_train_data, jitter_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0); |
588 | 1 | const int jitter_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_train_data, jitter_images_out_16f, 0, 0); |
589 | 1 | const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_16F, CCV_TENSOR_FORMAT_NCHW, 0); |
590 | 1 | ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images_16f, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
591 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
592 | 1 | ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set); |
593 | 1 | const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
594 | 1 | const int test_images_in = ccv_cnnp_dataframe_make_tuple(raw_test_data, COLUMN_ID_LIST(test_images), 0); |
595 | 1 | const int test_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_test_data, test_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0); |
596 | 1 | const int test_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_test_data, test_images_out_16f, 0, 0); |
597 | 1 | ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images_16f), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
598 | 1 | int train_device_columns[device_count * 2]; |
599 | 1 | int test_device_columns[device_count * 2]; |
600 | 2 | for (i = 0; i < device_count; i++1 ) |
601 | 1 | { |
602 | 1 | int stream_type = CCV_STREAM_CONTEXT_GPU; |
603 | 1 | CCV_STREAM_SET_DEVICE_ID(stream_type, i); |
604 | 1 | train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0); |
605 | 1 | ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 16F, batch_size, 10); |
606 | 1 | CCV_TENSOR_SET_DEVICE_ID(params.type, i); |
607 | 1 | train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0); |
608 | 1 | test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0); |
609 | 1 | test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0); |
610 | 1 | } |
611 | 1 | ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2); |
612 | 1 | ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2); |
613 | 1 | ccv_nnc_stream_context_t* stream_contexts[2]; |
614 | 1 | stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
615 | 1 | stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
616 | 1 | int p = 0, q = 1; |
617 | 1 | const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count); |
618 | 1 | int correct = 0; |
619 | 1 | int epoch = 0; |
620 | 1 | ccv_cnnp_model_set_data_parallel(cifar_10, device_count); |
621 | 1 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]); |
622 | 1 | ccv_nnc_tensor_t** input_fits[device_count * 2]; |
623 | 1 | ccv_nnc_tensor_t* input_fit_inputs[device_count]; |
624 | 1 | ccv_nnc_tensor_t* input_fit_fits[device_count]; |
625 | 1 | ccv_nnc_tensor_t* outputs[device_count]; |
626 | 1 | ccv_nnc_dynamic_graph_t* const graph = ccv_nnc_dynamic_graph_new(); |
627 | 197 | for (i = 0; epoch < epoch_limit; i++196 ) |
628 | 196 | { |
629 | | // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/ |
630 | 196 | learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0 ; |
631 | 196 | learn_rate = ccv_max(learn_rate, 0.000001); |
632 | 196 | ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9); |
633 | 196 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0); |
634 | 196 | sgd.info.sgd.decay = 0; |
635 | 196 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0 |
636 | 196 | ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]); |
637 | 196 | ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena. |
638 | 392 | for (j = 0; j < device_count; j++196 ) |
639 | 196 | { |
640 | 196 | input_fit_inputs[j] = input_fits[j][0]; |
641 | 196 | input_fit_fits[j] = input_fits[j][1]; |
642 | 196 | outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j]; |
643 | 196 | } |
644 | 196 | ccv_nnc_stream_context_wait(stream_contexts[p]); // Need to wait the other context to finish, we use the same tensor_arena. |
645 | 196 | ccv_nnc_tensor_variable_t const input = ccv_nnc_tensor_variable_new(graph); |
646 | 196 | ccv_nnc_tensor_variable_set(graph, input, input_fit_inputs[0]); |
647 | 196 | ccv_nnc_tensor_variable_t const output = ccv_nnc_tensor_variable_new(graph); |
648 | 196 | ccv_nnc_tensor_variable_set(graph, output, outputs[0]); |
649 | 196 | ccv_nnc_dynamic_graph_evaluate(graph, cifar_10, 0, TENSOR_VARIABLE_LIST(input), TENSOR_VARIABLE_LIST(output), 0, stream_contexts[q]); |
650 | 196 | ccv_nnc_tensor_variable_t const fit = ccv_nnc_tensor_variable_new(graph); |
651 | 196 | ccv_nnc_tensor_variable_set(graph, fit, input_fit_fits[0]); |
652 | 196 | ccv_nnc_tensor_variable_t const softmax = ccv_nnc_tensor_variable_new(graph); |
653 | 196 | ccv_nnc_dynamic_graph_exec(graph, CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_VARIABLE_LIST(output, fit), TENSOR_VARIABLE_LIST(0, softmax), 0, stream_contexts[q]); |
654 | 196 | ccv_nnc_dynamic_graph_backward(graph, TENSOR_VARIABLE_LIST(softmax), 0, TENSOR_VARIABLE_LIST(input), TENSOR_VARIABLE_LIST(0), stream_contexts[q]); |
655 | 196 | ccv_nnc_dynamic_graph_apply_gradients(graph, sgd, TENSOR_VARIABLE_LIST(), TENSOR_VARIABLE_LIST(), 0, 0, stream_contexts[q]); |
656 | 196 | ccv_nnc_tensor_variable_free(graph, input); |
657 | 196 | ccv_nnc_tensor_variable_free(graph, output); |
658 | 196 | ccv_nnc_tensor_variable_free(graph, fit); |
659 | 196 | ccv_nnc_tensor_variable_free(graph, softmax); |
660 | | // Prefetch the next round. |
661 | 196 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]); |
662 | 196 | if ((i + 1) % epoch_end == 0) |
663 | 1 | { |
664 | 1 | ++epoch; |
665 | | // Reshuffle and reset cursor. |
666 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
667 | 1 | ccv_cnnp_dataframe_iter_set_cursor(iter, 0); |
668 | 1 | } |
669 | 196 | int t; |
670 | 196 | CCV_SWAP(p, q, t); |
671 | 196 | } |
672 | 1 | ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0); |
673 | 1 | ccv_nnc_stream_context_wait(stream_contexts[p]); |
674 | 1 | ccv_nnc_stream_context_wait(stream_contexts[q]); |
675 | 1 | correct = 0; |
676 | 1 | ccv_nnc_tensor_t* cpu_outputs[device_count]; |
677 | 1 | ccv_nnc_tensor_t* cpu_outputs_16f[device_count]; |
678 | 2 | for (i = 0; i < device_count; i++1 ) |
679 | 1 | { |
680 | 1 | cpu_outputs_16f[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, batch_size, 10), 0); |
681 | 1 | ccv_nnc_tensor_pin_memory(cpu_outputs_16f[i]); |
682 | 1 | cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0); |
683 | 1 | } |
684 | 41 | for (j = 0; j < test_set->rnum; j += batch_size * device_count40 ) |
685 | 40 | { |
686 | 40 | ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0); |
687 | 80 | for (k = 0; k < device_count; k++40 ) |
688 | 40 | { |
689 | 40 | input_fit_inputs[k] = input_fits[k][0]; |
690 | 40 | outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k]; |
691 | 40 | } |
692 | 40 | ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){ |
693 | 40 | .is_test = 1 |
694 | 40 | }, input_fit_inputs, device_count, outputs, device_count, 0, 0); |
695 | 40 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs_16f, device_count, 0); |
696 | 40 | ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, cpu_outputs_16f, device_count, cpu_outputs, device_count, 0); |
697 | 10.0k | for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); k++10.0k ) |
698 | 10.0k | { |
699 | 10.0k | ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k); |
700 | 10.0k | const int d = k / batch_size; |
701 | 10.0k | const int b = k % batch_size; |
702 | 10.0k | float max = -FLT_MAX; |
703 | 10.0k | int t = -1; |
704 | 10.0k | int fi; |
705 | 110k | for (fi = 0; fi < 10; fi++100k ) |
706 | 100k | if (cpu_outputs[d]->data.f32[b * 10 + fi] > max) |
707 | 30.8k | max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi; |
708 | 10.0k | if (categorized->c == t) |
709 | 5.64k | ++correct; |
710 | 10.0k | } |
711 | 40 | } |
712 | 1 | ccv_cnnp_dataframe_iter_free(iter); |
713 | 1 | ccv_cnnp_dataframe_free(batch_train_data); |
714 | 1 | ccv_cnnp_dataframe_free(raw_train_data); |
715 | 1 | ccv_cnnp_dataframe_iter_free(test_iter); |
716 | 1 | ccv_cnnp_dataframe_free(batch_test_data); |
717 | 1 | ccv_cnnp_dataframe_free(raw_test_data); |
718 | 1 | ccv_cnnp_model_free(cifar_10); |
719 | 1 | ccv_nnc_dynamic_graph_free(graph); |
720 | 1 | ccv_nnc_stream_context_free(stream_contexts[0]); |
721 | 1 | ccv_nnc_stream_context_free(stream_contexts[1]); |
722 | 2 | for (i = 0; i < device_count; i++1 ) |
723 | 1 | { |
724 | 1 | ccv_nnc_tensor_free(cpu_outputs[i]); |
725 | 1 | ccv_nnc_tensor_free(cpu_outputs_16f[i]); |
726 | 1 | } |
727 | 1 | return correct; |
728 | 1 | } |
729 | | |
730 | | TEST_CASE("cifar-10 with dawnnet to > 65% after 10 epoch (fp16) use dynamic graph") |
731 | 1 | { |
732 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) && |
733 | 1 | ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN)); |
734 | 1 | FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb"); |
735 | 1 | FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb"); |
736 | 1 | if (!train || !test) |
737 | 0 | { |
738 | 0 | if (train) |
739 | 0 | fclose(train); |
740 | 0 | if (test) |
741 | 0 | fclose(test); |
742 | 0 | GUARD_ELSE_RETURN(0); |
743 | 0 | } |
744 | 1 | int i, j, k; |
745 | 1 | unsigned char bytes[32 * 32 + 1]; |
746 | 1 | double mean[3] = {}; |
747 | 1 | const int train_count = 50000; |
748 | 1 | const int test_count = 10000; |
749 | 1 | ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0); |
750 | 50.0k | for (k = 0; k < train_count; k++50.0k ) |
751 | 50.0k | { |
752 | 50.0k | fread(bytes, 32 * 32 + 1, 1, train); |
753 | 50.0k | double per_mean[3] = {}; |
754 | 50.0k | int c = bytes[0]; |
755 | 50.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
756 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
757 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
758 | 51.2M | per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.); |
759 | 50.0k | fread(bytes, 32 * 32, 1, train); |
760 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
761 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
762 | 51.2M | per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.); |
763 | 50.0k | fread(bytes, 32 * 32, 1, train); |
764 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
765 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
766 | 51.2M | per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.); |
767 | 50.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
768 | 50.0k | ccv_array_push(categorizeds, &categorized); |
769 | 50.0k | mean[0] += per_mean[0] / (32 * 32); |
770 | 50.0k | mean[1] += per_mean[1] / (32 * 32); |
771 | 50.0k | mean[2] += per_mean[2] / (32 * 32); |
772 | 50.0k | } |
773 | 1 | float meanf[3]; |
774 | 1 | meanf[0] = mean[0] / train_count; |
775 | 1 | meanf[1] = mean[1] / train_count; |
776 | 1 | meanf[2] = mean[2] / train_count; |
777 | 1 | ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0); |
778 | 10.0k | for (k = 0; k < test_count; k++10.0k ) |
779 | 10.0k | { |
780 | 10.0k | fread(bytes, 32 * 32 + 1, 1, test); |
781 | 10.0k | int c = bytes[0]; |
782 | 10.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
783 | 330k | for (i = 0; i < 32; i++320k ) |
784 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
785 | 10.2M | a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0]; |
786 | 10.0k | fread(bytes, 32 * 32, 1, test); |
787 | 330k | for (i = 0; i < 32; i++320k ) |
788 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
789 | 10.2M | a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1]; |
790 | 10.0k | fread(bytes, 32 * 32, 1, test); |
791 | 330k | for (i = 0; i < 32; i++320k ) |
792 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
793 | 10.2M | a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2]; |
794 | 10.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
795 | 10.0k | ccv_array_push(tests, &categorized); |
796 | 10.0k | } |
797 | 1 | fclose(train); |
798 | 1 | fclose(test); |
799 | 1 | if (!ccv_is_coverage()) |
800 | 0 | { |
801 | 0 | int correct = train_cifar_10_fp16_dy(10, categorizeds, 256, meanf, tests); |
802 | 0 | REQUIRE(correct > 6500, "accuracy %.2f after 10 epoch should be higher than 65%%", (float)correct / 10000); |
803 | 0 | } else |
804 | 1 | train_cifar_10_fp16_dy(1, categorizeds, 256, meanf, tests); |
805 | 1 | } |
806 | | |
807 | | #include "case_main.h" |