/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/cifar.tests.c
Line | Count | Source |
1 | | #include "case.h" |
2 | | #include "ccv_case.h" |
3 | | #include "ccv_nnc_case.h" |
4 | | #include <ccv.h> |
5 | | #include <ccv_internal.h> |
6 | | #include <nnc/ccv_nnc.h> |
7 | | #include <nnc/ccv_nnc_easy.h> |
8 | | #include <3rdparty/dsfmt/dSFMT.h> |
9 | | |
10 | | TEST_SETUP() |
11 | | { |
12 | | ccv_nnc_init(); |
13 | | } |
14 | | |
15 | | static ccv_cnnp_model_t* _dawn_layer_new(const int filters, const int strides, const int residual) |
16 | 9 | { |
17 | 9 | ccv_cnnp_model_io_t input = ccv_cnnp_input(); |
18 | 9 | ccv_cnnp_model_t* conv = ccv_cnnp_sequential_new(MODEL_LIST( |
19 | 9 | ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0), |
20 | 9 | ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0), |
21 | 9 | ccv_cnnp_relu(0) |
22 | 9 | ), 1, 0); |
23 | 9 | ccv_cnnp_model_io_t output = ccv_cnnp_model_apply(conv, MODEL_IO_LIST(input)); |
24 | 9 | ccv_cnnp_model_t* pool = ccv_cnnp_max_pool(DIM_ALLOC(strides, strides), HINT((strides, strides), (0, 0)), 0); |
25 | 9 | output = ccv_cnnp_model_apply(pool, MODEL_IO_LIST(output)); |
26 | 9 | if (residual) |
27 | 6 | { |
28 | 6 | ccv_cnnp_model_io_t shortcut = output; |
29 | 6 | ccv_cnnp_model_t* res1 = ccv_cnnp_sequential_new(MODEL_LIST( |
30 | 6 | ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0), |
31 | 6 | ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0), |
32 | 6 | ccv_cnnp_relu(0) |
33 | 6 | ), 1, 0); |
34 | 6 | output = ccv_cnnp_model_apply(res1, MODEL_IO_LIST(output)); |
35 | 6 | ccv_cnnp_model_t* res2 = ccv_cnnp_sequential_new(MODEL_LIST( |
36 | 6 | ccv_cnnp_convolution(1, filters, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0), |
37 | 6 | ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0), |
38 | 6 | ccv_cnnp_relu(0) |
39 | 6 | ), 1, 0); |
40 | 6 | output = ccv_cnnp_model_apply(res2, MODEL_IO_LIST(output)); |
41 | 6 | ccv_cnnp_model_t* const add = ccv_cnnp_sum(0); |
42 | 6 | output = ccv_cnnp_model_apply(add, MODEL_IO_LIST(output, shortcut)); |
43 | 6 | } |
44 | 9 | return ccv_cnnp_model_new(MODEL_IO_LIST(input), MODEL_IO_LIST(output), 1, 0); |
45 | 9 | } |
46 | | |
47 | | static ccv_cnnp_model_t* _cifar_10_dawn(const int softmax) |
48 | 3 | { |
49 | 3 | ccv_cnnp_model_t* prep = ccv_cnnp_sequential_new(MODEL_LIST( |
50 | 3 | ccv_cnnp_convolution(1, 64, DIM_ALLOC(3, 3), DIM_ALLOC(), 0, HINT((1, 1), (1, 1)), 0, 1, 0), |
51 | 3 | ccv_cnnp_batch_norm(0.9, 1e-4, 1, 0), |
52 | 3 | ccv_cnnp_relu(0) |
53 | 3 | ), 1, 0); |
54 | 3 | ccv_cnnp_model_t* layer1 = _dawn_layer_new(128, 2, 1); |
55 | 3 | ccv_cnnp_model_t* layer2 = _dawn_layer_new(256, 2, 0); |
56 | 3 | ccv_cnnp_model_t* layer3 = _dawn_layer_new(512, 2, 1); |
57 | 3 | if (softmax) |
58 | 2 | { |
59 | 2 | return ccv_cnnp_sequential_new(MODEL_LIST( |
60 | 2 | prep, |
61 | 2 | layer1, |
62 | 2 | layer2, |
63 | 2 | layer3, |
64 | 2 | ccv_cnnp_max_pool(DIM_ALLOC(0, 0), ccv_nnc_no_hint, 0), |
65 | 2 | ccv_cnnp_flatten(0), |
66 | 2 | ccv_cnnp_dense(10, 0, 0, 1, 0), |
67 | 2 | ccv_cnnp_softmax(0) |
68 | 2 | ), 1, 0); |
69 | 2 | } else { |
70 | 1 | return ccv_cnnp_sequential_new(MODEL_LIST( |
71 | 1 | prep, |
72 | 1 | layer1, |
73 | 1 | layer2, |
74 | 1 | layer3, |
75 | 1 | ccv_cnnp_max_pool(DIM_ALLOC(0, 0), ccv_nnc_no_hint, 0), |
76 | 1 | ccv_cnnp_flatten(0), |
77 | 1 | ccv_cnnp_dense(10, 0, 0, 1, 0) |
78 | 1 | ), 1, 0); |
79 | 1 | } |
80 | 3 | } |
81 | | |
82 | | static int train_cifar_10(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set) |
83 | 1 | { |
84 | 1 | ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1); |
85 | 1 | int device_map[4] = {3, 2, 1, 0}; |
86 | 1 | ccv_nnc_set_device_permutation(CCV_STREAM_CONTEXT_GPU, device_map, 4); |
87 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
88 | 1 | if (device_count < 1) |
89 | 0 | return -1; |
90 | 1 | const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 32F, batch_size, 3, 32, 32); |
91 | 1 | float learn_rate = 0.001; |
92 | 1 | ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD()); |
93 | 1 | ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024); |
94 | 1 | int i, j, k; |
95 | 1 | ccv_nnc_tensor_t* cpu_outputs[device_count]; |
96 | 5 | for (i = 0; i < device_count; i++4 ) |
97 | 4 | { |
98 | 4 | cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0); |
99 | 4 | ccv_nnc_tensor_pin_memory(cpu_outputs[i]); |
100 | 4 | } |
101 | 1 | ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set); |
102 | 1 | const ccv_cnnp_random_jitter_t random_jitter = { |
103 | 1 | .resize = { |
104 | 1 | .min = 32, |
105 | 1 | .max = 32, |
106 | 1 | }, |
107 | 1 | .size = { |
108 | 1 | .rows = 32, |
109 | 1 | .cols = 32, |
110 | 1 | }, |
111 | 1 | .symmetric = 1, |
112 | 1 | .normalize = { |
113 | 1 | .mean = { |
114 | 1 | mean[0], mean[1], mean[2], |
115 | 1 | }, |
116 | 1 | }, |
117 | 1 | .offset = { |
118 | 1 | .x = 4, |
119 | 1 | .y = 4, |
120 | 1 | }, |
121 | 1 | .seed = 1, |
122 | 1 | }; |
123 | 1 | const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
124 | 1 | const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0); |
125 | 1 | const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_32F, CCV_TENSOR_FORMAT_NCHW, 0); |
126 | 1 | ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
127 | 1 | ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set); |
128 | 1 | const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
129 | 1 | ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
130 | 1 | int train_device_columns[device_count * 2]; |
131 | 1 | int test_device_columns[device_count * 2]; |
132 | 5 | for (i = 0; i < device_count; i++4 ) |
133 | 4 | { |
134 | 4 | int stream_type = CCV_STREAM_CONTEXT_GPU; |
135 | 4 | CCV_STREAM_SET_DEVICE_ID(stream_type, i); |
136 | 4 | train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0); |
137 | 4 | ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 32F, batch_size, 10); |
138 | 4 | CCV_TENSOR_SET_DEVICE_ID(params.type, i); |
139 | 4 | train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0); |
140 | 4 | test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0); |
141 | 4 | test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0); |
142 | 4 | } |
143 | 1 | ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2); |
144 | 1 | ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2); |
145 | 1 | ccv_nnc_stream_context_t* stream_contexts[2]; |
146 | 1 | stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
147 | 1 | stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
148 | 1 | int p = 0, q = 1; |
149 | 1 | const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count); |
150 | 1 | int correct = 0; |
151 | 1 | int epoch = 0; |
152 | 1 | ccv_cnnp_model_set_data_parallel(cifar_10, device_count); |
153 | 1 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]); |
154 | 1 | ccv_nnc_tensor_t** input_fits[device_count * 2]; |
155 | 1 | ccv_nnc_tensor_t* input_fit_inputs[device_count]; |
156 | 1 | ccv_nnc_tensor_t* input_fit_fits[device_count]; |
157 | 1 | ccv_nnc_tensor_t* outputs[device_count]; |
158 | 50 | for (i = 0; epoch < epoch_limit; i++49 ) |
159 | 49 | { |
160 | | // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/ |
161 | 49 | learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0 ; |
162 | 49 | learn_rate = ccv_max(learn_rate, 0.000001); |
163 | 49 | ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9); |
164 | 49 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0); |
165 | 49 | sgd.info.sgd.decay = 0; |
166 | 49 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0 |
167 | 49 | ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]); |
168 | 49 | ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena. |
169 | 245 | for (j = 0; j < device_count; j++196 ) |
170 | 196 | { |
171 | 196 | input_fit_inputs[j] = input_fits[j][0]; |
172 | 196 | input_fit_fits[j] = input_fits[j][1]; |
173 | 196 | outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j]; |
174 | 196 | } |
175 | 49 | ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, 0, stream_contexts[p]); |
176 | | // Prefetch the next round. |
177 | 49 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]); |
178 | 49 | if ((i + 1) % epoch_end == 0) |
179 | 1 | { |
180 | 1 | ++epoch; |
181 | | // Reshuffle and reset cursor. |
182 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
183 | 1 | ccv_cnnp_dataframe_iter_set_cursor(iter, 0); |
184 | 1 | } |
185 | 49 | int t; |
186 | 49 | CCV_SWAP(p, q, t); |
187 | 49 | } |
188 | 1 | ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0); |
189 | 1 | ccv_nnc_stream_context_wait(stream_contexts[p]); |
190 | 1 | ccv_nnc_stream_context_wait(stream_contexts[q]); |
191 | 1 | correct = 0; |
192 | 1 | p = 0, q = 1; |
193 | 11 | for (j = 0; j < test_set->rnum; j += batch_size * device_count10 ) |
194 | 10 | { |
195 | 10 | ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0); |
196 | 50 | for (k = 0; k < device_count; k++40 ) |
197 | 40 | { |
198 | 40 | input_fit_inputs[k] = input_fits[k][0]; |
199 | 40 | outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k]; |
200 | 40 | } |
201 | 10 | ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){ |
202 | 10 | .is_test = 1 |
203 | 10 | }, input_fit_inputs, device_count, outputs, device_count, 0, 0); |
204 | 10 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs, device_count, 0); |
205 | 10.0k | for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); k++10.0k ) |
206 | 10.0k | { |
207 | 10.0k | ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k); |
208 | 10.0k | const int d = k / batch_size; |
209 | 10.0k | const int b = k % batch_size; |
210 | 10.0k | float max = -FLT_MAX; |
211 | 10.0k | int t = -1; |
212 | 10.0k | int fi; |
213 | 110k | for (fi = 0; fi < 10; fi++100k ) |
214 | 100k | if (cpu_outputs[d]->data.f32[b * 10 + fi] > max) |
215 | 29.4k | max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi; |
216 | 10.0k | if (categorized->c == t) |
217 | 4.55k | ++correct; |
218 | 10.0k | } |
219 | 10 | } |
220 | 1 | ccv_cnnp_dataframe_iter_free(iter); |
221 | 1 | ccv_cnnp_dataframe_free(batch_train_data); |
222 | 1 | ccv_cnnp_dataframe_free(raw_train_data); |
223 | 1 | ccv_cnnp_dataframe_iter_free(test_iter); |
224 | 1 | ccv_cnnp_dataframe_free(batch_test_data); |
225 | 1 | ccv_cnnp_dataframe_free(raw_test_data); |
226 | 1 | ccv_cnnp_model_free(cifar_10); |
227 | 1 | ccv_nnc_stream_context_free(stream_contexts[0]); |
228 | 1 | ccv_nnc_stream_context_free(stream_contexts[1]); |
229 | 5 | for (i = 0; i < device_count; i++4 ) |
230 | 4 | ccv_nnc_tensor_free(cpu_outputs[i]); |
231 | 1 | ccv_nnc_set_device_permutation(CCV_STREAM_CONTEXT_GPU, 0, 0); |
232 | 1 | return correct; |
233 | 1 | } |
234 | | |
235 | | TEST_CASE("cifar-10 with dawnnet to > 90% under 3 minutes") |
236 | 1 | { |
237 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) && |
238 | 1 | ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN)); |
239 | 1 | FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb"); |
240 | 1 | FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb"); |
241 | 1 | if (!train || !test) |
242 | 0 | { |
243 | 0 | if (train) |
244 | 0 | fclose(train); |
245 | 0 | if (test) |
246 | 0 | fclose(test); |
247 | 0 | GUARD_ELSE_RETURN(0); |
248 | 0 | } |
249 | 1 | int i, j, k; |
250 | 1 | unsigned char bytes[32 * 32 + 1]; |
251 | 1 | double mean[3] = {}; |
252 | 1 | const int train_count = 50000; |
253 | 1 | const int test_count = 10000; |
254 | 1 | ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0); |
255 | 50.0k | for (k = 0; k < train_count; k++50.0k ) |
256 | 50.0k | { |
257 | 50.0k | fread(bytes, 32 * 32 + 1, 1, train); |
258 | 50.0k | double per_mean[3] = {}; |
259 | 50.0k | int c = bytes[0]; |
260 | 50.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
261 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
262 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
263 | 51.2M | per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.); |
264 | 50.0k | fread(bytes, 32 * 32, 1, train); |
265 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
266 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
267 | 51.2M | per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.); |
268 | 50.0k | fread(bytes, 32 * 32, 1, train); |
269 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
270 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
271 | 51.2M | per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.); |
272 | 50.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
273 | 50.0k | ccv_array_push(categorizeds, &categorized); |
274 | 50.0k | mean[0] += per_mean[0] / (32 * 32); |
275 | 50.0k | mean[1] += per_mean[1] / (32 * 32); |
276 | 50.0k | mean[2] += per_mean[2] / (32 * 32); |
277 | 50.0k | } |
278 | 1 | float meanf[3]; |
279 | 1 | meanf[0] = mean[0] / train_count; |
280 | 1 | meanf[1] = mean[1] / train_count; |
281 | 1 | meanf[2] = mean[2] / train_count; |
282 | 1 | ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0); |
283 | 10.0k | for (k = 0; k < test_count; k++10.0k ) |
284 | 10.0k | { |
285 | 10.0k | fread(bytes, 32 * 32 + 1, 1, test); |
286 | 10.0k | int c = bytes[0]; |
287 | 10.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
288 | 330k | for (i = 0; i < 32; i++320k ) |
289 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
290 | 10.2M | a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0]; |
291 | 10.0k | fread(bytes, 32 * 32, 1, test); |
292 | 330k | for (i = 0; i < 32; i++320k ) |
293 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
294 | 10.2M | a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1]; |
295 | 10.0k | fread(bytes, 32 * 32, 1, test); |
296 | 330k | for (i = 0; i < 32; i++320k ) |
297 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
298 | 10.2M | a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2]; |
299 | 10.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
300 | 10.0k | ccv_array_push(tests, &categorized); |
301 | 10.0k | } |
302 | 1 | fclose(train); |
303 | 1 | fclose(test); |
304 | 1 | if (!ccv_is_coverage()) |
305 | 0 | { |
306 | 0 | int correct = train_cifar_10(35, categorizeds, 256, meanf, tests); |
307 | 0 | REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000); |
308 | 0 | } else |
309 | 1 | train_cifar_10(1, categorizeds, 256, meanf, tests); |
310 | 1 | } |
311 | | |
312 | | static int train_cifar_10_fp16(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set) |
313 | 1 | { |
314 | 1 | ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1); |
315 | 1 | const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
316 | 1 | if (device_count < 1) |
317 | 0 | return -1; |
318 | 1 | const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 16F, batch_size, 3, 32, 32); |
319 | 1 | float learn_rate = 0.001; |
320 | 1 | ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9), CMD_CATEGORICAL_CROSSENTROPY_FORWARD()); |
321 | 1 | ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024); |
322 | 1 | int i, j, k; |
323 | 1 | ccv_nnc_tensor_t* cpu_outputs[device_count]; |
324 | 1 | ccv_nnc_tensor_t* cpu_outputs_16f[device_count]; |
325 | 5 | for (i = 0; i < device_count; i++4 ) |
326 | 4 | { |
327 | 4 | cpu_outputs_16f[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, batch_size, 10), 0); |
328 | 4 | ccv_nnc_tensor_pin_memory(cpu_outputs_16f[i]); |
329 | 4 | cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0); |
330 | 4 | } |
331 | 1 | ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set); |
332 | 1 | const ccv_cnnp_random_jitter_t random_jitter = { |
333 | 1 | .resize = { |
334 | 1 | .min = 32, |
335 | 1 | .max = 32, |
336 | 1 | }, |
337 | 1 | .size = { |
338 | 1 | .rows = 32, |
339 | 1 | .cols = 32, |
340 | 1 | }, |
341 | 1 | .symmetric = 1, |
342 | 1 | .normalize = { |
343 | 1 | .mean = { |
344 | 1 | mean[0], mean[1], mean[2], |
345 | 1 | }, |
346 | 1 | }, |
347 | 1 | .offset = { |
348 | 1 | .x = 4, |
349 | 1 | .y = 4, |
350 | 1 | }, |
351 | 1 | .seed = 1, |
352 | 1 | }; |
353 | 1 | const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
354 | 1 | const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0); |
355 | 1 | ccv_nnc_tensor_param_t images_16f_params = CPU_TENSOR_NHWC(16F, 32, 32, 3); |
356 | 1 | const int jitter_images_in = ccv_cnnp_dataframe_make_tuple(raw_train_data, COLUMN_ID_LIST(jitter_images), 0); |
357 | 1 | const int jitter_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_train_data, jitter_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0); |
358 | 1 | const int jitter_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_train_data, jitter_images_out_16f, 0, 0); |
359 | 1 | const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_16F, CCV_TENSOR_FORMAT_NCHW, 0); |
360 | 1 | ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images_16f, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
361 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
362 | 1 | ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set); |
363 | 1 | const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
364 | 1 | const int test_images_in = ccv_cnnp_dataframe_make_tuple(raw_test_data, COLUMN_ID_LIST(test_images), 0); |
365 | 1 | const int test_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_test_data, test_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0); |
366 | 1 | const int test_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_test_data, test_images_out_16f, 0, 0); |
367 | 1 | ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images_16f), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
368 | 1 | int train_device_columns[device_count * 2]; |
369 | 1 | int test_device_columns[device_count * 2]; |
370 | 5 | for (i = 0; i < device_count; i++4 ) |
371 | 4 | { |
372 | 4 | int stream_type = CCV_STREAM_CONTEXT_GPU; |
373 | 4 | CCV_STREAM_SET_DEVICE_ID(stream_type, i); |
374 | 4 | train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0); |
375 | 4 | ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 16F, batch_size, 10); |
376 | 4 | CCV_TENSOR_SET_DEVICE_ID(params.type, i); |
377 | 4 | train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0); |
378 | 4 | test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0); |
379 | 4 | test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0); |
380 | 4 | } |
381 | 1 | ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2); |
382 | 1 | ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2); |
383 | 1 | ccv_nnc_stream_context_t* stream_contexts[2]; |
384 | 1 | stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
385 | 1 | stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
386 | 1 | int p = 0, q = 1; |
387 | 1 | const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count); |
388 | 1 | int correct = 0; |
389 | 1 | int epoch = 0; |
390 | 1 | ccv_cnnp_model_set_data_parallel(cifar_10, device_count); |
391 | 1 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]); |
392 | 1 | ccv_nnc_tensor_t** input_fits[device_count * 2]; |
393 | 1 | ccv_nnc_tensor_t* input_fit_inputs[device_count]; |
394 | 1 | ccv_nnc_tensor_t* input_fit_fits[device_count]; |
395 | 1 | ccv_nnc_tensor_t* outputs[device_count]; |
396 | 50 | for (i = 0; epoch < epoch_limit; i++49 ) |
397 | 49 | { |
398 | | // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/ |
399 | 49 | learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0 ; |
400 | 49 | learn_rate = ccv_max(learn_rate, 0.000001); |
401 | 49 | ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9); |
402 | 49 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0); |
403 | 49 | sgd.info.sgd.decay = 0; |
404 | 49 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0 |
405 | 49 | ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]); |
406 | 49 | ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena. |
407 | 245 | for (j = 0; j < device_count; j++196 ) |
408 | 196 | { |
409 | 196 | input_fit_inputs[j] = input_fits[j][0]; |
410 | 196 | input_fit_fits[j] = input_fits[j][1]; |
411 | 196 | outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j]; |
412 | 196 | } |
413 | 49 | ccv_cnnp_model_fit(cifar_10, input_fit_inputs, device_count, input_fit_fits, device_count, outputs, device_count, 0, stream_contexts[p]); |
414 | | // Prefetch the next round. |
415 | 49 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]); |
416 | 49 | if ((i + 1) % epoch_end == 0) |
417 | 1 | { |
418 | 1 | ++epoch; |
419 | | // Reshuffle and reset cursor. |
420 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
421 | 1 | ccv_cnnp_dataframe_iter_set_cursor(iter, 0); |
422 | 1 | } |
423 | 49 | int t; |
424 | 49 | CCV_SWAP(p, q, t); |
425 | 49 | } |
426 | 1 | ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0); |
427 | 1 | ccv_nnc_stream_context_wait(stream_contexts[p]); |
428 | 1 | ccv_nnc_stream_context_wait(stream_contexts[q]); |
429 | 1 | correct = 0; |
430 | 1 | p = 0, q = 1; |
431 | 11 | for (j = 0; j < test_set->rnum; j += batch_size * device_count10 ) |
432 | 10 | { |
433 | 10 | ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0); |
434 | 50 | for (k = 0; k < device_count; k++40 ) |
435 | 40 | { |
436 | 40 | input_fit_inputs[k] = input_fits[k][0]; |
437 | 40 | outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k]; |
438 | 40 | } |
439 | 10 | ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){ |
440 | 10 | .is_test = 1 |
441 | 10 | }, input_fit_inputs, device_count, outputs, device_count, 0, 0); |
442 | 10 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs_16f, device_count, 0); |
443 | 10 | ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, cpu_outputs_16f, device_count, cpu_outputs, device_count, 0); |
444 | 10.0k | for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); k++10.0k ) |
445 | 10.0k | { |
446 | 10.0k | ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k); |
447 | 10.0k | const int d = k / batch_size; |
448 | 10.0k | const int b = k % batch_size; |
449 | 10.0k | float max = -FLT_MAX; |
450 | 10.0k | int t = -1; |
451 | 10.0k | int fi; |
452 | 110k | for (fi = 0; fi < 10; fi++100k ) |
453 | 100k | if (cpu_outputs[d]->data.f32[b * 10 + fi] > max) |
454 | 30.8k | max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi; |
455 | 10.0k | if (categorized->c == t) |
456 | 4.55k | ++correct; |
457 | 10.0k | } |
458 | 10 | } |
459 | 1 | ccv_cnnp_dataframe_iter_free(iter); |
460 | 1 | ccv_cnnp_dataframe_free(batch_train_data); |
461 | 1 | ccv_cnnp_dataframe_free(raw_train_data); |
462 | 1 | ccv_cnnp_dataframe_iter_free(test_iter); |
463 | 1 | ccv_cnnp_dataframe_free(batch_test_data); |
464 | 1 | ccv_cnnp_dataframe_free(raw_test_data); |
465 | 1 | ccv_cnnp_model_free(cifar_10); |
466 | 1 | ccv_nnc_stream_context_free(stream_contexts[0]); |
467 | 1 | ccv_nnc_stream_context_free(stream_contexts[1]); |
468 | 5 | for (i = 0; i < device_count; i++4 ) |
469 | 4 | { |
470 | 4 | ccv_nnc_tensor_free(cpu_outputs[i]); |
471 | 4 | ccv_nnc_tensor_free(cpu_outputs_16f[i]); |
472 | 4 | } |
473 | 1 | return correct; |
474 | 1 | } |
475 | | |
476 | | TEST_CASE("cifar-10 with dawnnet to > 90% under 1 minutes (fp16)") |
477 | 1 | { |
478 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) && |
479 | 1 | ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN)); |
480 | 1 | FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb"); |
481 | 1 | FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb"); |
482 | 1 | if (!train || !test) |
483 | 0 | { |
484 | 0 | if (train) |
485 | 0 | fclose(train); |
486 | 0 | if (test) |
487 | 0 | fclose(test); |
488 | 0 | GUARD_ELSE_RETURN(0); |
489 | 0 | } |
490 | 1 | int i, j, k; |
491 | 1 | unsigned char bytes[32 * 32 + 1]; |
492 | 1 | double mean[3] = {}; |
493 | 1 | const int train_count = 50000; |
494 | 1 | const int test_count = 10000; |
495 | 1 | ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0); |
496 | 50.0k | for (k = 0; k < train_count; k++50.0k ) |
497 | 50.0k | { |
498 | 50.0k | fread(bytes, 32 * 32 + 1, 1, train); |
499 | 50.0k | double per_mean[3] = {}; |
500 | 50.0k | int c = bytes[0]; |
501 | 50.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
502 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
503 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
504 | 51.2M | per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.); |
505 | 50.0k | fread(bytes, 32 * 32, 1, train); |
506 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
507 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
508 | 51.2M | per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.); |
509 | 50.0k | fread(bytes, 32 * 32, 1, train); |
510 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
511 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
512 | 51.2M | per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.); |
513 | 50.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
514 | 50.0k | ccv_array_push(categorizeds, &categorized); |
515 | 50.0k | mean[0] += per_mean[0] / (32 * 32); |
516 | 50.0k | mean[1] += per_mean[1] / (32 * 32); |
517 | 50.0k | mean[2] += per_mean[2] / (32 * 32); |
518 | 50.0k | } |
519 | 1 | float meanf[3]; |
520 | 1 | meanf[0] = mean[0] / train_count; |
521 | 1 | meanf[1] = mean[1] / train_count; |
522 | 1 | meanf[2] = mean[2] / train_count; |
523 | 1 | ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0); |
524 | 10.0k | for (k = 0; k < test_count; k++10.0k ) |
525 | 10.0k | { |
526 | 10.0k | fread(bytes, 32 * 32 + 1, 1, test); |
527 | 10.0k | int c = bytes[0]; |
528 | 10.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
529 | 330k | for (i = 0; i < 32; i++320k ) |
530 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
531 | 10.2M | a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0]; |
532 | 10.0k | fread(bytes, 32 * 32, 1, test); |
533 | 330k | for (i = 0; i < 32; i++320k ) |
534 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
535 | 10.2M | a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1]; |
536 | 10.0k | fread(bytes, 32 * 32, 1, test); |
537 | 330k | for (i = 0; i < 32; i++320k ) |
538 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
539 | 10.2M | a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2]; |
540 | 10.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
541 | 10.0k | ccv_array_push(tests, &categorized); |
542 | 10.0k | } |
543 | 1 | fclose(train); |
544 | 1 | fclose(test); |
545 | 1 | if (!ccv_is_coverage()) |
546 | 0 | { |
547 | 0 | int correct = train_cifar_10_fp16(35, categorizeds, 256, meanf, tests); |
548 | 0 | REQUIRE(correct > 9000, "accuracy %.2f after 35 epoch should be higher than 90%%", (float)correct / 10000); |
549 | 0 | } else |
550 | 1 | train_cifar_10_fp16(1, categorizeds, 256, meanf, tests); |
551 | 1 | } |
552 | | |
553 | | static int train_cifar_10_fp16_dy(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set) |
554 | 1 | { |
555 | 1 | ccv_cnnp_model_t* const cifar_10_0 = _cifar_10_dawn(0); |
556 | 1 | ccv_cnnp_model_t* const cifar_10 = ccv_cnnp_model_copy(cifar_10_0, 1); |
557 | 1 | ccv_cnnp_model_free(cifar_10_0); |
558 | 1 | const int device_count = 1; |
559 | 1 | const ccv_nnc_tensor_param_t input = GPU_TENSOR_NCHW(000, 16F, batch_size, 3, 32, 32); |
560 | 1 | float learn_rate = 0.001; |
561 | 1 | ccv_cnnp_model_compile(cifar_10, &input, 1, CMD_SGD_FORWARD(0, learn_rate, 1. / batch_size, 0.01, 0.9, 0.9), CMD_NOOP()); |
562 | 1 | ccv_cnnp_model_set_workspace_size(cifar_10, 2llu * 1024 * 1024 * 1024); |
563 | 1 | int i, j, k; |
564 | 1 | ccv_cnnp_dataframe_t* const raw_train_data = ccv_cnnp_dataframe_from_array_new(training_set); |
565 | 1 | const ccv_cnnp_random_jitter_t random_jitter = { |
566 | 1 | .resize = { |
567 | 1 | .min = 32, |
568 | 1 | .max = 32, |
569 | 1 | }, |
570 | 1 | .size = { |
571 | 1 | .rows = 32, |
572 | 1 | .cols = 32, |
573 | 1 | }, |
574 | 1 | .symmetric = 1, |
575 | 1 | .normalize = { |
576 | 1 | .mean = { |
577 | 1 | mean[0], mean[1], mean[2], |
578 | 1 | }, |
579 | 1 | }, |
580 | 1 | .offset = { |
581 | 1 | .x = 4, |
582 | 1 | .y = 4, |
583 | 1 | }, |
584 | 1 | .seed = 1, |
585 | 1 | }; |
586 | 1 | const int images = ccv_cnnp_dataframe_extract_value(raw_train_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
587 | 1 | const int jitter_images = ccv_cnnp_dataframe_image_random_jitter(raw_train_data, images, CCV_32F, random_jitter, 0); |
588 | 1 | ccv_nnc_tensor_param_t images_16f_params = CPU_TENSOR_NHWC(16F, 32, 32, 3); |
589 | 1 | const int jitter_images_in = ccv_cnnp_dataframe_make_tuple(raw_train_data, COLUMN_ID_LIST(jitter_images), 0); |
590 | 1 | const int jitter_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_train_data, jitter_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0); |
591 | 1 | const int jitter_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_train_data, jitter_images_out_16f, 0, 0); |
592 | 1 | const int one_hot = ccv_cnnp_dataframe_one_hot(raw_train_data, 0, offsetof(ccv_categorized_t, c), 10, 1, 0, CCV_16F, CCV_TENSOR_FORMAT_NCHW, 0); |
593 | 1 | ccv_cnnp_dataframe_t* const batch_train_data = ccv_cnnp_dataframe_combine_new(raw_train_data, COLUMN_ID_LIST(jitter_images_16f, one_hot), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
594 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
595 | 1 | ccv_cnnp_dataframe_t* const raw_test_data = ccv_cnnp_dataframe_from_array_new(test_set); |
596 | 1 | const int test_images = ccv_cnnp_dataframe_extract_value(raw_test_data, 0, offsetof(ccv_categorized_t, matrix), 0); |
597 | 1 | const int test_images_in = ccv_cnnp_dataframe_make_tuple(raw_test_data, COLUMN_ID_LIST(test_images), 0); |
598 | 1 | const int test_images_out_16f = ccv_cnnp_dataframe_cmd_exec(raw_test_data, test_images_in, CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, 0, 1, &images_16f_params, 1, 0, 0); |
599 | 1 | const int test_images_16f = ccv_cnnp_dataframe_extract_tuple(raw_test_data, test_images_out_16f, 0, 0); |
600 | 1 | ccv_cnnp_dataframe_t* const batch_test_data = ccv_cnnp_dataframe_combine_new(raw_test_data, COLUMN_ID_LIST(test_images_16f), batch_size, device_count, CCV_TENSOR_FORMAT_NCHW); |
601 | 1 | int train_device_columns[device_count * 2]; |
602 | 1 | int test_device_columns[device_count * 2]; |
603 | 2 | for (i = 0; i < device_count; i++1 ) |
604 | 1 | { |
605 | 1 | int stream_type = CCV_STREAM_CONTEXT_GPU; |
606 | 1 | CCV_STREAM_SET_DEVICE_ID(stream_type, i); |
607 | 1 | train_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_train_data, 0, i * 2, 2, i, 0); |
608 | 1 | ccv_nnc_tensor_param_t params = GPU_TENSOR_NCHW(000, 16F, batch_size, 10); |
609 | 1 | CCV_TENSOR_SET_DEVICE_ID(params.type, i); |
610 | 1 | train_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_train_data, params, 0); |
611 | 1 | test_device_columns[i] = ccv_cnnp_dataframe_copy_to_gpu(batch_test_data, 0, i, 1, i, 0); |
612 | 1 | test_device_columns[device_count + i] = ccv_cnnp_dataframe_add_aux(batch_test_data, params, 0); |
613 | 1 | } |
614 | 1 | ccv_cnnp_dataframe_iter_t* const test_iter = ccv_cnnp_dataframe_iter_new(batch_test_data, test_device_columns, device_count * 2); |
615 | 1 | ccv_cnnp_dataframe_iter_t* const iter = ccv_cnnp_dataframe_iter_new(batch_train_data, train_device_columns, device_count * 2); |
616 | 1 | ccv_nnc_stream_context_t* stream_contexts[2]; |
617 | 1 | stream_contexts[0] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
618 | 1 | stream_contexts[1] = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); |
619 | 1 | int p = 0, q = 1; |
620 | 1 | const int epoch_end = (training_set->rnum + batch_size * device_count - 1) / (batch_size * device_count); |
621 | 1 | int correct = 0; |
622 | 1 | int epoch = 0; |
623 | 1 | ccv_cnnp_model_set_data_parallel(cifar_10, device_count); |
624 | 1 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[p]); |
625 | 1 | ccv_nnc_tensor_t** input_fits[device_count * 2]; |
626 | 1 | ccv_nnc_tensor_t* input_fit_inputs[device_count]; |
627 | 1 | ccv_nnc_tensor_t* input_fit_fits[device_count]; |
628 | 1 | ccv_nnc_tensor_t* outputs[device_count]; |
629 | 1 | ccv_nnc_dynamic_graph_t* const graph = ccv_nnc_dynamic_graph_new(); |
630 | 197 | for (i = 0; epoch < epoch_limit; i++196 ) |
631 | 196 | { |
632 | | // Piece-wise linear learning rate: https://www.myrtle.ai/2018/09/24/how_to_train_your_resnet_3/ |
633 | 196 | learn_rate = (i + 1) < 10 * epoch_end ? 0.4 * (i + 1) / (10 * epoch_end) : 0.4 * (35 * epoch_end - (i + 1)) / ((35 - 10) * epoch_end)0 ; |
634 | 196 | learn_rate = ccv_max(learn_rate, 0.000001); |
635 | 196 | ccv_nnc_cmd_t sgd = CMD_SGD_FORWARD(0, learn_rate, 1. / (batch_size * device_count), 0.01, 0.9, 0.9); |
636 | 196 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 1, 0, 0); |
637 | 196 | sgd.info.sgd.decay = 0; |
638 | 196 | ccv_cnnp_model_set_minimizer(cifar_10, sgd, 0, MODEL_IO_LIST(ccv_cnnp_model_parameters(cifar_10, CCV_CNNP_PARAMETER_SELECT_BIAS, ALL_PARAMETERS))); // Set bias (the second parameter) to decay 0 |
639 | 196 | ccv_cnnp_dataframe_iter_next(iter, (void**)input_fits, device_count * 2, stream_contexts[p]); |
640 | 196 | ccv_nnc_stream_context_wait(stream_contexts[q]); // Need to wait the other context to finish, we use the same tensor_arena. |
641 | 392 | for (j = 0; j < device_count; j++196 ) |
642 | 196 | { |
643 | 196 | input_fit_inputs[j] = input_fits[j][0]; |
644 | 196 | input_fit_fits[j] = input_fits[j][1]; |
645 | 196 | outputs[j] = (ccv_nnc_tensor_t*)input_fits[device_count + j]; |
646 | 196 | } |
647 | 196 | ccv_nnc_stream_context_wait(stream_contexts[p]); // Need to wait the other context to finish, we use the same tensor_arena. |
648 | 196 | ccv_nnc_tensor_variable_t const input = ccv_nnc_tensor_variable_new(graph); |
649 | 196 | ccv_nnc_tensor_variable_set(graph, input, input_fit_inputs[0]); |
650 | 196 | ccv_nnc_tensor_variable_t const output = ccv_nnc_tensor_variable_new(graph); |
651 | 196 | ccv_nnc_tensor_variable_set(graph, output, outputs[0]); |
652 | 196 | ccv_nnc_dynamic_graph_evaluate(graph, cifar_10, 0, TENSOR_VARIABLE_LIST(input), TENSOR_VARIABLE_LIST(output), 0, stream_contexts[q]); |
653 | 196 | ccv_nnc_tensor_variable_t const fit = ccv_nnc_tensor_variable_new(graph); |
654 | 196 | ccv_nnc_tensor_variable_set(graph, fit, input_fit_fits[0]); |
655 | 196 | ccv_nnc_tensor_variable_t const softmax = ccv_nnc_tensor_variable_new(graph); |
656 | 196 | ccv_nnc_dynamic_graph_exec(graph, CMD_SOFTMAX_CROSSENTROPY_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_VARIABLE_LIST(output, fit), TENSOR_VARIABLE_LIST(0, softmax), 0, stream_contexts[q]); |
657 | 196 | ccv_nnc_dynamic_graph_backward(graph, TENSOR_VARIABLE_LIST(softmax), 0, TENSOR_VARIABLE_LIST(input), TENSOR_VARIABLE_LIST(0), stream_contexts[q]); |
658 | 196 | ccv_nnc_dynamic_graph_apply_gradients(graph, sgd, TENSOR_VARIABLE_LIST(), TENSOR_VARIABLE_LIST(), 0, 0, stream_contexts[q]); |
659 | 196 | ccv_nnc_tensor_variable_free(graph, input); |
660 | 196 | ccv_nnc_tensor_variable_free(graph, output); |
661 | 196 | ccv_nnc_tensor_variable_free(graph, fit); |
662 | 196 | ccv_nnc_tensor_variable_free(graph, softmax); |
663 | | // Prefetch the next round. |
664 | 196 | ccv_cnnp_dataframe_iter_prefetch(iter, 1, stream_contexts[q]); |
665 | 196 | if ((i + 1) % epoch_end == 0) |
666 | 1 | { |
667 | 1 | ++epoch; |
668 | | // Reshuffle and reset cursor. |
669 | 1 | ccv_cnnp_dataframe_shuffle(raw_train_data); |
670 | 1 | ccv_cnnp_dataframe_iter_set_cursor(iter, 0); |
671 | 1 | } |
672 | 196 | int t; |
673 | 196 | CCV_SWAP(p, q, t); |
674 | 196 | } |
675 | 1 | ccv_cnnp_dataframe_iter_set_cursor(test_iter, 0); |
676 | 1 | ccv_nnc_stream_context_wait(stream_contexts[p]); |
677 | 1 | ccv_nnc_stream_context_wait(stream_contexts[q]); |
678 | 1 | correct = 0; |
679 | 1 | ccv_nnc_tensor_t* cpu_outputs[device_count]; |
680 | 1 | ccv_nnc_tensor_t* cpu_outputs_16f[device_count]; |
681 | 2 | for (i = 0; i < device_count; i++1 ) |
682 | 1 | { |
683 | 1 | cpu_outputs_16f[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, batch_size, 10), 0); |
684 | 1 | ccv_nnc_tensor_pin_memory(cpu_outputs_16f[i]); |
685 | 1 | cpu_outputs[i] = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, batch_size, 10), 0); |
686 | 1 | } |
687 | 41 | for (j = 0; j < test_set->rnum; j += batch_size * device_count40 ) |
688 | 40 | { |
689 | 40 | ccv_cnnp_dataframe_iter_next(test_iter, (void**)input_fits, device_count * 2, 0); |
690 | 80 | for (k = 0; k < device_count; k++40 ) |
691 | 40 | { |
692 | 40 | input_fit_inputs[k] = input_fits[k][0]; |
693 | 40 | outputs[k] = (ccv_nnc_tensor_t*)input_fits[device_count + k]; |
694 | 40 | } |
695 | 40 | ccv_cnnp_model_evaluate(cifar_10, (ccv_cnnp_evaluate_param_t){ |
696 | 40 | .is_test = 1 |
697 | 40 | }, input_fit_inputs, device_count, outputs, device_count, 0, 0); |
698 | 40 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, outputs, device_count, cpu_outputs_16f, device_count, 0); |
699 | 40 | ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, cpu_outputs_16f, device_count, cpu_outputs, device_count, 0); |
700 | 10.0k | for (k = 0; k < ccv_min(test_set->rnum - j, batch_size * device_count); k++10.0k ) |
701 | 10.0k | { |
702 | 10.0k | ccv_categorized_t* const categorized = (ccv_categorized_t*)ccv_array_get(test_set, j + k); |
703 | 10.0k | const int d = k / batch_size; |
704 | 10.0k | const int b = k % batch_size; |
705 | 10.0k | float max = -FLT_MAX; |
706 | 10.0k | int t = -1; |
707 | 10.0k | int fi; |
708 | 110k | for (fi = 0; fi < 10; fi++100k ) |
709 | 100k | if (cpu_outputs[d]->data.f32[b * 10 + fi] > max) |
710 | 31.0k | max = cpu_outputs[d]->data.f32[b * 10 + fi], t = fi; |
711 | 10.0k | if (categorized->c == t) |
712 | 5.65k | ++correct; |
713 | 10.0k | } |
714 | 40 | } |
715 | 1 | ccv_cnnp_dataframe_iter_free(iter); |
716 | 1 | ccv_cnnp_dataframe_free(batch_train_data); |
717 | 1 | ccv_cnnp_dataframe_free(raw_train_data); |
718 | 1 | ccv_cnnp_dataframe_iter_free(test_iter); |
719 | 1 | ccv_cnnp_dataframe_free(batch_test_data); |
720 | 1 | ccv_cnnp_dataframe_free(raw_test_data); |
721 | 1 | ccv_cnnp_model_free(cifar_10); |
722 | 1 | ccv_nnc_dynamic_graph_free(graph); |
723 | 1 | ccv_nnc_stream_context_free(stream_contexts[0]); |
724 | 1 | ccv_nnc_stream_context_free(stream_contexts[1]); |
725 | 2 | for (i = 0; i < device_count; i++1 ) |
726 | 1 | { |
727 | 1 | ccv_nnc_tensor_free(cpu_outputs[i]); |
728 | 1 | ccv_nnc_tensor_free(cpu_outputs_16f[i]); |
729 | 1 | } |
730 | 1 | return correct; |
731 | 1 | } |
732 | | |
733 | | TEST_CASE("cifar-10 with dawnnet to > 65% after 10 epoch (fp16) use dynamic graph") |
734 | 1 | { |
735 | 1 | GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) && |
736 | 1 | ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN)); |
737 | 1 | FILE* train = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/data_batch.bin", "rb"); |
738 | 1 | FILE* test = fopen("/fast/Data/cifar-10/cifar-10-batches-bin/test_batch.bin", "rb"); |
739 | 1 | if (!train || !test) |
740 | 0 | { |
741 | 0 | if (train) |
742 | 0 | fclose(train); |
743 | 0 | if (test) |
744 | 0 | fclose(test); |
745 | 0 | GUARD_ELSE_RETURN(0); |
746 | 0 | } |
747 | 1 | int i, j, k; |
748 | 1 | unsigned char bytes[32 * 32 + 1]; |
749 | 1 | double mean[3] = {}; |
750 | 1 | const int train_count = 50000; |
751 | 1 | const int test_count = 10000; |
752 | 1 | ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), train_count, 0); |
753 | 50.0k | for (k = 0; k < train_count; k++50.0k ) |
754 | 50.0k | { |
755 | 50.0k | fread(bytes, 32 * 32 + 1, 1, train); |
756 | 50.0k | double per_mean[3] = {}; |
757 | 50.0k | int c = bytes[0]; |
758 | 50.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
759 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
760 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
761 | 51.2M | per_mean[0] += (a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255.); |
762 | 50.0k | fread(bytes, 32 * 32, 1, train); |
763 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
764 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
765 | 51.2M | per_mean[1] += (a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255.); |
766 | 50.0k | fread(bytes, 32 * 32, 1, train); |
767 | 1.65M | for (i = 0; i < 32; i++1.60M ) |
768 | 52.8M | for (j = 0; 1.60M j < 32; j++51.2M ) |
769 | 51.2M | per_mean[2] += (a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255.); |
770 | 50.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
771 | 50.0k | ccv_array_push(categorizeds, &categorized); |
772 | 50.0k | mean[0] += per_mean[0] / (32 * 32); |
773 | 50.0k | mean[1] += per_mean[1] / (32 * 32); |
774 | 50.0k | mean[2] += per_mean[2] / (32 * 32); |
775 | 50.0k | } |
776 | 1 | float meanf[3]; |
777 | 1 | meanf[0] = mean[0] / train_count; |
778 | 1 | meanf[1] = mean[1] / train_count; |
779 | 1 | meanf[2] = mean[2] / train_count; |
780 | 1 | ccv_array_t* tests = ccv_array_new(sizeof(ccv_categorized_t), test_count, 0); |
781 | 10.0k | for (k = 0; k < test_count; k++10.0k ) |
782 | 10.0k | { |
783 | 10.0k | fread(bytes, 32 * 32 + 1, 1, test); |
784 | 10.0k | int c = bytes[0]; |
785 | 10.0k | ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0); |
786 | 330k | for (i = 0; i < 32; i++320k ) |
787 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
788 | 10.2M | a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1] * 2. / 255. - meanf[0]; |
789 | 10.0k | fread(bytes, 32 * 32, 1, test); |
790 | 330k | for (i = 0; i < 32; i++320k ) |
791 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
792 | 10.2M | a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32] * 2. / 255. - meanf[1]; |
793 | 10.0k | fread(bytes, 32 * 32, 1, test); |
794 | 330k | for (i = 0; i < 32; i++320k ) |
795 | 10.5M | for (j = 0; 320k j < 32; j++10.2M ) |
796 | 10.2M | a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32] * 2. / 255. - meanf[2]; |
797 | 10.0k | ccv_categorized_t categorized = ccv_categorized(c, a, 0); |
798 | 10.0k | ccv_array_push(tests, &categorized); |
799 | 10.0k | } |
800 | 1 | fclose(train); |
801 | 1 | fclose(test); |
802 | 1 | if (!ccv_is_coverage()) |
803 | 0 | { |
804 | 0 | int correct = train_cifar_10_fp16_dy(10, categorizeds, 256, meanf, tests); |
805 | 0 | REQUIRE(correct > 6500, "accuracy %.2f after 10 epoch should be higher than 65%%", (float)correct / 10000); |
806 | 0 | } else |
807 | 1 | train_cifar_10_fp16_dy(1, categorizeds, 256, meanf, tests); |
808 | 1 | } |
809 | | |
810 | | #include "case_main.h" |