/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/softmax_loss/ccv_nnc_softmax_crossentropy_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_softmax_crossentropy_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 316 | { |
15 | 316 | assert(input_size == 2); |
16 | 316 | const ccv_nnc_tensor_t* a = inputs[0]; |
17 | 316 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
18 | 316 | const ccv_nnc_tensor_t* b = inputs[1]; |
19 | 316 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
20 | 316 | assert(output_size == 2); |
21 | 316 | ccv_nnc_tensor_t* c = outputs[0]; |
22 | 316 | assert(!c || !CCV_IS_TENSOR_VIEW(c)); |
23 | 316 | ccv_nnc_tensor_t* d = outputs[1]; |
24 | 316 | assert(CCV_IS_TENSOR_CONTIGUOUS(d)); |
25 | 316 | const int axis_count = ccv_nnc_tensor_nd(a->info.dim); |
26 | 316 | const int batch_size = axis_count < 2 ? 10 : a->info.dim[0]; |
27 | 316 | const int count = ccv_nnc_tensor_count(a->info) / batch_size; |
28 | 316 | int i; |
29 | 316 | if (c) |
30 | 315 | { |
31 | 315 | assert(ccv_nnc_tensor_count(c->info) == batch_size); |
32 | 315 | if (b->info.datatype == CCV_32F) |
33 | 312 | { |
34 | | // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is |
35 | | // the channel count. Otherwise, the range is 1 (and the only axis is the batch size). |
36 | 312 | const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? ccv_nnc_tensor_get_c(b->info)3 : (309 batch_size == 1309 ? b->info.dim[0]301 : 18 ); |
37 | 312 | if (range == 1) |
38 | 309 | { |
39 | 927 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++618 ) |
40 | 618 | { assert(a->info.dim[i] == d->info.dim[i]); } |
41 | 309 | const float trim0 = cmd.info.label_smoothing.trim0; |
42 | 309 | const float trim1 = cmd.info.label_smoothing.trim1; |
43 | 309 | if (trim0 == 0 && trim1 == 1305 ) |
44 | 305 | { |
45 | 341 | parallel_for305 (i, batch_size) { |
46 | 341 | int j; |
47 | 341 | float* const ap = a->data.f32 + i * count; |
48 | 341 | float* const dp = d->data.f32 + i * count; |
49 | 341 | double maxval = ap[0]; |
50 | 7.01k | for (j = 1; j < count; j++6.66k ) |
51 | 6.66k | if (ap[j] > maxval) |
52 | 649 | maxval = ap[j]; |
53 | 341 | const int label = (int)(b->data.f32[i] + 0.5); |
54 | 341 | assert(label >= 0 && label < count); |
55 | 341 | c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy. |
56 | 341 | double sumval = 0; |
57 | 7.35k | for (j = 0; j < count; j++7.01k ) |
58 | 7.01k | sumval += (dp[j] = expf(ap[j] - maxval)); |
59 | 341 | sumval = 1.0 / sumval; |
60 | 7.35k | for (j = 0; j < count; j++7.01k ) |
61 | 7.01k | dp[j] *= sumval; |
62 | 341 | } parallel_endfor |
63 | 305 | } else { |
64 | 40 | parallel_for4 (i, batch_size) { |
65 | 40 | int j; |
66 | 40 | float* const ap = a->data.f32 + i * count; |
67 | 40 | float* const dp = d->data.f32 + i * count; |
68 | 40 | double maxval = ap[0]; |
69 | 4.00k | for (j = 1; j < count; j++3.96k ) |
70 | 3.96k | if (ap[j] > maxval) |
71 | 140 | maxval = ap[j]; |
72 | 40 | const int label = (int)(b->data.f32[i] + 0.5); |
73 | 40 | assert(label >= 0 && label < count); |
74 | 40 | float p = 0; |
75 | 2.02k | for (j = 0; j < label; j++1.98k ) |
76 | 1.98k | p += trim0 * (maxval - ap[j]); |
77 | 40 | p += trim1 * (maxval - ap[label]); |
78 | 2.02k | for (j = label + 1; j < count; j++1.98k ) |
79 | 1.98k | p += trim0 * (maxval - ap[j]); |
80 | 40 | c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy. |
81 | 40 | double sumval = 0; |
82 | 4.04k | for (j = 0; j < count; j++4.00k ) |
83 | 4.00k | sumval += (dp[j] = expf(ap[j] - maxval)); |
84 | 40 | sumval = 1.0 / sumval; |
85 | 4.04k | for (j = 0; j < count; j++4.00k ) |
86 | 4.00k | dp[j] *= sumval; |
87 | 40 | } parallel_endfor |
88 | 4 | } |
89 | 309 | } else { |
90 | 3 | assert(range == count); |
91 | 4 | parallel_for3 (i, batch_size) { |
92 | 4 | int j; |
93 | 4 | float* const ap = a->data.f32 + i * count; |
94 | 4 | float* const bp = b->data.f32 + i * count; |
95 | 4 | float* const dp = d->data.f32 + i * count; |
96 | 4 | double maxval = ap[0]; |
97 | 26 | for (j = 1; j < count; j++22 ) |
98 | 22 | if (ap[j] > maxval) |
99 | 3 | maxval = ap[j]; |
100 | 4 | float p = 0; |
101 | 30 | for (j = 0; j < count; j++26 ) |
102 | 26 | p += bp[j] * (maxval - ap[j]); |
103 | 4 | c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy. |
104 | 4 | double sumval = 0; |
105 | 30 | for (j = 0; j < count; j++26 ) |
106 | 26 | sumval += (dp[j] = expf(ap[j] - maxval)); |
107 | 4 | sumval = 1.0 / sumval; |
108 | 30 | for (j = 0; j < count; j++26 ) |
109 | 26 | dp[j] *= sumval; |
110 | 4 | } parallel_endfor |
111 | 3 | } |
112 | 312 | } else if (3 b->info.datatype == CCV_32S3 ) { |
113 | 9 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++6 ) |
114 | 6 | { assert(a->info.dim[i] == d->info.dim[i]); } |
115 | 3 | const float trim0 = cmd.info.label_smoothing.trim0; |
116 | 3 | const float trim1 = cmd.info.label_smoothing.trim1; |
117 | 3 | if (trim0 == 0 && trim1 == 12 ) |
118 | 2 | { |
119 | 4 | parallel_for2 (i, batch_size) { |
120 | 4 | int j; |
121 | 4 | float* const ap = a->data.f32 + i * count; |
122 | 4 | float* const dp = d->data.f32 + i * count; |
123 | 4 | double maxval = ap[0]; |
124 | 12 | for (j = 1; j < count; j++8 ) |
125 | 8 | if (ap[j] > maxval) |
126 | 2 | maxval = ap[j]; |
127 | 4 | const int label = b->data.i32[i]; |
128 | 4 | assert(label >= 0 && label < count); |
129 | 4 | c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy. |
130 | 4 | double sumval = 0; |
131 | 16 | for (j = 0; j < count; j++12 ) |
132 | 12 | sumval += (dp[j] = expf(ap[j] - maxval)); |
133 | 4 | sumval = 1.0 / sumval; |
134 | 16 | for (j = 0; j < count; j++12 ) |
135 | 12 | dp[j] *= sumval; |
136 | 4 | } parallel_endfor |
137 | 2 | } else { |
138 | 2 | parallel_for1 (i, batch_size) { |
139 | 2 | int j; |
140 | 2 | float* const ap = a->data.f32 + i * count; |
141 | 2 | float* const dp = d->data.f32 + i * count; |
142 | 2 | double maxval = ap[0]; |
143 | 6 | for (j = 1; j < count; j++4 ) |
144 | 4 | if (ap[j] > maxval) |
145 | 2 | maxval = ap[j]; |
146 | 2 | const int label = b->data.i32[i]; |
147 | 2 | assert(label >= 0 && label < count); |
148 | 2 | float p = 0; |
149 | 5 | for (j = 0; j < label; j++3 ) |
150 | 3 | p += trim0 * (maxval - ap[j]); |
151 | 2 | p += trim1 * (maxval - ap[label]); |
152 | 3 | for (j = label + 1; j < count; j++1 ) |
153 | 1 | p += trim0 * (maxval - ap[j]); |
154 | 2 | c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy. |
155 | 2 | double sumval = 0; |
156 | 8 | for (j = 0; j < count; j++6 ) |
157 | 6 | sumval += (dp[j] = expf(ap[j] - maxval)); |
158 | 2 | sumval = 1.0 / sumval; |
159 | 8 | for (j = 0; j < count; j++6 ) |
160 | 6 | dp[j] *= sumval; |
161 | 2 | } parallel_endfor |
162 | 1 | } |
163 | 3 | } |
164 | 315 | } else { |
165 | | // No loss calculation, just vanilla softmax. |
166 | 2 | parallel_for1 (i, batch_size) { |
167 | 2 | int j; |
168 | 2 | float* const ap = a->data.f32 + i * count; |
169 | 2 | float* const dp = d->data.f32 + i * count; |
170 | 2 | double maxval = ap[0]; |
171 | 6 | for (j = 1; j < count; j++4 ) |
172 | 4 | if (ap[j] > maxval) |
173 | 1 | maxval = ap[j]; |
174 | 2 | double sumval = 0; |
175 | 8 | for (j = 0; j < count; j++6 ) |
176 | 6 | sumval += (dp[j] = expf(ap[j] - maxval)); |
177 | 2 | sumval = 1.0 / sumval; |
178 | 8 | for (j = 0; j < count; j++6 ) |
179 | 6 | dp[j] *= sumval; |
180 | 2 | } parallel_endfor |
181 | 1 | } |
182 | 316 | return CCV_NNC_EXEC_SUCCESS; |
183 | 316 | } |
184 | | |
185 | | static int _ccv_nnc_softmax_crossentropy_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
186 | 308 | { |
187 | 308 | assert(input_size >= 6); |
188 | 308 | assert(output_size >= 1); |
189 | 308 | const ccv_nnc_tensor_t* g = inputs[0]; |
190 | 308 | assert(!g || !CCV_IS_TENSOR_VIEW(g)); |
191 | 308 | const ccv_nnc_tensor_t* b = inputs[3]; |
192 | 308 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
193 | 308 | const ccv_nnc_tensor_t* d = inputs[5]; |
194 | 308 | assert(CCV_IS_TENSOR_CONTIGUOUS(d)); |
195 | 308 | ccv_nnc_tensor_t* h = outputs[0]; |
196 | 308 | assert(CCV_IS_TENSOR_CONTIGUOUS(h)); |
197 | 308 | const int axis_count = ccv_nnc_tensor_nd(d->info.dim); |
198 | 308 | const int batch_size = axis_count < 2 ? 10 : d->info.dim[0]; |
199 | 308 | const int count = ccv_nnc_tensor_count(d->info) / batch_size; |
200 | 308 | int i; |
201 | 308 | if (g) |
202 | 107 | { |
203 | 107 | if (b->info.datatype == CCV_32F) |
204 | 105 | { |
205 | | // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is |
206 | | // the channel count. Otherwise, the range is 1 (and the only axis is the batch size). |
207 | 105 | const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? ccv_nnc_tensor_get_c(b->info)1 : (104 batch_size == 1104 ? b->info.dim[0]100 : 14 ); |
208 | 105 | if (range == 1) |
209 | 104 | { |
210 | 312 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; i++208 ) |
211 | 208 | { assert(d->info.dim[i] == h->info.dim[i]); } |
212 | 104 | const float trim0 = cmd.info.label_smoothing.trim0; |
213 | 104 | const float trim1 = cmd.info.label_smoothing.trim1; |
214 | 104 | if (trim0 == 0 && trim1 == 1102 ) |
215 | 102 | { |
216 | 120 | parallel_for102 (i, batch_size) { |
217 | 120 | int j; |
218 | 120 | const float gp = g->data.f32[i]; |
219 | 120 | const int label = (int)(b->data.f32[i] + 0.5); |
220 | 120 | float* const dp = d->data.f32 + i * count; |
221 | 120 | float* const hp = h->data.f32 + i * count; |
222 | 3.12k | for (j = 0; j < count; j++3.00k ) |
223 | 3.00k | hp[j] = gp * dp[j]; |
224 | 120 | hp[label] -= gp; |
225 | 120 | } parallel_endfor |
226 | 102 | } else { |
227 | 20 | parallel_for2 (i, batch_size) { |
228 | 20 | int j; |
229 | 20 | const float gp = g->data.f32[i]; |
230 | 20 | const int label = (int)(b->data.f32[i] + 0.5); |
231 | 20 | float* const dp = d->data.f32 + i * count; |
232 | 20 | float* const hp = h->data.f32 + i * count; |
233 | 1.01k | for (j = 0; j < label; j++990 ) |
234 | 990 | hp[j] = gp * (dp[j] - trim0); |
235 | 20 | hp[label] = gp * (dp[label] - trim1); |
236 | 1.01k | for (j = label + 1; j < count; j++990 ) |
237 | 990 | hp[j] = gp * (dp[j] - trim0); |
238 | 20 | } parallel_endfor |
239 | 2 | } |
240 | 104 | } else { |
241 | 1 | assert(range == count); |
242 | 2 | parallel_for1 (i, batch_size) { |
243 | 2 | int j; |
244 | 2 | const float gp = g->data.f32[i]; |
245 | 2 | float* const dp = d->data.f32 + i * count; |
246 | 2 | float* const hp = h->data.f32 + i * count; |
247 | 2 | float* const bp = b->data.f32 + i * count; |
248 | 8 | for (j = 0; j < count; j++6 ) |
249 | 6 | hp[j] = gp * (dp[j] - bp[j]); |
250 | 2 | } parallel_endfor |
251 | 1 | } |
252 | 105 | } else if (2 b->info.datatype == CCV_32S2 ) { |
253 | 6 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; i++4 ) |
254 | 4 | { assert(d->info.dim[i] == h->info.dim[i]); } |
255 | 2 | const float trim0 = cmd.info.label_smoothing.trim0; |
256 | 2 | const float trim1 = cmd.info.label_smoothing.trim1; |
257 | 2 | if (trim0 == 0 && trim1 == 11 ) |
258 | 1 | { |
259 | 2 | parallel_for1 (i, batch_size) { |
260 | 2 | int j; |
261 | 2 | const float gp = g->data.f32[i]; |
262 | 2 | const int label = b->data.i32[i]; |
263 | 2 | float* const dp = d->data.f32 + i * count; |
264 | 2 | float* const hp = h->data.f32 + i * count; |
265 | 8 | for (j = 0; j < count; j++6 ) |
266 | 6 | hp[j] = gp * dp[j]; |
267 | 2 | hp[label] -= gp; |
268 | 2 | } parallel_endfor |
269 | 1 | } else { |
270 | 2 | parallel_for1 (i, batch_size) { |
271 | 2 | int j; |
272 | 2 | const float gp = g->data.f32[i]; |
273 | 2 | const int label = b->data.i32[i]; |
274 | 2 | float* const dp = d->data.f32 + i * count; |
275 | 2 | float* const hp = h->data.f32 + i * count; |
276 | 5 | for (j = 0; j < label; j++3 ) |
277 | 3 | hp[j] = gp * (dp[j] - trim0); |
278 | 2 | hp[label] = gp * (dp[label] - trim1); |
279 | 3 | for (j = label + 1; j < count; j++1 ) |
280 | 1 | hp[j] = gp * (dp[j] - trim0); |
281 | 2 | } parallel_endfor |
282 | 1 | } |
283 | 2 | } |
284 | 201 | } else { |
285 | 201 | if (h->data.f32 != d->data.f32) // If not inplace replacement. |
286 | 201 | memcpy(h->data.f32, d->data.f32, sizeof(float) * count * batch_size); |
287 | 201 | if (b->info.datatype == CCV_32F) |
288 | 200 | { |
289 | | // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is |
290 | | // the channel count. Otherwise, the range is 1 (and the only axis is the batch size). |
291 | 200 | const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? ccv_nnc_tensor_get_c(b->info)0 : (batch_size == 1 ? b->info.dim[0] : 10 ); |
292 | 200 | if (range == 1) |
293 | 200 | { |
294 | 200 | const float trim0 = cmd.info.label_smoothing.trim0; |
295 | 200 | const float trim1 = cmd.info.label_smoothing.trim1; |
296 | 200 | if (trim0 == 0 && trim1 == 1) |
297 | 200 | { |
298 | 600 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; i++400 ) |
299 | 400 | { assert(d->info.dim[i] == h->info.dim[i]); } |
300 | 200 | parallel_for(i, batch_size) { |
301 | 200 | const int label = (int)(b->data.f32[i] + 0.5); |
302 | 200 | float* const hp = h->data.f32 + i * count; |
303 | 200 | hp[label] -= 1.; |
304 | 200 | } parallel_endfor |
305 | 200 | } else { |
306 | 0 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; i++) |
307 | 0 | { assert(d->info.dim[i] == h->info.dim[i]); } |
308 | 0 | parallel_for(i, batch_size) { |
309 | 0 | int j; |
310 | 0 | const int label = (int)(b->data.f32[i] + 0.5); |
311 | 0 | float* const hp = h->data.f32 + i * count; |
312 | 0 | for (j = 0; j < label; j++) |
313 | 0 | hp[j] -= trim0; |
314 | 0 | hp[label] -= trim1; |
315 | 0 | for (j = label + 1; j < count; j++) |
316 | 0 | hp[j] -= trim0; |
317 | 0 | } parallel_endfor |
318 | 0 | } |
319 | 200 | } else { |
320 | 0 | assert(range == count); |
321 | 0 | parallel_for(i, batch_size) { |
322 | 0 | int j; |
323 | 0 | float* const hp = h->data.f32 + i * count; |
324 | 0 | float* const bp = b->data.f32 + i * count; |
325 | 0 | for (j = 0; j < count; j++) |
326 | 0 | hp[j] -= bp[j]; |
327 | 0 | } parallel_endfor |
328 | 0 | } |
329 | 200 | } else if (1 b->info.datatype == CCV_32S1 ) { |
330 | 3 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; i++2 ) |
331 | 2 | { assert(d->info.dim[i] == h->info.dim[i]); } |
332 | 1 | const float trim0 = cmd.info.label_smoothing.trim0; |
333 | 1 | const float trim1 = cmd.info.label_smoothing.trim1; |
334 | 1 | if (trim0 == 0 && trim1 == 1) |
335 | 1 | { |
336 | 2 | parallel_for1 (i, batch_size) { |
337 | 2 | const int label = b->data.i32[i]; |
338 | 2 | float* const hp = h->data.f32 + i * count; |
339 | 2 | hp[label] -= 1.; |
340 | 2 | } parallel_endfor |
341 | 1 | } else { |
342 | 0 | parallel_for(i, batch_size) { |
343 | 0 | int j; |
344 | 0 | const int label = b->data.i32[i]; |
345 | 0 | float* const hp = h->data.f32 + i * count; |
346 | 0 | for (j = 0; j < label; j++) |
347 | 0 | hp[j] -= trim0; |
348 | 0 | hp[label] -= trim1; |
349 | 0 | for (j = label + 1; j < count; j++) |
350 | 0 | hp[j] -= trim0; |
351 | 0 | } parallel_endfor |
352 | 0 | } |
353 | 1 | } |
354 | 201 | } |
355 | 308 | return CCV_NNC_EXEC_SUCCESS; |
356 | 308 | } |
357 | | |
358 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
359 | 1 | { |
360 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
361 | 1 | registry->tensor_datatypes = CCV_32F | CCV_32S; |
362 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
363 | 1 | registry->algorithms = 1; |
364 | 1 | registry->exec = _ccv_nnc_softmax_crossentropy_forw; |
365 | 1 | } |
366 | | |
367 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
368 | 1 | { |
369 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
370 | 1 | registry->tensor_datatypes = CCV_32F | CCV_32S; |
371 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
372 | 1 | registry->algorithms = 1; |
373 | 1 | registry->exec = _ccv_nnc_softmax_crossentropy_back; |
374 | 1 | } |