/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/loss/ccv_nnc_categorical_crossentropy_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_categorical_crossentropy_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 11 | { |
15 | 11 | assert(input_size == 2); |
16 | 11 | const ccv_nnc_tensor_t* a = inputs[0]; |
17 | 11 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
18 | 11 | const ccv_nnc_tensor_t* b = inputs[1]; |
19 | 11 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
20 | 11 | assert(output_size == 1); |
21 | 11 | ccv_nnc_tensor_t* c = outputs[0]; |
22 | 11 | assert(CCV_IS_TENSOR_CONTIGUOUS(c)); |
23 | 11 | const int axis_count = ccv_nnc_tensor_nd(a->info.dim); |
24 | 11 | const int batch_size = axis_count < 2 ? 10 : a->info.dim[0]; |
25 | 11 | const int count = ccv_nnc_tensor_count(a->info) / batch_size; |
26 | 11 | int i; |
27 | 11 | if (b->info.datatype == CCV_32F) |
28 | 9 | { |
29 | | // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is |
30 | | // the channel count. Otherwise, the range is 1 (and the only axis is the batch size). |
31 | 9 | const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? ccv_nnc_tensor_get_c(b->info)1 : (8 batch_size == 18 ? b->info.dim[0]0 : 18 ); |
32 | 9 | if (range == 1) |
33 | 8 | { |
34 | 16 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && b->info.dim[i] > 0; i++8 ) |
35 | 8 | { assert(b->info.dim[i] == c->info.dim[i]); } |
36 | 8 | const float trim0 = cmd.info.label_smoothing.trim0; |
37 | 8 | const float trim1 = cmd.info.label_smoothing.trim1; |
38 | 8 | if (trim0 == 0 && trim1 == 14 ) |
39 | 4 | { |
40 | 40 | parallel_for4 (i, batch_size) { |
41 | 40 | const int label = (int)(b->data.f32[i] + 0.5); |
42 | 40 | assert(label >= 0 && label < count); |
43 | 40 | const float p = a->data.f32[i * count + label]; |
44 | 40 | c->data.f32[i] = -logf(p); |
45 | 40 | } parallel_endfor |
46 | 4 | } else { |
47 | 40 | parallel_for4 (i, batch_size) { |
48 | 40 | const int label = (int)(b->data.f32[i] + 0.5); |
49 | 40 | assert(label >= 0 && label < count); |
50 | 40 | int j; |
51 | 40 | float p = 0; |
52 | 40 | float* const ap = a->data.f32 + i * count; |
53 | 2.02k | for (j = 0; j < label; j++1.98k ) |
54 | 1.98k | p += -trim0 * logf(ap[j]); |
55 | 40 | p += -trim1 * logf(ap[label]); |
56 | 2.02k | for (j = label + 1; j < count; j++1.98k ) |
57 | 1.98k | p += -trim0 * logf(ap[j]); |
58 | 40 | c->data.f32[i] = p; |
59 | 40 | } parallel_endfor |
60 | 4 | } |
61 | 8 | } else { |
62 | 1 | assert(range == count); |
63 | 2 | parallel_for1 (i, batch_size) { |
64 | 2 | int j; |
65 | 2 | float p = 0; |
66 | 2 | float* const bp = b->data.f32 + i * count; |
67 | 2 | float* const ap = a->data.f32 + i * count; |
68 | 8 | for (j = 0; j < count; j++6 ) |
69 | 6 | p += -bp[j] * logf(ap[j]); |
70 | 2 | c->data.f32[i] = p; |
71 | 2 | } parallel_endfor |
72 | 1 | } |
73 | 9 | } else if (2 b->info.datatype == CCV_32S2 ) { |
74 | 5 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && b->info.dim[i] > 0; i++3 ) |
75 | 3 | { assert(b->info.dim[i] == c->info.dim[i]); } |
76 | 2 | const float trim0 = cmd.info.label_smoothing.trim0; |
77 | 2 | const float trim1 = cmd.info.label_smoothing.trim1; |
78 | 2 | if (trim0 == 0 && trim1 == 11 ) |
79 | 1 | { |
80 | 2 | parallel_for1 (i, batch_size) { |
81 | 2 | const int label = b->data.i32[i]; |
82 | 2 | assert(label >= 0 && label < count); |
83 | 2 | const float p = a->data.f32[i * count + label]; |
84 | 2 | c->data.f32[i] = -logf(p); |
85 | 2 | } parallel_endfor |
86 | 1 | } else { |
87 | 2 | parallel_for1 (i, batch_size) { |
88 | 2 | const int label = b->data.i32[i]; |
89 | 2 | assert(label >= 0 && label < count); |
90 | 2 | int j; |
91 | 2 | float p = 0; |
92 | 2 | float* const ap = a->data.f32 + i * count; |
93 | 5 | for (j = 0; j < label; j++3 ) |
94 | 3 | p += -trim0 * logf(ap[j]); |
95 | 2 | p += -trim1 * logf(ap[label]); |
96 | 3 | for (j = label + 1; j < count; j++1 ) |
97 | 1 | p += -trim0 * logf(ap[j]); |
98 | 2 | c->data.f32[i] = p; |
99 | 2 | } parallel_endfor |
100 | 1 | } |
101 | 2 | } |
102 | 11 | return CCV_NNC_EXEC_SUCCESS; |
103 | 11 | } |
104 | | |
105 | | static int _ccv_nnc_categorical_crossentropy_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
106 | 8 | { |
107 | 8 | assert(input_size >= 3); |
108 | 8 | assert(output_size >= 1); |
109 | 8 | const ccv_nnc_tensor_t* g = inputs[0]; |
110 | 8 | assert(!g || !CCV_IS_TENSOR_VIEW(g)); |
111 | 8 | const ccv_nnc_tensor_t* a = inputs[1]; |
112 | 8 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
113 | 8 | const ccv_nnc_tensor_t* b = inputs[2]; |
114 | 8 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
115 | 8 | ccv_nnc_tensor_t* h = outputs[0]; |
116 | 8 | assert(CCV_IS_TENSOR_CONTIGUOUS(h)); |
117 | 8 | const int axis_count = ccv_nnc_tensor_nd(a->info.dim); |
118 | 8 | const int batch_size = axis_count < 2 ? 10 : a->info.dim[0]; |
119 | 8 | const int count = ccv_nnc_tensor_count(a->info) / batch_size; |
120 | 8 | int i; |
121 | 8 | if (g) |
122 | 8 | { |
123 | 8 | if (b->info.datatype == CCV_32F) |
124 | 5 | { |
125 | | // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is |
126 | | // the channel count. Otherwise, the range is 1 (and the only axis is the batch size). |
127 | 5 | const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? ccv_nnc_tensor_get_c(b->info)1 : (4 batch_size == 14 ? b->info.dim[0]0 : 14 ); |
128 | 5 | if (range == 1) |
129 | 4 | { |
130 | 12 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++8 ) |
131 | 8 | { assert(a->info.dim[i] == h->info.dim[i]); } |
132 | 4 | const float trim0 = cmd.info.label_smoothing.trim0; |
133 | 4 | const float trim1 = cmd.info.label_smoothing.trim1; |
134 | 4 | if (trim0 == 0 && trim1 == 12 ) |
135 | 2 | { |
136 | 20 | parallel_for2 (i, batch_size) { |
137 | 20 | int j; |
138 | 20 | const float gp = g->data.f32[i]; |
139 | 20 | const int label = (int)(b->data.f32[i] + 0.5); |
140 | 20 | float* const hp = h->data.f32 + i * count; |
141 | 2.02k | for (j = 0; j < count; j++2.00k ) |
142 | 2.00k | hp[j] = 0; |
143 | 20 | const float p = a->data.f32[i * count + label]; |
144 | 20 | hp[label] = -gp / p; |
145 | 20 | } parallel_endfor |
146 | 2 | } else { |
147 | 20 | parallel_for2 (i, batch_size) { |
148 | 20 | int j; |
149 | 20 | const float gp = g->data.f32[i]; |
150 | 20 | const int label = (int)(b->data.f32[i] + 0.5); |
151 | 20 | float* const hp = h->data.f32 + i * count; |
152 | 20 | float* const ap = a->data.f32 + i * count; |
153 | 1.01k | for (j = 0; j < label; j++990 ) |
154 | 990 | hp[j] = -gp * trim0 / ap[j]; |
155 | 20 | hp[label] = -gp * trim1 / ap[label]; |
156 | 1.01k | for (j = label + 1; j < count; j++990 ) |
157 | 990 | hp[j] = -gp * trim0 / ap[j]; |
158 | 20 | } parallel_endfor |
159 | 2 | } |
160 | 4 | } else { |
161 | 1 | assert(range == count); |
162 | 2 | parallel_for1 (i, batch_size) { |
163 | 2 | int j; |
164 | 2 | const float gp = g->data.f32[i]; |
165 | 2 | float* const hp = h->data.f32 + i * count; |
166 | 2 | float* const ap = a->data.f32 + i * count; |
167 | 2 | float* const bp = b->data.f32 + i * count; |
168 | 8 | for (j = 0; j < count; j++6 ) |
169 | 6 | hp[j] = -gp * bp[j] / ap[j]; |
170 | 2 | } parallel_endfor |
171 | 1 | } |
172 | 5 | } else if (3 b->info.datatype == CCV_32S3 ) { |
173 | 9 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++6 ) |
174 | 6 | { assert(a->info.dim[i] == h->info.dim[i]); } |
175 | 3 | const float trim0 = cmd.info.label_smoothing.trim0; |
176 | 3 | const float trim1 = cmd.info.label_smoothing.trim1; |
177 | 3 | if (trim0 == 0 && trim1 == 12 ) |
178 | 2 | { |
179 | 4 | parallel_for2 (i, batch_size) { |
180 | 4 | int j; |
181 | 4 | const float gp = g->data.f32[i]; |
182 | 4 | const int label = b->data.i32[i]; |
183 | 4 | float* const hp = h->data.f32 + i * count; |
184 | 16 | for (j = 0; j < count; j++12 ) |
185 | 12 | hp[j] = 0; |
186 | 4 | const float p = a->data.f32[i * count + label]; |
187 | 4 | hp[label] = -gp / p; |
188 | 4 | } parallel_endfor |
189 | 2 | } else { |
190 | 2 | parallel_for1 (i, batch_size) { |
191 | 2 | int j; |
192 | 2 | const float gp = g->data.f32[i]; |
193 | 2 | const int label = b->data.i32[i]; |
194 | 2 | float* const hp = h->data.f32 + i * count; |
195 | 2 | float* const ap = a->data.f32 + i * count; |
196 | 5 | for (j = 0; j < label; j++3 ) |
197 | 3 | hp[j] = -gp * trim0 / ap[j]; |
198 | 2 | hp[label] = -gp * trim1 / ap[label]; |
199 | 3 | for (j = label + 1; j < count; j++1 ) |
200 | 1 | hp[j] = -gp * trim0 / ap[j]; |
201 | 2 | } parallel_endfor |
202 | 1 | } |
203 | 3 | } |
204 | 8 | } else { |
205 | 0 | if (b->info.datatype == CCV_32F) |
206 | 0 | { |
207 | | // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is |
208 | | // the channel count. Otherwise, the range is 1 (and the only axis is the batch size). |
209 | 0 | const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? ccv_nnc_tensor_get_c(b->info) : (batch_size == 1 ? b->info.dim[0] : 1); |
210 | 0 | if (range == 1) |
211 | 0 | { |
212 | 0 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++) |
213 | 0 | { assert(a->info.dim[i] == h->info.dim[i]); } |
214 | 0 | const float trim0 = cmd.info.label_smoothing.trim0; |
215 | 0 | const float trim1 = cmd.info.label_smoothing.trim1; |
216 | 0 | if (trim0 == 0 && trim1 == 1) |
217 | 0 | { |
218 | 0 | parallel_for(i, batch_size) { |
219 | 0 | int j; |
220 | 0 | const int label = (int)(b->data.f32[i] + 0.5); |
221 | 0 | float* const hp = h->data.f32 + i * count; |
222 | 0 | for (j = 0; j < count; j++) |
223 | 0 | hp[j] = 0; |
224 | 0 | const float p = a->data.f32[i * count + label]; |
225 | 0 | hp[label] = -1. / p; |
226 | 0 | } parallel_endfor |
227 | 0 | } else { |
228 | 0 | parallel_for(i, batch_size) { |
229 | 0 | int j; |
230 | 0 | const int label = (int)(b->data.f32[i] + 0.5); |
231 | 0 | float* const hp = h->data.f32 + i * count; |
232 | 0 | float* const ap = a->data.f32 + i * count; |
233 | 0 | for (j = 0; j < label; j++) |
234 | 0 | hp[j] = -trim0 / ap[j]; |
235 | 0 | hp[label] = -trim1 / ap[label]; |
236 | 0 | for (j = label + 1; j < count; j++) |
237 | 0 | hp[j] = -trim0 / ap[j]; |
238 | 0 | } parallel_endfor |
239 | 0 | } |
240 | 0 | } else { |
241 | 0 | assert(range == count); |
242 | 0 | parallel_for(i, batch_size) { |
243 | 0 | int j; |
244 | 0 | float* const hp = h->data.f32 + i * count; |
245 | 0 | float* const ap = a->data.f32 + i * count; |
246 | 0 | float* const bp = b->data.f32 + i * count; |
247 | 0 | for (j = 0; j < count; j++) |
248 | 0 | hp[j] = -bp[j] / ap[j]; |
249 | 0 | } parallel_endfor |
250 | 0 | } |
251 | 0 | } else if (b->info.datatype == CCV_32S) { |
252 | 0 | const float trim0 = cmd.info.label_smoothing.trim0; |
253 | 0 | const float trim1 = cmd.info.label_smoothing.trim1; |
254 | 0 | if (trim0 == 0 && trim1 == 1) |
255 | 0 | { |
256 | 0 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++) |
257 | 0 | { assert(a->info.dim[i] == h->info.dim[i]); } |
258 | 0 | parallel_for(i, batch_size) { |
259 | 0 | int j; |
260 | 0 | const int label = b->data.i32[i]; |
261 | 0 | float* const hp = h->data.f32 + i * count; |
262 | 0 | for (j = 0; j < count; j++) |
263 | 0 | hp[j] = 0; |
264 | 0 | const float p = a->data.f32[i * count + label]; |
265 | 0 | hp[label] = -1. / p; |
266 | 0 | } parallel_endfor |
267 | 0 | } else { |
268 | 0 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++) |
269 | 0 | { assert(a->info.dim[i] == h->info.dim[i]); } |
270 | 0 | parallel_for(i, batch_size) { |
271 | 0 | int j; |
272 | 0 | const int label = b->data.i32[i]; |
273 | 0 | float* const hp = h->data.f32 + i * count; |
274 | 0 | float* const ap = a->data.f32 + i * count; |
275 | 0 | for (j = 0; j < label; j++) |
276 | 0 | hp[j] = -trim0 / ap[j]; |
277 | 0 | hp[label] = -trim1 / ap[label]; |
278 | 0 | for (j = label + 1; j < count; j++) |
279 | 0 | hp[j] = -trim0 / ap[j]; |
280 | 0 | } parallel_endfor |
281 | 0 | } |
282 | 0 | } |
283 | 0 | } |
284 | 8 | return CCV_NNC_EXEC_SUCCESS; |
285 | 8 | } |
286 | | |
287 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
288 | 1 | { |
289 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
290 | 1 | registry->tensor_datatypes = CCV_32F | CCV_32S; |
291 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
292 | 1 | registry->algorithms = 1; |
293 | 1 | registry->exec = _ccv_nnc_categorical_crossentropy_forw; |
294 | 1 | } |
295 | | |
296 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
297 | 1 | { |
298 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
299 | 1 | registry->tensor_datatypes = CCV_32F | CCV_32S; |
300 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
301 | 1 | registry->algorithms = 1; |
302 | 1 | registry->exec = _ccv_nnc_categorical_crossentropy_back; |
303 | 1 | } |