Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/softmax_loss/ccv_nnc_softmax_crossentropy_cpu_ref.c
Line
Count
Source
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_softmax_crossentropy_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
316
{
15
316
  assert(input_size == 2);
16
316
  const ccv_nnc_tensor_t* a = inputs[0];
17
316
  assert(CCV_IS_TENSOR_CONTIGUOUS(a));
18
316
  const ccv_nnc_tensor_t* b = inputs[1];
19
316
  assert(CCV_IS_TENSOR_CONTIGUOUS(b));
20
316
  assert(output_size == 2);
21
316
  ccv_nnc_tensor_t* c = outputs[0];
22
316
  assert(!c || !CCV_IS_TENSOR_VIEW(c));
23
316
  ccv_nnc_tensor_t* d = outputs[1];
24
316
  assert(CCV_IS_TENSOR_CONTIGUOUS(d));
25
316
  const int axis_count = ccv_nnc_tensor_nd(a->info.dim);
26
316
  const int batch_size = axis_count < 2 ? 
10
: a->info.dim[0];
27
316
  const int count = ccv_nnc_tensor_count(a->info) / batch_size;
28
316
  int i;
29
316
  if (c)
30
315
  {
31
315
    assert(ccv_nnc_tensor_count(c->info) == batch_size);
32
315
    if (b->info.datatype == CCV_32F)
33
312
    {
34
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
35
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
36
312
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)3
:
(309
batch_size == 1309
?
b->info.dim[0]301
:
18
);
37
312
      if (range == 1)
38
309
      {
39
927
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++618
)
40
618
          { assert(a->info.dim[i] == d->info.dim[i]); }
41
309
        const float trim0 = cmd.info.label_smoothing.trim0;
42
309
        const float trim1 = cmd.info.label_smoothing.trim1;
43
309
        if (trim0 == 0 && 
trim1 == 1305
)
44
305
        {
45
341
          
parallel_for305
(i, batch_size) {
46
341
            int j;
47
341
            float* const ap = a->data.f32 + i * count;
48
341
            float* const dp = d->data.f32 + i * count;
49
341
            double maxval = ap[0];
50
7.01k
            for (j = 1; j < count; 
j++6.66k
)
51
6.66k
              if (ap[j] > maxval)
52
649
                maxval = ap[j];
53
341
            const int label = (int)(b->data.f32[i] + 0.5);
54
341
            assert(label >= 0 && label < count);
55
341
            c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
56
341
            double sumval = 0;
57
7.35k
            for (j = 0; j < count; 
j++7.01k
)
58
7.01k
              sumval += (dp[j] = expf(ap[j] - maxval));
59
341
            sumval = 1.0 / sumval;
60
7.35k
            for (j = 0; j < count; 
j++7.01k
)
61
7.01k
              dp[j] *= sumval;
62
341
          } parallel_endfor
63
305
        } else {
64
40
          
parallel_for4
(i, batch_size) {
65
40
            int j;
66
40
            float* const ap = a->data.f32 + i * count;
67
40
            float* const dp = d->data.f32 + i * count;
68
40
            double maxval = ap[0];
69
4.00k
            for (j = 1; j < count; 
j++3.96k
)
70
3.96k
              if (ap[j] > maxval)
71
140
                maxval = ap[j];
72
40
            const int label = (int)(b->data.f32[i] + 0.5);
73
40
            assert(label >= 0 && label < count);
74
40
            float p = 0;
75
2.02k
            for (j = 0; j < label; 
j++1.98k
)
76
1.98k
              p += trim0 * (maxval - ap[j]);
77
40
            p += trim1 * (maxval - ap[label]);
78
2.02k
            for (j = label + 1; j < count; 
j++1.98k
)
79
1.98k
              p += trim0 * (maxval - ap[j]);
80
40
            c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
81
40
            double sumval = 0;
82
4.04k
            for (j = 0; j < count; 
j++4.00k
)
83
4.00k
              sumval += (dp[j] = expf(ap[j] - maxval));
84
40
            sumval = 1.0 / sumval;
85
4.04k
            for (j = 0; j < count; 
j++4.00k
)
86
4.00k
              dp[j] *= sumval;
87
40
          } parallel_endfor
88
4
        }
89
309
      } else {
90
3
        assert(range == count);
91
4
        
parallel_for3
(i, batch_size) {
92
4
          int j;
93
4
          float* const ap = a->data.f32 + i * count;
94
4
          float* const bp = b->data.f32 + i * count;
95
4
          float* const dp = d->data.f32 + i * count;
96
4
          double maxval = ap[0];
97
26
          for (j = 1; j < count; 
j++22
)
98
22
            if (ap[j] > maxval)
99
3
              maxval = ap[j];
100
4
          float p = 0;
101
30
          for (j = 0; j < count; 
j++26
)
102
26
            p += bp[j] * (maxval - ap[j]);
103
4
          c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
104
4
          double sumval = 0;
105
30
          for (j = 0; j < count; 
j++26
)
106
26
            sumval += (dp[j] = expf(ap[j] - maxval));
107
4
          sumval = 1.0 / sumval;
108
30
          for (j = 0; j < count; 
j++26
)
109
26
            dp[j] *= sumval;
110
4
        } parallel_endfor
111
3
      }
112
312
    } else 
if (3
b->info.datatype == CCV_32S3
) {
113
9
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++6
)
114
6
        { assert(a->info.dim[i] == d->info.dim[i]); }
115
3
      const float trim0 = cmd.info.label_smoothing.trim0;
116
3
      const float trim1 = cmd.info.label_smoothing.trim1;
117
3
      if (trim0 == 0 && 
trim1 == 12
)
118
2
      {
119
4
        
parallel_for2
(i, batch_size) {
120
4
          int j;
121
4
          float* const ap = a->data.f32 + i * count;
122
4
          float* const dp = d->data.f32 + i * count;
123
4
          double maxval = ap[0];
124
12
          for (j = 1; j < count; 
j++8
)
125
8
            if (ap[j] > maxval)
126
2
              maxval = ap[j];
127
4
          const int label = b->data.i32[i];
128
4
          assert(label >= 0 && label < count);
129
4
          c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
130
4
          double sumval = 0;
131
16
          for (j = 0; j < count; 
j++12
)
132
12
            sumval += (dp[j] = expf(ap[j] - maxval));
133
4
          sumval = 1.0 / sumval;
134
16
          for (j = 0; j < count; 
j++12
)
135
12
            dp[j] *= sumval;
136
4
        } parallel_endfor
137
2
      } else {
138
2
        
parallel_for1
(i, batch_size) {
139
2
          int j;
140
2
          float* const ap = a->data.f32 + i * count;
141
2
          float* const dp = d->data.f32 + i * count;
142
2
          double maxval = ap[0];
143
6
          for (j = 1; j < count; 
j++4
)
144
4
            if (ap[j] > maxval)
145
2
              maxval = ap[j];
146
2
          const int label = b->data.i32[i];
147
2
          assert(label >= 0 && label < count);
148
2
          float p = 0;
149
5
          for (j = 0; j < label; 
j++3
)
150
3
            p += trim0 * (maxval - ap[j]);
151
2
          p += trim1 * (maxval - ap[label]);
152
3
          for (j = label + 1; j < count; 
j++1
)
153
1
            p += trim0 * (maxval - ap[j]);
154
2
          c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
155
2
          double sumval = 0;
156
8
          for (j = 0; j < count; 
j++6
)
157
6
            sumval += (dp[j] = expf(ap[j] - maxval));
158
2
          sumval = 1.0 / sumval;
159
8
          for (j = 0; j < count; 
j++6
)
160
6
            dp[j] *= sumval;
161
2
        } parallel_endfor
162
1
      }
163
3
    }
164
315
  } else {
165
    // No loss calculation, just vanilla softmax.
166
2
    
parallel_for1
(i, batch_size) {
167
2
      int j;
168
2
      float* const ap = a->data.f32 + i * count;
169
2
      float* const dp = d->data.f32 + i * count;
170
2
      double maxval = ap[0];
171
6
      for (j = 1; j < count; 
j++4
)
172
4
        if (ap[j] > maxval)
173
1
          maxval = ap[j];
174
2
      double sumval = 0;
175
8
      for (j = 0; j < count; 
j++6
)
176
6
        sumval += (dp[j] = expf(ap[j] - maxval));
177
2
      sumval = 1.0 / sumval;
178
8
      for (j = 0; j < count; 
j++6
)
179
6
        dp[j] *= sumval;
180
2
    } parallel_endfor
181
1
  }
182
316
  return CCV_NNC_EXEC_SUCCESS;
183
316
}
184
185
static int _ccv_nnc_softmax_crossentropy_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
186
308
{
187
308
  assert(input_size >= 6);
188
308
  assert(output_size >= 1);
189
308
  const ccv_nnc_tensor_t* g = inputs[0];
190
308
  assert(!g || !CCV_IS_TENSOR_VIEW(g));
191
308
  const ccv_nnc_tensor_t* b = inputs[3];
192
308
  assert(CCV_IS_TENSOR_CONTIGUOUS(b));
193
308
  const ccv_nnc_tensor_t* d = inputs[5];
194
308
  assert(CCV_IS_TENSOR_CONTIGUOUS(d));
195
308
  ccv_nnc_tensor_t* h = outputs[0];
196
308
  assert(CCV_IS_TENSOR_CONTIGUOUS(h));
197
308
  const int axis_count = ccv_nnc_tensor_nd(d->info.dim);
198
308
  const int batch_size = axis_count < 2 ? 
10
: d->info.dim[0];
199
308
  const int count = ccv_nnc_tensor_count(d->info) / batch_size;
200
308
  int i;
201
308
  if (g)
202
107
  {
203
107
    if (b->info.datatype == CCV_32F)
204
105
    {
205
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
206
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
207
105
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)1
:
(104
batch_size == 1104
?
b->info.dim[0]100
:
14
);
208
105
      if (range == 1)
209
104
      {
210
312
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++208
)
211
208
          { assert(d->info.dim[i] == h->info.dim[i]); }
212
104
        const float trim0 = cmd.info.label_smoothing.trim0;
213
104
        const float trim1 = cmd.info.label_smoothing.trim1;
214
104
        if (trim0 == 0 && 
trim1 == 1102
)
215
102
        {
216
120
          
parallel_for102
(i, batch_size) {
217
120
            int j;
218
120
            const float gp = g->data.f32[i];
219
120
            const int label = (int)(b->data.f32[i] + 0.5);
220
120
            float* const dp = d->data.f32 + i * count;
221
120
            float* const hp = h->data.f32 + i * count;
222
3.12k
            for (j = 0; j < count; 
j++3.00k
)
223
3.00k
              hp[j] = gp * dp[j];
224
120
            hp[label] -= gp;
225
120
          } parallel_endfor
226
102
        } else {
227
20
          
parallel_for2
(i, batch_size) {
228
20
            int j;
229
20
            const float gp = g->data.f32[i];
230
20
            const int label = (int)(b->data.f32[i] + 0.5);
231
20
            float* const dp = d->data.f32 + i * count;
232
20
            float* const hp = h->data.f32 + i * count;
233
1.01k
            for (j = 0; j < label; 
j++990
)
234
990
              hp[j] = gp * (dp[j] - trim0);
235
20
            hp[label] = gp * (dp[label] - trim1);
236
1.01k
            for (j = label + 1; j < count; 
j++990
)
237
990
              hp[j] = gp * (dp[j] - trim0);
238
20
          } parallel_endfor
239
2
        }
240
104
      } else {
241
1
        assert(range == count);
242
2
        
parallel_for1
(i, batch_size) {
243
2
          int j;
244
2
          const float gp = g->data.f32[i];
245
2
          float* const dp = d->data.f32 + i * count;
246
2
          float* const hp = h->data.f32 + i * count;
247
2
          float* const bp = b->data.f32 + i * count;
248
8
          for (j = 0; j < count; 
j++6
)
249
6
            hp[j] = gp * (dp[j] - bp[j]);
250
2
        } parallel_endfor
251
1
      }
252
105
    } else 
if (2
b->info.datatype == CCV_32S2
) {
253
6
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++4
)
254
4
        { assert(d->info.dim[i] == h->info.dim[i]); }
255
2
      const float trim0 = cmd.info.label_smoothing.trim0;
256
2
      const float trim1 = cmd.info.label_smoothing.trim1;
257
2
      if (trim0 == 0 && 
trim1 == 11
)
258
1
      {
259
2
        
parallel_for1
(i, batch_size) {
260
2
          int j;
261
2
          const float gp = g->data.f32[i];
262
2
          const int label = b->data.i32[i];
263
2
          float* const dp = d->data.f32 + i * count;
264
2
          float* const hp = h->data.f32 + i * count;
265
8
          for (j = 0; j < count; 
j++6
)
266
6
            hp[j] = gp * dp[j];
267
2
          hp[label] -= gp;
268
2
        } parallel_endfor
269
1
      } else {
270
2
        
parallel_for1
(i, batch_size) {
271
2
          int j;
272
2
          const float gp = g->data.f32[i];
273
2
          const int label = b->data.i32[i];
274
2
          float* const dp = d->data.f32 + i * count;
275
2
          float* const hp = h->data.f32 + i * count;
276
5
          for (j = 0; j < label; 
j++3
)
277
3
            hp[j] = gp * (dp[j] - trim0);
278
2
          hp[label] = gp * (dp[label] - trim1);
279
3
          for (j = label + 1; j < count; 
j++1
)
280
1
            hp[j] = gp * (dp[j] - trim0);
281
2
        } parallel_endfor
282
1
      }
283
2
    }
284
201
  } else {
285
201
    if (h->data.f32 != d->data.f32) // If not inplace replacement.
286
201
      memcpy(h->data.f32, d->data.f32, sizeof(float) * count * batch_size);
287
201
    if (b->info.datatype == CCV_32F)
288
200
    {
289
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
290
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
291
200
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)0
: (batch_size == 1 ? b->info.dim[0] :
10
);
292
200
      if (range == 1)
293
200
      {
294
200
        const float trim0 = cmd.info.label_smoothing.trim0;
295
200
        const float trim1 = cmd.info.label_smoothing.trim1;
296
200
        if (trim0 == 0 && trim1 == 1)
297
200
        {
298
600
          for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++400
)
299
400
            { assert(d->info.dim[i] == h->info.dim[i]); }
300
200
          parallel_for(i, batch_size) {
301
200
            const int label = (int)(b->data.f32[i] + 0.5);
302
200
            float* const hp = h->data.f32 + i * count;
303
200
            hp[label] -= 1.;
304
200
          } parallel_endfor
305
200
        } else {
306
0
          for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; i++)
307
0
            { assert(d->info.dim[i] == h->info.dim[i]); }
308
0
          parallel_for(i, batch_size) {
309
0
            int j;
310
0
            const int label = (int)(b->data.f32[i] + 0.5);
311
0
            float* const hp = h->data.f32 + i * count;
312
0
            for (j = 0; j < label; j++)
313
0
              hp[j] -= trim0;
314
0
            hp[label] -= trim1;
315
0
            for (j = label + 1; j < count; j++)
316
0
              hp[j] -= trim0;
317
0
          } parallel_endfor
318
0
        }
319
200
      } else {
320
0
        assert(range == count);
321
0
        parallel_for(i, batch_size) {
322
0
          int j;
323
0
          float* const hp = h->data.f32 + i * count;
324
0
          float* const bp = b->data.f32 + i * count;
325
0
          for (j = 0; j < count; j++)
326
0
            hp[j] -= bp[j];
327
0
        } parallel_endfor
328
0
      }
329
200
    } else 
if (1
b->info.datatype == CCV_32S1
) {
330
3
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++2
)
331
2
        { assert(d->info.dim[i] == h->info.dim[i]); }
332
1
      const float trim0 = cmd.info.label_smoothing.trim0;
333
1
      const float trim1 = cmd.info.label_smoothing.trim1;
334
1
      if (trim0 == 0 && trim1 == 1)
335
1
      {
336
2
        
parallel_for1
(i, batch_size) {
337
2
          const int label = b->data.i32[i];
338
2
          float* const hp = h->data.f32 + i * count;
339
2
          hp[label] -= 1.;
340
2
        } parallel_endfor
341
1
      } else {
342
0
        parallel_for(i, batch_size) {
343
0
          int j;
344
0
          const int label = b->data.i32[i];
345
0
          float* const hp = h->data.f32 + i * count;
346
0
          for (j = 0; j < label; j++)
347
0
            hp[j] -= trim0;
348
0
          hp[label] -= trim1;
349
0
          for (j = label + 1; j < count; j++)
350
0
            hp[j] -= trim0;
351
0
        } parallel_endfor
352
0
      }
353
1
    }
354
201
  }
355
308
  return CCV_NNC_EXEC_SUCCESS;
356
308
}
357
358
REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
359
1
{
360
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
361
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
362
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
363
1
  registry->algorithms = 1;
364
1
  registry->exec = _ccv_nnc_softmax_crossentropy_forw;
365
1
}
366
367
REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
368
1
{
369
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
370
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
371
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
372
1
  registry->algorithms = 1;
373
1
  registry->exec = _ccv_nnc_softmax_crossentropy_back;
374
1
}