Coverage Report

Created: 2022-07-27 23:53

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/softmax_loss/ccv_nnc_softmax_crossentropy_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_softmax_crossentropy_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
316
{
15
316
  assert(input_size == 2);
16
316
  const ccv_nnc_tensor_t* a = inputs[0];
17
316
  assert(CCV_IS_TENSOR_CONTIGUOUS(a));
18
316
  const ccv_nnc_tensor_t* b = inputs[1];
19
316
  assert(CCV_IS_TENSOR_CONTIGUOUS(b));
20
316
  assert(output_size == 2);
21
316
  ccv_nnc_tensor_t* c = outputs[0];
22
316
  assert(!c || !CCV_IS_TENSOR_VIEW(c));
23
316
  ccv_nnc_tensor_t* d = outputs[1];
24
316
  assert(CCV_IS_TENSOR_CONTIGUOUS(d));
25
316
  const int axis_count = ccv_nnc_tensor_nd(a->info.dim);
26
316
  const int batch_size = axis_count < 2 ? 
10
: a->info.dim[0];
27
316
  const int count = ccv_nnc_tensor_count(a->info) / batch_size;
28
316
  int i;
29
316
  if (c)
30
315
  {
31
315
    assert(ccv_nnc_tensor_count(c->info) == batch_size);
32
315
    if (b->info.datatype == CCV_32F)
33
312
    {
34
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
35
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
36
312
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)3
:
(309
batch_size == 1309
?
b->info.dim[0]301
:
18
);
37
312
      if (range == 1)
38
309
      {
39
927
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++618
)
40
618
          { assert(a->info.dim[i] == d->info.dim[i]); }
41
309
        const float trim0 = cmd.info.label_smoothing.trim0;
42
309
        const float trim1 = cmd.info.label_smoothing.trim1;
43
309
        if (trim0 == 0 && 
trim1 == 1305
)
44
305
        {
45
11.9k
          
parallel_for6.28k
(i, batch_size) {
46
11.9k
            int j;
47
11.9k
            float* const ap = a->data.f32 + i * count;
48
11.9k
            float* const dp = d->data.f32 + i * count;
49
11.9k
            double maxval = ap[0];
50
11.9k
            for (j = 1; 
j < count9.80k
;
j++3.82k
)
51
3.82k
              if (ap[j] > maxval)
52
776
                maxval = ap[j];
53
11.9k
            const int label = (int)(b->data.f32[i] + 0.5);
54
11.9k
            assert(label >= 0 && label < count);
55
331
            c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
56
331
            double sumval = 0;
57
4.60k
            for (j = 0; j < count; 
j++4.27k
)
58
4.27k
              sumval += (dp[j] = expf(ap[j] - maxval));
59
331
            sumval = 1.0 / sumval;
60
5.54k
            for (j = 0; j < count; 
j++5.21k
)
61
5.21k
              dp[j] *= sumval;
62
636
          } parallel_endfor
63
305
        } else {
64
216
          
parallel_for112
(i, batch_size) {
65
216
            int j;
66
216
            float* const ap = a->data.f32 + i * count;
67
216
            float* const dp = d->data.f32 + i * count;
68
216
            double maxval = ap[0];
69
1.02k
            for (j = 1; j < count; 
j++917
)
70
917
              if (ap[j] > maxval)
71
92
                maxval = ap[j];
72
216
            const int label = (int)(b->data.f32[i] + 0.5);
73
216
            assert(label >= 0 && label < count);
74
32
            float p = 0;
75
900
            for (j = 0; j < label; 
j++868
)
76
868
              p += trim0 * (maxval - ap[j]);
77
32
            p += trim1 * (maxval - ap[label]);
78
843
            for (j = label + 1; j < count; 
j++811
)
79
811
              p += trim0 * (maxval - ap[j]);
80
32
            c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
81
32
            double sumval = 0;
82
1.51k
            for (j = 0; j < count; 
j++1.47k
)
83
1.47k
              sumval += (dp[j] = expf(ap[j] - maxval));
84
32
            sumval = 1.0 / sumval;
85
2.12k
            for (j = 0; j < count; 
j++2.09k
)
86
2.09k
              dp[j] *= sumval;
87
36
          } parallel_endfor
88
4
        }
89
309
      } else {
90
3
        assert(range == count);
91
92
        
parallel_for49
(i, batch_size) {
92
92
          int j;
93
92
          float* const ap = a->data.f32 + i * count;
94
92
          float* const bp = b->data.f32 + i * count;
95
92
          float* const dp = d->data.f32 + i * count;
96
92
          double maxval = ap[0];
97
92
          for (j = 1; 
j < count68
;
j++22
)
98
22
            if (ap[j] > maxval)
99
5
              maxval = ap[j];
100
92
          float p = 0;
101
92
          for (j = 0; 
j < count72
;
j++26
)
102
26
            p += bp[j] * (maxval - ap[j]);
103
92
          c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
104
92
          double sumval = 0;
105
92
          for (j = 0; 
j < count72
;
j++26
)
106
26
            sumval += (dp[j] = expf(ap[j] - maxval));
107
92
          sumval = 1.0 / sumval;
108
92
          for (j = 0; 
j < count72
;
j++26
)
109
26
            dp[j] *= sumval;
110
92
        } 
parallel_endfor49
111
3
      }
112
312
    } else 
if (3
b->info.datatype == CCV_32S3
) {
113
9
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++6
)
114
6
        { assert(a->info.dim[i] == d->info.dim[i]); }
115
3
      const float trim0 = cmd.info.label_smoothing.trim0;
116
3
      const float trim1 = cmd.info.label_smoothing.trim1;
117
3
      if (trim0 == 0 && 
trim1 == 12
)
118
2
      {
119
60
        
parallel_for32
(i, batch_size) {
120
60
          int j;
121
60
          float* const ap = a->data.f32 + i * count;
122
60
          float* const dp = d->data.f32 + i * count;
123
60
          double maxval = ap[0];
124
60
          for (j = 1; 
j < count36
;
j++6
)
125
6
            if (ap[j] > maxval)
126
2
              maxval = ap[j];
127
60
          const int label = b->data.i32[i];
128
60
          assert(label >= 0 && label < count);
129
3
          c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
130
3
          double sumval = 0;
131
11
          for (j = 0; j < count; 
j++8
)
132
8
            sumval += (dp[j] = expf(ap[j] - maxval));
133
3
          sumval = 1.0 / sumval;
134
15
          for (j = 0; j < count; 
j++12
)
135
12
            dp[j] *= sumval;
136
5
        } parallel_endfor
137
2
      } else {
138
32
        
parallel_for17
(i, batch_size) {
139
32
          int j;
140
32
          float* const ap = a->data.f32 + i * count;
141
32
          float* const dp = d->data.f32 + i * count;
142
32
          double maxval = ap[0];
143
32
          for (j = 1; 
j < count18
;
j++2
)
144
2
            if (ap[j] > maxval)
145
2
              maxval = ap[j];
146
32
          const int label = b->data.i32[i];
147
32
          assert(label >= 0 && label < count);
148
1
          float p = 0;
149
2
          for (j = 0; j < label; 
j++1
)
150
1
            p += trim0 * (maxval - ap[j]);
151
1
          p += trim1 * (maxval - ap[label]);
152
2
          for (j = label + 1; j < count; 
j++1
)
153
1
            p += trim0 * (maxval - ap[j]);
154
1
          c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
155
1
          double sumval = 0;
156
4
          for (j = 0; j < count; 
j++3
)
157
3
            sumval += (dp[j] = expf(ap[j] - maxval));
158
1
          sumval = 1.0 / sumval;
159
7
          for (j = 0; j < count; 
j++6
)
160
6
            dp[j] *= sumval;
161
2
        } parallel_endfor
162
1
      }
163
3
    }
164
315
  } else {
165
    // No loss calculation, just vanilla softmax.
166
16
    
parallel_for9
(i, batch_size) {
167
16
      int j;
168
16
      float* const ap = a->data.f32 + i * count;
169
16
      float* const dp = d->data.f32 + i * count;
170
16
      double maxval = ap[0];
171
16
      for (j = 1; 
j < count10
;
j++2
)
172
2
        if (ap[j] > maxval)
173
1
          maxval = ap[j];
174
16
      double sumval = 0;
175
16
      for (j = 0; 
j < count13
;
j++5
)
176
5
        sumval += (dp[j] = expf(ap[j] - maxval));
177
16
      sumval = 1.0 / sumval;
178
16
      for (j = 0; 
j < count14
;
j++6
)
179
6
        dp[j] *= sumval;
180
16
    } 
parallel_endfor9
181
1
  }
182
316
  return CCV_NNC_EXEC_SUCCESS;
183
316
}
184
185
static int _ccv_nnc_softmax_crossentropy_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
186
308
{
187
308
  assert(input_size >= 6);
188
308
  assert(output_size >= 1);
189
308
  const ccv_nnc_tensor_t* g = inputs[0];
190
308
  assert(!g || !CCV_IS_TENSOR_VIEW(g));
191
308
  const ccv_nnc_tensor_t* b = inputs[3];
192
308
  assert(CCV_IS_TENSOR_CONTIGUOUS(b));
193
308
  const ccv_nnc_tensor_t* d = inputs[5];
194
308
  assert(CCV_IS_TENSOR_CONTIGUOUS(d));
195
308
  ccv_nnc_tensor_t* h = outputs[0];
196
308
  assert(CCV_IS_TENSOR_CONTIGUOUS(h));
197
308
  const int axis_count = ccv_nnc_tensor_nd(d->info.dim);
198
308
  const int batch_size = axis_count < 2 ? 
10
: d->info.dim[0];
199
308
  const int count = ccv_nnc_tensor_count(d->info) / batch_size;
200
308
  int i;
201
308
  if (g)
202
107
  {
203
107
    if (b->info.datatype == CCV_32F)
204
105
    {
205
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
206
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
207
105
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)1
:
(104
batch_size == 1104
?
b->info.dim[0]100
:
14
);
208
105
      if (range == 1)
209
104
      {
210
312
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++208
)
211
208
          { assert(d->info.dim[i] == h->info.dim[i]); }
212
104
        const float trim0 = cmd.info.label_smoothing.trim0;
213
104
        const float trim1 = cmd.info.label_smoothing.trim1;
214
104
        if (trim0 == 0 && 
trim1 == 1102
)
215
102
        {
216
3.92k
          
parallel_for2.06k
(i, batch_size) {
217
3.92k
            int j;
218
3.92k
            const float gp = g->data.f32[i];
219
3.92k
            const int label = (int)(b->data.f32[i] + 0.5);
220
3.92k
            float* const dp = d->data.f32 + i * count;
221
3.92k
            float* const hp = h->data.f32 + i * count;
222
3.92k
            for (j = 0; 
j < count3.28k
;
j++1.32k
)
223
1.32k
              hp[j] = gp * dp[j];
224
3.92k
            hp[label] -= gp;
225
3.92k
          } 
parallel_endfor2.06k
226
102
        } else {
227
74
          
parallel_for39
(i, batch_size) {
228
74
            int j;
229
74
            const float gp = g->data.f32[i];
230
74
            const int label = (int)(b->data.f32[i] + 0.5);
231
74
            float* const dp = d->data.f32 + i * count;
232
74
            float* const hp = h->data.f32 + i * count;
233
315
            for (j = 0; j < label; 
j++278
)
234
278
              hp[j] = gp * (dp[j] - trim0);
235
74
            hp[label] = gp * (dp[label] - trim1);
236
566
            for (j = label + 1; j < count; 
j++529
)
237
529
              hp[j] = gp * (dp[j] - trim0);
238
74
          } 
parallel_endfor39
239
2
        }
240
104
      } else {
241
1
        assert(range == count);
242
50
        
parallel_for26
(i, batch_size) {
243
50
          int j;
244
50
          const float gp = g->data.f32[i];
245
50
          float* const dp = d->data.f32 + i * count;
246
50
          float* const hp = h->data.f32 + i * count;
247
50
          float* const bp = b->data.f32 + i * count;
248
50
          for (j = 0; 
j < count28
;
j++3
)
249
3
            hp[j] = gp * (dp[j] - bp[j]);
250
50
        } 
parallel_endfor26
251
1
      }
252
105
    } else 
if (2
b->info.datatype == CCV_32S2
) {
253
6
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++4
)
254
4
        { assert(d->info.dim[i] == h->info.dim[i]); }
255
2
      const float trim0 = cmd.info.label_smoothing.trim0;
256
2
      const float trim1 = cmd.info.label_smoothing.trim1;
257
2
      if (trim0 == 0 && 
trim1 == 11
)
258
1
      {
259
38
        
parallel_for20
(i, batch_size) {
260
38
          int j;
261
38
          const float gp = g->data.f32[i];
262
38
          const int label = b->data.i32[i];
263
38
          float* const dp = d->data.f32 + i * count;
264
38
          float* const hp = h->data.f32 + i * count;
265
38
          for (j = 0; 
j < count25
;
j++6
)
266
6
            hp[j] = gp * dp[j];
267
38
          hp[label] -= gp;
268
38
        } 
parallel_endfor20
269
1
      } else {
270
26
        
parallel_for14
(i, batch_size) {
271
26
          int j;
272
26
          const float gp = g->data.f32[i];
273
26
          const int label = b->data.i32[i];
274
26
          float* const dp = d->data.f32 + i * count;
275
26
          float* const hp = h->data.f32 + i * count;
276
26
          for (j = 0; 
j < label15
;
j++2
)
277
2
            hp[j] = gp * (dp[j] - trim0);
278
26
          hp[label] = gp * (dp[label] - trim1);
279
26
          for (j = label + 1; 
j < count14
;
j++1
)
280
1
            hp[j] = gp * (dp[j] - trim0);
281
26
        } 
parallel_endfor14
282
1
      }
283
2
    }
284
201
  } else {
285
201
    if (h->data.f32 != d->data.f32) // If not inplace replacement.
286
201
      memcpy(h->data.f32, d->data.f32, sizeof(float) * count * batch_size);
287
201
    if (b->info.datatype == CCV_32F)
288
200
    {
289
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
290
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
291
200
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)0
: (batch_size == 1 ? b->info.dim[0] :
10
);
292
200
      if (range == 1)
293
200
      {
294
200
        const float trim0 = cmd.info.label_smoothing.trim0;
295
200
        const float trim1 = cmd.info.label_smoothing.trim1;
296
200
        if (trim0 == 0 && trim1 == 1)
297
200
        {
298
600
          for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++400
)
299
400
            { assert(d->info.dim[i] == h->info.dim[i]); }
300
7.61k
          
parallel_for4.00k
(i, batch_size) {
301
7.61k
            const int label = (int)(b->data.f32[i] + 0.5);
302
7.61k
            float* const hp = h->data.f32 + i * count;
303
7.61k
            hp[label] -= 1.;
304
7.61k
          } 
parallel_endfor4.00k
305
200
        } else {
306
0
          for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; i++)
307
0
            { assert(d->info.dim[i] == h->info.dim[i]); }
308
0
          parallel_for(i, batch_size) {
309
0
            int j;
310
0
            const int label = (int)(b->data.f32[i] + 0.5);
311
0
            float* const hp = h->data.f32 + i * count;
312
0
            for (j = 0; j < label; j++)
313
0
              hp[j] -= trim0;
314
0
            hp[label] -= trim1;
315
0
            for (j = label + 1; j < count; j++)
316
0
              hp[j] -= trim0;
317
0
          } parallel_endfor
318
0
        }
319
200
      } else {
320
0
        assert(range == count);
321
0
        parallel_for(i, batch_size) {
322
0
          int j;
323
0
          float* const hp = h->data.f32 + i * count;
324
0
          float* const bp = b->data.f32 + i * count;
325
0
          for (j = 0; j < count; j++)
326
0
            hp[j] -= bp[j];
327
0
        } parallel_endfor
328
0
      }
329
200
    } else 
if (1
b->info.datatype == CCV_32S1
) {
330
3
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++2
)
331
2
        { assert(d->info.dim[i] == h->info.dim[i]); }
332
1
      const float trim0 = cmd.info.label_smoothing.trim0;
333
1
      const float trim1 = cmd.info.label_smoothing.trim1;
334
1
      if (trim0 == 0 && trim1 == 1)
335
1
      {
336
24
        
parallel_for13
(i, batch_size) {
337
24
          const int label = b->data.i32[i];
338
24
          float* const hp = h->data.f32 + i * count;
339
24
          hp[label] -= 1.;
340
24
        } 
parallel_endfor13
341
1
      } else {
342
0
        parallel_for(i, batch_size) {
343
0
          int j;
344
0
          const int label = b->data.i32[i];
345
0
          float* const hp = h->data.f32 + i * count;
346
0
          for (j = 0; j < label; j++)
347
0
            hp[j] -= trim0;
348
0
          hp[label] -= trim1;
349
0
          for (j = label + 1; j < count; j++)
350
0
            hp[j] -= trim0;
351
0
        } parallel_endfor
352
0
      }
353
1
    }
354
201
  }
355
308
  return CCV_NNC_EXEC_SUCCESS;
356
308
}
357
358
REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
359
1
{
360
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
361
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
362
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
363
1
  registry->algorithms = 1;
364
1
  registry->exec = _ccv_nnc_softmax_crossentropy_forw;
365
1
}
366
367
REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
368
1
{
369
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
370
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
371
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
372
1
  registry->algorithms = 1;
373
1
  registry->exec = _ccv_nnc_softmax_crossentropy_back;
374
1
}