Coverage Report

Created: 2021-09-21 23:33

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/softmax_loss/ccv_nnc_softmax_crossentropy_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_softmax_crossentropy_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
316
{
15
316
  assert(input_size == 2);
16
316
  const ccv_nnc_tensor_t* a = inputs[0];
17
316
  assert(!CCV_IS_TENSOR_VIEW(a));
18
316
  const ccv_nnc_tensor_t* b = inputs[1];
19
316
  assert(!CCV_IS_TENSOR_VIEW(b));
20
316
  assert(output_size == 2);
21
316
  ccv_nnc_tensor_t* c = outputs[0];
22
316
  assert(!c || !CCV_IS_TENSOR_VIEW(c));
23
316
  ccv_nnc_tensor_t* d = outputs[1];
24
316
  assert(!CCV_IS_TENSOR_VIEW(d));
25
316
  const int axis_count = ccv_nnc_tensor_nd(a->info.dim);
26
316
  const int batch_size = axis_count < 2 ? 
10
: a->info.dim[0];
27
316
  const int count = ccv_nnc_tensor_count(a->info) / batch_size;
28
316
  int i;
29
316
  if (c)
30
315
  {
31
315
    assert(ccv_nnc_tensor_count(c->info) == batch_size);
32
315
    if (b->info.datatype == CCV_32F)
33
312
    {
34
312
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
35
312
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
36
312
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)3
:
(batch_size == 1 309
?
b->info.dim[0]301
:
18
);
37
312
      if (range == 1)
38
309
      {
39
927
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++618
)
40
618
          { assert(a->info.dim[i] == d->info.dim[i]); }
41
309
        const float trim0 = cmd.info.label_smoothing.trim0;
42
309
        const float trim1 = cmd.info.label_smoothing.trim1;
43
309
        if (trim0 == 0 && 
trim1 == 1305
)
44
305
        {
45
305
          parallel_for(i, batch_size) {
46
0
            int j;
47
0
            float* const ap = a->data.f32 + i * count;
48
0
            float* const dp = d->data.f32 + i * count;
49
0
            double maxval = ap[0];
50
3.58k
            for (j = 1; j < count; j++)
51
3.58k
              if (ap[j] > maxval)
52
524
                maxval = ap[j];
53
0
            const int label = (int)(b->data.f32[i] + 0.5);
54
0
            assert(label >= 0 && label < count);
55
335
            c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
56
335
            double sumval = 0;
57
4.67k
            for (j = 0; j < count; 
j++4.33k
)
58
4.33k
              sumval += (dp[j] = expf(ap[j] - maxval));
59
335
            sumval = 1.0 / sumval;
60
6.08k
            for (j = 0; j < count; 
j++5.75k
)
61
5.75k
              dp[j] *= sumval;
62
640
          } parallel_endfor
63
305
        } else {
64
4
          parallel_for(i, batch_size) {
65
0
            int j;
66
0
            float* const ap = a->data.f32 + i * count;
67
0
            float* const dp = d->data.f32 + i * count;
68
0
            double maxval = ap[0];
69
1.03k
            for (j = 1; j < count; j++)
70
1.03k
              if (ap[j] > maxval)
71
104
                maxval = ap[j];
72
0
            const int label = (int)(b->data.f32[i] + 0.5);
73
0
            assert(label >= 0 && label < count);
74
28
            float p = 0;
75
957
            for (j = 0; j < label; 
j++929
)
76
929
              p += trim0 * (maxval - ap[j]);
77
28
            p += trim1 * (maxval - ap[label]);
78
953
            for (j = label + 1; j < count; 
j++925
)
79
925
              p += trim0 * (maxval - ap[j]);
80
28
            c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
81
28
            double sumval = 0;
82
1.41k
            for (j = 0; j < count; 
j++1.38k
)
83
1.38k
              sumval += (dp[j] = expf(ap[j] - maxval));
84
28
            sumval = 1.0 / sumval;
85
2.90k
            for (j = 0; j < count; 
j++2.87k
)
86
2.87k
              dp[j] *= sumval;
87
32
          } parallel_endfor
88
4
        }
89
309
      } else {
90
3
        assert(range == count);
91
3
        parallel_for(i, batch_size) {
92
0
          int j;
93
0
          float* const ap = a->data.f32 + i * count;
94
0
          float* const bp = b->data.f32 + i * count;
95
0
          float* const dp = d->data.f32 + i * count;
96
0
          double maxval = ap[0];
97
22
          for (j = 1; j < count; j++)
98
22
            if (ap[j] > maxval)
99
5
              maxval = ap[j];
100
0
          float p = 0;
101
23
          for (j = 0; j < count; j++)
102
23
            p += bp[j] * (maxval - ap[j]);
103
0
          c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
104
0
          double sumval = 0;
105
24
          for (j = 0; j < count; j++)
106
24
            sumval += (dp[j] = expf(ap[j] - maxval));
107
0
          sumval = 1.0 / sumval;
108
23
          for (j = 0; j < count; j++)
109
23
            dp[j] *= sumval;
110
3
        } parallel_endfor
111
3
      }
112
312
    } else 
if (3
b->info.datatype == CCV_32S3
) {
113
9
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++6
)
114
6
        { assert(a->info.dim[i] == d->info.dim[i]); }
115
3
      const float trim0 = cmd.info.label_smoothing.trim0;
116
3
      const float trim1 = cmd.info.label_smoothing.trim1;
117
3
      if (trim0 == 0 && 
trim1 == 12
)
118
2
      {
119
2
        parallel_for(i, batch_size) {
120
0
          int j;
121
0
          float* const ap = a->data.f32 + i * count;
122
0
          float* const dp = d->data.f32 + i * count;
123
0
          double maxval = ap[0];
124
6
          for (j = 1; j < count; j++)
125
6
            if (ap[j] > maxval)
126
2
              maxval = ap[j];
127
0
          const int label = b->data.i32[i];
128
0
          assert(label >= 0 && label < count);
129
3
          c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
130
3
          double sumval = 0;
131
13
          for (j = 0; j < count; 
j++10
)
132
10
            sumval += (dp[j] = expf(ap[j] - maxval));
133
3
          sumval = 1.0 / sumval;
134
10
          for (j = 0; j < count; 
j++7
)
135
7
            dp[j] *= sumval;
136
5
        } parallel_endfor
137
2
      } else {
138
1
        parallel_for(i, batch_size) {
139
0
          int j;
140
0
          float* const ap = a->data.f32 + i * count;
141
0
          float* const dp = d->data.f32 + i * count;
142
0
          double maxval = ap[0];
143
2
          for (j = 1; j < count; j++)
144
2
            if (ap[j] > maxval)
145
2
              maxval = ap[j];
146
0
          const int label = b->data.i32[i];
147
0
          assert(label >= 0 && label < count);
148
1
          float p = 0;
149
2
          for (j = 0; j < label; 
j++1
)
150
1
            p += trim0 * (maxval - ap[j]);
151
1
          p += trim1 * (maxval - ap[label]);
152
2
          for (j = label + 1; j < count; 
j++1
)
153
1
            p += trim0 * (maxval - ap[j]);
154
1
          c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
155
1
          double sumval = 0;
156
6
          for (j = 0; j < count; 
j++5
)
157
5
            sumval += (dp[j] = expf(ap[j] - maxval));
158
1
          sumval = 1.0 / sumval;
159
6
          for (j = 0; j < count; 
j++5
)
160
5
            dp[j] *= sumval;
161
2
        } parallel_endfor
162
1
      }
163
3
    }
164
315
  } else {
165
1
    // No loss calculation, just vanilla softmax.
166
1
    parallel_for(i, batch_size) {
167
0
      int j;
168
0
      float* const ap = a->data.f32 + i * count;
169
0
      float* const dp = d->data.f32 + i * count;
170
0
      double maxval = ap[0];
171
4
      for (j = 1; j < count; j++)
172
4
        if (ap[j] > maxval)
173
1
          maxval = ap[j];
174
0
      double sumval = 0;
175
5
      for (j = 0; j < count; j++)
176
5
        sumval += (dp[j] = expf(ap[j] - maxval));
177
0
      sumval = 1.0 / sumval;
178
6
      for (j = 0; j < count; j++)
179
6
        dp[j] *= sumval;
180
1
    } parallel_endfor
181
1
  }
182
316
  return CCV_NNC_EXEC_SUCCESS;
183
316
}
184
185
static int _ccv_nnc_softmax_crossentropy_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
186
308
{
187
308
  assert(input_size >= 6);
188
308
  assert(output_size >= 1);
189
308
  const ccv_nnc_tensor_t* g = inputs[0];
190
308
  assert(!g || !CCV_IS_TENSOR_VIEW(g));
191
308
  const ccv_nnc_tensor_t* b = inputs[3];
192
308
  assert(!CCV_IS_TENSOR_VIEW(b));
193
308
  const ccv_nnc_tensor_t* d = inputs[5];
194
308
  assert(!CCV_IS_TENSOR_VIEW(d));
195
308
  ccv_nnc_tensor_t* h = outputs[0];
196
308
  assert(!CCV_IS_TENSOR_VIEW(h));
197
308
  const int axis_count = ccv_nnc_tensor_nd(d->info.dim);
198
308
  const int batch_size = axis_count < 2 ? 
10
: d->info.dim[0];
199
308
  const int count = ccv_nnc_tensor_count(d->info) / batch_size;
200
308
  int i;
201
308
  if (g)
202
107
  {
203
107
    if (b->info.datatype == CCV_32F)
204
105
    {
205
105
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
206
105
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
207
105
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)1
:
(batch_size == 1 104
?
b->info.dim[0]100
:
14
);
208
105
      if (range == 1)
209
104
      {
210
312
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++208
)
211
208
          { assert(d->info.dim[i] == h->info.dim[i]); }
212
104
        const float trim0 = cmd.info.label_smoothing.trim0;
213
104
        const float trim1 = cmd.info.label_smoothing.trim1;
214
104
        if (trim0 == 0 && 
trim1 == 1102
)
215
102
        {
216
102
          parallel_for(i, batch_size) {
217
0
            int j;
218
0
            const float gp = g->data.f32[i];
219
0
            const int label = (int)(b->data.f32[i] + 0.5);
220
0
            float* const dp = d->data.f32 + i * count;
221
0
            float* const hp = h->data.f32 + i * count;
222
1.44k
            for (j = 0; j < count; j++)
223
1.44k
              hp[j] = gp * dp[j];
224
0
            hp[label] -= gp;
225
102
          } parallel_endfor
226
102
        } else {
227
2
          parallel_for(i, batch_size) {
228
0
            int j;
229
0
            const float gp = g->data.f32[i];
230
0
            const int label = (int)(b->data.f32[i] + 0.5);
231
0
            float* const dp = d->data.f32 + i * count;
232
0
            float* const hp = h->data.f32 + i * count;
233
281
            for (j = 0; j < label; j++)
234
281
              hp[j] = gp * (dp[j] - trim0);
235
0
            hp[label] = gp * (dp[label] - trim1);
236
354
            for (j = label + 1; j < count; j++)
237
354
              hp[j] = gp * (dp[j] - trim0);
238
2
          } parallel_endfor
239
2
        }
240
104
      } else {
241
1
        assert(range == count);
242
1
        parallel_for(i, batch_size) {
243
0
          int j;
244
0
          const float gp = g->data.f32[i];
245
0
          float* const dp = d->data.f32 + i * count;
246
0
          float* const hp = h->data.f32 + i * count;
247
0
          float* const bp = b->data.f32 + i * count;
248
4
          for (j = 0; j < count; j++)
249
4
            hp[j] = gp * (dp[j] - bp[j]);
250
1
        } parallel_endfor
251
1
      }
252
105
    } else 
if (2
b->info.datatype == CCV_32S2
) {
253
6
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++4
)
254
4
        { assert(d->info.dim[i] == h->info.dim[i]); }
255
2
      const float trim0 = cmd.info.label_smoothing.trim0;
256
2
      const float trim1 = cmd.info.label_smoothing.trim1;
257
2
      if (trim0 == 0 && 
trim1 == 11
)
258
1
      {
259
1
        parallel_for(i, batch_size) {
260
0
          int j;
261
0
          const float gp = g->data.f32[i];
262
0
          const int label = b->data.i32[i];
263
0
          float* const dp = d->data.f32 + i * count;
264
0
          float* const hp = h->data.f32 + i * count;
265
6
          for (j = 0; j < count; j++)
266
6
            hp[j] = gp * dp[j];
267
0
          hp[label] -= gp;
268
1
        } parallel_endfor
269
1
      } else {
270
1
        parallel_for(i, batch_size) {
271
0
          int j;
272
0
          const float gp = g->data.f32[i];
273
0
          const int label = b->data.i32[i];
274
0
          float* const dp = d->data.f32 + i * count;
275
0
          float* const hp = h->data.f32 + i * count;
276
2
          for (j = 0; j < label; j++)
277
2
            hp[j] = gp * (dp[j] - trim0);
278
0
          hp[label] = gp * (dp[label] - trim1);
279
1
          for (j = label + 1; j < count; j++)
280
1
            hp[j] = gp * (dp[j] - trim0);
281
1
        } parallel_endfor
282
1
      }
283
2
    }
284
201
  } else {
285
201
    if (h->data.f32 != d->data.f32) // If not inplace replacement.
286
201
      memcpy(h->data.f32, d->data.f32, sizeof(float) * count * batch_size);
287
201
    if (b->info.datatype == CCV_32F)
288
200
    {
289
200
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
290
200
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
291
200
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)0
: (batch_size == 1 ? b->info.dim[0] :
10
);
292
200
      if (range == 1)
293
200
      {
294
200
        const float trim0 = cmd.info.label_smoothing.trim0;
295
200
        const float trim1 = cmd.info.label_smoothing.trim1;
296
200
        if (trim0 == 0 && trim1 == 1)
297
200
        {
298
600
          for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++400
)
299
400
            { assert(d->info.dim[i] == h->info.dim[i]); }
300
200
          parallel_for(i, batch_size) {
301
0
            const int label = (int)(b->data.f32[i] + 0.5);
302
0
            float* const hp = h->data.f32 + i * count;
303
0
            hp[label] -= 1.;
304
200
          } parallel_endfor
305
200
        } else {
306
0
          for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; i++)
307
0
            { assert(d->info.dim[i] == h->info.dim[i]); }
308
0
          parallel_for(i, batch_size) {
309
0
            int j;
310
0
            const int label = (int)(b->data.f32[i] + 0.5);
311
0
            float* const hp = h->data.f32 + i * count;
312
0
            for (j = 0; j < label; j++)
313
0
              hp[j] -= trim0;
314
0
            hp[label] -= trim1;
315
0
            for (j = label + 1; j < count; j++)
316
0
              hp[j] -= trim0;
317
0
          } parallel_endfor
318
0
        }
319
200
      } else {
320
0
        assert(range == count);
321
0
        parallel_for(i, batch_size) {
322
0
          int j;
323
0
          float* const hp = h->data.f32 + i * count;
324
0
          float* const bp = b->data.f32 + i * count;
325
0
          for (j = 0; j < count; j++)
326
0
            hp[j] -= bp[j];
327
0
        } parallel_endfor
328
0
      }
329
200
    } else 
if (1
b->info.datatype == CCV_32S1
) {
330
3
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++2
)
331
2
        { assert(d->info.dim[i] == h->info.dim[i]); }
332
1
      const float trim0 = cmd.info.label_smoothing.trim0;
333
1
      const float trim1 = cmd.info.label_smoothing.trim1;
334
1
      if (trim0 == 0 && trim1 == 1)
335
1
      {
336
1
        parallel_for(i, batch_size) {
337
0
          const int label = b->data.i32[i];
338
0
          float* const hp = h->data.f32 + i * count;
339
0
          hp[label] -= 1.;
340
1
        } parallel_endfor
341
1
      } else {
342
0
        parallel_for(i, batch_size) {
343
0
          int j;
344
0
          const int label = b->data.i32[i];
345
0
          float* const hp = h->data.f32 + i * count;
346
0
          for (j = 0; j < label; j++)
347
0
            hp[j] -= trim0;
348
0
          hp[label] -= trim1;
349
0
          for (j = label + 1; j < count; j++)
350
0
            hp[j] -= trim0;
351
0
        } parallel_endfor
352
0
      }
353
1
    }
354
201
  }
355
308
  return CCV_NNC_EXEC_SUCCESS;
356
308
}
357
358
REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
359
1
{
360
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
361
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
362
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
363
1
  registry->algorithms = 1;
364
1
  registry->exec = _ccv_nnc_softmax_crossentropy_forw;
365
1
}
366
367
REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
368
1
{
369
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
370
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
371
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
372
1
  registry->algorithms = 1;
373
1
  registry->exec = _ccv_nnc_softmax_crossentropy_back;
374
1
}