Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/loss/ccv_nnc_categorical_crossentropy_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_categorical_crossentropy_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
11
{
15
11
  assert(input_size == 2);
16
11
  const ccv_nnc_tensor_t* a = inputs[0];
17
11
  assert(CCV_IS_TENSOR_CONTIGUOUS(a));
18
11
  const ccv_nnc_tensor_t* b = inputs[1];
19
11
  assert(CCV_IS_TENSOR_CONTIGUOUS(b));
20
11
  assert(output_size == 1);
21
11
  ccv_nnc_tensor_t* c = outputs[0];
22
11
  assert(CCV_IS_TENSOR_CONTIGUOUS(c));
23
11
  const int axis_count = ccv_nnc_tensor_nd(a->info.dim);
24
11
  const int batch_size = axis_count < 2 ? 
10
: a->info.dim[0];
25
11
  const int count = ccv_nnc_tensor_count(a->info) / batch_size;
26
11
  int i;
27
11
  if (b->info.datatype == CCV_32F)
28
9
  {
29
    // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
30
    // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
31
9
    const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)1
:
(8
batch_size == 18
?
b->info.dim[0]0
:
18
);
32
9
    if (range == 1)
33
8
    {
34
16
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && b->info.dim[i] > 0; 
i++8
)
35
8
        { assert(b->info.dim[i] == c->info.dim[i]); }
36
8
      const float trim0 = cmd.info.label_smoothing.trim0;
37
8
      const float trim1 = cmd.info.label_smoothing.trim1;
38
8
      if (trim0 == 0 && 
trim1 == 14
)
39
4
      {
40
40
        
parallel_for4
(i, batch_size) {
41
40
          const int label = (int)(b->data.f32[i] + 0.5);
42
40
          assert(label >= 0 && label < count);
43
40
          const float p = a->data.f32[i * count + label];
44
40
          c->data.f32[i] = -logf(p);
45
40
        } parallel_endfor
46
4
      } else {
47
40
        
parallel_for4
(i, batch_size) {
48
40
          const int label = (int)(b->data.f32[i] + 0.5);
49
40
          assert(label >= 0 && label < count);
50
40
          int j;
51
40
          float p = 0;
52
40
          float* const ap = a->data.f32 + i * count;
53
2.02k
          for (j = 0; j < label; 
j++1.98k
)
54
1.98k
            p += -trim0 * logf(ap[j]);
55
40
          p += -trim1 * logf(ap[label]);
56
2.02k
          for (j = label + 1; j < count; 
j++1.98k
)
57
1.98k
            p += -trim0 * logf(ap[j]);
58
40
          c->data.f32[i] = p;
59
40
        } parallel_endfor
60
4
      }
61
8
    } else {
62
1
      assert(range == count);
63
2
      
parallel_for1
(i, batch_size) {
64
2
        int j;
65
2
        float p = 0;
66
2
        float* const bp = b->data.f32 + i * count;
67
2
        float* const ap = a->data.f32 + i * count;
68
8
        for (j = 0; j < count; 
j++6
)
69
6
          p += -bp[j] * logf(ap[j]);
70
2
        c->data.f32[i] = p;
71
2
      } parallel_endfor
72
1
    }
73
9
  } else 
if (2
b->info.datatype == CCV_32S2
) {
74
5
    for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && b->info.dim[i] > 0; 
i++3
)
75
3
      { assert(b->info.dim[i] == c->info.dim[i]); }
76
2
    const float trim0 = cmd.info.label_smoothing.trim0;
77
2
    const float trim1 = cmd.info.label_smoothing.trim1;
78
2
    if (trim0 == 0 && 
trim1 == 11
)
79
1
    {
80
2
      
parallel_for1
(i, batch_size) {
81
2
        const int label = b->data.i32[i];
82
2
        assert(label >= 0 && label < count);
83
2
        const float p = a->data.f32[i * count + label];
84
2
        c->data.f32[i] = -logf(p);
85
2
      } parallel_endfor
86
1
    } else {
87
2
      
parallel_for1
(i, batch_size) {
88
2
        const int label = b->data.i32[i];
89
2
        assert(label >= 0 && label < count);
90
2
        int j;
91
2
        float p = 0;
92
2
        float* const ap = a->data.f32 + i * count;
93
5
        for (j = 0; j < label; 
j++3
)
94
3
          p += -trim0 * logf(ap[j]);
95
2
        p += -trim1 * logf(ap[label]);
96
3
        for (j = label + 1; j < count; 
j++1
)
97
1
          p += -trim0 * logf(ap[j]);
98
2
        c->data.f32[i] = p;
99
2
      } parallel_endfor
100
1
    }
101
2
  }
102
11
  return CCV_NNC_EXEC_SUCCESS;
103
11
}
104
105
static int _ccv_nnc_categorical_crossentropy_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
106
8
{
107
8
  assert(input_size >= 3);
108
8
  assert(output_size >= 1);
109
8
  const ccv_nnc_tensor_t* g = inputs[0];
110
8
  assert(!g || !CCV_IS_TENSOR_VIEW(g));
111
8
  const ccv_nnc_tensor_t* a = inputs[1];
112
8
  assert(CCV_IS_TENSOR_CONTIGUOUS(a));
113
8
  const ccv_nnc_tensor_t* b = inputs[2];
114
8
  assert(CCV_IS_TENSOR_CONTIGUOUS(b));
115
8
  ccv_nnc_tensor_t* h = outputs[0];
116
8
  assert(CCV_IS_TENSOR_CONTIGUOUS(h));
117
8
  const int axis_count = ccv_nnc_tensor_nd(a->info.dim);
118
8
  const int batch_size = axis_count < 2 ? 
10
: a->info.dim[0];
119
8
  const int count = ccv_nnc_tensor_count(a->info) / batch_size;
120
8
  int i;
121
8
  if (g)
122
8
  {
123
8
    if (b->info.datatype == CCV_32F)
124
5
    {
125
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
126
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
127
5
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)1
:
(4
batch_size == 14
?
b->info.dim[0]0
:
14
);
128
5
      if (range == 1)
129
4
      {
130
12
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++8
)
131
8
          { assert(a->info.dim[i] == h->info.dim[i]); }
132
4
        const float trim0 = cmd.info.label_smoothing.trim0;
133
4
        const float trim1 = cmd.info.label_smoothing.trim1;
134
4
        if (trim0 == 0 && 
trim1 == 12
)
135
2
        {
136
20
          
parallel_for2
(i, batch_size) {
137
20
            int j;
138
20
            const float gp = g->data.f32[i];
139
20
            const int label = (int)(b->data.f32[i] + 0.5);
140
20
            float* const hp = h->data.f32 + i * count;
141
2.02k
            for (j = 0; j < count; 
j++2.00k
)
142
2.00k
              hp[j] = 0;
143
20
            const float p = a->data.f32[i * count + label];
144
20
            hp[label] = -gp / p;
145
20
          } parallel_endfor
146
2
        } else {
147
20
          
parallel_for2
(i, batch_size) {
148
20
            int j;
149
20
            const float gp = g->data.f32[i];
150
20
            const int label = (int)(b->data.f32[i] + 0.5);
151
20
            float* const hp = h->data.f32 + i * count;
152
20
            float* const ap = a->data.f32 + i * count;
153
1.01k
            for (j = 0; j < label; 
j++990
)
154
990
              hp[j] = -gp * trim0 / ap[j];
155
20
            hp[label] = -gp * trim1 / ap[label];
156
1.01k
            for (j = label + 1; j < count; 
j++990
)
157
990
              hp[j] = -gp * trim0 / ap[j];
158
20
          } parallel_endfor
159
2
        }
160
4
      } else {
161
1
        assert(range == count);
162
2
        
parallel_for1
(i, batch_size) {
163
2
          int j;
164
2
          const float gp = g->data.f32[i];
165
2
          float* const hp = h->data.f32 + i * count;
166
2
          float* const ap = a->data.f32 + i * count;
167
2
          float* const bp = b->data.f32 + i * count;
168
8
          for (j = 0; j < count; 
j++6
)
169
6
            hp[j] = -gp * bp[j] / ap[j];
170
2
        } parallel_endfor
171
1
      }
172
5
    } else 
if (3
b->info.datatype == CCV_32S3
) {
173
9
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++6
)
174
6
        { assert(a->info.dim[i] == h->info.dim[i]); }
175
3
      const float trim0 = cmd.info.label_smoothing.trim0;
176
3
      const float trim1 = cmd.info.label_smoothing.trim1;
177
3
      if (trim0 == 0 && 
trim1 == 12
)
178
2
      {
179
4
        
parallel_for2
(i, batch_size) {
180
4
          int j;
181
4
          const float gp = g->data.f32[i];
182
4
          const int label = b->data.i32[i];
183
4
          float* const hp = h->data.f32 + i * count;
184
16
          for (j = 0; j < count; 
j++12
)
185
12
            hp[j] = 0;
186
4
          const float p = a->data.f32[i * count + label];
187
4
          hp[label] = -gp / p;
188
4
        } parallel_endfor
189
2
      } else {
190
2
        
parallel_for1
(i, batch_size) {
191
2
          int j;
192
2
          const float gp = g->data.f32[i];
193
2
          const int label = b->data.i32[i];
194
2
          float* const hp = h->data.f32 + i * count;
195
2
          float* const ap = a->data.f32 + i * count;
196
5
          for (j = 0; j < label; 
j++3
)
197
3
            hp[j] = -gp * trim0 / ap[j];
198
2
          hp[label] = -gp * trim1 / ap[label];
199
3
          for (j = label + 1; j < count; 
j++1
)
200
1
            hp[j] = -gp * trim0 / ap[j];
201
2
        } parallel_endfor
202
1
      }
203
3
    }
204
8
  } else {
205
0
    if (b->info.datatype == CCV_32F)
206
0
    {
207
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
208
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
209
0
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? ccv_nnc_tensor_get_c(b->info) : (batch_size == 1 ? b->info.dim[0] : 1);
210
0
      if (range == 1)
211
0
      {
212
0
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++)
213
0
          { assert(a->info.dim[i] == h->info.dim[i]); }
214
0
        const float trim0 = cmd.info.label_smoothing.trim0;
215
0
        const float trim1 = cmd.info.label_smoothing.trim1;
216
0
        if (trim0 == 0 && trim1 == 1)
217
0
        {
218
0
          parallel_for(i, batch_size) {
219
0
            int j;
220
0
            const int label = (int)(b->data.f32[i] + 0.5);
221
0
            float* const hp = h->data.f32 + i * count;
222
0
            for (j = 0; j < count; j++)
223
0
              hp[j] = 0;
224
0
            const float p = a->data.f32[i * count + label];
225
0
            hp[label] = -1. / p;
226
0
          } parallel_endfor
227
0
        } else {
228
0
          parallel_for(i, batch_size) {
229
0
            int j;
230
0
            const int label = (int)(b->data.f32[i] + 0.5);
231
0
            float* const hp = h->data.f32 + i * count;
232
0
            float* const ap = a->data.f32 + i * count;
233
0
            for (j = 0; j < label; j++)
234
0
              hp[j] = -trim0 / ap[j];
235
0
            hp[label] = -trim1 / ap[label];
236
0
            for (j = label + 1; j < count; j++)
237
0
              hp[j] = -trim0 / ap[j];
238
0
          } parallel_endfor
239
0
        }
240
0
      } else {
241
0
        assert(range == count);
242
0
        parallel_for(i, batch_size) {
243
0
          int j;
244
0
          float* const hp = h->data.f32 + i * count;
245
0
          float* const ap = a->data.f32 + i * count;
246
0
          float* const bp = b->data.f32 + i * count;
247
0
          for (j = 0; j < count; j++)
248
0
            hp[j] = -bp[j] / ap[j];
249
0
        } parallel_endfor
250
0
      }
251
0
    } else if (b->info.datatype == CCV_32S) {
252
0
      const float trim0 = cmd.info.label_smoothing.trim0;
253
0
      const float trim1 = cmd.info.label_smoothing.trim1;
254
0
      if (trim0 == 0 && trim1 == 1)
255
0
      {
256
0
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++)
257
0
          { assert(a->info.dim[i] == h->info.dim[i]); }
258
0
        parallel_for(i, batch_size) {
259
0
          int j;
260
0
          const int label = b->data.i32[i];
261
0
          float* const hp = h->data.f32 + i * count;
262
0
          for (j = 0; j < count; j++)
263
0
            hp[j] = 0;
264
0
          const float p = a->data.f32[i * count + label];
265
0
          hp[label] = -1. / p;
266
0
        } parallel_endfor
267
0
      } else {
268
0
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++)
269
0
          { assert(a->info.dim[i] == h->info.dim[i]); }
270
0
        parallel_for(i, batch_size) {
271
0
          int j;
272
0
          const int label = b->data.i32[i];
273
0
          float* const hp = h->data.f32 + i * count;
274
0
          float* const ap = a->data.f32 + i * count;
275
0
          for (j = 0; j < label; j++)
276
0
            hp[j] = -trim0 / ap[j];
277
0
          hp[label] = -trim1 / ap[label];
278
0
          for (j = label + 1; j < count; j++)
279
0
            hp[j] = -trim0 / ap[j];
280
0
        } parallel_endfor
281
0
      }
282
0
    }
283
0
  }
284
8
  return CCV_NNC_EXEC_SUCCESS;
285
8
}
286
287
REGISTER_COMMAND_BACKEND(CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
288
1
{
289
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
290
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
291
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
292
1
  registry->algorithms = 1;
293
1
  registry->exec = _ccv_nnc_categorical_crossentropy_forw;
294
1
}
295
296
REGISTER_COMMAND_BACKEND(CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
297
1
{
298
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
299
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
300
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
301
1
  registry->algorithms = 1;
302
1
  registry->exec = _ccv_nnc_categorical_crossentropy_back;
303
1
}