Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/softmax_loss/ccv_nnc_softmax_crossentropy_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include <ccv.h>
2
#include <ccv_internal.h>
3
#include <nnc/ccv_nnc.h>
4
#include <nnc/ccv_nnc_easy.h>
5
#include <nnc/ccv_nnc_internal.h>
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_softmax_crossentropy_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
308
{
15
308
  assert(input_size == 2);
16
308
  const ccv_nnc_tensor_t* a = inputs[0];
17
308
  assert(!CCV_IS_TENSOR_VIEW(a));
18
308
  const ccv_nnc_tensor_t* b = inputs[1];
19
308
  assert(!CCV_IS_TENSOR_VIEW(b));
20
308
  assert(output_size == 2);
21
308
  ccv_nnc_tensor_t* c = outputs[0];
22
308
  assert(!c || !CCV_IS_TENSOR_VIEW(c));
23
308
  ccv_nnc_tensor_t* d = outputs[1];
24
308
  assert(!CCV_IS_TENSOR_VIEW(d));
25
308
  const int axis_count = ccv_nnc_tensor_nd(a->info.dim);
26
308
  const int batch_size = axis_count < 2 ? 
10
: a->info.dim[0];
27
308
  const int count = ccv_nnc_tensor_count(a->info) / batch_size;
28
308
  int i;
29
308
  if (c)
30
307
  {
31
614
    for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && b->info.dim[i] > 0; 
i++307
)
32
307
      { assert(b->info.dim[i] == c->info.dim[i]); }
33
307
    if (b->info.datatype == CCV_32F)
34
305
    {
35
305
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
36
305
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
37
305
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)0
: (batch_size == 1 ?
b->info.dim[0]301
:
14
);
38
305
      if (range == 1)
39
305
      {
40
915
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++610
)
41
610
          { assert(a->info.dim[i] == d->info.dim[i]); }
42
305
        parallel_for(i, batch_size) {
43
0
          int j;
44
0
          float* const ap = a->data.f32 + i * count;
45
0
          float* const dp = d->data.f32 + i * count;
46
0
          double maxval = ap[0];
47
3.37k
          for (j = 1; j < count; j++)
48
3.37k
            if (ap[j] > maxval)
49
1.08k
              maxval = ap[j];
50
0
          const int label = (int)(b->data.f32[i] + 0.5);
51
0
          assert(label >= 0 && label < count);
52
334
          c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
53
334
          double sumval = 0;
54
4.29k
          for (j = 0; j < count; 
j++3.96k
)
55
3.96k
            sumval += (dp[j] = expf(ap[j] - maxval));
56
334
          sumval = 1.0 / sumval;
57
4.93k
          for (j = 0; j < count; 
j++4.60k
)
58
4.60k
            dp[j] *= sumval;
59
639
        } parallel_endfor
60
305
      } else {
61
0
        assert(range == count);
62
0
        parallel_for(i, batch_size) {
63
0
          int j;
64
0
          float* const ap = a->data.f32 + i * count;
65
0
          float* const bp = b->data.f32 + i * count;
66
0
          float* const dp = d->data.f32 + i * count;
67
0
          double maxval = ap[0];
68
0
          for (j = 1; j < count; j++)
69
0
            if (ap[j] > maxval)
70
0
              maxval = ap[j];
71
0
          float p = 0;
72
0
          for (j = 0; j < count; j++)
73
0
            p += bp[j] * (maxval - ap[j]);
74
0
          c->data.f32[i] = p; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
75
0
          double sumval = 0;
76
0
          for (j = 0; j < count; j++)
77
0
            sumval += (dp[j] = expf(ap[j] - maxval));
78
0
          sumval = 1.0 / sumval;
79
0
          for (j = 0; j < count; j++)
80
0
            dp[j] *= sumval;
81
0
        } parallel_endfor
82
0
      }
83
305
    } else 
if (2
b->info.datatype == CCV_32S2
) {
84
6
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; 
i++4
)
85
4
        { assert(a->info.dim[i] == d->info.dim[i]); }
86
2
      parallel_for(i, batch_size) {
87
0
        int j;
88
0
        float* const ap = a->data.f32 + i * count;
89
0
        float* const dp = d->data.f32 + i * count;
90
0
        double maxval = ap[0];
91
6
        for (j = 1; j < count; j++)
92
6
          if (ap[j] > maxval)
93
2
            maxval = ap[j];
94
0
        const int label = b->data.i32[i];
95
0
        assert(label >= 0 && label < count);
96
3
        c->data.f32[i] = maxval - ap[label]; // Assign the loss before we do expf so that we can avoid the logf later to preserve numeric accuracy.
97
3
        double sumval = 0;
98
11
        for (j = 0; j < count; 
j++8
)
99
8
          sumval += (dp[j] = expf(ap[j] - maxval));
100
3
        sumval = 1.0 / sumval;
101
12
        for (j = 0; j < count; 
j++9
)
102
9
          dp[j] *= sumval;
103
5
      } parallel_endfor
104
2
    }
105
307
  } else {
106
1
    // No loss calculation, just vanilla softmax.
107
1
    parallel_for(i, batch_size) {
108
0
      int j;
109
0
      float* const ap = a->data.f32 + i * count;
110
0
      float* const dp = d->data.f32 + i * count;
111
0
      double maxval = ap[0];
112
4
      for (j = 1; j < count; j++)
113
4
        if (ap[j] > maxval)
114
1
          maxval = ap[j];
115
0
      double sumval = 0;
116
6
      for (j = 0; j < count; j++)
117
6
        sumval += (dp[j] = expf(ap[j] - maxval));
118
0
      sumval = 1.0 / sumval;
119
6
      for (j = 0; j < count; j++)
120
6
        dp[j] *= sumval;
121
1
    } parallel_endfor
122
1
  }
123
308
  return CCV_NNC_EXEC_SUCCESS;
124
308
}
125
126
static int _ccv_nnc_softmax_crossentropy_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
127
304
{
128
304
  assert(input_size >= 6);
129
304
  assert(output_size >= 1);
130
304
  const ccv_nnc_tensor_t* g = inputs[0];
131
304
  assert(!g || !CCV_IS_TENSOR_VIEW(g));
132
304
  const ccv_nnc_tensor_t* b = inputs[3];
133
304
  assert(!CCV_IS_TENSOR_VIEW(b));
134
304
  const ccv_nnc_tensor_t* d = inputs[5];
135
304
  assert(!CCV_IS_TENSOR_VIEW(d));
136
304
  ccv_nnc_tensor_t* h = outputs[0];
137
304
  assert(!CCV_IS_TENSOR_VIEW(h));
138
304
  const int axis_count = ccv_nnc_tensor_nd(d->info.dim);
139
304
  const int batch_size = axis_count < 2 ? 
10
: d->info.dim[0];
140
304
  const int count = ccv_nnc_tensor_count(d->info) / batch_size;
141
304
  int i;
142
304
  if (g)
143
103
  {
144
206
    for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && b->info.dim[i] > 0; 
i++103
)
145
103
      { assert(b->info.dim[i] == g->info.dim[i]); }
146
103
    if (b->info.datatype == CCV_32F)
147
102
    {
148
102
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
149
102
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
150
102
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)0
: (batch_size == 1 ?
b->info.dim[0]100
:
12
);
151
102
      if (range == 1)
152
102
      {
153
306
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++204
)
154
204
          { assert(d->info.dim[i] == h->info.dim[i]); }
155
102
        parallel_for(i, batch_size) {
156
0
          int j;
157
0
          const float gp = g->data.f32[i];
158
0
          const int label = (int)(b->data.f32[i] + 0.5);
159
0
          float* const dp = d->data.f32 + i * count;
160
0
          float* const hp = h->data.f32 + i * count;
161
1.29k
          for (j = 0; j < count; j++)
162
1.29k
            hp[j] = gp * dp[j];
163
0
          hp[label] -= gp;
164
102
        } parallel_endfor
165
102
      } else {
166
0
        assert(range == count);
167
0
        parallel_for(i, batch_size) {
168
0
          int j;
169
0
          const float gp = g->data.f32[i];
170
0
          float* const dp = d->data.f32 + i * count;
171
0
          float* const hp = h->data.f32 + i * count;
172
0
          float* const bp = b->data.f32 + i * count;
173
0
          for (j = 0; j < count; j++)
174
0
            hp[j] = gp * (dp[j] - bp[j]);
175
0
        } parallel_endfor
176
0
      }
177
102
    } else 
if (1
b->info.datatype == CCV_32S1
) {
178
3
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++2
)
179
2
        { assert(d->info.dim[i] == h->info.dim[i]); }
180
1
      parallel_for(i, batch_size) {
181
0
        int j;
182
0
        const float gp = g->data.f32[i];
183
0
        const int label = b->data.i32[i];
184
0
        float* const dp = d->data.f32 + i * count;
185
0
        float* const hp = h->data.f32 + i * count;
186
3
        for (j = 0; j < count; j++)
187
3
          hp[j] = gp * dp[j];
188
0
        hp[label] -= gp;
189
1
      } parallel_endfor
190
1
    }
191
201
  } else {
192
201
    if (h->data.f32 != d->data.f32) // If not inplace replacement.
193
201
      memcpy(h->data.f32, d->data.f32, sizeof(float) * count * batch_size);
194
201
    if (b->info.datatype == CCV_32F)
195
200
    {
196
200
      // If has more than 1 axis, then the range is the channel count. Otherwise, if our batch size is 1, then the range is
197
200
      // the channel count. Otherwise, the range is 1 (and the only axis is the batch size).
198
200
      const int range = ccv_nnc_tensor_nd(b->info.dim) > 1 ? 
ccv_nnc_tensor_get_c(b->info)0
: (batch_size == 1 ? b->info.dim[0] :
10
);
199
200
      if (range == 1)
200
200
      {
201
600
        for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++400
)
202
400
          { assert(d->info.dim[i] == h->info.dim[i]); }
203
200
        parallel_for(i, batch_size) {
204
0
          const int label = (int)(b->data.f32[i] + 0.5);
205
0
          float* const hp = h->data.f32 + i * count;
206
0
          hp[label] -= 1.;
207
200
        } parallel_endfor
208
200
      } else {
209
0
        assert(range == count);
210
0
        parallel_for(i, batch_size) {
211
0
          int j;
212
0
          float* const hp = h->data.f32 + i * count;
213
0
          float* const bp = b->data.f32 + i * count;
214
0
          for (j = 0; j < count; j++)
215
0
            hp[j] -= bp[j];
216
0
        } parallel_endfor
217
0
      }
218
200
    } else 
if (1
b->info.datatype == CCV_32S1
) {
219
3
      for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && d->info.dim[i] > 0; 
i++2
)
220
2
        { assert(d->info.dim[i] == h->info.dim[i]); }
221
1
      parallel_for(i, batch_size) {
222
0
        const int label = b->data.i32[i];
223
0
        float* const hp = h->data.f32 + i * count;
224
0
        hp[label] -= 1.;
225
1
      } parallel_endfor
226
1
    }
227
201
  }
228
304
  return CCV_NNC_EXEC_SUCCESS;
229
304
}
230
231
REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
232
1
{
233
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
234
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
235
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
236
1
  registry->algorithms = 1;
237
1
  registry->exec = _ccv_nnc_softmax_crossentropy_forw;
238
1
}
239
240
REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
241
1
{
242
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
243
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
244
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
245
1
  registry->algorithms = 1;
246
1
  registry->exec = _ccv_nnc_softmax_crossentropy_back;
247
1
}