Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/upsample/ccv_nnc_upsample_cpu_ref.c
Line
Count
Source
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
static int _ccv_nnc_upsample_nearest_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
17
5
{
18
5
  assert(input_size >= 1);
19
5
  assert(output_size >= 1);
20
5
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0];
21
5
  ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0];
22
5
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
23
5
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
24
  // Assuming this is float 32.
25
5
  int adim[CCV_NNC_MAX_DIM_ALLOC];
26
5
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
27
5
  ccv_nnc_tensor_view_get_dim(a, adim);
28
5
  ccv_nnc_tensor_view_get_dim(b, bdim);
29
5
  int astride[CCV_NNC_MAX_DIM_ALLOC];
30
5
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
31
5
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
32
5
  ccv_nnc_tensor_view_get_stride(a, astride);
33
5
  ccv_nnc_tensor_view_get_stride(b, bstride);
34
5
  int i[CCV_NNC_MAX_DIM + 2];
35
5
  int xd, yd, cd;
36
5
  const float* ap = a->data.f32;
37
5
  float* const bp = b->data.f32;
38
5
  assert(a->info.format == b->info.format);
39
5
  const int align_corners = cmd.info.upsample.align_corners;
40
5
  if (a->info.format == CCV_TENSOR_FORMAT_NCHW)
41
2
  {
42
2
    const float rheight = align_corners ? 
(float)(adim[2] - 1) / 0
ccv_max0
(1, bdim[2] - 1) : (float)adim[2] / bdim[2];
43
2
    const float rwidth = align_corners ? 
(float)(adim[3] - 1) / 0
ccv_max0
(1, bdim[3] - 1) : (float)adim[3] / bdim[3];
44
2
    assert(rheight <= 1);
45
2
    assert(rwidth <= 1);
46
2
    int* const xcoeff = (int*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(int) * (bdim[3]), CCV_TENSOR_CPU_MEMORY);
47
16
    for (xd = 0; xd < bdim[3]; 
xd++14
)
48
14
      xcoeff[xd] = ccv_min(align_corners ? (int)(xd * rwidth + 0.5) : (int)((xd + 0.5) * rwidth), adim[3] - 1);
49
2
    assert(adim[0] == bdim[0]);
50
2
    assert(adim[1] == bdim[1]);
51
4
    
for (i[0] = 0; 2
i[0] < adim[0];
i[0]++2
)
52
2
    {
53
2
      const float* ap0 = ap + i[0] * astride[0];
54
2
      float* const bp0 = bp + i[0] * bstride[0];
55
19
      for (i[1] = 0; i[1] < adim[1]; 
i[1]++17
)
56
17
      {
57
17
        int pysi0 = 0;
58
17
        const float* ap1 = ap0;
59
17
        float* bp1 = bp0 + i[1] * bstride[1];
60
475
        for (yd = 0; yd < bdim[2]; 
yd++458
)
61
458
        {
62
458
          const int ysi0 = ccv_min(align_corners ? (int)(yd * rheight + 0.5) : (int)((yd + 0.5) * rheight), adim[2] - 1);
63
458
          if (pysi0 < ysi0) // Move to ay1 line.
64
212
          {
65
212
            ap1 += (ysi0 - pysi0) * astride[2];
66
212
            pysi0 = ysi0;
67
212
          }
68
4.99k
          for (xd = 0; xd < bdim[3]; 
xd++4.53k
)
69
4.53k
            bp1[xd] = ap1[xcoeff[xd]];
70
458
          bp1 += bstride[2];
71
458
        }
72
17
        ap0 += astride[1];
73
17
      }
74
2
    }
75
3
  } else {
76
    // Any case, this is either NHWC or CHWN
77
3
    assert(a->info.format == CCV_TENSOR_FORMAT_NHWC || a->info.format == CCV_TENSOR_FORMAT_CHWN);
78
3
    const float rheight = align_corners ? 
(float)(adim[1] - 1) / 1
ccv_max1
(1, bdim[1] - 1) :
(float)adim[1] / bdim[1]2
;
79
3
    const float rwidth = align_corners ? 
(float)(adim[2] - 1) / 1
ccv_max1
(1, bdim[2] - 1) :
(float)adim[2] / bdim[2]2
;
80
3
    assert(rheight <= 1);
81
3
    assert(rwidth <= 1);
82
3
    int* const xcoeff = (int*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(int) * (bdim[2]), CCV_TENSOR_CPU_MEMORY);
83
43
    for (xd = 0; xd < bdim[2]; 
xd++40
)
84
40
      xcoeff[xd] = ccv_min(align_corners ? (int)(xd * rwidth + 0.5) : (int)((xd + 0.5) * rwidth), adim[2] - 1);
85
3
    assert(adim[0] == bdim[0]);
86
3
    assert(adim[3] == bdim[3]);
87
6
    
for (i[0] = 0; 3
i[0] < adim[0];
i[0]++3
)
88
3
    {
89
3
      int pysi0 = 0;
90
3
      const float* ap0 = ap;
91
3
      float* const bp0 = bp + i[0] * bstride[0];
92
43
      for (yd = 0; yd < bdim[1]; 
yd++40
)
93
40
      {
94
40
        const int ysi0 = ccv_min(align_corners ? (int)(yd * rheight + 0.5) : (int)((yd + 0.5) * rheight), adim[1] - 1);
95
40
        if (pysi0 < ysi0) // Move to ay1 line.
96
17
        {
97
17
          ap0 += (ysi0 - pysi0) * astride[1];
98
17
          pysi0 = ysi0;
99
17
        }
100
40
        float* bp1 = bp0 + yd * bstride[1];
101
992
        for (xd = 0; xd < bdim[2]; 
xd++952
)
102
952
        {
103
952
          const float* const ap00 = ap0 + xcoeff[xd] * astride[2];
104
5.52k
          for (cd = 0; cd < bdim[3]; 
cd++4.56k
)
105
4.56k
            bp1[cd] = ap00[cd];
106
952
          bp1 += bstride[2];
107
952
        }
108
40
      }
109
3
      ap += astride[0];
110
3
    }
111
3
  }
112
5
  return CCV_NNC_EXEC_SUCCESS;
113
5
}
114
115
static int _ccv_nnc_upsample_nearest_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
116
7
{
117
7
  assert(input_size >= 1);
118
7
  assert(output_size >= 1);
119
7
  ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[0];
120
7
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
121
7
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
122
7
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
123
  // Assuming this is float 32.
124
7
  int adim[CCV_NNC_MAX_DIM_ALLOC];
125
7
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
126
7
  ccv_nnc_tensor_view_get_dim(a, adim);
127
7
  ccv_nnc_tensor_view_get_dim(b, bdim);
128
7
  int astride[CCV_NNC_MAX_DIM_ALLOC];
129
7
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
130
7
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
131
7
  ccv_nnc_tensor_view_get_stride(a, astride);
132
7
  ccv_nnc_tensor_view_get_stride(b, bstride);
133
7
  int i[CCV_NNC_MAX_DIM + 2];
134
7
  int xd, yd, cd;
135
7
  _ccv_nnc_tensor_set_cpu_ref_f32(a, 0);
136
7
  float* ap = a->data.f32;
137
7
  const float* bp = b->data.f32;
138
7
  const int align_corners = cmd.info.upsample.align_corners;
139
7
  assert(a->info.format == b->info.format);
140
7
  if (a->info.format == CCV_TENSOR_FORMAT_NCHW)
141
4
  {
142
4
    const float rheight = align_corners ? 
(float)(adim[2] - 1) / 1
ccv_max1
(1, bdim[2] - 1) :
(float)adim[2] / bdim[2]3
;
143
4
    const float rwidth = align_corners ? 
(float)(adim[3] - 1) / 1
ccv_max1
(1, bdim[3] - 1) :
(float)adim[3] / bdim[3]3
;
144
4
    assert(rheight <= 1);
145
4
    assert(rwidth <= 1);
146
4
    int* const xcoeff = (int*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(int) * (bdim[3]), CCV_TENSOR_CPU_MEMORY);
147
26
    for (xd = 0; xd < bdim[3]; 
xd++22
)
148
22
      xcoeff[xd] = ccv_min(align_corners ? (int)(xd * rwidth + 0.5) : (int)((xd + 0.5) * rwidth), adim[3] - 1);
149
4
    assert(adim[0] == bdim[0]);
150
4
    assert(adim[1] == bdim[1]);
151
8
    
for (i[0] = 0; 4
i[0] < adim[0];
i[0]++4
)
152
4
    {
153
4
      float* ap0 = ap + i[0] * astride[0];
154
4
      const float* bp0 = bp + i[0] * bstride[0];
155
51
      for (i[1] = 0; i[1] < adim[1]; 
i[1]++47
)
156
47
      {
157
47
        int pysi0 = 0;
158
47
        float* ap1 = ap0;
159
47
        const float* bp1 = bp0 + i[1] * bstride[1];
160
685
        for (yd = 0; yd < bdim[2]; 
yd++638
)
161
638
        {
162
638
          const int ysi0 = ccv_min(align_corners ? (int)(yd * rheight + 0.5) : (int)((yd + 0.5) * rheight), adim[2] - 1);
163
638
          if (pysi0 < ysi0) // Move to ay1 line.
164
272
          {
165
272
            ap1 += (ysi0 - pysi0) * astride[2];
166
272
            pysi0 = ysi0;
167
272
          }
168
4.45k
          for (xd = 0; xd < bdim[3]; 
xd++3.81k
)
169
3.81k
            ap1[xcoeff[xd]] += bp1[xd];
170
638
          bp1 += bstride[2];
171
638
        }
172
47
        ap0 += astride[1];
173
47
      }
174
4
    }
175
4
  } else {
176
    // Any case, this is either NHWC or CHWN
177
3
    assert(a->info.format == CCV_TENSOR_FORMAT_NHWC || a->info.format == CCV_TENSOR_FORMAT_CHWN);
178
3
    const float rheight = align_corners ? 
(float)(adim[1] - 1) / 0
ccv_max0
(1, bdim[1] - 1) : (float)adim[1] / bdim[1];
179
3
    const float rwidth = align_corners ? 
(float)(adim[2] - 1) / 0
ccv_max0
(1, bdim[2] - 1) : (float)adim[2] / bdim[2];
180
3
    assert(rheight <= 1);
181
3
    assert(rwidth <= 1);
182
3
    int* const xcoeff = (int*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(int) * (bdim[2]), CCV_TENSOR_CPU_MEMORY);
183
35
    for (xd = 0; xd < bdim[2]; 
xd++32
)
184
32
      xcoeff[xd] = ccv_min(align_corners ? (int)(xd * rwidth + 0.5) : (int)((xd + 0.5) * rwidth), adim[2] - 1);
185
3
    assert(adim[0] == bdim[0]);
186
3
    assert(adim[3] == bdim[3]);
187
6
    
for (i[0] = 0; 3
i[0] < adim[0];
i[0]++3
)
188
3
    {
189
3
      int pysi0 = 0;
190
3
      float* ap0 = ap;
191
3
      const float* const bp0 = bp + i[0] * bstride[0];
192
35
      for (yd = 0; yd < bdim[1]; 
yd++32
)
193
32
      {
194
32
        const int ysi0 = ccv_min(align_corners ? (int)(yd * rheight + 0.5) : (int)((yd + 0.5) * rheight), adim[1] - 1);
195
32
        if (pysi0 < ysi0) // Move to ay1 line.
196
13
        {
197
13
          ap0 += (ysi0 - pysi0) * astride[1];
198
13
          pysi0 = ysi0;
199
13
        }
200
32
        const float* bp1 = bp0 + yd * bstride[1];
201
440
        for (xd = 0; xd < bdim[2]; 
xd++408
)
202
408
        {
203
408
          float* const ap00 = ap0 + xcoeff[xd] * astride[2];
204
2.40k
          for (cd = 0; cd < bdim[3]; 
cd++1.99k
)
205
1.99k
            ap00[cd] += bp1[cd];
206
408
          bp1 += bstride[2];
207
408
        }
208
32
      }
209
3
      ap += astride[0];
210
3
    }
211
3
  }
212
7
  return CCV_NNC_EXEC_SUCCESS;
213
7
}
214
215
typedef struct {
216
  int si[2];
217
  float sc[2];
218
} ccv_nnc_bi_coeffs_t;
219
220
static void _ccv_nnc_init_bi_coeffs(const int ss, const int sz, const float s, ccv_nnc_bi_coeffs_t* const coeff, const int align_corners)
221
14
{
222
14
  int i;
223
14
  if (align_corners)
224
2
  {
225
14
    for (i = 0; i < sz; 
i++12
)
226
12
    {
227
12
      const float xs = i * s;
228
12
      coeff[i].si[0] = (int)xs;
229
12
      coeff[i].si[1] = ccv_min((int)(xs + 1), ss - 1);
230
12
      coeff[i].sc[1] = xs - coeff[i].si[0];
231
12
      coeff[i].sc[0] = 1.0 - coeff[i].sc[1];
232
12
    }
233
12
  } else {
234
6.06k
    for (i = 0; i < sz; 
i++6.04k
)
235
6.04k
    {
236
6.04k
      const float xs = (i + 0.5) * s - 0.5;
237
6.04k
      coeff[i].si[0] = (int)xs;
238
6.04k
      coeff[i].si[1] = ccv_min((int)(xs + 1), ss - 1);
239
6.04k
      coeff[i].sc[1] = xs - coeff[i].si[0];
240
6.04k
      coeff[i].sc[0] = 1.0 - coeff[i].sc[1];
241
6.04k
    }
242
12
  }
243
14
}
244
245
static int _ccv_nnc_upsample_bilinear_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
246
3
{
247
3
  assert(input_size >= 1);
248
3
  assert(output_size >= 1);
249
3
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0];
250
3
  ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0];
251
3
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
252
3
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
253
  // Assuming this is float 32.
254
3
  int adim[CCV_NNC_MAX_DIM_ALLOC];
255
3
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
256
3
  ccv_nnc_tensor_view_get_dim(a, adim);
257
3
  ccv_nnc_tensor_view_get_dim(b, bdim);
258
3
  int astride[CCV_NNC_MAX_DIM_ALLOC];
259
3
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
260
3
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
261
3
  ccv_nnc_tensor_view_get_stride(a, astride);
262
3
  ccv_nnc_tensor_view_get_stride(b, bstride);
263
3
  int i[CCV_NNC_MAX_DIM + 2];
264
3
  int xd, yd, cd;
265
3
  const float* ap = a->data.f32;
266
3
  float* bp = b->data.f32;
267
3
  assert(a->info.format == b->info.format);
268
3
  const int align_corners = cmd.info.upsample.align_corners;
269
3
  if (a->info.format == CCV_TENSOR_FORMAT_NCHW)
270
1
  {
271
1
    const float rheight = align_corners ? 
(float)(adim[2] - 1) / 0
ccv_max0
(1, bdim[2] - 1) : (float)adim[2] / bdim[2];
272
1
    const float rwidth = align_corners ? 
(float)(adim[3] - 1) / 0
ccv_max0
(1, bdim[3] - 1) : (float)adim[3] / bdim[3];
273
1
    assert(rheight <= 1);
274
1
    assert(rwidth <= 1);
275
1
    ccv_nnc_bi_coeffs_t* const ycoeff = (ccv_nnc_bi_coeffs_t*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(ccv_nnc_bi_coeffs_t) * (bdim[2] + bdim[3]), CCV_TENSOR_CPU_MEMORY);
276
1
    ccv_nnc_bi_coeffs_t* const xcoeff = ycoeff + bdim[2];
277
1
    _ccv_nnc_init_bi_coeffs(adim[2], bdim[2], rheight, ycoeff, align_corners);
278
1
    _ccv_nnc_init_bi_coeffs(adim[3], bdim[3], rwidth, xcoeff, align_corners);
279
1
    assert(adim[0] == bdim[0]);
280
1
    assert(adim[1] == bdim[1]);
281
2
    
for (i[0] = 0; 1
i[0] < adim[0];
i[0]++1
)
282
1
    {
283
1
      const float* ap0 = ap + i[0] * astride[0];
284
1
      float* const bp0 = bp + i[0] * bstride[0];
285
4
      for (i[1] = 0; i[1] < adim[1]; 
i[1]++3
)
286
3
      {
287
3
        int pysi0 = 0;
288
3
        const float* ap1 = ap0;
289
3
        float* bp1 = bp0 + i[1] * bstride[1];
290
3.00k
        for (yd = 0; yd < bdim[2]; 
yd++3.00k
)
291
3.00k
        {
292
3.00k
          const int ysi0 = ycoeff[yd].si[0];
293
3.00k
          const int ysi1 = ycoeff[yd].si[1] - ysi0;
294
3.00k
          const float ysc0 = ycoeff[yd].sc[0];
295
3.00k
          const float ysc1 = ycoeff[yd].sc[1];
296
3.00k
          if (pysi0 < ysi0) // Move to ay1 line.
297
1.49k
          {
298
1.49k
            ap1 += (ysi0 - pysi0) * astride[2];
299
1.49k
            pysi0 = ysi0;
300
1.49k
          }
301
3.00M
          for (xd = 0; xd < bdim[3]; 
xd++3.00M
)
302
3.00M
          {
303
3.00M
            const ccv_nnc_bi_coeffs_t cof = xcoeff[xd];
304
3.00M
            bp1[xd] = ap1[cof.si[0]] * cof.sc[0] * ysc0 + ap1[cof.si[1]] * cof.sc[1] * ysc0 +
305
3.00M
              ap1[cof.si[0] + astride[2] * ysi1] * cof.sc[0] * ysc1 + ap1[cof.si[1] + astride[2] * ysi1] * cof.sc[1] * ysc1;
306
3.00M
          }
307
3.00k
          bp1 += bstride[2];
308
3.00k
        }
309
3
        ap0 += astride[1];
310
3
      }
311
1
    }
312
2
  } else {
313
    // Any case, this is either NHWC or CHWN
314
2
    assert(a->info.format == CCV_TENSOR_FORMAT_NHWC || a->info.format == CCV_TENSOR_FORMAT_CHWN);
315
2
    const float rheight = align_corners ? 
(float)(adim[1] - 1) / 1
ccv_max1
(1, bdim[1] - 1) :
(float)adim[1] / bdim[1]1
;
316
2
    const float rwidth = align_corners ? 
(float)(adim[2] - 1) / 1
ccv_max1
(1, bdim[2] - 1) :
(float)adim[2] / bdim[2]1
;
317
2
    assert(rheight <= 1);
318
2
    assert(rwidth <= 1);
319
2
    ccv_nnc_bi_coeffs_t* const ycoeff = (ccv_nnc_bi_coeffs_t*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(ccv_nnc_bi_coeffs_t) * (bdim[1] + bdim[2]), CCV_TENSOR_CPU_MEMORY);
320
2
    ccv_nnc_bi_coeffs_t* const xcoeff = ycoeff + bdim[1];
321
2
    _ccv_nnc_init_bi_coeffs(adim[1], bdim[1], rheight, ycoeff, align_corners);
322
2
    _ccv_nnc_init_bi_coeffs(adim[2], bdim[2], rwidth, xcoeff, align_corners);
323
2
    assert(adim[0] == bdim[0]);
324
2
    assert(adim[3] == bdim[3]);
325
4
    
for (i[0] = 0; 2
i[0] < adim[0];
i[0]++2
)
326
2
    {
327
2
      int pysi0 = 0;
328
2
      const float* ap0 = ap;
329
2
      float* const bp0 = bp + i[0] * bstride[0];
330
1.00k
      for (yd = 0; yd < bdim[1]; 
yd++1.00k
)
331
1.00k
      {
332
1.00k
        const int ysi0 = ycoeff[yd].si[0];
333
1.00k
        const int ysi1 = ycoeff[yd].si[1] - ysi0;
334
1.00k
        const float ysc0 = ycoeff[yd].sc[0];
335
1.00k
        const float ysc1 = ycoeff[yd].sc[1];
336
1.00k
        if (pysi0 < ysi0) // Move to ay1 line.
337
501
        {
338
501
          ap0 += (ysi0 - pysi0) * astride[1];
339
501
          pysi0 = ysi0;
340
501
        }
341
1.00k
        float* bp1 = bp0 + yd * bstride[1];
342
1.00M
        for (xd = 0; xd < bdim[2]; 
xd++1.00M
)
343
1.00M
        {
344
1.00M
          const ccv_nnc_bi_coeffs_t cof = xcoeff[xd];
345
1.00M
          const float c00 = cof.sc[0] * ysc0;
346
1.00M
          const float c01 = cof.sc[1] * ysc0;
347
1.00M
          const float c10 = cof.sc[0] * ysc1;
348
1.00M
          const float c11 = cof.sc[1] * ysc1;
349
1.00M
          const float* const ap00 = ap0 + cof.si[0] * astride[2];
350
1.00M
          const float* const ap01 = ap0 + cof.si[1] * astride[2];
351
1.00M
          const float* const ap10 = ap00 + ysi1 * astride[1];
352
1.00M
          const float* const ap11 = ap01 + ysi1 * astride[1];
353
4.00M
          for (cd = 0; cd < bdim[3]; 
cd++3.00M
)
354
3.00M
            bp1[cd] = ap00[cd] * c00 + ap01[cd] * c01 +
355
3.00M
              ap10[cd] * c10 + ap11[cd] * c11;
356
1.00M
          bp1 += bstride[2];
357
1.00M
        }
358
1.00k
      }
359
2
      ap += astride[0];
360
2
    }
361
2
  }
362
3
  return CCV_NNC_EXEC_SUCCESS;
363
3
}
364
365
static int _ccv_nnc_upsample_bilinear_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
366
4
{
367
4
  assert(input_size >= 1);
368
4
  assert(output_size >= 1);
369
4
  ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[0];
370
4
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
371
4
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
372
4
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
373
  // Assuming this is float 32.
374
4
  int adim[CCV_NNC_MAX_DIM_ALLOC];
375
4
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
376
4
  ccv_nnc_tensor_view_get_dim(a, adim);
377
4
  ccv_nnc_tensor_view_get_dim(b, bdim);
378
4
  int astride[CCV_NNC_MAX_DIM_ALLOC];
379
4
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
380
4
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
381
4
  ccv_nnc_tensor_view_get_stride(a, astride);
382
4
  ccv_nnc_tensor_view_get_stride(b, bstride);
383
4
  int i[CCV_NNC_MAX_DIM + 2];
384
4
  int xd, yd, cd;
385
4
  _ccv_nnc_tensor_set_cpu_ref_f32(a, 0);
386
4
  float* ap = a->data.f32;
387
4
  const float* bp = b->data.f32;
388
4
  assert(a->info.format == b->info.format);
389
4
  const int align_corners = cmd.info.upsample.align_corners;
390
4
  if (a->info.format == CCV_TENSOR_FORMAT_NCHW)
391
2
  {
392
2
    const float rheight = align_corners ? 
(float)(adim[2] - 1) / 0
ccv_max0
(1, bdim[2] - 1) : (float)adim[2] / bdim[2];
393
2
    const float rwidth = align_corners ? 
(float)(adim[3] - 1) / 0
ccv_max0
(1, bdim[3] - 1) : (float)adim[3] / bdim[3];
394
2
    assert(rheight <= 1);
395
2
    assert(rwidth <= 1);
396
2
    ccv_nnc_bi_coeffs_t* const ycoeff = (ccv_nnc_bi_coeffs_t*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(ccv_nnc_bi_coeffs_t) * (bdim[2] + bdim[3]), CCV_TENSOR_CPU_MEMORY);
397
2
    ccv_nnc_bi_coeffs_t* const xcoeff = ycoeff + bdim[2];
398
2
    _ccv_nnc_init_bi_coeffs(adim[2], bdim[2], rheight, ycoeff, align_corners);
399
2
    _ccv_nnc_init_bi_coeffs(adim[3], bdim[3], rwidth, xcoeff, align_corners);
400
2
    assert(adim[0] == bdim[0]);
401
2
    assert(adim[1] == bdim[1]);
402
4
    
for (i[0] = 0; 2
i[0] < adim[0];
i[0]++2
)
403
2
    {
404
2
      float* ap0 = ap + i[0] * astride[0];
405
2
      const float* const bp0 = bp + i[0] * bstride[0];
406
20
      for (i[1] = 0; i[1] < adim[1]; 
i[1]++18
)
407
18
      {
408
18
        int pysi0 = 0;
409
18
        float* ap1 = ap0;
410
18
        const float* bp1 = bp0 + i[1] * bstride[1];
411
1.72k
        for (yd = 0; yd < bdim[2]; 
yd++1.71k
)
412
1.71k
        {
413
1.71k
          const int ysi0 = ycoeff[yd].si[0];
414
1.71k
          const int ysi1 = ycoeff[yd].si[1] - ysi0;
415
1.71k
          const float ysc0 = ycoeff[yd].sc[0];
416
1.71k
          const float ysc1 = ycoeff[yd].sc[1];
417
1.71k
          if (pysi0 < ysi0) // Move to ay1 line.
418
837
          {
419
837
            ap1 += (ysi0 - pysi0) * astride[2];
420
837
            pysi0 = ysi0;
421
837
          }
422
752k
          for (xd = 0; xd < bdim[3]; 
xd++751k
)
423
751k
          {
424
751k
            const ccv_nnc_bi_coeffs_t cof = xcoeff[xd];
425
751k
            ap1[cof.si[0]] += bp1[xd] * ysc0 * cof.sc[0];
426
751k
            ap1[cof.si[1]] += bp1[xd] * ysc0 * cof.sc[1];
427
751k
            ap1[cof.si[0] + astride[2] * ysi1] += bp1[xd] * ysc1 * cof.sc[0];
428
751k
            ap1[cof.si[1] + astride[2] * ysi1] += bp1[xd] * ysc1 * cof.sc[1];
429
751k
          }
430
1.71k
          bp1 += bstride[2];
431
1.71k
        }
432
18
        ap0 += astride[1];
433
18
      }
434
2
    }
435
2
  } else {
436
    // Any case, this is either NHWC or CHWN
437
2
    assert(a->info.format == CCV_TENSOR_FORMAT_NHWC || a->info.format == CCV_TENSOR_FORMAT_CHWN);
438
2
    const float rheight = align_corners ? 
(float)(adim[1] - 1) / 0
ccv_max0
(1, bdim[1] - 1) : (float)adim[1] / bdim[1];
439
2
    const float rwidth = align_corners ? 
(float)(adim[2] - 1) / 0
ccv_max0
(1, bdim[2] - 1) : (float)adim[2] / bdim[2];
440
2
    assert(rheight <= 1);
441
2
    assert(rwidth <= 1);
442
2
    ccv_nnc_bi_coeffs_t* const ycoeff = (ccv_nnc_bi_coeffs_t*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(ccv_nnc_bi_coeffs_t) * (bdim[1] + bdim[2]), CCV_TENSOR_CPU_MEMORY);
443
2
    ccv_nnc_bi_coeffs_t* const xcoeff = ycoeff + bdim[1];
444
2
    _ccv_nnc_init_bi_coeffs(adim[1], bdim[1], rheight, ycoeff, align_corners);
445
2
    _ccv_nnc_init_bi_coeffs(adim[2], bdim[2], rwidth, xcoeff, align_corners);
446
2
    assert(adim[0] == bdim[0]);
447
2
    assert(adim[3] == bdim[3]);
448
4
    
for (i[0] = 0; 2
i[0] < adim[0];
i[0]++2
)
449
2
    {
450
2
      int pysi0 = 0;
451
2
      float* ap0 = ap;
452
2
      const float* const bp0 = bp + i[0] * bstride[0];
453
516
      for (yd = 0; yd < bdim[1]; 
yd++514
)
454
514
      {
455
514
        const int ysi0 = ycoeff[yd].si[0];
456
514
        const int ysi1 = ycoeff[yd].si[1] - ysi0;
457
514
        const float ysc0 = ycoeff[yd].sc[0];
458
514
        const float ysc1 = ycoeff[yd].sc[1];
459
514
        if (pysi0 < ysi0) // Move to ay1 line.
460
255
        {
461
255
          ap0 += (ysi0 - pysi0) * astride[1];
462
255
          pysi0 = ysi0;
463
255
        }
464
514
        const float* bp1 = bp0 + yd * bstride[1];
465
250k
        for (xd = 0; xd < bdim[2]; 
xd++250k
)
466
250k
        {
467
250k
          const ccv_nnc_bi_coeffs_t cof = xcoeff[xd];
468
250k
          const float c00 = cof.sc[0] * ysc0;
469
250k
          const float c01 = cof.sc[1] * ysc0;
470
250k
          const float c10 = cof.sc[0] * ysc1;
471
250k
          const float c11 = cof.sc[1] * ysc1;
472
250k
          float* const ap00 = ap0 + cof.si[0] * astride[2];
473
250k
          float* const ap01 = ap0 + cof.si[1] * astride[2];
474
250k
          float* const ap10 = ap00 + ysi1 * astride[1];
475
250k
          float* const ap11 = ap01 + ysi1 * astride[1];
476
1.00M
          for (cd = 0; cd < bdim[3]; 
cd++750k
)
477
750k
          {
478
750k
            ap00[cd] += bp1[cd] * c00;
479
750k
            ap01[cd] += bp1[cd] * c01;
480
750k
            ap10[cd] += bp1[cd] * c10;
481
750k
            ap11[cd] += bp1[cd] * c11;
482
750k
          }
483
250k
          bp1 += bstride[2];
484
250k
        }
485
514
      }
486
2
      ap += astride[0];
487
2
    }
488
2
  }
489
4
  return CCV_NNC_EXEC_SUCCESS;
490
4
}
491
492
static int _ccv_nnc_upsample_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
493
8
{
494
8
  if (cmd.info.upsample.type == CCV_NNC_UPSAMPLE_NEAREST)
495
5
    return _ccv_nnc_upsample_nearest_forw(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
496
3
  else if (cmd.info.upsample.type == CCV_NNC_UPSAMPLE_BILINEAR)
497
3
    return _ccv_nnc_upsample_bilinear_forw(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
498
0
  return CCV_NNC_EXEC_INVALID;
499
8
}
500
501
static int _ccv_nnc_upsample_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
502
11
{
503
11
  if (cmd.info.upsample.type == CCV_NNC_UPSAMPLE_NEAREST)
504
7
    return _ccv_nnc_upsample_nearest_back(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
505
4
  else if (cmd.info.upsample.type == CCV_NNC_UPSAMPLE_BILINEAR)
506
4
    return _ccv_nnc_upsample_bilinear_back(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
507
0
  return CCV_NNC_EXEC_INVALID;
508
11
}
509
510
REGISTER_COMMAND_BACKEND(CCV_NNC_UPSAMPLE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
511
1
{
512
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
513
1
  registry->tensor_datatypes = CCV_32F;
514
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
515
1
  registry->algorithms = 1;
516
1
  registry->exec = _ccv_nnc_upsample_forw;
517
1
}
518
519
REGISTER_COMMAND_BACKEND(CCV_NNC_UPSAMPLE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
520
1
{
521
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
522
1
  registry->tensor_datatypes = CCV_32F;
523
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
524
1
  registry->algorithms = 1;
525
1
  registry->exec = _ccv_nnc_upsample_back;
526
1
}