/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/upsample/ccv_nnc_upsample_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | // Shared methods. |
14 | | #include "../_ccv_nnc_cpu_ref.h" |
15 | | |
16 | | static int _ccv_nnc_upsample_nearest_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
17 | 5 | { |
18 | 5 | assert(input_size >= 1); |
19 | 5 | assert(output_size >= 1); |
20 | 5 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0]; |
21 | 5 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0]; |
22 | 5 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
23 | 5 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
24 | | // Assuming this is float 32. |
25 | 5 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
26 | 5 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
27 | 5 | ccv_nnc_tensor_view_get_dim(a, adim); |
28 | 5 | ccv_nnc_tensor_view_get_dim(b, bdim); |
29 | 5 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
30 | 5 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
31 | 5 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
32 | 5 | ccv_nnc_tensor_view_get_stride(a, astride); |
33 | 5 | ccv_nnc_tensor_view_get_stride(b, bstride); |
34 | 5 | int i[CCV_NNC_MAX_DIM + 2]; |
35 | 5 | int xd, yd, cd; |
36 | 5 | const float* ap = a->data.f32; |
37 | 5 | float* const bp = b->data.f32; |
38 | 5 | assert(a->info.format == b->info.format); |
39 | 5 | const int align_corners = cmd.info.upsample.align_corners; |
40 | 5 | if (a->info.format == CCV_TENSOR_FORMAT_NCHW) |
41 | 2 | { |
42 | 2 | const float rheight = align_corners ? (float)(adim[2] - 1) / 0 ccv_max0 (1, bdim[2] - 1) : (float)adim[2] / bdim[2]; |
43 | 2 | const float rwidth = align_corners ? (float)(adim[3] - 1) / 0 ccv_max0 (1, bdim[3] - 1) : (float)adim[3] / bdim[3]; |
44 | 2 | assert(rheight <= 1); |
45 | 2 | assert(rwidth <= 1); |
46 | 2 | int* const xcoeff = (int*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(int) * (bdim[3]), CCV_TENSOR_CPU_MEMORY); |
47 | 16 | for (xd = 0; xd < bdim[3]; xd++14 ) |
48 | 14 | xcoeff[xd] = ccv_min(align_corners ? (int)(xd * rwidth + 0.5) : (int)((xd + 0.5) * rwidth), adim[3] - 1); |
49 | 2 | assert(adim[0] == bdim[0]); |
50 | 2 | assert(adim[1] == bdim[1]); |
51 | 4 | for (i[0] = 0; 2 i[0] < adim[0]; i[0]++2 ) |
52 | 2 | { |
53 | 2 | const float* ap0 = ap + i[0] * astride[0]; |
54 | 2 | float* const bp0 = bp + i[0] * bstride[0]; |
55 | 19 | for (i[1] = 0; i[1] < adim[1]; i[1]++17 ) |
56 | 17 | { |
57 | 17 | int pysi0 = 0; |
58 | 17 | const float* ap1 = ap0; |
59 | 17 | float* bp1 = bp0 + i[1] * bstride[1]; |
60 | 475 | for (yd = 0; yd < bdim[2]; yd++458 ) |
61 | 458 | { |
62 | 458 | const int ysi0 = ccv_min(align_corners ? (int)(yd * rheight + 0.5) : (int)((yd + 0.5) * rheight), adim[2] - 1); |
63 | 458 | if (pysi0 < ysi0) // Move to ay1 line. |
64 | 212 | { |
65 | 212 | ap1 += (ysi0 - pysi0) * astride[2]; |
66 | 212 | pysi0 = ysi0; |
67 | 212 | } |
68 | 4.99k | for (xd = 0; xd < bdim[3]; xd++4.53k ) |
69 | 4.53k | bp1[xd] = ap1[xcoeff[xd]]; |
70 | 458 | bp1 += bstride[2]; |
71 | 458 | } |
72 | 17 | ap0 += astride[1]; |
73 | 17 | } |
74 | 2 | } |
75 | 3 | } else { |
76 | | // Any case, this is either NHWC or CHWN |
77 | 3 | assert(a->info.format == CCV_TENSOR_FORMAT_NHWC || a->info.format == CCV_TENSOR_FORMAT_CHWN); |
78 | 3 | const float rheight = align_corners ? (float)(adim[1] - 1) / 1 ccv_max1 (1, bdim[1] - 1) : (float)adim[1] / bdim[1]2 ; |
79 | 3 | const float rwidth = align_corners ? (float)(adim[2] - 1) / 1 ccv_max1 (1, bdim[2] - 1) : (float)adim[2] / bdim[2]2 ; |
80 | 3 | assert(rheight <= 1); |
81 | 3 | assert(rwidth <= 1); |
82 | 3 | int* const xcoeff = (int*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(int) * (bdim[2]), CCV_TENSOR_CPU_MEMORY); |
83 | 43 | for (xd = 0; xd < bdim[2]; xd++40 ) |
84 | 40 | xcoeff[xd] = ccv_min(align_corners ? (int)(xd * rwidth + 0.5) : (int)((xd + 0.5) * rwidth), adim[2] - 1); |
85 | 3 | assert(adim[0] == bdim[0]); |
86 | 3 | assert(adim[3] == bdim[3]); |
87 | 6 | for (i[0] = 0; 3 i[0] < adim[0]; i[0]++3 ) |
88 | 3 | { |
89 | 3 | int pysi0 = 0; |
90 | 3 | const float* ap0 = ap; |
91 | 3 | float* const bp0 = bp + i[0] * bstride[0]; |
92 | 43 | for (yd = 0; yd < bdim[1]; yd++40 ) |
93 | 40 | { |
94 | 40 | const int ysi0 = ccv_min(align_corners ? (int)(yd * rheight + 0.5) : (int)((yd + 0.5) * rheight), adim[1] - 1); |
95 | 40 | if (pysi0 < ysi0) // Move to ay1 line. |
96 | 17 | { |
97 | 17 | ap0 += (ysi0 - pysi0) * astride[1]; |
98 | 17 | pysi0 = ysi0; |
99 | 17 | } |
100 | 40 | float* bp1 = bp0 + yd * bstride[1]; |
101 | 992 | for (xd = 0; xd < bdim[2]; xd++952 ) |
102 | 952 | { |
103 | 952 | const float* const ap00 = ap0 + xcoeff[xd] * astride[2]; |
104 | 5.52k | for (cd = 0; cd < bdim[3]; cd++4.56k ) |
105 | 4.56k | bp1[cd] = ap00[cd]; |
106 | 952 | bp1 += bstride[2]; |
107 | 952 | } |
108 | 40 | } |
109 | 3 | ap += astride[0]; |
110 | 3 | } |
111 | 3 | } |
112 | 5 | return CCV_NNC_EXEC_SUCCESS; |
113 | 5 | } |
114 | | |
115 | | static int _ccv_nnc_upsample_nearest_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
116 | 7 | { |
117 | 7 | assert(input_size >= 1); |
118 | 7 | assert(output_size >= 1); |
119 | 7 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[0]; |
120 | 7 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
121 | 7 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
122 | 7 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
123 | | // Assuming this is float 32. |
124 | 7 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
125 | 7 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
126 | 7 | ccv_nnc_tensor_view_get_dim(a, adim); |
127 | 7 | ccv_nnc_tensor_view_get_dim(b, bdim); |
128 | 7 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
129 | 7 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
130 | 7 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
131 | 7 | ccv_nnc_tensor_view_get_stride(a, astride); |
132 | 7 | ccv_nnc_tensor_view_get_stride(b, bstride); |
133 | 7 | int i[CCV_NNC_MAX_DIM + 2]; |
134 | 7 | int xd, yd, cd; |
135 | 7 | _ccv_nnc_tensor_set_cpu_ref_f32(a, 0); |
136 | 7 | float* ap = a->data.f32; |
137 | 7 | const float* bp = b->data.f32; |
138 | 7 | const int align_corners = cmd.info.upsample.align_corners; |
139 | 7 | assert(a->info.format == b->info.format); |
140 | 7 | if (a->info.format == CCV_TENSOR_FORMAT_NCHW) |
141 | 4 | { |
142 | 4 | const float rheight = align_corners ? (float)(adim[2] - 1) / 1 ccv_max1 (1, bdim[2] - 1) : (float)adim[2] / bdim[2]3 ; |
143 | 4 | const float rwidth = align_corners ? (float)(adim[3] - 1) / 1 ccv_max1 (1, bdim[3] - 1) : (float)adim[3] / bdim[3]3 ; |
144 | 4 | assert(rheight <= 1); |
145 | 4 | assert(rwidth <= 1); |
146 | 4 | int* const xcoeff = (int*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(int) * (bdim[3]), CCV_TENSOR_CPU_MEMORY); |
147 | 26 | for (xd = 0; xd < bdim[3]; xd++22 ) |
148 | 22 | xcoeff[xd] = ccv_min(align_corners ? (int)(xd * rwidth + 0.5) : (int)((xd + 0.5) * rwidth), adim[3] - 1); |
149 | 4 | assert(adim[0] == bdim[0]); |
150 | 4 | assert(adim[1] == bdim[1]); |
151 | 8 | for (i[0] = 0; 4 i[0] < adim[0]; i[0]++4 ) |
152 | 4 | { |
153 | 4 | float* ap0 = ap + i[0] * astride[0]; |
154 | 4 | const float* bp0 = bp + i[0] * bstride[0]; |
155 | 51 | for (i[1] = 0; i[1] < adim[1]; i[1]++47 ) |
156 | 47 | { |
157 | 47 | int pysi0 = 0; |
158 | 47 | float* ap1 = ap0; |
159 | 47 | const float* bp1 = bp0 + i[1] * bstride[1]; |
160 | 685 | for (yd = 0; yd < bdim[2]; yd++638 ) |
161 | 638 | { |
162 | 638 | const int ysi0 = ccv_min(align_corners ? (int)(yd * rheight + 0.5) : (int)((yd + 0.5) * rheight), adim[2] - 1); |
163 | 638 | if (pysi0 < ysi0) // Move to ay1 line. |
164 | 272 | { |
165 | 272 | ap1 += (ysi0 - pysi0) * astride[2]; |
166 | 272 | pysi0 = ysi0; |
167 | 272 | } |
168 | 4.45k | for (xd = 0; xd < bdim[3]; xd++3.81k ) |
169 | 3.81k | ap1[xcoeff[xd]] += bp1[xd]; |
170 | 638 | bp1 += bstride[2]; |
171 | 638 | } |
172 | 47 | ap0 += astride[1]; |
173 | 47 | } |
174 | 4 | } |
175 | 4 | } else { |
176 | | // Any case, this is either NHWC or CHWN |
177 | 3 | assert(a->info.format == CCV_TENSOR_FORMAT_NHWC || a->info.format == CCV_TENSOR_FORMAT_CHWN); |
178 | 3 | const float rheight = align_corners ? (float)(adim[1] - 1) / 0 ccv_max0 (1, bdim[1] - 1) : (float)adim[1] / bdim[1]; |
179 | 3 | const float rwidth = align_corners ? (float)(adim[2] - 1) / 0 ccv_max0 (1, bdim[2] - 1) : (float)adim[2] / bdim[2]; |
180 | 3 | assert(rheight <= 1); |
181 | 3 | assert(rwidth <= 1); |
182 | 3 | int* const xcoeff = (int*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(int) * (bdim[2]), CCV_TENSOR_CPU_MEMORY); |
183 | 35 | for (xd = 0; xd < bdim[2]; xd++32 ) |
184 | 32 | xcoeff[xd] = ccv_min(align_corners ? (int)(xd * rwidth + 0.5) : (int)((xd + 0.5) * rwidth), adim[2] - 1); |
185 | 3 | assert(adim[0] == bdim[0]); |
186 | 3 | assert(adim[3] == bdim[3]); |
187 | 6 | for (i[0] = 0; 3 i[0] < adim[0]; i[0]++3 ) |
188 | 3 | { |
189 | 3 | int pysi0 = 0; |
190 | 3 | float* ap0 = ap; |
191 | 3 | const float* const bp0 = bp + i[0] * bstride[0]; |
192 | 35 | for (yd = 0; yd < bdim[1]; yd++32 ) |
193 | 32 | { |
194 | 32 | const int ysi0 = ccv_min(align_corners ? (int)(yd * rheight + 0.5) : (int)((yd + 0.5) * rheight), adim[1] - 1); |
195 | 32 | if (pysi0 < ysi0) // Move to ay1 line. |
196 | 13 | { |
197 | 13 | ap0 += (ysi0 - pysi0) * astride[1]; |
198 | 13 | pysi0 = ysi0; |
199 | 13 | } |
200 | 32 | const float* bp1 = bp0 + yd * bstride[1]; |
201 | 440 | for (xd = 0; xd < bdim[2]; xd++408 ) |
202 | 408 | { |
203 | 408 | float* const ap00 = ap0 + xcoeff[xd] * astride[2]; |
204 | 2.40k | for (cd = 0; cd < bdim[3]; cd++1.99k ) |
205 | 1.99k | ap00[cd] += bp1[cd]; |
206 | 408 | bp1 += bstride[2]; |
207 | 408 | } |
208 | 32 | } |
209 | 3 | ap += astride[0]; |
210 | 3 | } |
211 | 3 | } |
212 | 7 | return CCV_NNC_EXEC_SUCCESS; |
213 | 7 | } |
214 | | |
215 | | typedef struct { |
216 | | int si[2]; |
217 | | float sc[2]; |
218 | | } ccv_nnc_bi_coeffs_t; |
219 | | |
220 | | static void _ccv_nnc_init_bi_coeffs(const int ss, const int sz, const float s, ccv_nnc_bi_coeffs_t* const coeff, const int align_corners) |
221 | 14 | { |
222 | 14 | int i; |
223 | 14 | if (align_corners) |
224 | 2 | { |
225 | 14 | for (i = 0; i < sz; i++12 ) |
226 | 12 | { |
227 | 12 | const float xs = i * s; |
228 | 12 | coeff[i].si[0] = (int)xs; |
229 | 12 | coeff[i].si[1] = ccv_min((int)(xs + 1), ss - 1); |
230 | 12 | coeff[i].sc[1] = xs - coeff[i].si[0]; |
231 | 12 | coeff[i].sc[0] = 1.0 - coeff[i].sc[1]; |
232 | 12 | } |
233 | 12 | } else { |
234 | 6.06k | for (i = 0; i < sz; i++6.04k ) |
235 | 6.04k | { |
236 | 6.04k | const float xs = (i + 0.5) * s - 0.5; |
237 | 6.04k | coeff[i].si[0] = (int)xs; |
238 | 6.04k | coeff[i].si[1] = ccv_min((int)(xs + 1), ss - 1); |
239 | 6.04k | coeff[i].sc[1] = xs - coeff[i].si[0]; |
240 | 6.04k | coeff[i].sc[0] = 1.0 - coeff[i].sc[1]; |
241 | 6.04k | } |
242 | 12 | } |
243 | 14 | } |
244 | | |
245 | | static int _ccv_nnc_upsample_bilinear_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
246 | 3 | { |
247 | 3 | assert(input_size >= 1); |
248 | 3 | assert(output_size >= 1); |
249 | 3 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0]; |
250 | 3 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0]; |
251 | 3 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
252 | 3 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
253 | | // Assuming this is float 32. |
254 | 3 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
255 | 3 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
256 | 3 | ccv_nnc_tensor_view_get_dim(a, adim); |
257 | 3 | ccv_nnc_tensor_view_get_dim(b, bdim); |
258 | 3 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
259 | 3 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
260 | 3 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
261 | 3 | ccv_nnc_tensor_view_get_stride(a, astride); |
262 | 3 | ccv_nnc_tensor_view_get_stride(b, bstride); |
263 | 3 | int i[CCV_NNC_MAX_DIM + 2]; |
264 | 3 | int xd, yd, cd; |
265 | 3 | const float* ap = a->data.f32; |
266 | 3 | float* bp = b->data.f32; |
267 | 3 | assert(a->info.format == b->info.format); |
268 | 3 | const int align_corners = cmd.info.upsample.align_corners; |
269 | 3 | if (a->info.format == CCV_TENSOR_FORMAT_NCHW) |
270 | 1 | { |
271 | 1 | const float rheight = align_corners ? (float)(adim[2] - 1) / 0 ccv_max0 (1, bdim[2] - 1) : (float)adim[2] / bdim[2]; |
272 | 1 | const float rwidth = align_corners ? (float)(adim[3] - 1) / 0 ccv_max0 (1, bdim[3] - 1) : (float)adim[3] / bdim[3]; |
273 | 1 | assert(rheight <= 1); |
274 | 1 | assert(rwidth <= 1); |
275 | 1 | ccv_nnc_bi_coeffs_t* const ycoeff = (ccv_nnc_bi_coeffs_t*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(ccv_nnc_bi_coeffs_t) * (bdim[2] + bdim[3]), CCV_TENSOR_CPU_MEMORY); |
276 | 1 | ccv_nnc_bi_coeffs_t* const xcoeff = ycoeff + bdim[2]; |
277 | 1 | _ccv_nnc_init_bi_coeffs(adim[2], bdim[2], rheight, ycoeff, align_corners); |
278 | 1 | _ccv_nnc_init_bi_coeffs(adim[3], bdim[3], rwidth, xcoeff, align_corners); |
279 | 1 | assert(adim[0] == bdim[0]); |
280 | 1 | assert(adim[1] == bdim[1]); |
281 | 2 | for (i[0] = 0; 1 i[0] < adim[0]; i[0]++1 ) |
282 | 1 | { |
283 | 1 | const float* ap0 = ap + i[0] * astride[0]; |
284 | 1 | float* const bp0 = bp + i[0] * bstride[0]; |
285 | 4 | for (i[1] = 0; i[1] < adim[1]; i[1]++3 ) |
286 | 3 | { |
287 | 3 | int pysi0 = 0; |
288 | 3 | const float* ap1 = ap0; |
289 | 3 | float* bp1 = bp0 + i[1] * bstride[1]; |
290 | 3.00k | for (yd = 0; yd < bdim[2]; yd++3.00k ) |
291 | 3.00k | { |
292 | 3.00k | const int ysi0 = ycoeff[yd].si[0]; |
293 | 3.00k | const int ysi1 = ycoeff[yd].si[1] - ysi0; |
294 | 3.00k | const float ysc0 = ycoeff[yd].sc[0]; |
295 | 3.00k | const float ysc1 = ycoeff[yd].sc[1]; |
296 | 3.00k | if (pysi0 < ysi0) // Move to ay1 line. |
297 | 1.49k | { |
298 | 1.49k | ap1 += (ysi0 - pysi0) * astride[2]; |
299 | 1.49k | pysi0 = ysi0; |
300 | 1.49k | } |
301 | 3.00M | for (xd = 0; xd < bdim[3]; xd++3.00M ) |
302 | 3.00M | { |
303 | 3.00M | const ccv_nnc_bi_coeffs_t cof = xcoeff[xd]; |
304 | 3.00M | bp1[xd] = ap1[cof.si[0]] * cof.sc[0] * ysc0 + ap1[cof.si[1]] * cof.sc[1] * ysc0 + |
305 | 3.00M | ap1[cof.si[0] + astride[2] * ysi1] * cof.sc[0] * ysc1 + ap1[cof.si[1] + astride[2] * ysi1] * cof.sc[1] * ysc1; |
306 | 3.00M | } |
307 | 3.00k | bp1 += bstride[2]; |
308 | 3.00k | } |
309 | 3 | ap0 += astride[1]; |
310 | 3 | } |
311 | 1 | } |
312 | 2 | } else { |
313 | | // Any case, this is either NHWC or CHWN |
314 | 2 | assert(a->info.format == CCV_TENSOR_FORMAT_NHWC || a->info.format == CCV_TENSOR_FORMAT_CHWN); |
315 | 2 | const float rheight = align_corners ? (float)(adim[1] - 1) / 1 ccv_max1 (1, bdim[1] - 1) : (float)adim[1] / bdim[1]1 ; |
316 | 2 | const float rwidth = align_corners ? (float)(adim[2] - 1) / 1 ccv_max1 (1, bdim[2] - 1) : (float)adim[2] / bdim[2]1 ; |
317 | 2 | assert(rheight <= 1); |
318 | 2 | assert(rwidth <= 1); |
319 | 2 | ccv_nnc_bi_coeffs_t* const ycoeff = (ccv_nnc_bi_coeffs_t*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(ccv_nnc_bi_coeffs_t) * (bdim[1] + bdim[2]), CCV_TENSOR_CPU_MEMORY); |
320 | 2 | ccv_nnc_bi_coeffs_t* const xcoeff = ycoeff + bdim[1]; |
321 | 2 | _ccv_nnc_init_bi_coeffs(adim[1], bdim[1], rheight, ycoeff, align_corners); |
322 | 2 | _ccv_nnc_init_bi_coeffs(adim[2], bdim[2], rwidth, xcoeff, align_corners); |
323 | 2 | assert(adim[0] == bdim[0]); |
324 | 2 | assert(adim[3] == bdim[3]); |
325 | 4 | for (i[0] = 0; 2 i[0] < adim[0]; i[0]++2 ) |
326 | 2 | { |
327 | 2 | int pysi0 = 0; |
328 | 2 | const float* ap0 = ap; |
329 | 2 | float* const bp0 = bp + i[0] * bstride[0]; |
330 | 1.00k | for (yd = 0; yd < bdim[1]; yd++1.00k ) |
331 | 1.00k | { |
332 | 1.00k | const int ysi0 = ycoeff[yd].si[0]; |
333 | 1.00k | const int ysi1 = ycoeff[yd].si[1] - ysi0; |
334 | 1.00k | const float ysc0 = ycoeff[yd].sc[0]; |
335 | 1.00k | const float ysc1 = ycoeff[yd].sc[1]; |
336 | 1.00k | if (pysi0 < ysi0) // Move to ay1 line. |
337 | 501 | { |
338 | 501 | ap0 += (ysi0 - pysi0) * astride[1]; |
339 | 501 | pysi0 = ysi0; |
340 | 501 | } |
341 | 1.00k | float* bp1 = bp0 + yd * bstride[1]; |
342 | 1.00M | for (xd = 0; xd < bdim[2]; xd++1.00M ) |
343 | 1.00M | { |
344 | 1.00M | const ccv_nnc_bi_coeffs_t cof = xcoeff[xd]; |
345 | 1.00M | const float c00 = cof.sc[0] * ysc0; |
346 | 1.00M | const float c01 = cof.sc[1] * ysc0; |
347 | 1.00M | const float c10 = cof.sc[0] * ysc1; |
348 | 1.00M | const float c11 = cof.sc[1] * ysc1; |
349 | 1.00M | const float* const ap00 = ap0 + cof.si[0] * astride[2]; |
350 | 1.00M | const float* const ap01 = ap0 + cof.si[1] * astride[2]; |
351 | 1.00M | const float* const ap10 = ap00 + ysi1 * astride[1]; |
352 | 1.00M | const float* const ap11 = ap01 + ysi1 * astride[1]; |
353 | 4.00M | for (cd = 0; cd < bdim[3]; cd++3.00M ) |
354 | 3.00M | bp1[cd] = ap00[cd] * c00 + ap01[cd] * c01 + |
355 | 3.00M | ap10[cd] * c10 + ap11[cd] * c11; |
356 | 1.00M | bp1 += bstride[2]; |
357 | 1.00M | } |
358 | 1.00k | } |
359 | 2 | ap += astride[0]; |
360 | 2 | } |
361 | 2 | } |
362 | 3 | return CCV_NNC_EXEC_SUCCESS; |
363 | 3 | } |
364 | | |
365 | | static int _ccv_nnc_upsample_bilinear_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
366 | 4 | { |
367 | 4 | assert(input_size >= 1); |
368 | 4 | assert(output_size >= 1); |
369 | 4 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[0]; |
370 | 4 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
371 | 4 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
372 | 4 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
373 | | // Assuming this is float 32. |
374 | 4 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
375 | 4 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
376 | 4 | ccv_nnc_tensor_view_get_dim(a, adim); |
377 | 4 | ccv_nnc_tensor_view_get_dim(b, bdim); |
378 | 4 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
379 | 4 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
380 | 4 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
381 | 4 | ccv_nnc_tensor_view_get_stride(a, astride); |
382 | 4 | ccv_nnc_tensor_view_get_stride(b, bstride); |
383 | 4 | int i[CCV_NNC_MAX_DIM + 2]; |
384 | 4 | int xd, yd, cd; |
385 | 4 | _ccv_nnc_tensor_set_cpu_ref_f32(a, 0); |
386 | 4 | float* ap = a->data.f32; |
387 | 4 | const float* bp = b->data.f32; |
388 | 4 | assert(a->info.format == b->info.format); |
389 | 4 | const int align_corners = cmd.info.upsample.align_corners; |
390 | 4 | if (a->info.format == CCV_TENSOR_FORMAT_NCHW) |
391 | 2 | { |
392 | 2 | const float rheight = align_corners ? (float)(adim[2] - 1) / 0 ccv_max0 (1, bdim[2] - 1) : (float)adim[2] / bdim[2]; |
393 | 2 | const float rwidth = align_corners ? (float)(adim[3] - 1) / 0 ccv_max0 (1, bdim[3] - 1) : (float)adim[3] / bdim[3]; |
394 | 2 | assert(rheight <= 1); |
395 | 2 | assert(rwidth <= 1); |
396 | 2 | ccv_nnc_bi_coeffs_t* const ycoeff = (ccv_nnc_bi_coeffs_t*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(ccv_nnc_bi_coeffs_t) * (bdim[2] + bdim[3]), CCV_TENSOR_CPU_MEMORY); |
397 | 2 | ccv_nnc_bi_coeffs_t* const xcoeff = ycoeff + bdim[2]; |
398 | 2 | _ccv_nnc_init_bi_coeffs(adim[2], bdim[2], rheight, ycoeff, align_corners); |
399 | 2 | _ccv_nnc_init_bi_coeffs(adim[3], bdim[3], rwidth, xcoeff, align_corners); |
400 | 2 | assert(adim[0] == bdim[0]); |
401 | 2 | assert(adim[1] == bdim[1]); |
402 | 4 | for (i[0] = 0; 2 i[0] < adim[0]; i[0]++2 ) |
403 | 2 | { |
404 | 2 | float* ap0 = ap + i[0] * astride[0]; |
405 | 2 | const float* const bp0 = bp + i[0] * bstride[0]; |
406 | 20 | for (i[1] = 0; i[1] < adim[1]; i[1]++18 ) |
407 | 18 | { |
408 | 18 | int pysi0 = 0; |
409 | 18 | float* ap1 = ap0; |
410 | 18 | const float* bp1 = bp0 + i[1] * bstride[1]; |
411 | 1.72k | for (yd = 0; yd < bdim[2]; yd++1.71k ) |
412 | 1.71k | { |
413 | 1.71k | const int ysi0 = ycoeff[yd].si[0]; |
414 | 1.71k | const int ysi1 = ycoeff[yd].si[1] - ysi0; |
415 | 1.71k | const float ysc0 = ycoeff[yd].sc[0]; |
416 | 1.71k | const float ysc1 = ycoeff[yd].sc[1]; |
417 | 1.71k | if (pysi0 < ysi0) // Move to ay1 line. |
418 | 837 | { |
419 | 837 | ap1 += (ysi0 - pysi0) * astride[2]; |
420 | 837 | pysi0 = ysi0; |
421 | 837 | } |
422 | 752k | for (xd = 0; xd < bdim[3]; xd++751k ) |
423 | 751k | { |
424 | 751k | const ccv_nnc_bi_coeffs_t cof = xcoeff[xd]; |
425 | 751k | ap1[cof.si[0]] += bp1[xd] * ysc0 * cof.sc[0]; |
426 | 751k | ap1[cof.si[1]] += bp1[xd] * ysc0 * cof.sc[1]; |
427 | 751k | ap1[cof.si[0] + astride[2] * ysi1] += bp1[xd] * ysc1 * cof.sc[0]; |
428 | 751k | ap1[cof.si[1] + astride[2] * ysi1] += bp1[xd] * ysc1 * cof.sc[1]; |
429 | 751k | } |
430 | 1.71k | bp1 += bstride[2]; |
431 | 1.71k | } |
432 | 18 | ap0 += astride[1]; |
433 | 18 | } |
434 | 2 | } |
435 | 2 | } else { |
436 | | // Any case, this is either NHWC or CHWN |
437 | 2 | assert(a->info.format == CCV_TENSOR_FORMAT_NHWC || a->info.format == CCV_TENSOR_FORMAT_CHWN); |
438 | 2 | const float rheight = align_corners ? (float)(adim[1] - 1) / 0 ccv_max0 (1, bdim[1] - 1) : (float)adim[1] / bdim[1]; |
439 | 2 | const float rwidth = align_corners ? (float)(adim[2] - 1) / 0 ccv_max0 (1, bdim[2] - 1) : (float)adim[2] / bdim[2]; |
440 | 2 | assert(rheight <= 1); |
441 | 2 | assert(rwidth <= 1); |
442 | 2 | ccv_nnc_bi_coeffs_t* const ycoeff = (ccv_nnc_bi_coeffs_t*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(ccv_nnc_bi_coeffs_t) * (bdim[1] + bdim[2]), CCV_TENSOR_CPU_MEMORY); |
443 | 2 | ccv_nnc_bi_coeffs_t* const xcoeff = ycoeff + bdim[1]; |
444 | 2 | _ccv_nnc_init_bi_coeffs(adim[1], bdim[1], rheight, ycoeff, align_corners); |
445 | 2 | _ccv_nnc_init_bi_coeffs(adim[2], bdim[2], rwidth, xcoeff, align_corners); |
446 | 2 | assert(adim[0] == bdim[0]); |
447 | 2 | assert(adim[3] == bdim[3]); |
448 | 4 | for (i[0] = 0; 2 i[0] < adim[0]; i[0]++2 ) |
449 | 2 | { |
450 | 2 | int pysi0 = 0; |
451 | 2 | float* ap0 = ap; |
452 | 2 | const float* const bp0 = bp + i[0] * bstride[0]; |
453 | 516 | for (yd = 0; yd < bdim[1]; yd++514 ) |
454 | 514 | { |
455 | 514 | const int ysi0 = ycoeff[yd].si[0]; |
456 | 514 | const int ysi1 = ycoeff[yd].si[1] - ysi0; |
457 | 514 | const float ysc0 = ycoeff[yd].sc[0]; |
458 | 514 | const float ysc1 = ycoeff[yd].sc[1]; |
459 | 514 | if (pysi0 < ysi0) // Move to ay1 line. |
460 | 255 | { |
461 | 255 | ap0 += (ysi0 - pysi0) * astride[1]; |
462 | 255 | pysi0 = ysi0; |
463 | 255 | } |
464 | 514 | const float* bp1 = bp0 + yd * bstride[1]; |
465 | 250k | for (xd = 0; xd < bdim[2]; xd++250k ) |
466 | 250k | { |
467 | 250k | const ccv_nnc_bi_coeffs_t cof = xcoeff[xd]; |
468 | 250k | const float c00 = cof.sc[0] * ysc0; |
469 | 250k | const float c01 = cof.sc[1] * ysc0; |
470 | 250k | const float c10 = cof.sc[0] * ysc1; |
471 | 250k | const float c11 = cof.sc[1] * ysc1; |
472 | 250k | float* const ap00 = ap0 + cof.si[0] * astride[2]; |
473 | 250k | float* const ap01 = ap0 + cof.si[1] * astride[2]; |
474 | 250k | float* const ap10 = ap00 + ysi1 * astride[1]; |
475 | 250k | float* const ap11 = ap01 + ysi1 * astride[1]; |
476 | 1.00M | for (cd = 0; cd < bdim[3]; cd++750k ) |
477 | 750k | { |
478 | 750k | ap00[cd] += bp1[cd] * c00; |
479 | 750k | ap01[cd] += bp1[cd] * c01; |
480 | 750k | ap10[cd] += bp1[cd] * c10; |
481 | 750k | ap11[cd] += bp1[cd] * c11; |
482 | 750k | } |
483 | 250k | bp1 += bstride[2]; |
484 | 250k | } |
485 | 514 | } |
486 | 2 | ap += astride[0]; |
487 | 2 | } |
488 | 2 | } |
489 | 4 | return CCV_NNC_EXEC_SUCCESS; |
490 | 4 | } |
491 | | |
492 | | static int _ccv_nnc_upsample_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
493 | 8 | { |
494 | 8 | if (cmd.info.upsample.type == CCV_NNC_UPSAMPLE_NEAREST) |
495 | 5 | return _ccv_nnc_upsample_nearest_forw(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
496 | 3 | else if (cmd.info.upsample.type == CCV_NNC_UPSAMPLE_BILINEAR) |
497 | 3 | return _ccv_nnc_upsample_bilinear_forw(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
498 | 0 | return CCV_NNC_EXEC_INVALID; |
499 | 8 | } |
500 | | |
501 | | static int _ccv_nnc_upsample_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
502 | 11 | { |
503 | 11 | if (cmd.info.upsample.type == CCV_NNC_UPSAMPLE_NEAREST) |
504 | 7 | return _ccv_nnc_upsample_nearest_back(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
505 | 4 | else if (cmd.info.upsample.type == CCV_NNC_UPSAMPLE_BILINEAR) |
506 | 4 | return _ccv_nnc_upsample_bilinear_back(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
507 | 0 | return CCV_NNC_EXEC_INVALID; |
508 | 11 | } |
509 | | |
510 | | REGISTER_COMMAND_BACKEND(CCV_NNC_UPSAMPLE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
511 | 1 | { |
512 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
513 | 1 | registry->tensor_datatypes = CCV_32F; |
514 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
515 | 1 | registry->algorithms = 1; |
516 | 1 | registry->exec = _ccv_nnc_upsample_forw; |
517 | 1 | } |
518 | | |
519 | | REGISTER_COMMAND_BACKEND(CCV_NNC_UPSAMPLE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
520 | 1 | { |
521 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
522 | 1 | registry->tensor_datatypes = CCV_32F; |
523 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
524 | 1 | registry->algorithms = 1; |
525 | 1 | registry->exec = _ccv_nnc_upsample_back; |
526 | 1 | } |