/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/dropout/ccv_nnc_dropout_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | #include "3rdparty/dsfmt/dSFMT.h" |
13 | | |
14 | | // Shared methods. |
15 | | #include "../_ccv_nnc_cpu_ref.h" |
16 | | |
17 | | static int _ccv_nnc_dropout_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
18 | 4 | { |
19 | 4 | const float p = cmd.info.dropout.p; |
20 | 4 | const float inv_p = 1. / (1. - p); |
21 | 4 | assert(output_size >= 2); |
22 | | // Assuming this is float 32. |
23 | 4 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
24 | 4 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
25 | 4 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
26 | 4 | ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
27 | 4 | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0]; |
28 | 4 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
29 | 4 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
30 | 4 | ccv_nnc_tensor_view_get_dim(a, dim); |
31 | 4 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
32 | 4 | const int tensor_count = ccv_nnc_tensor_count(inputs[0]->info); |
33 | 4 | uint8_t* const maskdata = outputs[1]->data.u8; |
34 | 4 | dsfmt_t dsfmt; |
35 | 4 | dsfmt_init_gen_rand(&dsfmt, ccv_nnc_stream_context_genrand_uint32(stream_context)); |
36 | 4 | int x; |
37 | 4 | if (cmd.info.dropout.entirety) |
38 | 2 | { |
39 | 2 | const int32_t drop = ((int32_t*)maskdata)[0] = (dsfmt_genrand_open_close(&dsfmt) <= p); |
40 | 2 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b)) |
41 | 2 | { |
42 | | // Super optimal case, just do one for-loop for sum. |
43 | 2.00k | for (x = 0; x < tensor_count; x++2.00k ) |
44 | 2.00k | b->data.f32[x] = drop ? 00 : a->data.f32[x] * inv_p; |
45 | 2 | return CCV_NNC_EXEC_SUCCESS; |
46 | 2 | } |
47 | 2 | assert(CCV_NNC_MAX_DIM == 2)0 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
48 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
49 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
50 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
51 | 0 | float* const ap = a->data.f32; |
52 | 0 | float* const bp = b->data.f32; |
53 | 0 | const int count = dim[2] * dim[3]; |
54 | 0 | if (astride[2] == dim[3] && bstride[2] == dim[3]) |
55 | 0 | { |
56 | | // Special casing if the ainc[3] is the same as dim[3] |
57 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
58 | 0 | { |
59 | 0 | float* ap0 = ap + i[0] * astride[0]; |
60 | 0 | float* bp0 = bp + i[0] * bstride[0]; |
61 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
62 | 0 | { |
63 | 0 | for (x = 0; x < count; x++) |
64 | 0 | bp0[x] = drop ? 0 : ap0[x] * inv_p; |
65 | 0 | ap0 += astride[1]; |
66 | 0 | bp0 += bstride[1]; |
67 | 0 | } |
68 | 0 | } |
69 | 0 | return CCV_NNC_EXEC_SUCCESS; |
70 | 0 | } |
71 | | // Non-optimal case, need to do skip copy. |
72 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
73 | 0 | { |
74 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
75 | 0 | float* const bp0 = bp + i[0] * bstride[0]; |
76 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
77 | 0 | { |
78 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
79 | 0 | float* bp1 = bp0 + i[1] * bstride[1]; |
80 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
81 | 0 | { |
82 | 0 | for (x = 0; x < dim[3]; x++) |
83 | 0 | bp1[x] = drop ? 0 : ap1[x] * inv_p; |
84 | 0 | ap1 += astride[2]; |
85 | 0 | bp1 += bstride[2]; |
86 | 0 | } |
87 | 0 | } |
88 | 0 | } |
89 | 2 | } else { |
90 | 2 | uint8_t* maskp = maskdata + (tensor_count - 1); |
91 | 2.00k | for (; maskp >= maskdata; --maskp2.00k ) |
92 | 2.00k | *maskp = (dsfmt_genrand_open_close(&dsfmt) <= p); |
93 | 2 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b)) |
94 | 2 | { |
95 | | // Super optimal case, just do one for-loop for sum. |
96 | 2.00k | for (x = 0; x < tensor_count; x++2.00k ) |
97 | 2.00k | b->data.f32[x] = maskdata[x] ? 0794 : a->data.f32[x] * inv_p1.20k ; |
98 | 2 | return CCV_NNC_EXEC_SUCCESS; |
99 | 2 | } |
100 | 2 | assert(CCV_NNC_MAX_DIM == 2)0 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
101 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
102 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
103 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
104 | 0 | float* const ap = a->data.f32; |
105 | 0 | float* const bp = b->data.f32; |
106 | 0 | const int count = dim[2] * dim[3]; |
107 | 0 | maskp = maskdata; |
108 | 0 | if (astride[2] == dim[3] && bstride[2] == dim[3]) |
109 | 0 | { |
110 | | // Special casing if the ainc[3] is the same as dim[3] |
111 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
112 | 0 | { |
113 | 0 | float* ap0 = ap + i[0] * astride[0]; |
114 | 0 | float* bp0 = bp + i[0] * bstride[0]; |
115 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
116 | 0 | { |
117 | 0 | for (x = 0; x < count; x++) |
118 | 0 | bp0[x] = maskp[x] ? 0 : ap0[x] * inv_p; |
119 | 0 | ap0 += astride[1]; |
120 | 0 | bp0 += bstride[1]; |
121 | 0 | maskp += count; |
122 | 0 | } |
123 | 0 | } |
124 | 0 | return CCV_NNC_EXEC_SUCCESS; |
125 | 0 | } |
126 | | // Non-optimal case, need to do skip copy. |
127 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
128 | 0 | { |
129 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
130 | 0 | float* const bp0 = bp + i[0] * bstride[0]; |
131 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
132 | 0 | { |
133 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
134 | 0 | float* bp1 = bp0 + i[1] * bstride[1]; |
135 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
136 | 0 | { |
137 | 0 | for (x = 0; x < dim[3]; x++) |
138 | 0 | bp1[x] = maskp[x] ? 0 : ap1[x] * inv_p; |
139 | 0 | maskp += dim[3]; |
140 | 0 | ap1 += astride[2]; |
141 | 0 | bp1 += bstride[2]; |
142 | 0 | } |
143 | 0 | } |
144 | 0 | } |
145 | 0 | } |
146 | 0 | return CCV_NNC_EXEC_SUCCESS; |
147 | 4 | } |
148 | | |
149 | | static int _ccv_nnc_dropout_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
150 | 2 | { |
151 | 2 | assert(input_size == 5); |
152 | 2 | const float p = cmd.info.dropout.p; |
153 | 2 | const float inv_p = 1. / (1. - p); |
154 | 2 | uint8_t* const maskdata = inputs[4]->data.u8; |
155 | | // Assuming this is float 32. |
156 | 2 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
157 | 2 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
158 | 2 | int hstride[CCV_NNC_MAX_DIM_ALLOC]; |
159 | 2 | ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; |
160 | 2 | ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0]; |
161 | 2 | assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2); |
162 | 2 | assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2); |
163 | 2 | ccv_nnc_tensor_view_get_dim(g, dim); |
164 | 2 | assert(ccv_nnc_tensor_view_check_dim(h, dim)); |
165 | 2 | int x; |
166 | 2 | if (cmd.info.dropout.entirety) |
167 | 1 | { |
168 | 1 | const int32_t drop = ((int32_t*)maskdata)[0]; |
169 | 1 | if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(h)) |
170 | 1 | { |
171 | | // Super optimal case, just do one for-loop for sum. |
172 | 1 | const int tensor_count = ccv_nnc_tensor_count(inputs[0]->info); |
173 | 1.00k | for (x = 0; x < tensor_count; x++1.00k ) |
174 | 1.00k | h->data.f32[x] = drop ? 00 : g->data.f32[x] * inv_p; |
175 | 1 | return CCV_NNC_EXEC_SUCCESS; |
176 | 1 | } |
177 | 1 | assert(CCV_NNC_MAX_DIM == 2)0 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
178 | 0 | ccv_nnc_tensor_view_get_stride(g, gstride); |
179 | 0 | ccv_nnc_tensor_view_get_stride(h, hstride); |
180 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
181 | 0 | float* const gp = g->data.f32; |
182 | 0 | float* const hp = h->data.f32; |
183 | 0 | const int count = dim[2] * dim[3]; |
184 | 0 | if (gstride[2] == dim[3] && hstride[2] == dim[3]) |
185 | 0 | { |
186 | | // Special casing if the ginc[3] is the same as dim[3] |
187 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
188 | 0 | { |
189 | 0 | float* gp0 = gp + i[0] * gstride[0]; |
190 | 0 | float* hp0 = hp + i[0] * hstride[0]; |
191 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
192 | 0 | { |
193 | 0 | for (x = 0; x < count; x++) |
194 | 0 | hp0[x] = drop ? 0 : gp0[x] * inv_p; |
195 | 0 | gp0 += gstride[1]; |
196 | 0 | hp0 += hstride[1]; |
197 | 0 | } |
198 | 0 | } |
199 | 0 | return CCV_NNC_EXEC_SUCCESS; |
200 | 0 | } |
201 | | // Non-optimal case, need to do skip copy. |
202 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
203 | 0 | { |
204 | 0 | float* const gp0 = gp + i[0] * gstride[0]; |
205 | 0 | float* const hp0 = hp + i[0] * hstride[0]; |
206 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
207 | 0 | { |
208 | 0 | float* gp1 = gp0 + i[1] * gstride[1]; |
209 | 0 | float* hp1 = hp0 + i[1] * hstride[1]; |
210 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
211 | 0 | { |
212 | 0 | for (x = 0; x < dim[3]; x++) |
213 | 0 | hp1[x] = drop ? 0 : gp1[x] * inv_p; |
214 | 0 | gp1 += gstride[2]; |
215 | 0 | hp1 += hstride[2]; |
216 | 0 | } |
217 | 0 | } |
218 | 0 | } |
219 | 1 | } else { |
220 | 1 | if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(h)) |
221 | 1 | { |
222 | | // Super optimal case, just do one for-loop for sum. |
223 | 1 | const int tensor_count = ccv_nnc_tensor_count(inputs[0]->info); |
224 | 1.00k | for (x = 0; x < tensor_count; x++1.00k ) |
225 | 1.00k | h->data.f32[x] = maskdata[x] ? 0399 : g->data.f32[x] * inv_p601 ; |
226 | 1 | return CCV_NNC_EXEC_SUCCESS; |
227 | 1 | } |
228 | 1 | assert(CCV_NNC_MAX_DIM == 2)0 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
229 | 0 | ccv_nnc_tensor_view_get_stride(g, gstride); |
230 | 0 | ccv_nnc_tensor_view_get_stride(h, hstride); |
231 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
232 | 0 | float* const gp = g->data.f32; |
233 | 0 | float* const hp = h->data.f32; |
234 | 0 | const int count = dim[2] * dim[3]; |
235 | 0 | uint8_t* maskp = maskdata; |
236 | 0 | if (gstride[2] == dim[3] && hstride[2] == dim[3]) |
237 | 0 | { |
238 | | // Special casing if the ginc[3] is the same as dim[3] |
239 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
240 | 0 | { |
241 | 0 | float* gp0 = gp + i[0] * gstride[0]; |
242 | 0 | float* hp0 = hp + i[0] * hstride[0]; |
243 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
244 | 0 | { |
245 | 0 | for (x = 0; x < count; x++) |
246 | 0 | hp0[x] = maskp[x] ? 0 : gp0[x] * inv_p; |
247 | 0 | gp0 += gstride[1]; |
248 | 0 | hp0 += hstride[1]; |
249 | 0 | maskp += count; |
250 | 0 | } |
251 | 0 | } |
252 | 0 | return CCV_NNC_EXEC_SUCCESS; |
253 | 0 | } |
254 | | // Non-optimal case, need to do skip copy. |
255 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
256 | 0 | { |
257 | 0 | float* const gp0 = gp + i[0] * gstride[0]; |
258 | 0 | float* const hp0 = hp + i[0] * hstride[0]; |
259 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
260 | 0 | { |
261 | 0 | float* gp1 = gp0 + i[1] * gstride[1]; |
262 | 0 | float* hp1 = hp0 + i[1] * hstride[1]; |
263 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
264 | 0 | { |
265 | 0 | for (x = 0; x < dim[3]; x++) |
266 | 0 | hp1[x] = maskp[x] ? 0 : gp1[x] * inv_p; |
267 | 0 | maskp += dim[3]; |
268 | 0 | gp1 += gstride[2]; |
269 | 0 | hp1 += hstride[2]; |
270 | 0 | } |
271 | 0 | } |
272 | 0 | } |
273 | 0 | } |
274 | 0 | return CCV_NNC_EXEC_SUCCESS; |
275 | 2 | } |
276 | | |
277 | | REGISTER_COMMAND_BACKEND(CCV_NNC_DROPOUT_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
278 | 1 | { |
279 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
280 | 1 | registry->tensor_datatypes = CCV_32F; |
281 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
282 | 1 | registry->algorithms = 1; |
283 | 1 | registry->exec = _ccv_nnc_dropout_forw; |
284 | 1 | } |
285 | | |
286 | | REGISTER_COMMAND_BACKEND(CCV_NNC_DROPOUT_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
287 | 1 | { |
288 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
289 | 1 | registry->tensor_datatypes = CCV_32F; |
290 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
291 | 1 | registry->algorithms = 1; |
292 | 1 | registry->exec = _ccv_nnc_dropout_back; |
293 | 1 | } |