/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/norm/ccv_nnc_layer_norm_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | // Shared methods. |
14 | | #include "../_ccv_nnc_cpu_ref.h" |
15 | | |
16 | | static int _ccv_nnc_layer_norm_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
17 | 10 | { |
18 | 10 | assert(input_size == 3 || input_size == 1); |
19 | 10 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0]; |
20 | 10 | ccv_nnc_tensor_view_t* const scale = input_size >= 2 ? (ccv_nnc_tensor_view_t*)inputs[1]5 : 05 ; |
21 | 10 | ccv_nnc_tensor_view_t* const bias = input_size >= 3 ? (ccv_nnc_tensor_view_t*)inputs[2]5 : 05 ; |
22 | 10 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0]; |
23 | 10 | ccv_nnc_tensor_view_t* const saved_mean = (ccv_nnc_tensor_view_t*)outputs[1]; |
24 | 10 | ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)outputs[2]; |
25 | 10 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
26 | 10 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
27 | | // Assuming this is float 32. |
28 | 10 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
29 | 10 | int rdim[CCV_NNC_MAX_DIM_ALLOC]; |
30 | 10 | ccv_nnc_tensor_view_get_dim(a, adim); |
31 | 10 | ccv_nnc_tensor_view_get_dim(saved_mean, rdim); |
32 | 10 | assert(ccv_nnc_tensor_view_check_dim(saved_inv_std, rdim)); |
33 | 10 | assert(ccv_nnc_tensor_view_check_dim(b, adim)); |
34 | 10 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
35 | 10 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
36 | 10 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
37 | 10 | int scale_stride[CCV_NNC_MAX_DIM_ALLOC]; |
38 | 10 | int bias_stride[CCV_NNC_MAX_DIM_ALLOC]; |
39 | 10 | ccv_nnc_tensor_view_get_stride(a, astride); |
40 | 10 | if (scale) |
41 | 5 | ccv_nnc_tensor_view_get_stride(scale, scale_stride); |
42 | 10 | if (bias) |
43 | 5 | ccv_nnc_tensor_view_get_stride(bias, bias_stride); |
44 | 10 | ccv_nnc_tensor_view_get_stride(b, bstride); |
45 | | // The epsilon is used a little bit differently from batch norm, it is outside of the sqrt in this case. |
46 | 10 | const float epsilon = cmd.info.lnorm.epsilon; |
47 | 10 | int saved_mean_stride[CCV_NNC_MAX_DIM_ALLOC]; |
48 | 10 | int saved_inv_std_stride[CCV_NNC_MAX_DIM_ALLOC]; |
49 | 10 | ccv_nnc_tensor_view_get_stride(saved_mean, saved_mean_stride); |
50 | 10 | ccv_nnc_tensor_view_get_stride(saved_inv_std, saved_inv_std_stride); |
51 | 10 | int x; |
52 | 10 | int n = 1; |
53 | 50 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++40 ) |
54 | 40 | n *= adim[x]; |
55 | 50 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++40 ) |
56 | 40 | n /= rdim[x]; |
57 | 10 | const float inv_n = 1. / n; |
58 | 10 | _ccv_nnc_reduce_sum_forw_cpu_ref(a, saved_mean); |
59 | 10 | _ccv_nnc_mul_forw_cpu_ref(inv_n, saved_mean, 0, saved_mean); |
60 | 10 | ccv_nnc_tensor_zero(saved_inv_std); |
61 | 10 | float* const ap = a->data.f32; |
62 | 10 | float* const meanp = saved_mean->data.f32; |
63 | 10 | float* const varp = saved_inv_std->data.f32; |
64 | 10 | int i[CCV_NNC_MAX_DIM + 2]; |
65 | 54 | for (i[0] = 0; i[0] < adim[0]; i[0]++44 ) |
66 | 44 | { |
67 | 44 | float* const ap0 = ap + i[0] * astride[0]; |
68 | 44 | float* const meanp0 = rdim[0] == 1 ? meanp0 : meanp + i[0] * saved_mean_stride[0]; |
69 | 44 | float* const varp0 = rdim[0] == 1 ? varp0 : varp + i[0] * saved_inv_std_stride[0]; |
70 | 196 | for (i[1] = 0; i[1] < adim[1]; i[1]++152 ) |
71 | 152 | { |
72 | 152 | float* ap1 = ap0 + i[1] * astride[1]; |
73 | 152 | float* const meanp1 = rdim[1] == 1 ? meanp0 : meanp0 + i[1] * saved_mean_stride[1]0 ; |
74 | 152 | float* const varp1 = rdim[1] == 1 ? varp0 : varp0 + i[1] * saved_inv_std_stride[1]0 ; |
75 | 712 | for (i[2] = 0; i[2] < adim[2]; i[2]++560 ) |
76 | 560 | { |
77 | 560 | float* const meanp2 = rdim[2] == 1 ? meanp1 : meanp1 + i[2] * saved_mean_stride[2]0 ; |
78 | 560 | float* const varp2 = rdim[2] == 1 ? varp1 : varp1 + i[2] * saved_inv_std_stride[2]0 ; |
79 | 560 | if (rdim[3] == 1) |
80 | 6.16k | for (x = 0; 560 x < adim[3]; x++5.60k ) |
81 | 5.60k | { |
82 | 5.60k | float w = ap1[x * astride[3]] - meanp2[0]; |
83 | 5.60k | varp2[0] += w * w; |
84 | 5.60k | } |
85 | 0 | else |
86 | 0 | for (x = 0; x < adim[3]; x++) |
87 | 0 | { |
88 | 0 | float w = ap1[x * astride[3]] - meanp2[x]; |
89 | 0 | varp2[x] += w * w; |
90 | 0 | } |
91 | 560 | ap1 += astride[2]; |
92 | 560 | } |
93 | 152 | } |
94 | 44 | } |
95 | 54 | for (i[0] = 0; i[0] < rdim[0]; i[0]++44 ) |
96 | 44 | { |
97 | 44 | float* const varp0 = varp + i[0] * saved_inv_std_stride[0]; |
98 | 88 | for (i[1] = 0; i[1] < rdim[1]; i[1]++44 ) |
99 | 44 | { |
100 | 44 | float* const varp1 = varp0 + i[1] * saved_inv_std_stride[1]; |
101 | 88 | for (i[2] = 0; i[2] < rdim[2]; i[2]++44 ) |
102 | 44 | { |
103 | 44 | float* const varp2 = varp1 + i[2] * saved_inv_std_stride[2]; |
104 | 88 | for (x = 0; x < rdim[3]; x++44 ) |
105 | 44 | varp2[x] = 1. / sqrtf(varp2[x] * inv_n + epsilon); |
106 | 44 | } |
107 | 44 | } |
108 | 44 | } |
109 | 10 | if (cmd.info.lnorm.elementwise_affine) |
110 | 5 | { |
111 | 5 | assert(scale && bias && "Should have both scale and bias"); |
112 | 5 | float* const scalep = scale->data.f32; |
113 | 5 | float* const biasp = bias->data.f32; |
114 | 5 | int sdim[CCV_NNC_MAX_DIM_ALLOC]; |
115 | 5 | ccv_nnc_tensor_view_get_dim(scale, sdim); |
116 | 5 | int bias_dim[CCV_NNC_MAX_DIM_ALLOC]; |
117 | 5 | ccv_nnc_tensor_view_get_dim(bias, bias_dim); |
118 | | // Do the straight-forward one, y = (x - mean) * inv_std * scale + bias, we cannot allocate extra memory to help. |
119 | | // There is no need for precompute since scale / bias is per element. |
120 | 5 | float* const bp = b->data.f32; |
121 | 27 | for (i[0] = 0; i[0] < adim[0]; i[0]++22 ) |
122 | 22 | { |
123 | 22 | float* const ap0 = ap + i[0] * astride[0]; |
124 | 22 | float* const bp0 = bp + i[0] * bstride[0]; |
125 | 22 | float* const meanp0 = rdim[0] == 1 ? meanp0 : meanp + i[0] * saved_mean_stride[0]; |
126 | 22 | float* const varp0 = rdim[0] == 1 ? varp0 : varp + i[0] * saved_inv_std_stride[0]; |
127 | 22 | float* const scalep0 = sdim[0] == 1 ? scalep : scalep + i[0] * scale_stride[0]0 ; |
128 | 22 | float* const biasp0 = bias_dim[0] == 1 ? biasp : biasp + i[0] * bias_stride[0]0 ; |
129 | 98 | for (i[1] = 0; i[1] < adim[1]; i[1]++76 ) |
130 | 76 | { |
131 | 76 | float* ap1 = ap0 + i[1] * astride[1]; |
132 | 76 | float* bp1 = bp0 + i[1] * bstride[1]; |
133 | 76 | float* const meanp1 = rdim[1] == 1 ? meanp0 : meanp0 + i[1] * saved_mean_stride[1]0 ; |
134 | 76 | float* const varp1 = rdim[1] == 1 ? varp0 : varp0 + i[1] * saved_inv_std_stride[1]0 ; |
135 | 76 | float* const scalep1 = sdim[1] == 1 ? scalep00 : scalep0 + i[1] * scale_stride[1]; |
136 | 76 | float* const biasp1 = bias_dim[1] == 1 ? biasp00 : biasp0 + i[1] * bias_stride[1]; |
137 | 356 | for (i[2] = 0; i[2] < adim[2]; i[2]++280 ) |
138 | 280 | { |
139 | 280 | float* const meanp2 = rdim[2] == 1 ? meanp1 : meanp1 + i[2] * saved_mean_stride[2]0 ; |
140 | 280 | float* const varp2 = rdim[2] == 1 ? varp1 : varp1 + i[2] * saved_inv_std_stride[2]0 ; |
141 | 280 | float* const scalep2 = sdim[2] == 1 ? scalep10 : scalep1 + i[2] * scale_stride[2]; |
142 | 280 | float* const biasp2 = bias_dim[2] == 1 ? biasp10 : biasp1 + i[2] * bias_stride[2]; |
143 | 280 | if (rdim[3] == 1) |
144 | 3.08k | for (x = 0; 280 x < adim[3]; x++2.80k ) |
145 | 2.80k | bp1[x] = (ap1[x * astride[3]] - meanp2[0]) * varp2[0] * scalep2[sdim[3] == 1 ? 00 : x] + biasp2[bias_dim[3] == 1 ? 00 : x]; |
146 | 0 | else |
147 | 0 | for (x = 0; x < adim[3]; x++) |
148 | 0 | bp1[x] = (ap1[x * astride[3]] - meanp2[x]) * varp2[x] * scalep2[sdim[3] == 1 ? 0 : x] + biasp2[bias_dim[3] == 1 ? 0 : x]; |
149 | 280 | ap1 += astride[2]; |
150 | 280 | bp1 += bstride[2]; |
151 | 280 | } |
152 | 76 | } |
153 | 22 | } |
154 | 5 | } else { |
155 | | // Do the straight-forward one, y = (x - mean) * inv_std, we cannot allocate extra memory to help. |
156 | 5 | float* const bp = b->data.f32; |
157 | 27 | for (i[0] = 0; i[0] < adim[0]; i[0]++22 ) |
158 | 22 | { |
159 | 22 | float* const ap0 = ap + i[0] * astride[0]; |
160 | 22 | float* const bp0 = bp + i[0] * bstride[0]; |
161 | 22 | float* const meanp0 = rdim[0] == 1 ? meanp0 : meanp + i[0] * saved_mean_stride[0]; |
162 | 22 | float* const varp0 = rdim[0] == 1 ? varp0 : varp + i[0] * saved_inv_std_stride[0]; |
163 | 98 | for (i[1] = 0; i[1] < adim[1]; i[1]++76 ) |
164 | 76 | { |
165 | 76 | float* ap1 = ap0 + i[1] * astride[1]; |
166 | 76 | float* bp1 = bp0 + i[1] * bstride[1]; |
167 | 76 | float* const meanp1 = rdim[1] == 1 ? meanp0 : meanp0 + i[1] * saved_mean_stride[1]0 ; |
168 | 76 | float* const varp1 = rdim[1] == 1 ? varp0 : varp0 + i[1] * saved_inv_std_stride[1]0 ; |
169 | 356 | for (i[2] = 0; i[2] < adim[2]; i[2]++280 ) |
170 | 280 | { |
171 | 280 | float* const meanp2 = rdim[2] == 1 ? meanp1 : meanp1 + i[2] * saved_mean_stride[2]0 ; |
172 | 280 | float* const varp2 = rdim[2] == 1 ? varp1 : varp1 + i[2] * saved_inv_std_stride[2]0 ; |
173 | 280 | if (rdim[3] == 1) |
174 | 3.08k | for (x = 0; 280 x < adim[3]; x++2.80k ) |
175 | 2.80k | bp1[x] = (ap1[x * astride[3]] - meanp2[0]) * varp2[0]; |
176 | 0 | else |
177 | 0 | for (x = 0; x < adim[3]; x++) |
178 | 0 | bp1[x] = (ap1[x * astride[3]] - meanp2[x]) * varp2[x]; |
179 | 280 | ap1 += astride[2]; |
180 | 280 | bp1 += bstride[2]; |
181 | 280 | } |
182 | 76 | } |
183 | 22 | } |
184 | 5 | } |
185 | 10 | return CCV_NNC_EXEC_SUCCESS; |
186 | 10 | } |
187 | | |
188 | | static int _ccv_nnc_layer_norm_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
189 | 6 | { |
190 | 6 | assert(input_size == 9 || input_size == 7); |
191 | 6 | assert(output_size >= 1); |
192 | 6 | const int elementwise_affine = cmd.info.lnorm.elementwise_affine; |
193 | 6 | ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
194 | 6 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[3]; |
195 | 6 | ccv_nnc_tensor_view_t* const scale = elementwise_affine ? (ccv_nnc_tensor_view_t*)inputs[4]3 : 03 ; |
196 | 6 | ccv_nnc_tensor_view_t* const saved_mean = (ccv_nnc_tensor_view_t*)inputs[elementwise_affine ? 73 : 53 ]; |
197 | 6 | ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)inputs[elementwise_affine ? 83 : 63 ]; |
198 | 6 | ccv_nnc_tensor_view_t* const h = (ccv_nnc_tensor_view_t*)outputs[0]; |
199 | 6 | ccv_nnc_tensor_view_t* const dscale = output_size > 1 ? (ccv_nnc_tensor_view_t*)outputs[1]3 : 03 ; |
200 | 6 | ccv_nnc_tensor_view_t* const dbias = output_size > 2 ? (ccv_nnc_tensor_view_t*)outputs[2]3 : 03 ; |
201 | 6 | assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2); |
202 | 6 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
203 | 6 | assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2); |
204 | | // Assuming this is float 32. |
205 | 6 | int gdim[CCV_NNC_MAX_DIM_ALLOC]; |
206 | 6 | int rdim[CCV_NNC_MAX_DIM_ALLOC]; |
207 | 6 | ccv_nnc_tensor_view_get_dim(g, gdim); |
208 | 6 | ccv_nnc_tensor_view_get_dim(saved_mean, rdim); |
209 | 6 | assert(ccv_nnc_tensor_view_check_dim(saved_inv_std, rdim)); |
210 | 6 | int sdim[CCV_NNC_MAX_DIM_ALLOC]; |
211 | 6 | if (scale) |
212 | 3 | ccv_nnc_tensor_view_get_dim(scale, sdim); |
213 | 6 | if (dscale) |
214 | 2 | { assert(ccv_nnc_tensor_view_check_dim(dscale, sdim)); } |
215 | 6 | assert(ccv_nnc_tensor_view_check_dim(a, gdim)); |
216 | 6 | assert(ccv_nnc_tensor_view_check_dim(h, gdim)); |
217 | 6 | if (dbias) |
218 | 2 | _ccv_nnc_reduce_sum_forw_cpu_ref(g, dbias); |
219 | 6 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
220 | 6 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
221 | 6 | int hstride[CCV_NNC_MAX_DIM_ALLOC]; |
222 | 6 | int scale_stride[CCV_NNC_MAX_DIM_ALLOC]; |
223 | 6 | int mean_stride[CCV_NNC_MAX_DIM_ALLOC]; |
224 | 6 | int inv_std_stride[CCV_NNC_MAX_DIM_ALLOC]; |
225 | 6 | int dscale_stride[CCV_NNC_MAX_DIM_ALLOC]; |
226 | 6 | ccv_nnc_tensor_view_get_stride(a, astride); |
227 | 6 | ccv_nnc_tensor_view_get_stride(g, gstride); |
228 | 6 | ccv_nnc_tensor_view_get_stride(h, hstride); |
229 | 6 | if (scale) |
230 | 3 | ccv_nnc_tensor_view_get_stride(scale, scale_stride); |
231 | 6 | ccv_nnc_tensor_view_get_stride(saved_mean, mean_stride); |
232 | 6 | ccv_nnc_tensor_view_get_stride(saved_inv_std, inv_std_stride); |
233 | 6 | if (dscale) |
234 | 2 | ccv_nnc_tensor_view_get_stride(dscale, dscale_stride); |
235 | | // Need to allocate two additional memory: |
236 | | // 1. normalized a; |
237 | | // 2. scale * inv_std / n; |
238 | 6 | assert(!(flags & CCV_NNC_ZERO_MEMORY_ALLOC)); |
239 | 6 | int x; |
240 | 6 | int n = 1; |
241 | 30 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++24 ) |
242 | 24 | n *= gdim[x]; |
243 | 30 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++24 ) |
244 | 24 | n /= rdim[x]; |
245 | 6 | int gcount = 1, rcount = 1; |
246 | 30 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++24 ) |
247 | 24 | gcount *= gdim[x], rcount *= rdim[x]; |
248 | 6 | float* const ah = (float*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * gcount * 2 + sizeof(float) * rcount * 2, CCV_TENSOR_CPU_MEMORY); |
249 | 6 | float* const gss = ah + gcount; // g * scale * inv_std |
250 | 6 | float* const gssr = gss + gcount; // gss reduced to inv_std dimension |
251 | 6 | float* const ahgssr = gssr + rcount; // ah * gss then reduced to inv_std dimension. |
252 | 6 | int i[CCV_NNC_MAX_DIM + 2]; |
253 | 6 | float* ahp = ah; |
254 | 6 | const float* const meanp = saved_mean->data.f32; |
255 | 6 | const float* const inv_stdp = saved_inv_std->data.f32; |
256 | 6 | const float* const ap = a->data.f32; |
257 | 30 | for (i[0] = 0; i[0] < gdim[0]; i[0]++24 ) |
258 | 24 | { |
259 | 24 | const float* const ap0 = ap + i[0] * astride[0]; |
260 | 24 | const float* const meanp0 = rdim[0] == 1 ? meanp0 : meanp + i[0] * mean_stride[0]; |
261 | 24 | const float* const inv_stdp0 = rdim[0] == 1 ? inv_stdp0 : inv_stdp + i[0] * inv_std_stride[0]; |
262 | 104 | for (i[1] = 0; i[1] < gdim[1]; i[1]++80 ) |
263 | 80 | { |
264 | 80 | const float* ap1 = ap0 + i[1] * astride[1]; |
265 | 80 | const float* const meanp1 = rdim[1] == 1 ? meanp0 : meanp0 + i[1] * mean_stride[1]0 ; |
266 | 80 | const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : inv_stdp0 + i[1] * inv_std_stride[1]0 ; |
267 | 368 | for (i[2] = 0; i[2] < gdim[2]; i[2]++288 ) |
268 | 288 | { |
269 | 288 | const float* const meanp2 = rdim[2] == 1 ? meanp1 : meanp1 + i[2] * mean_stride[2]0 ; |
270 | 288 | const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : inv_stdp1 + i[2] * inv_std_stride[2]0 ; |
271 | 288 | if (rdim[3] == 1) |
272 | 3.16k | for (x = 0; 288 x < gdim[3]; x++2.88k ) |
273 | 2.88k | ahp[x] = (ap1[x] - meanp2[0]) * inv_stdp2[0]; |
274 | 0 | else |
275 | 0 | for (x = 0; x < gdim[3]; x++) |
276 | 0 | ahp[x] = (ap1[x] - meanp2[x]) * inv_stdp2[x]; |
277 | 288 | ap1 += astride[2]; |
278 | 288 | ahp += gdim[3]; |
279 | 288 | } |
280 | 80 | } |
281 | 24 | } |
282 | 6 | if (dscale) |
283 | 2 | { |
284 | 2 | ccv_nnc_tensor_zero(dscale); |
285 | 2 | ahp = ah; |
286 | 2 | float* gssp = gss; |
287 | 2 | const float* const gp = g->data.f32; |
288 | 2 | const float* const scalep = scale->data.f32; |
289 | 2 | float* const dscalep = dscale->data.f32; |
290 | 12 | for (i[0] = 0; i[0] < gdim[0]; i[0]++10 ) |
291 | 10 | { |
292 | 10 | const float* const gp0 = gp + i[0] * gstride[0]; |
293 | 10 | const float* const inv_stdp0 = rdim[0] == 1 ? inv_stdp0 : inv_stdp + i[0] * inv_std_stride[0]; |
294 | 10 | const float* const scalep0 = sdim[0] == 1 ? scalep : scalep + i[0] * scale_stride[0]0 ; |
295 | 10 | float* const dscalep0 = sdim[0] == 1 ? dscalep : dscalep + i[0] * dscale_stride[0]0 ; |
296 | 46 | for (i[1] = 0; i[1] < gdim[1]; i[1]++36 ) |
297 | 36 | { |
298 | 36 | const float* gp1 = gp0 + i[1] * gstride[1]; |
299 | 36 | const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : inv_stdp0 + i[1] * inv_std_stride[1]0 ; |
300 | 36 | const float* const scalep1 = sdim[1] == 1 ? scalep00 : scalep0 + i[1] * scale_stride[1]; |
301 | 36 | float* const dscalep1 = sdim[1] == 1 ? dscalep00 : dscalep0 + i[1] * dscale_stride[1]; |
302 | 172 | for (i[2] = 0; i[2] < gdim[2]; i[2]++136 ) |
303 | 136 | { |
304 | 136 | const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : inv_stdp1 + i[2] * inv_std_stride[2]0 ; |
305 | 136 | const float* const scalep2 = sdim[2] == 1 ? scalep10 : scalep1 + i[2] * scale_stride[2]; |
306 | 136 | float* const dscalep2 = sdim[2] == 1 ? dscalep10 : dscalep1 + i[2] * dscale_stride[2]; |
307 | 136 | if (sdim[3] == 1) |
308 | 0 | for (x = 0; x < gdim[3]; x++) |
309 | 0 | { |
310 | 0 | gssp[x] = gp1[x] * scalep2[0] * inv_stdp2[rdim[3] == 1 ? 0 : x]; |
311 | 0 | dscalep2[0] += ahp[x] * gp1[x]; |
312 | 0 | } |
313 | 136 | else |
314 | 1.49k | for (x = 0; 136 x < gdim[3]; x++1.36k ) |
315 | 1.36k | { |
316 | 1.36k | gssp[x] = gp1[x] * scalep2[x] * inv_stdp2[rdim[3] == 1 ? 0 : x0 ]; |
317 | 1.36k | dscalep2[x] += ahp[x] * gp1[x]; |
318 | 1.36k | } |
319 | 136 | gp1 += gstride[2]; |
320 | 136 | ahp += gdim[3]; |
321 | 136 | gssp += gdim[3]; |
322 | 136 | } |
323 | 36 | } |
324 | 10 | } |
325 | 4 | } else { |
326 | 4 | float* gssp = gss; |
327 | 4 | const float* const gp = g->data.f32; |
328 | 4 | if (elementwise_affine) |
329 | 1 | { |
330 | 1 | const float* const scalep = scale->data.f32; |
331 | 3 | for (i[0] = 0; i[0] < gdim[0]; i[0]++2 ) |
332 | 2 | { |
333 | 2 | const float* const gp0 = gp + i[0] * gstride[0]; |
334 | 2 | const float* const inv_stdp0 = rdim[0] == 1 ? inv_stdp0 : inv_stdp + i[0] * inv_std_stride[0]; |
335 | 2 | const float* const scalep0 = sdim[0] == 1 ? scalep : scalep + i[0] * scale_stride[0]0 ; |
336 | 6 | for (i[1] = 0; i[1] < gdim[1]; i[1]++4 ) |
337 | 4 | { |
338 | 4 | const float* gp1 = gp0 + i[1] * gstride[1]; |
339 | 4 | const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : inv_stdp0 + i[1] * inv_std_stride[1]0 ; |
340 | 4 | const float* const scalep1 = sdim[1] == 1 ? scalep00 : scalep0 + i[1] * scale_stride[1]; |
341 | 12 | for (i[2] = 0; i[2] < gdim[2]; i[2]++8 ) |
342 | 8 | { |
343 | 8 | const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : inv_stdp1 + i[2] * inv_std_stride[2]0 ; |
344 | 8 | const float* const scalep2 = sdim[2] == 1 ? scalep10 : scalep1 + i[2] * scale_stride[2]; |
345 | 8 | if (sdim[3] == 1) |
346 | 0 | for (x = 0; x < gdim[3]; x++) |
347 | 0 | gssp[x] = gp1[x] * scalep2[0] * inv_stdp2[rdim[3] == 1 ? 0 : x]; |
348 | 8 | else |
349 | 88 | for (x = 0; 8 x < gdim[3]; x++80 ) |
350 | 80 | gssp[x] = gp1[x] * scalep2[x] * inv_stdp2[rdim[3] == 1 ? 0 : x0 ]; |
351 | 8 | gp1 += gstride[2]; |
352 | 8 | gssp += gdim[3]; |
353 | 8 | } |
354 | 4 | } |
355 | 2 | } |
356 | 3 | } else { |
357 | 15 | for (i[0] = 0; i[0] < gdim[0]; i[0]++12 ) |
358 | 12 | { |
359 | 12 | const float* const gp0 = gp + i[0] * gstride[0]; |
360 | 12 | const float* const inv_stdp0 = rdim[0] == 1 ? inv_stdp0 : inv_stdp + i[0] * inv_std_stride[0]; |
361 | 52 | for (i[1] = 0; i[1] < gdim[1]; i[1]++40 ) |
362 | 40 | { |
363 | 40 | const float* gp1 = gp0 + i[1] * gstride[1]; |
364 | 40 | const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : inv_stdp0 + i[1] * inv_std_stride[1]0 ; |
365 | 184 | for (i[2] = 0; i[2] < gdim[2]; i[2]++144 ) |
366 | 144 | { |
367 | 144 | const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : inv_stdp1 + i[2] * inv_std_stride[2]0 ; |
368 | 1.58k | for (x = 0; x < gdim[3]; x++1.44k ) |
369 | 1.44k | gssp[x] = gp1[x] * inv_stdp2[rdim[3] == 1 ? 0 : x0 ]; |
370 | 144 | gp1 += gstride[2]; |
371 | 144 | gssp += gdim[3]; |
372 | 144 | } |
373 | 40 | } |
374 | 12 | } |
375 | 3 | } |
376 | 4 | } |
377 | 6 | ccv_nnc_tensor_t gsst = ccv_nnc_tensor(gss, g->info, 0); |
378 | 6 | ccv_nnc_tensor_t gssrt = ccv_nnc_tensor(gssr, saved_mean->info, 0); |
379 | 6 | _ccv_nnc_reduce_sum_forw_cpu_ref((ccv_nnc_tensor_view_t*)&gsst, (ccv_nnc_tensor_view_t*)&gssrt); |
380 | 6 | ahp = ah; |
381 | 6 | float* gssp = gss; |
382 | 6 | ccv_nnc_tensor_t ahgssrt = ccv_nnc_tensor(ahgssr, saved_mean->info, 0); |
383 | 6 | ccv_nnc_tensor_zero(&ahgssrt); |
384 | 6 | float* const ahgssrp = ahgssr; |
385 | 30 | for (i[0] = 0; i[0] < gdim[0]; i[0]++24 ) |
386 | 24 | { |
387 | 24 | float* const ahgssrp0 = rdim[0] == 1 ? ahgssrp0 : ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3]; |
388 | 104 | for (i[1] = 0; i[1] < gdim[1]; i[1]++80 ) |
389 | 80 | { |
390 | 80 | float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : ahgssrp0 + i[1] * rdim[2] * rdim[3]0 ; |
391 | 368 | for (i[2] = 0; i[2] < gdim[2]; i[2]++288 ) |
392 | 288 | { |
393 | 288 | float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : ahgssrp1 + i[2] * rdim[3]0 ; |
394 | 288 | if (rdim[3] == 1) |
395 | 3.16k | for (x = 0; 288 x < gdim[3]; x++2.88k ) |
396 | 2.88k | ahgssrp2[0] += ahp[x] * gssp[x]; |
397 | 0 | else |
398 | 0 | for (x = 0; x < gdim[3]; x++) |
399 | 0 | ahgssrp2[x] += ahp[x] * gssp[x]; |
400 | 288 | ahp += gdim[3]; |
401 | 288 | gssp += gdim[3]; |
402 | 288 | } |
403 | 80 | } |
404 | 24 | } |
405 | | // Now the part to compute dx (h). |
406 | 6 | float* const hp = h->data.f32; |
407 | 6 | ahp = ah; |
408 | 6 | const float inv_n = 1. / n; |
409 | 6 | gssp = gss; |
410 | 6 | const float* const gssrp = gssr; |
411 | 30 | for (i[0] = 0; i[0] < gdim[0]; i[0]++24 ) |
412 | 24 | { |
413 | 24 | float* const hp0 = hp + i[0] * hstride[0]; |
414 | 24 | const float* const gssrp0 = rdim[0] == 1 ? gssrp0 : gssrp + i[0] * rdim[1] * rdim[2] * rdim[3]; |
415 | 24 | const float* const ahgssrp0 = rdim[0] == 1 ? ahgssrp0 : ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3]; |
416 | 104 | for (i[1] = 0; i[1] < gdim[1]; i[1]++80 ) |
417 | 80 | { |
418 | 80 | float* hp1 = hp0 + i[1] * hstride[1]; |
419 | 80 | const float* const gssrp1 = rdim[1] == 1 ? gssrp0 : gssrp0 + i[1] * rdim[2] * rdim[3]0 ; |
420 | 80 | const float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : ahgssrp0 + i[1] * rdim[2] * rdim[3]0 ; |
421 | 368 | for (i[2] = 0; i[2] < gdim[2]; i[2]++288 ) |
422 | 288 | { |
423 | 288 | const float* const gssrp2 = rdim[2] == 1 ? gssrp1 : gssrp1 + i[2] * rdim[3]0 ; |
424 | 288 | const float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : ahgssrp1 + i[2] * rdim[3]0 ; |
425 | 288 | if (rdim[3] == 1) |
426 | 3.16k | for (x = 0; 288 x < gdim[3]; x++2.88k ) |
427 | 2.88k | hp1[x] = gssp[x] - inv_n * (gssrp2[0] + ahp[x] * ahgssrp2[0]); |
428 | 0 | else |
429 | 0 | for (x = 0; x < gdim[3]; x++) |
430 | 0 | hp1[x] = gssp[x] - inv_n * (gssrp2[x] + ahp[x] * ahgssrp2[x]); |
431 | 288 | hp1 += hstride[2]; |
432 | 288 | ahp += gdim[3]; |
433 | 288 | gssp += gdim[3]; |
434 | 288 | } |
435 | 80 | } |
436 | 24 | } |
437 | 6 | return CCV_NNC_EXEC_SUCCESS; |
438 | 6 | } |
439 | | |
440 | | REGISTER_COMMAND_BACKEND(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
441 | 1 | { |
442 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
443 | 1 | registry->tensor_datatypes = CCV_32F; |
444 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
445 | 1 | registry->algorithms = 1; |
446 | 1 | registry->exec = _ccv_nnc_layer_norm_forw; |
447 | 1 | } |
448 | | |
449 | | REGISTER_COMMAND_BACKEND(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
450 | 1 | { |
451 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
452 | 1 | registry->tensor_datatypes = CCV_32F; |
453 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
454 | 1 | registry->algorithms = 1; |
455 | 1 | registry->exec = _ccv_nnc_layer_norm_back; |
456 | 1 | } |