/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/norm/ccv_nnc_rmsnorm_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | // Shared methods. |
14 | | #include "../_ccv_nnc_cpu_ref.h" |
15 | | |
16 | | static int _ccv_nnc_rmsnorm_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
17 | 5 | { |
18 | 5 | assert(input_size == 2); |
19 | 5 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0]; |
20 | 5 | ccv_nnc_tensor_view_t* const scale = (ccv_nnc_tensor_view_t*)inputs[1]; |
21 | 5 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0]; |
22 | 5 | ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)outputs[1]; |
23 | 5 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
24 | 5 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
25 | | // Assuming this is float 32. |
26 | 5 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
27 | 5 | int rdim[CCV_NNC_MAX_DIM_ALLOC]; |
28 | 5 | ccv_nnc_tensor_view_get_dim(a, adim); |
29 | 5 | ccv_nnc_tensor_view_get_dim(saved_inv_std, rdim); |
30 | 5 | assert(ccv_nnc_tensor_view_check_dim(b, adim)); |
31 | 5 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
32 | 5 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
33 | 5 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
34 | 5 | int scale_stride[CCV_NNC_MAX_DIM_ALLOC]; |
35 | 5 | ccv_nnc_tensor_view_get_stride(a, astride); |
36 | 5 | ccv_nnc_tensor_view_get_stride(scale, scale_stride); |
37 | 5 | ccv_nnc_tensor_view_get_stride(b, bstride); |
38 | | // The epsilon is used a little bit differently from batch norm, it is outside of the sqrt in this case. |
39 | 5 | const float epsilon = cmd.info.rmsnorm.epsilon; |
40 | 5 | int saved_inv_std_stride[CCV_NNC_MAX_DIM_ALLOC]; |
41 | 5 | ccv_nnc_tensor_view_get_stride(saved_inv_std, saved_inv_std_stride); |
42 | 5 | int x; |
43 | 5 | int n = 1; |
44 | 25 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++20 ) |
45 | 20 | n *= adim[x]; |
46 | 25 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++20 ) |
47 | 20 | n /= rdim[x]; |
48 | 5 | const float inv_n = 1. / n; |
49 | 5 | ccv_nnc_tensor_zero(saved_inv_std); |
50 | 5 | float* const ap = a->data.f32; |
51 | 5 | float* const varp = saved_inv_std->data.f32; |
52 | 5 | int i[CCV_NNC_MAX_DIM + 2]; |
53 | 27 | for (i[0] = 0; i[0] < adim[0]; i[0]++22 ) |
54 | 22 | { |
55 | 22 | float* const ap0 = ap + i[0] * astride[0]; |
56 | 22 | float* const varp0 = rdim[0] == 1 ? varp0 : varp + i[0] * saved_inv_std_stride[0]; |
57 | 98 | for (i[1] = 0; i[1] < adim[1]; i[1]++76 ) |
58 | 76 | { |
59 | 76 | float* ap1 = ap0 + i[1] * astride[1]; |
60 | 76 | float* const varp1 = rdim[1] == 1 ? varp0 : varp0 + i[1] * saved_inv_std_stride[1]0 ; |
61 | 356 | for (i[2] = 0; i[2] < adim[2]; i[2]++280 ) |
62 | 280 | { |
63 | 280 | float* const varp2 = rdim[2] == 1 ? varp1 : varp1 + i[2] * saved_inv_std_stride[2]0 ; |
64 | 280 | if (rdim[3] == 1) |
65 | 3.08k | for (x = 0; 280 x < adim[3]; x++2.80k ) |
66 | 2.80k | { |
67 | 2.80k | float w = ap1[x * astride[3]]; |
68 | 2.80k | varp2[0] += w * w; |
69 | 2.80k | } |
70 | 0 | else |
71 | 0 | for (x = 0; x < adim[3]; x++) |
72 | 0 | { |
73 | 0 | float w = ap1[x * astride[3]]; |
74 | 0 | varp2[x] += w * w; |
75 | 0 | } |
76 | 280 | ap1 += astride[2]; |
77 | 280 | } |
78 | 76 | } |
79 | 22 | } |
80 | 27 | for (i[0] = 0; i[0] < rdim[0]; i[0]++22 ) |
81 | 22 | { |
82 | 22 | float* const varp0 = varp + i[0] * saved_inv_std_stride[0]; |
83 | 44 | for (i[1] = 0; i[1] < rdim[1]; i[1]++22 ) |
84 | 22 | { |
85 | 22 | float* const varp1 = varp0 + i[1] * saved_inv_std_stride[1]; |
86 | 44 | for (i[2] = 0; i[2] < rdim[2]; i[2]++22 ) |
87 | 22 | { |
88 | 22 | float* const varp2 = varp1 + i[2] * saved_inv_std_stride[2]; |
89 | 44 | for (x = 0; x < rdim[3]; x++22 ) |
90 | 22 | varp2[x] = 1. / sqrtf(varp2[x] * inv_n + epsilon); |
91 | 22 | } |
92 | 22 | } |
93 | 22 | } |
94 | 5 | float* const scalep = scale->data.f32; |
95 | 5 | int sdim[CCV_NNC_MAX_DIM_ALLOC]; |
96 | 5 | ccv_nnc_tensor_view_get_dim(scale, sdim); |
97 | | // Do the straight-forward one, y = x * inv_std * scale + bias, we cannot allocate extra memory to help. |
98 | | // There is no need for precompute since scale / bias is per element. |
99 | 5 | float* const bp = b->data.f32; |
100 | 27 | for (i[0] = 0; i[0] < adim[0]; i[0]++22 ) |
101 | 22 | { |
102 | 22 | float* const ap0 = ap + i[0] * astride[0]; |
103 | 22 | float* const bp0 = bp + i[0] * bstride[0]; |
104 | 22 | float* const varp0 = rdim[0] == 1 ? varp0 : varp + i[0] * saved_inv_std_stride[0]; |
105 | 22 | float* const scalep0 = sdim[0] == 1 ? scalep : scalep + i[0] * scale_stride[0]0 ; |
106 | 98 | for (i[1] = 0; i[1] < adim[1]; i[1]++76 ) |
107 | 76 | { |
108 | 76 | float* ap1 = ap0 + i[1] * astride[1]; |
109 | 76 | float* bp1 = bp0 + i[1] * bstride[1]; |
110 | 76 | float* const varp1 = rdim[1] == 1 ? varp0 : varp0 + i[1] * saved_inv_std_stride[1]0 ; |
111 | 76 | float* const scalep1 = sdim[1] == 1 ? scalep00 : scalep0 + i[1] * scale_stride[1]; |
112 | 356 | for (i[2] = 0; i[2] < adim[2]; i[2]++280 ) |
113 | 280 | { |
114 | 280 | float* const varp2 = rdim[2] == 1 ? varp1 : varp1 + i[2] * saved_inv_std_stride[2]0 ; |
115 | 280 | float* const scalep2 = sdim[2] == 1 ? scalep10 : scalep1 + i[2] * scale_stride[2]; |
116 | 280 | if (rdim[3] == 1) |
117 | 3.08k | for (x = 0; 280 x < adim[3]; x++2.80k ) |
118 | 2.80k | bp1[x] = ap1[x * astride[3]] * varp2[0] * scalep2[sdim[3] == 1 ? 00 : x]; |
119 | 0 | else |
120 | 0 | for (x = 0; x < adim[3]; x++) |
121 | 0 | bp1[x] = ap1[x * astride[3]] * varp2[x] * scalep2[sdim[3] == 1 ? 0 : x]; |
122 | 280 | ap1 += astride[2]; |
123 | 280 | bp1 += bstride[2]; |
124 | 280 | } |
125 | 76 | } |
126 | 22 | } |
127 | 5 | return CCV_NNC_EXEC_SUCCESS; |
128 | 5 | } |
129 | | |
130 | | static int _ccv_nnc_rmsnorm_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
131 | 3 | { |
132 | 3 | assert(input_size == 6); |
133 | 3 | assert(output_size >= 1); |
134 | 3 | ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
135 | 3 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[2]; |
136 | 3 | ccv_nnc_tensor_view_t* const scale = (ccv_nnc_tensor_view_t*)inputs[3]; |
137 | 3 | ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)inputs[5]; |
138 | 3 | ccv_nnc_tensor_view_t* const h = (ccv_nnc_tensor_view_t*)outputs[0]; |
139 | 3 | ccv_nnc_tensor_view_t* const dscale = output_size > 1 ? (ccv_nnc_tensor_view_t*)outputs[1] : 00 ; |
140 | 3 | assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2); |
141 | 3 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
142 | 3 | assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2); |
143 | | // Assuming this is float 32. |
144 | 3 | int gdim[CCV_NNC_MAX_DIM_ALLOC]; |
145 | 3 | int rdim[CCV_NNC_MAX_DIM_ALLOC]; |
146 | 3 | ccv_nnc_tensor_view_get_dim(g, gdim); |
147 | 3 | ccv_nnc_tensor_view_get_dim(saved_inv_std, rdim); |
148 | 3 | int sdim[CCV_NNC_MAX_DIM_ALLOC]; |
149 | 3 | ccv_nnc_tensor_view_get_dim(scale, sdim); |
150 | 3 | if (dscale) |
151 | 2 | { assert(ccv_nnc_tensor_view_check_dim(dscale, sdim)); } |
152 | 3 | assert(ccv_nnc_tensor_view_check_dim(a, gdim)); |
153 | 3 | assert(ccv_nnc_tensor_view_check_dim(h, gdim)); |
154 | 3 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
155 | 3 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
156 | 3 | int hstride[CCV_NNC_MAX_DIM_ALLOC]; |
157 | 3 | int scale_stride[CCV_NNC_MAX_DIM_ALLOC]; |
158 | 3 | int inv_std_stride[CCV_NNC_MAX_DIM_ALLOC]; |
159 | 3 | int dscale_stride[CCV_NNC_MAX_DIM_ALLOC]; |
160 | 3 | ccv_nnc_tensor_view_get_stride(a, astride); |
161 | 3 | ccv_nnc_tensor_view_get_stride(g, gstride); |
162 | 3 | ccv_nnc_tensor_view_get_stride(h, hstride); |
163 | 3 | ccv_nnc_tensor_view_get_stride(scale, scale_stride); |
164 | 3 | ccv_nnc_tensor_view_get_stride(saved_inv_std, inv_std_stride); |
165 | 3 | if (dscale) |
166 | 2 | ccv_nnc_tensor_view_get_stride(dscale, dscale_stride); |
167 | | // Need to allocate two additional memory: |
168 | | // 1. normalized a; |
169 | | // 2. scale * inv_std / n; |
170 | 3 | assert(!(flags & CCV_NNC_ZERO_MEMORY_ALLOC)); |
171 | 3 | int x; |
172 | 3 | int n = 1; |
173 | 15 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++12 ) |
174 | 12 | n *= gdim[x]; |
175 | 15 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++12 ) |
176 | 12 | n /= rdim[x]; |
177 | 3 | int gcount = 1, rcount = 1; |
178 | 15 | for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++12 ) |
179 | 12 | gcount *= gdim[x], rcount *= rdim[x]; |
180 | 3 | float* const ah = (float*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * gcount * 2 + sizeof(float) * rcount, CCV_TENSOR_CPU_MEMORY); |
181 | 3 | float* const gss = ah + gcount; // g * scale * inv_std |
182 | 3 | float* const ahgssr = gss + gcount; // ah * gss then reduced to inv_std dimension. |
183 | 3 | int i[CCV_NNC_MAX_DIM + 2]; |
184 | 3 | float* ahp = ah; |
185 | 3 | const float* const inv_stdp = saved_inv_std->data.f32; |
186 | 3 | const float* const ap = a->data.f32; |
187 | 15 | for (i[0] = 0; i[0] < gdim[0]; i[0]++12 ) |
188 | 12 | { |
189 | 12 | const float* const ap0 = ap + i[0] * astride[0]; |
190 | 12 | const float* const inv_stdp0 = rdim[0] == 1 ? inv_stdp0 : inv_stdp + i[0] * inv_std_stride[0]; |
191 | 52 | for (i[1] = 0; i[1] < gdim[1]; i[1]++40 ) |
192 | 40 | { |
193 | 40 | const float* ap1 = ap0 + i[1] * astride[1]; |
194 | 40 | const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : inv_stdp0 + i[1] * inv_std_stride[1]0 ; |
195 | 184 | for (i[2] = 0; i[2] < gdim[2]; i[2]++144 ) |
196 | 144 | { |
197 | 144 | const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : inv_stdp1 + i[2] * inv_std_stride[2]0 ; |
198 | 144 | if (rdim[3] == 1) |
199 | 1.58k | for (x = 0; 144 x < gdim[3]; x++1.44k ) |
200 | 1.44k | ahp[x] = ap1[x] * inv_stdp2[0]; |
201 | 0 | else |
202 | 0 | for (x = 0; x < gdim[3]; x++) |
203 | 0 | ahp[x] = ap1[x] * inv_stdp2[x]; |
204 | 144 | ap1 += astride[2]; |
205 | 144 | ahp += gdim[3]; |
206 | 144 | } |
207 | 40 | } |
208 | 12 | } |
209 | 3 | if (dscale) |
210 | 2 | { |
211 | 2 | ccv_nnc_tensor_zero(dscale); |
212 | 2 | ahp = ah; |
213 | 2 | float* gssp = gss; |
214 | 2 | const float* const gp = g->data.f32; |
215 | 2 | const float* const scalep = scale->data.f32; |
216 | 2 | float* const dscalep = dscale->data.f32; |
217 | 12 | for (i[0] = 0; i[0] < gdim[0]; i[0]++10 ) |
218 | 10 | { |
219 | 10 | const float* const gp0 = gp + i[0] * gstride[0]; |
220 | 10 | const float* const inv_stdp0 = rdim[0] == 1 ? inv_stdp0 : inv_stdp + i[0] * inv_std_stride[0]; |
221 | 10 | const float* const scalep0 = sdim[0] == 1 ? scalep : scalep + i[0] * scale_stride[0]0 ; |
222 | 10 | float* const dscalep0 = sdim[0] == 1 ? dscalep : dscalep + i[0] * dscale_stride[0]0 ; |
223 | 46 | for (i[1] = 0; i[1] < gdim[1]; i[1]++36 ) |
224 | 36 | { |
225 | 36 | const float* gp1 = gp0 + i[1] * gstride[1]; |
226 | 36 | const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : inv_stdp0 + i[1] * inv_std_stride[1]0 ; |
227 | 36 | const float* const scalep1 = sdim[1] == 1 ? scalep00 : scalep0 + i[1] * scale_stride[1]; |
228 | 36 | float* const dscalep1 = sdim[1] == 1 ? dscalep00 : dscalep0 + i[1] * dscale_stride[1]; |
229 | 172 | for (i[2] = 0; i[2] < gdim[2]; i[2]++136 ) |
230 | 136 | { |
231 | 136 | const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : inv_stdp1 + i[2] * inv_std_stride[2]0 ; |
232 | 136 | const float* const scalep2 = sdim[2] == 1 ? scalep10 : scalep1 + i[2] * scale_stride[2]; |
233 | 136 | float* const dscalep2 = sdim[2] == 1 ? dscalep10 : dscalep1 + i[2] * dscale_stride[2]; |
234 | 136 | if (sdim[3] == 1) |
235 | 0 | for (x = 0; x < gdim[3]; x++) |
236 | 0 | { |
237 | 0 | gssp[x] = gp1[x] * scalep2[0] * inv_stdp2[rdim[3] == 1 ? 0 : x]; |
238 | 0 | dscalep2[0] += ahp[x] * gp1[x]; |
239 | 0 | } |
240 | 136 | else |
241 | 1.49k | for (x = 0; 136 x < gdim[3]; x++1.36k ) |
242 | 1.36k | { |
243 | 1.36k | gssp[x] = gp1[x] * scalep2[x] * inv_stdp2[rdim[3] == 1 ? 0 : x0 ]; |
244 | 1.36k | dscalep2[x] += ahp[x] * gp1[x]; |
245 | 1.36k | } |
246 | 136 | gp1 += gstride[2]; |
247 | 136 | ahp += gdim[3]; |
248 | 136 | gssp += gdim[3]; |
249 | 136 | } |
250 | 36 | } |
251 | 10 | } |
252 | 2 | } else { |
253 | 1 | float* gssp = gss; |
254 | 1 | const float* const gp = g->data.f32; |
255 | 1 | const float* const scalep = scale->data.f32; |
256 | 3 | for (i[0] = 0; i[0] < gdim[0]; i[0]++2 ) |
257 | 2 | { |
258 | 2 | const float* const gp0 = gp + i[0] * gstride[0]; |
259 | 2 | const float* const inv_stdp0 = rdim[0] == 1 ? inv_stdp0 : inv_stdp + i[0] * inv_std_stride[0]; |
260 | 2 | const float* const scalep0 = sdim[0] == 1 ? scalep : scalep + i[0] * scale_stride[0]0 ; |
261 | 6 | for (i[1] = 0; i[1] < gdim[1]; i[1]++4 ) |
262 | 4 | { |
263 | 4 | const float* gp1 = gp0 + i[1] * gstride[1]; |
264 | 4 | const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : inv_stdp0 + i[1] * inv_std_stride[1]0 ; |
265 | 4 | const float* const scalep1 = sdim[1] == 1 ? scalep00 : scalep0 + i[1] * scale_stride[1]; |
266 | 12 | for (i[2] = 0; i[2] < gdim[2]; i[2]++8 ) |
267 | 8 | { |
268 | 8 | const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : inv_stdp1 + i[2] * inv_std_stride[2]0 ; |
269 | 8 | const float* const scalep2 = sdim[2] == 1 ? scalep10 : scalep1 + i[2] * scale_stride[2]; |
270 | 8 | if (sdim[3] == 1) |
271 | 0 | for (x = 0; x < gdim[3]; x++) |
272 | 0 | gssp[x] = gp1[x] * scalep2[0] * inv_stdp2[rdim[3] == 1 ? 0 : x]; |
273 | 8 | else |
274 | 88 | for (x = 0; 8 x < gdim[3]; x++80 ) |
275 | 80 | gssp[x] = gp1[x] * scalep2[x] * inv_stdp2[rdim[3] == 1 ? 0 : x0 ]; |
276 | 8 | gp1 += gstride[2]; |
277 | 8 | gssp += gdim[3]; |
278 | 8 | } |
279 | 4 | } |
280 | 2 | } |
281 | 1 | } |
282 | 3 | ahp = ah; |
283 | 3 | float* gssp = gss; |
284 | 3 | ccv_nnc_tensor_t ahgssrt = ccv_nnc_tensor(ahgssr, saved_inv_std->info, 0); |
285 | 3 | ccv_nnc_tensor_zero(&ahgssrt); |
286 | 3 | float* const ahgssrp = ahgssr; |
287 | 15 | for (i[0] = 0; i[0] < gdim[0]; i[0]++12 ) |
288 | 12 | { |
289 | 12 | float* const ahgssrp0 = rdim[0] == 1 ? ahgssrp0 : ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3]; |
290 | 52 | for (i[1] = 0; i[1] < gdim[1]; i[1]++40 ) |
291 | 40 | { |
292 | 40 | float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : ahgssrp0 + i[1] * rdim[2] * rdim[3]0 ; |
293 | 184 | for (i[2] = 0; i[2] < gdim[2]; i[2]++144 ) |
294 | 144 | { |
295 | 144 | float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : ahgssrp1 + i[2] * rdim[3]0 ; |
296 | 144 | if (rdim[3] == 1) |
297 | 1.58k | for (x = 0; 144 x < gdim[3]; x++1.44k ) |
298 | 1.44k | ahgssrp2[0] += ahp[x] * gssp[x]; |
299 | 0 | else |
300 | 0 | for (x = 0; x < gdim[3]; x++) |
301 | 0 | ahgssrp2[x] += ahp[x] * gssp[x]; |
302 | 144 | ahp += gdim[3]; |
303 | 144 | gssp += gdim[3]; |
304 | 144 | } |
305 | 40 | } |
306 | 12 | } |
307 | | // Now the part to compute dx (h). |
308 | 3 | float* const hp = h->data.f32; |
309 | 3 | ahp = ah; |
310 | 3 | const float inv_n = 1. / n; |
311 | 3 | gssp = gss; |
312 | 15 | for (i[0] = 0; i[0] < gdim[0]; i[0]++12 ) |
313 | 12 | { |
314 | 12 | float* const hp0 = hp + i[0] * hstride[0]; |
315 | 12 | const float* const ahgssrp0 = rdim[0] == 1 ? ahgssrp0 : ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3]; |
316 | 52 | for (i[1] = 0; i[1] < gdim[1]; i[1]++40 ) |
317 | 40 | { |
318 | 40 | float* hp1 = hp0 + i[1] * hstride[1]; |
319 | 40 | const float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : ahgssrp0 + i[1] * rdim[2] * rdim[3]0 ; |
320 | 184 | for (i[2] = 0; i[2] < gdim[2]; i[2]++144 ) |
321 | 144 | { |
322 | 144 | const float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : ahgssrp1 + i[2] * rdim[3]0 ; |
323 | 144 | if (rdim[3] == 1) |
324 | 1.58k | for (x = 0; 144 x < gdim[3]; x++1.44k ) |
325 | 1.44k | hp1[x] = gssp[x] - inv_n * ahp[x] * ahgssrp2[0]; |
326 | 0 | else |
327 | 0 | for (x = 0; x < gdim[3]; x++) |
328 | 0 | hp1[x] = gssp[x] - inv_n * ahp[x] * ahgssrp2[x]; |
329 | 144 | hp1 += hstride[2]; |
330 | 144 | ahp += gdim[3]; |
331 | 144 | gssp += gdim[3]; |
332 | 144 | } |
333 | 40 | } |
334 | 12 | } |
335 | 3 | return CCV_NNC_EXEC_SUCCESS; |
336 | 3 | } |
337 | | |
338 | | REGISTER_COMMAND_BACKEND(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
339 | 1 | { |
340 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
341 | 1 | registry->tensor_datatypes = CCV_32F; |
342 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
343 | 1 | registry->algorithms = 1; |
344 | 1 | registry->exec = _ccv_nnc_rmsnorm_forw; |
345 | 1 | } |
346 | | |
347 | | REGISTER_COMMAND_BACKEND(CCV_NNC_RMSNORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
348 | 1 | { |
349 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
350 | 1 | registry->tensor_datatypes = CCV_32F; |
351 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
352 | 1 | registry->algorithms = 1; |
353 | 1 | registry->exec = _ccv_nnc_rmsnorm_back; |
354 | 1 | } |