/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/loss/ccv_nnc_mse_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_mse_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 131 | { |
15 | 131 | assert(input_size == 2); |
16 | 131 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
17 | 131 | assert(ccv_nnc_tensor_nd(a->info.dim) <= 2); |
18 | 131 | const ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[1]; |
19 | 131 | assert(output_size == 1); |
20 | 131 | ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0]; |
21 | 131 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
22 | 131 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
23 | 131 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
24 | 131 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
25 | 131 | ccv_nnc_tensor_view_get_dim(a, dim); |
26 | 131 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
27 | 131 | ccv_nnc_tensor_view_get_stride(a, astride); |
28 | 131 | ccv_nnc_tensor_view_get_stride(b, bstride); |
29 | 131 | ccv_nnc_tensor_view_get_stride(c, cstride); |
30 | 131 | assert(ccv_nnc_tensor_nd(a->info.dim) <= 2); |
31 | 131 | const int batch_size = dim[CCV_NNC_MAX_DIM]; |
32 | 131 | assert(ccv_nnc_tensor_count(c->info) == batch_size); |
33 | 131 | const int count = dim[CCV_NNC_MAX_DIM + 1]; |
34 | 131 | const int astep = astride[CCV_NNC_MAX_DIM]; |
35 | 131 | const int bstep = bstride[CCV_NNC_MAX_DIM]; |
36 | 131 | const int cstep = ccv_nnc_tensor_nd(c->info.dim) == 1 ? 1 : cstride[0 CCV_NNC_MAX_DIM0 ]; |
37 | 131 | if (cmd.info.mse.reduce_op == CCV_NNC_MSE_REDUCE_MEAN) |
38 | 127 | { |
39 | 127 | const float inv_mean = 1.0 / (float)count; |
40 | 163 | parallel_for127 (i, batch_size) { |
41 | 163 | int j; |
42 | 163 | const float* const ap = a->data.f32 + i * astep; |
43 | 163 | const float* const bp = b->data.f32 + i * bstep; |
44 | 163 | float cp = 0; |
45 | 3.08k | for (j = 0; j < count; j++2.92k ) |
46 | 2.92k | cp += (bp[j] - ap[j]) * (bp[j] - ap[j]); |
47 | 163 | c->data.f32[i * cstep] = cp * inv_mean; |
48 | 163 | } parallel_endfor |
49 | 127 | } else { |
50 | 4 | assert(cmd.info.mse.reduce_op == CCV_NNC_MSE_REDUCE_SUM); |
51 | 40 | parallel_for4 (i, batch_size) { |
52 | 40 | int j; |
53 | 40 | const float* const ap = a->data.f32 + i * astep; |
54 | 40 | const float* const bp = b->data.f32 + i * bstep; |
55 | 40 | float cp = 0; |
56 | 2.24k | for (j = 0; j < count; j++2.20k ) |
57 | 2.20k | cp += (bp[j] - ap[j]) * (bp[j] - ap[j]); |
58 | 40 | c->data.f32[i * cstep] = cp; |
59 | 40 | } parallel_endfor |
60 | 4 | } |
61 | 131 | return CCV_NNC_EXEC_SUCCESS; |
62 | 131 | } |
63 | | |
64 | | static int _ccv_nnc_mse_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
65 | 127 | { |
66 | 127 | assert(input_size >= 3); |
67 | 127 | assert(output_size >= 1); |
68 | 127 | const ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
69 | 127 | assert(!g || !CCV_IS_TENSOR_VIEW(g)); |
70 | 127 | const ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[1]; |
71 | 127 | const ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2]; |
72 | 127 | ccv_nnc_tensor_view_t* const ha = (ccv_nnc_tensor_view_t*)outputs[0]; |
73 | 127 | ccv_nnc_tensor_view_t* const hb = output_size >= 2 ? (ccv_nnc_tensor_view_t*)outputs[1] : 00 ; |
74 | 127 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
75 | 127 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
76 | 127 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
77 | 127 | int hastride[CCV_NNC_MAX_DIM_ALLOC]; |
78 | 127 | int hbstride[CCV_NNC_MAX_DIM_ALLOC]; |
79 | 127 | ccv_nnc_tensor_view_get_dim(a, dim); |
80 | 127 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
81 | 127 | if (ha) |
82 | 127 | { assert(ccv_nnc_tensor_view_check_dim(ha, dim)); } |
83 | 127 | if (hb) |
84 | 4 | { assert(ccv_nnc_tensor_view_check_dim(hb, dim)); } |
85 | 127 | ccv_nnc_tensor_view_get_stride(a, astride); |
86 | 127 | ccv_nnc_tensor_view_get_stride(b, bstride); |
87 | 127 | if (ha) |
88 | 127 | ccv_nnc_tensor_view_get_stride(ha, hastride); |
89 | 127 | if (hb) |
90 | 4 | ccv_nnc_tensor_view_get_stride(hb, hbstride); |
91 | 127 | assert(ccv_nnc_tensor_nd(a->info.dim) <= 2); |
92 | 127 | const int batch_size = dim[CCV_NNC_MAX_DIM]; |
93 | 127 | const int count = dim[CCV_NNC_MAX_DIM + 1]; |
94 | 127 | const float inv_mean_2 = cmd.info.mse.reduce_op == CCV_NNC_MSE_REDUCE_MEAN ? 2.0 / (float)count125 : 2.02 ; |
95 | 127 | assert(cmd.info.mse.reduce_op == CCV_NNC_MSE_REDUCE_MEAN || cmd.info.mse.reduce_op == CCV_NNC_MSE_REDUCE_SUM); |
96 | 127 | const int astep = astride[CCV_NNC_MAX_DIM]; |
97 | 127 | const int bstep = bstride[CCV_NNC_MAX_DIM]; |
98 | 127 | const int hastep = hastride[CCV_NNC_MAX_DIM]; |
99 | 127 | const int hbstep = hbstride[CCV_NNC_MAX_DIM]; |
100 | 127 | if (g) |
101 | 127 | { |
102 | 127 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
103 | 127 | ccv_nnc_tensor_view_get_stride(g, gstride); |
104 | 127 | assert(ccv_nnc_tensor_count(g->info) == batch_size); |
105 | 127 | const int gstep = ccv_nnc_tensor_nd(g->info.dim) == 1 ? 1 : gstride[0 CCV_NNC_MAX_DIM0 ]; |
106 | 127 | if (ha) |
107 | 127 | { |
108 | 163 | parallel_for127 (i, batch_size) { |
109 | 163 | int j; |
110 | 163 | const float* const ap = a->data.f32 + i * astep; |
111 | 163 | const float* const bp = b->data.f32 + i * bstep; |
112 | 163 | float* const hp = ha->data.f32 + i * hastep; |
113 | 163 | const float gp = inv_mean_2 * g->data.f32[i * gstep]; |
114 | 3.08k | for (j = 0; j < count; j++2.92k ) |
115 | 2.92k | hp[j] = gp * (ap[j] - bp[j]); |
116 | 163 | } parallel_endfor |
117 | 127 | } |
118 | 127 | if (hb) |
119 | 4 | { |
120 | 40 | parallel_for4 (i, batch_size) { |
121 | 40 | int j; |
122 | 40 | const float* const ap = a->data.f32 + i * astep; |
123 | 40 | const float* const bp = b->data.f32 + i * bstep; |
124 | 40 | float* const hp = hb->data.f32 + i * hbstep; |
125 | 40 | const float gp = inv_mean_2 * g->data.f32[i * gstep]; |
126 | 2.24k | for (j = 0; j < count; j++2.20k ) |
127 | 2.20k | hp[j] = gp * (bp[j] - ap[j]); |
128 | 40 | } parallel_endfor |
129 | 4 | } |
130 | 127 | } else { |
131 | 0 | if (ha) |
132 | 0 | { |
133 | 0 | parallel_for(i, batch_size) { |
134 | 0 | int j; |
135 | 0 | const float* const ap = a->data.f32 + i * astep; |
136 | 0 | const float* const bp = b->data.f32 + i * bstep; |
137 | 0 | float* const hp = ha->data.f32 + i * hastep; |
138 | 0 | for (j = 0; j < count; j++) |
139 | 0 | hp[j] = inv_mean_2 * (ap[j] - bp[j]); |
140 | 0 | } parallel_endfor |
141 | 0 | } |
142 | 0 | if (hb) |
143 | 0 | { |
144 | 0 | parallel_for(i, batch_size) { |
145 | 0 | int j; |
146 | 0 | const float* const ap = a->data.f32 + i * astep; |
147 | 0 | const float* const bp = b->data.f32 + i * bstep; |
148 | 0 | float* const hp = hb->data.f32 + i * hbstep; |
149 | 0 | for (j = 0; j < count; j++) |
150 | 0 | hp[j] = inv_mean_2 * (bp[j] - ap[j]); |
151 | 0 | } parallel_endfor |
152 | 0 | } |
153 | 0 | } |
154 | 127 | return CCV_NNC_EXEC_SUCCESS; |
155 | 127 | } |
156 | | |
157 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MSE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
158 | 1 | { |
159 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
160 | 1 | registry->tensor_datatypes = CCV_32F; |
161 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
162 | 1 | registry->algorithms = 1; |
163 | 1 | registry->exec = _ccv_nnc_mse_forw; |
164 | 1 | } |
165 | | |
166 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MSE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
167 | 1 | { |
168 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
169 | 1 | registry->tensor_datatypes = CCV_32F; |
170 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
171 | 1 | registry->algorithms = 1; |
172 | 1 | registry->exec = _ccv_nnc_mse_back; |
173 | 1 | } |