/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/loss/ccv_nnc_smooth_l1_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_smooth_l1_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 4 | { |
15 | 4 | assert(input_size == 2); |
16 | 4 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
17 | 4 | assert(ccv_nnc_tensor_nd(a->info.dim) <= 2); |
18 | 4 | const ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[1]; |
19 | 4 | assert(output_size == 1); |
20 | 4 | ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0]; |
21 | 4 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
22 | 4 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
23 | 4 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
24 | 4 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
25 | 4 | ccv_nnc_tensor_view_get_dim(a, dim); |
26 | 4 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
27 | 4 | ccv_nnc_tensor_view_get_stride(a, astride); |
28 | 4 | ccv_nnc_tensor_view_get_stride(b, bstride); |
29 | 4 | ccv_nnc_tensor_view_get_stride(c, cstride); |
30 | 4 | assert(ccv_nnc_tensor_nd(a->info.dim) <= 2); |
31 | 4 | const int batch_size = dim[CCV_NNC_MAX_DIM]; |
32 | 4 | assert(ccv_nnc_tensor_count(c->info) == batch_size); |
33 | 4 | const int count = dim[CCV_NNC_MAX_DIM + 1]; |
34 | 4 | const int astep = astride[CCV_NNC_MAX_DIM]; |
35 | 4 | const int bstep = bstride[CCV_NNC_MAX_DIM]; |
36 | 4 | const int cstep = ccv_nnc_tensor_nd(c->info.dim) == 1 ? 13 : cstride[1 CCV_NNC_MAX_DIM1 ]; |
37 | 4 | const float beta = cmd.info.smooth_l1.beta; |
38 | 4 | const float beta_inv_2 = 0.5 / beta; |
39 | 4 | const float beta_2 = 0.5 * beta; |
40 | 32 | parallel_for4 (i, batch_size) { |
41 | 32 | int j; |
42 | 32 | const float* const ap = a->data.f32 + i * astep; |
43 | 32 | const float* const bp = b->data.f32 + i * bstep; |
44 | 32 | float cp = 0; |
45 | 3.03k | for (j = 0; j < count; j++3.00k ) |
46 | 3.00k | cp += fabs(bp[j] - ap[j]); |
47 | 32 | if (cp < beta) |
48 | 31 | { |
49 | 31 | cp = 0; |
50 | 3.03k | for (j = 0; j < count; j++3.00k ) |
51 | 3.00k | cp += (bp[j] - ap[j]) * (bp[j] - ap[j]); |
52 | 31 | cp *= beta_inv_2; |
53 | 31 | } else |
54 | 1 | cp -= beta_2; |
55 | 32 | c->data.f32[i * cstep] = cp; |
56 | 32 | } parallel_endfor |
57 | 4 | return CCV_NNC_EXEC_SUCCESS; |
58 | 4 | } |
59 | | |
60 | | static int _ccv_nnc_smooth_l1_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
61 | 3 | { |
62 | 3 | assert(input_size >= 3); |
63 | 3 | assert(output_size >= 1); |
64 | 3 | const ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
65 | 3 | assert(!g || !CCV_IS_TENSOR_VIEW(g)); |
66 | 3 | const ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[1]; |
67 | 3 | const ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2]; |
68 | 3 | const ccv_nnc_tensor_view_t* const c = (ccv_nnc_tensor_view_t*)inputs[3]; |
69 | 3 | ccv_nnc_tensor_view_t* const h = (ccv_nnc_tensor_view_t*)outputs[0]; |
70 | 3 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
71 | 3 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
72 | 3 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
73 | 3 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
74 | 3 | int hstride[CCV_NNC_MAX_DIM_ALLOC]; |
75 | 3 | ccv_nnc_tensor_view_get_dim(a, dim); |
76 | 3 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
77 | 3 | assert(ccv_nnc_tensor_view_check_dim(h, dim)); |
78 | 3 | ccv_nnc_tensor_view_get_stride(a, astride); |
79 | 3 | ccv_nnc_tensor_view_get_stride(b, bstride); |
80 | 3 | ccv_nnc_tensor_view_get_stride(c, cstride); |
81 | 3 | ccv_nnc_tensor_view_get_stride(h, hstride); |
82 | 3 | assert(ccv_nnc_tensor_nd(a->info.dim) <= 2); |
83 | 3 | const int batch_size = dim[CCV_NNC_MAX_DIM]; |
84 | 3 | assert(ccv_nnc_tensor_count(c->info) == batch_size); |
85 | 3 | const int count = dim[CCV_NNC_MAX_DIM + 1]; |
86 | 3 | const int astep = astride[CCV_NNC_MAX_DIM]; |
87 | 3 | const int bstep = bstride[CCV_NNC_MAX_DIM]; |
88 | 3 | const int hstep = hstride[CCV_NNC_MAX_DIM]; |
89 | 3 | const int cstep = ccv_nnc_tensor_nd(c->info.dim) == 1 ? 12 : cstride[1 CCV_NNC_MAX_DIM1 ]; |
90 | 3 | const float beta = cmd.info.smooth_l1.beta; |
91 | 3 | const float beta_2 = 0.5 * beta; |
92 | 3 | const float inv_beta = 1.0 / beta; |
93 | 3 | if (g) |
94 | 2 | { |
95 | 2 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
96 | 2 | ccv_nnc_tensor_view_get_stride(g, gstride); |
97 | 2 | assert(ccv_nnc_tensor_count(g->info) == batch_size); |
98 | 2 | const int gstep = ccv_nnc_tensor_nd(g->info.dim) == 1 ? 11 : gstride[1 CCV_NNC_MAX_DIM1 ]; |
99 | 12 | parallel_for2 (i, batch_size) { |
100 | 12 | int j; |
101 | 12 | const float cp = c->data.f32[i * cstep]; |
102 | 12 | const float* const ap = a->data.f32 + i * astep; |
103 | 12 | const float* const bp = b->data.f32 + i * bstep; |
104 | 12 | float* const hp = h->data.f32 + i * hstep; |
105 | 12 | if (cp < beta_2) |
106 | 11 | { |
107 | 11 | const float gp = inv_beta * g->data.f32[i * gstep]; |
108 | 1.01k | for (j = 0; j < count; j++1.00k ) |
109 | 1.00k | hp[j] = gp * (ap[j] - bp[j]); |
110 | 11 | } else { |
111 | 1 | const float gp = g->data.f32[i * gstep]; |
112 | 4 | for (j = 0; j < count; j++3 ) |
113 | 3 | hp[j] = ((ap[j] - bp[j]) > 0 ? 11 : -12 ) * gp; |
114 | 1 | } |
115 | 12 | } parallel_endfor |
116 | 2 | } else { |
117 | 10 | parallel_for1 (i, batch_size) { |
118 | 10 | int j; |
119 | 10 | const float cp = c->data.f32[i * cstep]; |
120 | 10 | const float* const ap = a->data.f32 + i * astep; |
121 | 10 | const float* const bp = b->data.f32 + i * bstep; |
122 | 10 | float* const hp = h->data.f32 + i * hstep; |
123 | 10 | if (cp < beta_2) |
124 | 1.01k | for (j = 0; 10 j < count; j++1.00k ) |
125 | 1.00k | hp[j] = inv_beta * (ap[j] - bp[j]); |
126 | 0 | else |
127 | 0 | for (j = 0; j < count; j++) |
128 | 0 | hp[j] = (ap[j] - bp[j]) > 0 ? 1 : -1; |
129 | 10 | } parallel_endfor |
130 | 1 | } |
131 | 3 | return CCV_NNC_EXEC_SUCCESS; |
132 | 3 | } |
133 | | |
134 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SMOOTH_L1_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
135 | 1 | { |
136 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
137 | 1 | registry->tensor_datatypes = CCV_32F; |
138 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
139 | 1 | registry->algorithms = 1; |
140 | 1 | registry->exec = _ccv_nnc_smooth_l1_forw; |
141 | 1 | } |
142 | | |
143 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SMOOTH_L1_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
144 | 1 | { |
145 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
146 | 1 | registry->tensor_datatypes = CCV_32F; |
147 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
148 | 1 | registry->algorithms = 1; |
149 | 1 | registry->exec = _ccv_nnc_smooth_l1_back; |
150 | 1 | } |