/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/compression/ccv_nnc_lssc_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_lssc_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 6 | { |
15 | 6 | assert(output_size <= input_size); |
16 | 6 | int n; |
17 | 6 | ccv_float16_t a16[16]; |
18 | 6 | float a32[16]; |
19 | 6 | float bm[2]; |
20 | 12 | for (n = 0; n < output_size; n++6 ) |
21 | 6 | { |
22 | 6 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[n]; |
23 | 6 | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[n]; |
24 | 6 | int i[CCV_NNC_MAX_DIM]; |
25 | 6 | int j[CCV_NNC_MAX_DIM]; |
26 | 6 | int c, k; |
27 | 6 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
28 | 6 | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
29 | 6 | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim2 : a->info.dim + 14 ; |
30 | 6 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
31 | 6 | assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2); |
32 | 6 | const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? b->info.dim2 : b->info.dim + 14 ; |
33 | 6 | ccv_float16_t* ap = a->data.f16; |
34 | 6 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
35 | 6 | ccv_nnc_tensor_view_get_stride(a, astride); |
36 | 6 | ccv_float16_t* bp = b->data.f16; |
37 | 6 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
38 | 6 | ccv_nnc_tensor_view_get_stride(b, bstride); |
39 | 6 | const int nxc = ccv_nnc_tensor_get_n(a->info) * ccv_nnc_tensor_get_c(a->info); |
40 | 6 | assert(nxc == ccv_nnc_tensor_get_n(b->info) * ccv_nnc_tensor_get_c(b->info)); |
41 | 172k | for (k = 0; 6 k < nxc; k++172k ) |
42 | 172k | { |
43 | 1.03M | for (i[0] = 0; i[0] < bdim[1]; i[0]++860k ) |
44 | 860k | { |
45 | 860k | assert(bdim[CCV_NNC_MAX_DIM] % 4 == 0); |
46 | 860k | const int bw = bdim[CCV_NNC_MAX_DIM] / 4; |
47 | 10.7M | for (i[1] = 0; i[1] < bw; i[1]++9.89M ) |
48 | 9.89M | { |
49 | 9.89M | ccv_float16_t* apz = ap + i[0] * 4 * astride[CCV_NNC_MAX_DIM] + i[1] * 4; |
50 | 9.89M | const int h = ccv_min(i[0] * 4 + 4, adim[1]) - i[0] * 4; |
51 | 9.89M | const int w = ccv_min(i[1] * 4 + 4, adim[CCV_NNC_MAX_DIM]) - i[1] * 4; |
52 | 168M | for (c = 0; c < 16; c++158M ) |
53 | 158M | a16[c] = apz[0]; |
54 | 48.1M | for (j[0] = 0; j[0] < h; j[0]++38.2M ) |
55 | 186M | for (j[1] = 0; 38.2M j[1] < w; j[1]++148M ) |
56 | 148M | a16[j[0] * 4 + j[1]] = apz[j[0] * astride[CCV_NNC_MAX_DIM] + j[1]]; |
57 | 9.89M | ccv_half_precision_to_float((uint16_t*)a16, a32, 16); |
58 | 9.89M | float amax = a32[0]; |
59 | 9.89M | float amin = a32[0]; |
60 | 158M | for (c = 1; c < 16; c++148M ) |
61 | 148M | amax = ccv_max(a32[c], amax), amin = ccv_min(a32[c], amin); |
62 | 9.89M | bm[0] = amin; |
63 | 9.89M | bm[1] = amax; |
64 | 9.89M | ccv_float16_t* bpz = bp + i[0] * bstride[CCV_NNC_MAX_DIM] + i[1] * 4; |
65 | 9.89M | uint16_t* const bpz16 = (uint16_t*)bpz; |
66 | 9.89M | ccv_float_to_half_precision(bm, bpz16, 2); |
67 | 9.89M | const float abottom = amin * 7 / 6 - amax / 6; |
68 | 9.89M | const float ascale = 3 / ccv_max(amax - amin, 1e-6); |
69 | 9.89M | bpz16[2] = 0; |
70 | 89.0M | for (c = 0; c < 8; c++79.1M ) |
71 | 79.1M | bpz16[2] |= ((ccv_clamp((int)((a32[c] - abottom) * ascale), 0, 3)) << (c << 1)); |
72 | 9.89M | bpz16[3] = 0; |
73 | 89.0M | for (c = 0; c < 8; c++79.1M ) |
74 | 79.1M | bpz16[3] |= ((ccv_clamp((int)((a32[8 + c] - abottom) * ascale), 0, 3)) << (c << 1)); |
75 | 9.89M | } |
76 | 860k | } |
77 | 172k | bp += bstride[CCV_NNC_MAX_DIM - 1]; |
78 | 172k | ap += astride[CCV_NNC_MAX_DIM - 1]; |
79 | 172k | } |
80 | 6 | } |
81 | 6 | return CCV_NNC_EXEC_SUCCESS; |
82 | 6 | } |
83 | | |
84 | | static int _ccv_nnc_lssc_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
85 | 6 | { |
86 | 6 | assert(output_size <= input_size); |
87 | 6 | int n; |
88 | 6 | ccv_float16_t a16[16]; |
89 | 6 | float a32[16]; |
90 | 6 | float bm[4]; |
91 | 12 | for (n = 0; n < output_size; n++6 ) |
92 | 6 | { |
93 | 6 | const ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[n]; |
94 | 6 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[n]; |
95 | 6 | int i[CCV_NNC_MAX_DIM]; |
96 | 6 | int j[CCV_NNC_MAX_DIM]; |
97 | 6 | int c, k; |
98 | 6 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
99 | 6 | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
100 | 6 | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim2 : a->info.dim + 14 ; |
101 | 6 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
102 | 6 | assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2); |
103 | 6 | const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? b->info.dim2 : b->info.dim + 14 ; |
104 | 6 | ccv_float16_t* ap = a->data.f16; |
105 | 6 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
106 | 6 | ccv_nnc_tensor_view_get_stride(a, astride); |
107 | 6 | ccv_float16_t* bp = b->data.f16; |
108 | 6 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
109 | 6 | ccv_nnc_tensor_view_get_stride(b, bstride); |
110 | 6 | const int nxc = ccv_nnc_tensor_get_n(a->info) * ccv_nnc_tensor_get_c(a->info); |
111 | 6 | assert(nxc == ccv_nnc_tensor_get_n(b->info) * ccv_nnc_tensor_get_c(b->info)); |
112 | 172k | for (k = 0; 6 k < nxc; k++172k ) |
113 | 172k | { |
114 | 1.03M | for (i[0] = 0; i[0] < bdim[1]; i[0]++860k ) |
115 | 860k | { |
116 | 860k | assert(bdim[CCV_NNC_MAX_DIM] % 4 == 0); |
117 | 860k | const int bw = bdim[CCV_NNC_MAX_DIM] / 4; |
118 | 10.7M | for (i[1] = 0; i[1] < bw; i[1]++9.89M ) |
119 | 9.89M | { |
120 | 9.89M | ccv_float16_t* bpz = bp + i[0] * bstride[CCV_NNC_MAX_DIM] + i[1] * 4; |
121 | 9.89M | uint16_t* const bpz16 = (uint16_t*)bpz; |
122 | 9.89M | ccv_half_precision_to_float(bpz16, bm, 2); |
123 | 9.89M | bm[3] = bm[1]; |
124 | 9.89M | bm[1] = bm[3] / 3 + bm[0] * 2 / 3; |
125 | 9.89M | bm[2] = bm[3] * 2 / 3 + bm[0] / 3; |
126 | 89.0M | for (c = 0; c < 8; c++79.1M ) |
127 | 79.1M | a32[c] = bm[((bpz16[2] >> (c << 1)) & 3)]; |
128 | 89.0M | for (c = 0; c < 8; c++79.1M ) |
129 | 79.1M | a32[8 + c] = bm[((bpz16[3] >> (c << 1)) & 3)]; |
130 | 9.89M | ccv_float_to_half_precision(a32, (uint16_t*)a16, 16); |
131 | 9.89M | ccv_float16_t* apz = ap + i[0] * 4 * astride[CCV_NNC_MAX_DIM] + i[1] * 4; |
132 | 9.89M | const int h = ccv_min(i[0] * 4 + 4, adim[1]) - i[0] * 4; |
133 | 9.89M | const int w = ccv_min(i[1] * 4 + 4, adim[CCV_NNC_MAX_DIM]) - i[1] * 4; |
134 | 48.1M | for (j[0] = 0; j[0] < h; j[0]++38.2M ) |
135 | 186M | for (j[1] = 0; 38.2M j[1] < w; j[1]++148M ) |
136 | 148M | apz[j[0] * astride[CCV_NNC_MAX_DIM] + j[1]] = a16[j[0] * 4 + j[1]]; |
137 | 9.89M | } |
138 | 860k | } |
139 | 172k | bp += bstride[CCV_NNC_MAX_DIM - 1]; |
140 | 172k | ap += astride[CCV_NNC_MAX_DIM - 1]; |
141 | 172k | } |
142 | 6 | } |
143 | 6 | return CCV_NNC_EXEC_SUCCESS; |
144 | 0 | return CCV_NNC_EXEC_SUCCESS; |
145 | 6 | } |
146 | | |
147 | | REGISTER_COMMAND_BACKEND(CCV_NNC_COMPRESSION_LSSC_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
148 | 1 | { |
149 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW; |
150 | 1 | registry->tensor_datatypes = CCV_16F; |
151 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
152 | 1 | registry->algorithms = 1; |
153 | 1 | registry->exec = _ccv_nnc_lssc_forw; |
154 | 1 | } |
155 | | |
156 | | REGISTER_COMMAND_BACKEND(CCV_NNC_COMPRESSION_LSSC_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
157 | 1 | { |
158 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW; |
159 | 1 | registry->tensor_datatypes = CCV_16F; |
160 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
161 | 1 | registry->algorithms = 1; |
162 | 1 | registry->exec = _ccv_nnc_lssc_back; |
163 | 1 | } |