/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/softmax/ccv_nnc_softmax_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_softmax_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 508 | { |
15 | 508 | assert(input_size == 1); |
16 | 508 | const ccv_nnc_tensor_t* a = inputs[0]; |
17 | 508 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
18 | 508 | assert(output_size == 1); |
19 | 508 | ccv_nnc_tensor_t* b = outputs[0]; |
20 | 508 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
21 | 508 | const int axis_count = ccv_nnc_tensor_nd(a->info.dim); |
22 | 508 | const int batch_size = axis_count < 2 ? 1498 : a->info.dim[0]10 ; |
23 | 508 | const int count = ccv_nnc_tensor_count(a->info) / batch_size; |
24 | 508 | int i; |
25 | 1.02k | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++520 ) |
26 | 520 | { assert(a->info.dim[i] == b->info.dim[i]); } |
27 | 98.9k | parallel_for508 (i, batch_size) { |
28 | 98.9k | int j; |
29 | 98.9k | float* const ap = a->data.f32 + i * count; |
30 | 98.9k | float* const bp = b->data.f32 + i * count; |
31 | 98.9k | double maxval = ap[0]; |
32 | 13.8M | for (j = 1; j < count; j++13.7M ) |
33 | 13.7M | if (ap[j] > maxval) |
34 | 442k | maxval = ap[j]; |
35 | 98.9k | double sumval = 0; |
36 | 13.9M | for (j = 0; j < count; j++13.8M ) |
37 | 13.8M | sumval += (bp[j] = expf(ap[j] - maxval)); |
38 | 98.9k | sumval = 1.0 / sumval; |
39 | 13.9M | for (j = 0; j < count; j++13.8M ) |
40 | 13.8M | bp[j] *= sumval; |
41 | 98.9k | } parallel_endfor |
42 | 508 | return CCV_NNC_EXEC_SUCCESS; |
43 | 508 | } |
44 | | |
45 | | static int _ccv_nnc_softmax_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
46 | 6 | { |
47 | 6 | assert(input_size == 3); |
48 | 6 | assert(output_size == 1); |
49 | 6 | const ccv_nnc_tensor_t* g = inputs[0]; |
50 | 6 | assert(CCV_IS_TENSOR_CONTIGUOUS(g)); |
51 | 6 | const ccv_nnc_tensor_t* b = inputs[2]; |
52 | 6 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
53 | 6 | ccv_nnc_tensor_t* h = outputs[0]; |
54 | 6 | assert(CCV_IS_TENSOR_CONTIGUOUS(h)); |
55 | 6 | const int axis_count = ccv_nnc_tensor_nd(g->info.dim); |
56 | 6 | const int batch_size = axis_count < 2 ? 11 : g->info.dim[0]5 ; |
57 | 6 | const int count = ccv_nnc_tensor_count(g->info) / batch_size; |
58 | 6 | int i; |
59 | 17 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && g->info.dim[i] > 0; i++11 ) |
60 | 11 | { assert(g->info.dim[i] == h->info.dim[i] && h->info.dim[i] == b->info.dim[i]); } |
61 | 32.7k | parallel_for6 (i, batch_size) { |
62 | 32.7k | int j; |
63 | 32.7k | float* const gp = g->data.f32 + i * count; |
64 | 32.7k | float* const bp = b->data.f32 + i * count; |
65 | 32.7k | float* const hp = h->data.f32 + i * count; |
66 | 32.7k | float sumval = 0; |
67 | 4.22M | for (j = 0; j < count; j++4.19M ) |
68 | 4.19M | sumval += gp[j] * bp[j]; |
69 | 4.22M | for (j = 0; j < count; j++4.19M ) |
70 | 4.19M | hp[j] = (gp[j] - sumval) * bp[j]; |
71 | 32.7k | } parallel_endfor |
72 | 6 | return CCV_NNC_EXEC_SUCCESS; |
73 | 6 | } |
74 | | |
75 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
76 | 1 | { |
77 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
78 | 1 | registry->tensor_datatypes = CCV_32F; |
79 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
80 | 1 | registry->algorithms = 1; |
81 | 1 | registry->exec = _ccv_nnc_softmax_forw; |
82 | 1 | } |
83 | | |
84 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SOFTMAX_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
85 | 1 | { |
86 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
87 | 1 | registry->tensor_datatypes = CCV_32F; |
88 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
89 | 1 | registry->algorithms = 1; |
90 | 1 | registry->exec = _ccv_nnc_softmax_back; |
91 | 1 | } |