/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/gelu/ccv_nnc_gelu_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_gelu_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 211 | { |
15 | 211 | assert(input_size == 1); |
16 | 211 | const ccv_nnc_tensor_t* a = inputs[0]; |
17 | 211 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
18 | 211 | assert(output_size == 1); |
19 | 211 | ccv_nnc_tensor_t* b = outputs[0]; |
20 | 211 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
21 | 211 | const int count = ccv_nnc_tensor_count(a->info); |
22 | 211 | int i; |
23 | 430 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && a->info.dim[i] > 0; i++219 ) |
24 | 219 | { |
25 | 219 | assert(a->info.dim[i] == b->info.dim[i]); |
26 | 219 | } |
27 | 211 | float* ap = a->data.f32; |
28 | 211 | float* bp = b->data.f32; |
29 | 211 | if (cmd.info.gelu.tanh) |
30 | 2.41k | for (i = 0; 5 i < count; i++2.41k ) |
31 | 2.41k | { |
32 | 2.41k | const float x = ap[i]; |
33 | 2.41k | bp[i] = 0.5 * x * (1 + tanh(0.797884560802865355 * (x + 0.044715 * x * x * x))); |
34 | 2.41k | } |
35 | 206 | else |
36 | 4.62k | for (i = 0; 206 i < count; i++4.42k ) |
37 | 4.42k | { |
38 | 4.42k | const float x = ap[i]; |
39 | 4.42k | bp[i] = x * 0.5 * (1. + erf(x * 0.70710678118654752440)); |
40 | 4.42k | } |
41 | 211 | return CCV_NNC_EXEC_SUCCESS; |
42 | 211 | } |
43 | | |
44 | | static int _ccv_nnc_gelu_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
45 | 106 | { |
46 | 106 | assert(input_size >= 2); |
47 | 106 | const ccv_nnc_tensor_t* g = inputs[0]; // gradient |
48 | 106 | assert(CCV_IS_TENSOR_CONTIGUOUS(g)); |
49 | 106 | const ccv_nnc_tensor_t* a = inputs[1]; |
50 | 106 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
51 | 106 | assert(output_size == 1); |
52 | 106 | ccv_nnc_tensor_t* h = outputs[0]; |
53 | 106 | assert(CCV_IS_TENSOR_CONTIGUOUS(h)); |
54 | 106 | const int count = ccv_nnc_tensor_count(g->info); |
55 | 106 | int i; |
56 | 216 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && g->info.dim[i] > 0; i++110 ) |
57 | 110 | { |
58 | 110 | assert(a->info.dim[i] == g->info.dim[i]); |
59 | 110 | assert(g->info.dim[i] == h->info.dim[i]); |
60 | 110 | } |
61 | 106 | float* ap = a->data.f32; |
62 | 106 | float* gp = g->data.f32; |
63 | 106 | float* hp = h->data.f32; |
64 | 106 | if (cmd.info.gelu.tanh) |
65 | 3 | { |
66 | 2.01k | for (i = 0; i < count; i++2.01k ) |
67 | 2.01k | { |
68 | 2.01k | const float x = ap[i]; |
69 | 2.01k | const float x_sq = x * x; |
70 | 2.01k | const float x_cube = x_sq * x; |
71 | 2.01k | const float inner = 0.797884560802865355 * (x + 0.044715 * x_cube); |
72 | 2.01k | const float tanh_inner = tanh(inner); |
73 | 2.01k | const float left = 0.5 * x; |
74 | 2.01k | const float right = 1 + tanh_inner; |
75 | 2.01k | const float left_derivative = 0.5 * right; |
76 | 2.01k | const float tanh_derivative = 1 - tanh_inner * tanh_inner; |
77 | 2.01k | const float inner_derivative = 0.797884560802865355 * (1 + 3 * 0.044715 * x_sq); |
78 | 2.01k | const float right_derivative = left * tanh_derivative * inner_derivative; |
79 | 2.01k | hp[i] = gp[i] * (left_derivative + right_derivative); |
80 | 2.01k | } |
81 | 103 | } else { |
82 | 3.11k | for (i = 0; i < count; i++3.01k ) |
83 | 3.01k | { |
84 | 3.01k | const float x = ap[i]; |
85 | 3.01k | const float cdf = 0.5 * (1. + erf(x * 0.70710678118654752440)); |
86 | 3.01k | const float pdf = exp(-0.5 * x * x) * 0.797884560802865355; |
87 | 3.01k | hp[i] = gp[i] * (cdf + x * pdf); |
88 | 3.01k | } |
89 | 103 | } |
90 | 106 | return CCV_NNC_EXEC_SUCCESS; |
91 | 106 | } |
92 | | |
93 | | REGISTER_COMMAND_BACKEND(CCV_NNC_GELU_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
94 | 1 | { |
95 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
96 | 1 | registry->tensor_datatypes = CCV_32F; |
97 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
98 | 1 | registry->algorithms = 1; |
99 | 1 | registry->exec = _ccv_nnc_gelu_forw; |
100 | 1 | } |
101 | | |
102 | | REGISTER_COMMAND_BACKEND(CCV_NNC_GELU_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
103 | 1 | { |
104 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
105 | 1 | registry->tensor_datatypes = CCV_32F; |
106 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
107 | 1 | registry->algorithms = 1; |
108 | 1 | registry->exec = _ccv_nnc_gelu_back; |
109 | 1 | } |