1 | // This file is part of OpenCV project. |
2 | // It is subject to the license terms in the LICENSE file found in the top-level directory |
3 | // of this distribution and at http://opencv.org/license.html |
4 | |
5 | |
6 | /* Universal Intrinsics implementation of sin, cos, exp and log |
7 | |
8 | Inspired by Intel Approximate Math library, and based on the |
9 | corresponding algorithms of the cephes math library |
10 | */ |
11 | |
12 | /* Copyright (C) 2010,2011 RJVB - extensions */ |
13 | /* Copyright (C) 2011 Julien Pommier |
14 | |
15 | This software is provided 'as-is', without any express or implied |
16 | warranty. In no event will the authors be held liable for any damages |
17 | arising from the use of this software. |
18 | |
19 | Permission is granted to anyone to use this software for any purpose, |
20 | including commercial applications, and to alter it and redistribute it |
21 | freely, subject to the following restrictions: |
22 | |
23 | 1. The origin of this software must not be misrepresented; you must not |
24 | claim that you wrote the original software. If you use this software |
25 | in a product, an acknowledgment in the product documentation would be |
26 | appreciated but is not required. |
27 | 2. Altered source versions must be plainly marked as such, and must not be |
28 | misrepresented as being the original software. |
29 | 3. This notice may not be removed or altered from any source distribution. |
30 | |
31 | (this is the zlib license) |
32 | */ |
33 | #ifndef OPENCV_HAL_INTRIN_MATH_HPP |
34 | #define OPENCV_HAL_INTRIN_MATH_HPP |
35 | |
36 | //! @name Exponential |
37 | //! @{ |
38 | // Implementation is the same as float32 vector. |
39 | template<typename _TpVec16F, typename _TpVec16S> |
40 | inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) { |
41 | const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f); |
42 | const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f); |
43 | const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f); |
44 | const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f); |
45 | const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f); |
46 | const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f); |
47 | const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f); |
48 | const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f); |
49 | const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f); |
50 | const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f); |
51 | const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f); |
52 | const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f); |
53 | const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f); |
54 | |
55 | _TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx; |
56 | _TpVec16S _vexp_mm; |
57 | const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf); |
58 | |
59 | // compute exponential of x |
60 | _vexp_x = v_max(x, _vexp_lo_f16); |
61 | _vexp_x = v_min(_vexp_x, _vexp_hi_f16); |
62 | |
63 | _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16); |
64 | _vexp_mm = v_floor(_vexp_); |
65 | _vexp_ = v_cvt_f16(_vexp_mm); |
66 | _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16); |
67 | _vexp_mm = v_shl(_vexp_mm, 10); |
68 | |
69 | _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x); |
70 | _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x); |
71 | _vexp_xx = v_mul(_vexp_x, _vexp_x); |
72 | |
73 | _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16); |
74 | _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16); |
75 | _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16); |
76 | _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16); |
77 | _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16); |
78 | |
79 | _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x); |
80 | _vexp_y = v_add(_vexp_y, _vexp_one_fp16); |
81 | _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm)); |
82 | |
83 | // exp(NAN) -> NAN |
84 | _TpVec16F mask_not_nan = v_not_nan(x); |
85 | return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00))); |
86 | } |
87 | |
88 | template<typename _TpVec32F, typename _TpVec32S> |
89 | inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) { |
90 | const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f); |
91 | const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f); |
92 | const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f); |
93 | const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f); |
94 | const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f); |
95 | const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f); |
96 | const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f); |
97 | const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f); |
98 | const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f); |
99 | const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f); |
100 | const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f); |
101 | const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f); |
102 | const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f); |
103 | |
104 | _TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx; |
105 | _TpVec32S _vexp_mm; |
106 | const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f); |
107 | |
108 | // compute exponential of x |
109 | _vexp_x = v_max(x, _vexp_lo_f32); |
110 | _vexp_x = v_min(_vexp_x, _vexp_hi_f32); |
111 | |
112 | _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32); |
113 | _vexp_mm = v_floor(_vexp_); |
114 | _vexp_ = v_cvt_f32(_vexp_mm); |
115 | _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32); |
116 | _vexp_mm = v_shl(_vexp_mm, 23); |
117 | |
118 | _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x); |
119 | _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x); |
120 | _vexp_xx = v_mul(_vexp_x, _vexp_x); |
121 | |
122 | _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32); |
123 | _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32); |
124 | _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32); |
125 | _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32); |
126 | _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32); |
127 | |
128 | _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x); |
129 | _vexp_y = v_add(_vexp_y, _vexp_one_fp32); |
130 | _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm)); |
131 | |
132 | // exp(NAN) -> NAN |
133 | _TpVec32F mask_not_nan = v_not_nan(x); |
134 | return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000))); |
135 | } |
136 | |
137 | template<typename _TpVec64F, typename _TpVec64S> |
138 | inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) { |
139 | const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428); |
140 | const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.); |
141 | const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5); |
142 | const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0); |
143 | const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0); |
144 | const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736); |
145 | const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1); |
146 | const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6); |
147 | const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4); |
148 | const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2); |
149 | const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1); |
150 | const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6); |
151 | const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3); |
152 | const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1); |
153 | const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0); |
154 | |
155 | _TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx; |
156 | _TpVec64S _vexp_mm; |
157 | const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff); |
158 | |
159 | // compute exponential of x |
160 | _vexp_x = v_max(x, _vexp_lo_f64); |
161 | _vexp_x = v_min(_vexp_x, _vexp_hi_f64); |
162 | |
163 | _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64); |
164 | _vexp_mm = v_expand_low(v_floor(_vexp_)); |
165 | _vexp_ = v_cvt_f64(_vexp_mm); |
166 | _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64); |
167 | _vexp_mm = v_shl(_vexp_mm, 52); |
168 | |
169 | _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x); |
170 | _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x); |
171 | _vexp_xx = v_mul(_vexp_x, _vexp_x); |
172 | |
173 | _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64); |
174 | _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64); |
175 | _vexp_y = v_mul(_vexp_y, _vexp_x); |
176 | |
177 | _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64); |
178 | _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64); |
179 | _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64); |
180 | |
181 | _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y)); |
182 | _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64); |
183 | _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm)); |
184 | |
185 | // exp(NAN) -> NAN |
186 | _TpVec64F mask_not_nan = v_not_nan(x); |
187 | return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000))); |
188 | } |
189 | //! @} |
190 | |
191 | //! @name Natural Logarithm |
192 | //! @{ |
193 | template<typename _TpVec16F, typename _TpVec16S> |
194 | inline _TpVec16F v_log_default_16f(const _TpVec16F &x) { |
195 | const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f); |
196 | const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f); |
197 | const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f); |
198 | const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f); |
199 | const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f); |
200 | const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f); |
201 | const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f); |
202 | const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f); |
203 | const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f); |
204 | const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f); |
205 | const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f); |
206 | const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f); |
207 | const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f); |
208 | |
209 | _TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp; |
210 | _TpVec16S _vlog_ux, _vlog_emm0; |
211 | const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00); |
212 | |
213 | _vlog_ux = v_reinterpret_as_s16(x); |
214 | _vlog_emm0 = v_shr(_vlog_ux, 10); |
215 | |
216 | _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16); |
217 | _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f))); |
218 | _vlog_x = v_reinterpret_as_f16(_vlog_ux); |
219 | |
220 | _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf)); |
221 | _vlog_e = v_cvt_f16(_vlog_emm0); |
222 | |
223 | _vlog_e = v_add(_vlog_e, _vlog_one_fp16); |
224 | |
225 | _TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16); |
226 | _vlog_tmp = v_and(_vlog_x, _vlog_mask); |
227 | _vlog_x = v_sub(_vlog_x, _vlog_one_fp16); |
228 | _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask)); |
229 | _vlog_x = v_add(_vlog_x, _vlog_tmp); |
230 | |
231 | _vlog_z = v_mul(_vlog_x, _vlog_x); |
232 | |
233 | _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16); |
234 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16); |
235 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16); |
236 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16); |
237 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16); |
238 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16); |
239 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16); |
240 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16); |
241 | _vlog_y = v_mul(_vlog_y, _vlog_x); |
242 | _vlog_y = v_mul(_vlog_y, _vlog_z); |
243 | |
244 | _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y); |
245 | |
246 | _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f))); |
247 | |
248 | _vlog_x = v_add(_vlog_x, _vlog_y); |
249 | _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x); |
250 | // log(0) -> -INF |
251 | _TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>()); |
252 | _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x); |
253 | // log(NEG), log(NAN) -> NAN |
254 | _TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>()); |
255 | _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00))); |
256 | // log(INF) -> INF |
257 | _TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00))); |
258 | _vlog_x = v_select(mask_inf, x, _vlog_x); |
259 | return _vlog_x; |
260 | } |
261 | |
262 | template<typename _TpVec32F, typename _TpVec32S> |
263 | inline _TpVec32F v_log_default_32f(const _TpVec32F &x) { |
264 | const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f); |
265 | const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f); |
266 | const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f); |
267 | const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f); |
268 | const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f); |
269 | const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f); |
270 | const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f); |
271 | const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f); |
272 | const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f); |
273 | const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f); |
274 | const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f); |
275 | const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f); |
276 | const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f); |
277 | |
278 | _TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp; |
279 | _TpVec32S _vlog_ux, _vlog_emm0; |
280 | const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000); |
281 | |
282 | _vlog_ux = v_reinterpret_as_s32(x); |
283 | _vlog_emm0 = v_shr(_vlog_ux, 23); |
284 | |
285 | _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32); |
286 | _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f))); |
287 | _vlog_x = v_reinterpret_as_f32(_vlog_ux); |
288 | |
289 | _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f)); |
290 | _vlog_e = v_cvt_f32(_vlog_emm0); |
291 | |
292 | _vlog_e = v_add(_vlog_e, _vlog_one_fp32); |
293 | |
294 | _TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32); |
295 | _vlog_tmp = v_and(_vlog_x, _vlog_mask); |
296 | _vlog_x = v_sub(_vlog_x, _vlog_one_fp32); |
297 | _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask)); |
298 | _vlog_x = v_add(_vlog_x, _vlog_tmp); |
299 | |
300 | _vlog_z = v_mul(_vlog_x, _vlog_x); |
301 | |
302 | _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32); |
303 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32); |
304 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32); |
305 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32); |
306 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32); |
307 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32); |
308 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32); |
309 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32); |
310 | _vlog_y = v_mul(_vlog_y, _vlog_x); |
311 | _vlog_y = v_mul(_vlog_y, _vlog_z); |
312 | |
313 | _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y); |
314 | |
315 | _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f))); |
316 | |
317 | _vlog_x = v_add(_vlog_x, _vlog_y); |
318 | _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x); |
319 | // log(0) -> -INF |
320 | _TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>()); |
321 | _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x); |
322 | // log(NEG), log(NAN) -> NAN |
323 | _TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>()); |
324 | _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000))); |
325 | // log(INF) -> INF |
326 | _TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000))); |
327 | _vlog_x = v_select(mask_inf, x, _vlog_x); |
328 | return _vlog_x; |
329 | } |
330 | |
331 | template<typename _TpVec64F, typename _TpVec64S> |
332 | inline _TpVec64F v_log_default_64f(const _TpVec64F &x) { |
333 | const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0); |
334 | const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244); |
335 | const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4); |
336 | const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1); |
337 | const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854); |
338 | const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1); |
339 | const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1); |
340 | const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666); |
341 | const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1); |
342 | const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1); |
343 | const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1); |
344 | const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1); |
345 | const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1); |
346 | |
347 | const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4); |
348 | const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375); |
349 | |
350 | _TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx; |
351 | _TpVec64S _vlog_ux, _vlog_emm0; |
352 | const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000); |
353 | |
354 | _vlog_ux = v_reinterpret_as_s64(x); |
355 | _vlog_emm0 = v_shr(_vlog_ux, 52); |
356 | |
357 | _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64); |
358 | _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5))); |
359 | _vlog_x = v_reinterpret_as_f64(_vlog_ux); |
360 | |
361 | _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff)); |
362 | _vlog_e = v_cvt_f64(_vlog_emm0); |
363 | |
364 | _vlog_e = v_add(_vlog_e, _vlog_one_fp64); |
365 | |
366 | _TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64); |
367 | _vlog_tmp = v_and(_vlog_x, _vlog_mask); |
368 | _vlog_x = v_sub(_vlog_x, _vlog_one_fp64); |
369 | _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask)); |
370 | _vlog_x = v_add(_vlog_x, _vlog_tmp); |
371 | |
372 | _vlog_xx = v_mul(_vlog_x, _vlog_x); |
373 | |
374 | _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64); |
375 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64); |
376 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64); |
377 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64); |
378 | _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64); |
379 | _vlog_y = v_mul(_vlog_y, _vlog_x); |
380 | _vlog_y = v_mul(_vlog_y, _vlog_xx); |
381 | |
382 | _vlog_z = v_add(_vlog_x, _vlog_q0_fp64); |
383 | _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64); |
384 | _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64); |
385 | _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64); |
386 | _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64); |
387 | |
388 | _vlog_z = v_div(_vlog_y, _vlog_z); |
389 | _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64)); |
390 | _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5))); |
391 | |
392 | _vlog_z = v_add(_vlog_z, _vlog_x); |
393 | _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z); |
394 | |
395 | // log(0) -> -INF |
396 | _TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>()); |
397 | _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z); |
398 | // log(NEG), log(NAN) -> NAN |
399 | _TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>()); |
400 | _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000))); |
401 | // log(INF) -> INF |
402 | _TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000))); |
403 | _vlog_z = v_select(mask_inf, x, _vlog_z); |
404 | return _vlog_z; |
405 | } |
406 | //! @} |
407 | |
408 | //! @name Sine and Cosine |
409 | //! @{ |
410 | template<typename _TpVec16F, typename _TpVec16S> |
411 | inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) { |
412 | const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI |
413 | const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f)); |
414 | const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f)); |
415 | const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f)); |
416 | const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f)); |
417 | const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f)); |
418 | const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f)); |
419 | const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f)); |
420 | const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f)); |
421 | const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f)); |
422 | const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)); |
423 | const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f)); |
424 | |
425 | _TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos; |
426 | _TpVec16S emm2; |
427 | |
428 | sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>()); |
429 | _vx = v_abs(x); |
430 | _vy = v_mul(_vx, v_cephes_FOPI); |
431 | |
432 | emm2 = v_trunc(_vy); |
433 | emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1)); |
434 | emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1)); |
435 | _vy = v_cvt_f16(emm2); |
436 | |
437 | _TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0))); |
438 | |
439 | _vx = v_fma(_vy, v_minus_DP1, _vx); |
440 | _vx = v_fma(_vy, v_minus_DP2, _vx); |
441 | _vx = v_fma(_vy, v_minus_DP3, _vx); |
442 | |
443 | sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)))); |
444 | sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))); |
445 | |
446 | _TpVec16F _vxx = v_mul(_vx, _vx); |
447 | _TpVec16F y1, y2; |
448 | |
449 | y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1); |
450 | y1 = v_fma(y1, _vxx, v_coscof_p2); |
451 | y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f))); |
452 | y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f))); |
453 | |
454 | y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1); |
455 | y2 = v_fma(y2, _vxx, v_sincof_p2); |
456 | y2 = v_mul(y2, _vxx); |
457 | y2 = v_fma(y2, _vx, _vx); |
458 | |
459 | ysin = v_select(poly_mask, y2, y1); |
460 | ycos = v_select(poly_mask, y1, y2); |
461 | ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin)); |
462 | ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos); |
463 | |
464 | // sincos(NAN) -> NAN, sincos(±INF) -> NAN |
465 | _TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00))); |
466 | _TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x)); |
467 | ysin = v_select(mask_nan, v_nan, ysin); |
468 | ycos = v_select(mask_nan, v_nan, ycos); |
469 | } |
470 | |
471 | template<typename _TpVec16F, typename _TpVec16S> |
472 | inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) { |
473 | _TpVec16F ysin, ycos; |
474 | v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos); |
475 | return ysin; |
476 | } |
477 | |
478 | template<typename _TpVec16F, typename _TpVec16S> |
479 | inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) { |
480 | _TpVec16F ysin, ycos; |
481 | v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos); |
482 | return ycos; |
483 | } |
484 | |
485 | |
486 | template<typename _TpVec32F, typename _TpVec32S> |
487 | inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) { |
488 | const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI |
489 | const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f); |
490 | const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f); |
491 | const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f); |
492 | const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f); |
493 | const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f); |
494 | const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f); |
495 | const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f); |
496 | const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f); |
497 | const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f); |
498 | const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)); |
499 | const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f); |
500 | |
501 | _TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos; |
502 | _TpVec32S emm2; |
503 | |
504 | sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>()); |
505 | _vx = v_abs(x); |
506 | _vy = v_mul(_vx, v_cephes_FOPI); |
507 | |
508 | emm2 = v_trunc(_vy); |
509 | emm2 = v_add(emm2, v_setall_<_TpVec32S>(1)); |
510 | emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1)); |
511 | _vy = v_cvt_f32(emm2); |
512 | |
513 | _TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0))); |
514 | |
515 | _vx = v_fma(_vy, v_minus_DP1, _vx); |
516 | _vx = v_fma(_vy, v_minus_DP2, _vx); |
517 | _vx = v_fma(_vy, v_minus_DP3, _vx); |
518 | |
519 | sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)))); |
520 | sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))); |
521 | |
522 | _TpVec32F _vxx = v_mul(_vx, _vx); |
523 | _TpVec32F y1, y2; |
524 | |
525 | y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1); |
526 | y1 = v_fma(y1, _vxx, v_coscof_p2); |
527 | y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f)); |
528 | y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f)); |
529 | |
530 | y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1); |
531 | y2 = v_fma(y2, _vxx, v_sincof_p2); |
532 | y2 = v_mul(y2, _vxx); |
533 | y2 = v_fma(y2, _vx, _vx); |
534 | |
535 | ysin = v_select(poly_mask, y2, y1); |
536 | ycos = v_select(poly_mask, y1, y2); |
537 | ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin)); |
538 | ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos); |
539 | |
540 | // sincos(NAN) -> NAN, sincos(±INF) -> NAN |
541 | _TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000))); |
542 | _TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x)); |
543 | ysin = v_select(mask_nan, v_nan, ysin); |
544 | ycos = v_select(mask_nan, v_nan, ycos); |
545 | } |
546 | |
547 | template<typename _TpVec32F, typename _TpVec32S> |
548 | inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) { |
549 | _TpVec32F ysin, ycos; |
550 | v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos); |
551 | return ysin; |
552 | } |
553 | |
554 | template<typename _TpVec32F, typename _TpVec32S> |
555 | inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) { |
556 | _TpVec32F ysin, ycos; |
557 | v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos); |
558 | return ycos; |
559 | } |
560 | |
561 | template<typename _TpVec64F, typename _TpVec64S> |
562 | inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) { |
563 | const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI |
564 | const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1); |
565 | const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9); |
566 | const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17); |
567 | const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10); |
568 | const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8); |
569 | const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6); |
570 | const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4); |
571 | const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3); |
572 | const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1); |
573 | const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11); |
574 | const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9); |
575 | const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7); |
576 | const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5); |
577 | const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3); |
578 | const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2); |
579 | const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000)); |
580 | const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0); |
581 | |
582 | _TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos; |
583 | _TpVec64S emm2; |
584 | |
585 | sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>()); |
586 | _vx = v_abs(x); |
587 | _vy = v_mul(_vx, v_cephes_FOPI); |
588 | |
589 | emm2 = v_expand_low(v_trunc(_vy)); |
590 | emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1)); |
591 | emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1)); |
592 | _vy = v_cvt_f64(emm2); |
593 | |
594 | _TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0))); |
595 | |
596 | _vx = v_fma(_vy, v_minus_DP1, _vx); |
597 | _vx = v_fma(_vy, v_minus_DP2, _vx); |
598 | _vx = v_fma(_vy, v_minus_DP3, _vx); |
599 | |
600 | sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)))); |
601 | sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))); |
602 | |
603 | _TpVec64F _vxx = v_mul(_vx, _vx); |
604 | _TpVec64F y1, y2; |
605 | |
606 | y1 = v_fma(v_cos_C1, _vxx, v_cos_C2); |
607 | y1 = v_fma(y1, _vxx, v_cos_C3); |
608 | y1 = v_fma(y1, _vxx, v_cos_C4); |
609 | y1 = v_fma(y1, _vxx, v_cos_C5); |
610 | y1 = v_fma(y1, _vxx, v_cos_C6); |
611 | y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5)); |
612 | y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0)); |
613 | |
614 | y2 = v_fma(v_sin_C1, _vxx, v_sin_C2); |
615 | y2 = v_fma(y2, _vxx, v_sin_C3); |
616 | y2 = v_fma(y2, _vxx, v_sin_C4); |
617 | y2 = v_fma(y2, _vxx, v_sin_C5); |
618 | y2 = v_fma(y2, _vxx, v_sin_C6); |
619 | y2 = v_mul(y2, _vxx); |
620 | y2 = v_fma(y2, _vx, _vx); |
621 | |
622 | ysin = v_select(poly_mask, y2, y1); |
623 | ycos = v_select(poly_mask, y1, y2); |
624 | ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin)); |
625 | ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos); |
626 | |
627 | // sincos(NAN) -> NAN, sincos(±INF) -> NAN |
628 | _TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000))); |
629 | _TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x)); |
630 | ysin = v_select(mask_nan, v_nan, ysin); |
631 | ycos = v_select(mask_nan, v_nan, ycos); |
632 | } |
633 | |
634 | template<typename _TpVec64F, typename _TpVec64S> |
635 | inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) { |
636 | _TpVec64F ysin, ycos; |
637 | v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos); |
638 | return ysin; |
639 | } |
640 | |
641 | template<typename _TpVec64F, typename _TpVec64S> |
642 | inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) { |
643 | _TpVec64F ysin, ycos; |
644 | v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos); |
645 | return ycos; |
646 | } |
647 | //! @} |
648 | |
649 | |
650 | /* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch |
651 | https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220 |
652 | */ |
653 | |
654 | //! @name Error Function |
655 | //! @{ |
656 | template<typename _TpVec32F, typename _TpVec32S> |
657 | inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) { |
658 | const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f), |
659 | coef1 = v_setall_<_TpVec32F>(1.061405429f), |
660 | coef2 = v_setall_<_TpVec32F>(-1.453152027f), |
661 | coef3 = v_setall_<_TpVec32F>(1.421413741f), |
662 | coef4 = v_setall_<_TpVec32F>(-0.284496736f), |
663 | coef5 = v_setall_<_TpVec32F>(0.254829592f), |
664 | ones = v_setall_<_TpVec32F>(1.0f), |
665 | neg_zeros = v_setall_<_TpVec32F>(-0.f); |
666 | _TpVec32F t = v_abs(v); |
667 | // sign(v) |
668 | _TpVec32F sign_mask = v_and(neg_zeros, v); |
669 | |
670 | t = v_div(ones, v_fma(coef0, t, ones)); |
671 | _TpVec32F r = v_fma(coef1, t, coef2); |
672 | r = v_fma(r, t, coef3); |
673 | r = v_fma(r, t, coef4); |
674 | r = v_fma(r, t, coef5); |
675 | // - v * v |
676 | _TpVec32F v2 = v_mul(v, v); |
677 | _TpVec32F mv2 = v_xor(neg_zeros, v2); |
678 | // - exp(- v * v) |
679 | _TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2); |
680 | _TpVec32F neg_exp = v_xor(neg_zeros, exp); |
681 | _TpVec32F res = v_mul(t, neg_exp); |
682 | res = v_fma(r, res, ones); |
683 | return v_xor(sign_mask, res); |
684 | } |
685 | //! @} |
686 | |
687 | #endif // OPENCV_HAL_INTRIN_MATH_HPP |
688 | |