1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5
6/* Universal Intrinsics implementation of sin, cos, exp and log
7
8 Inspired by Intel Approximate Math library, and based on the
9 corresponding algorithms of the cephes math library
10*/
11
12/* Copyright (C) 2010,2011 RJVB - extensions */
13/* Copyright (C) 2011 Julien Pommier
14
15 This software is provided 'as-is', without any express or implied
16 warranty. In no event will the authors be held liable for any damages
17 arising from the use of this software.
18
19 Permission is granted to anyone to use this software for any purpose,
20 including commercial applications, and to alter it and redistribute it
21 freely, subject to the following restrictions:
22
23 1. The origin of this software must not be misrepresented; you must not
24 claim that you wrote the original software. If you use this software
25 in a product, an acknowledgment in the product documentation would be
26 appreciated but is not required.
27 2. Altered source versions must be plainly marked as such, and must not be
28 misrepresented as being the original software.
29 3. This notice may not be removed or altered from any source distribution.
30
31 (this is the zlib license)
32*/
33#ifndef OPENCV_HAL_INTRIN_MATH_HPP
34#define OPENCV_HAL_INTRIN_MATH_HPP
35
36//! @name Exponential
37//! @{
38// Implementation is the same as float32 vector.
39template<typename _TpVec16F, typename _TpVec16S>
40inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
41 const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f);
42 const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f);
43 const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f);
44 const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f);
45 const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f);
46 const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f);
47 const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f);
48 const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f);
49 const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f);
50 const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f);
51 const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f);
52 const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f);
53 const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f);
54
55 _TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
56 _TpVec16S _vexp_mm;
57 const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf);
58
59 // compute exponential of x
60 _vexp_x = v_max(x, _vexp_lo_f16);
61 _vexp_x = v_min(_vexp_x, _vexp_hi_f16);
62
63 _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
64 _vexp_mm = v_floor(_vexp_);
65 _vexp_ = v_cvt_f16(_vexp_mm);
66 _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
67 _vexp_mm = v_shl(_vexp_mm, 10);
68
69 _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
70 _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
71 _vexp_xx = v_mul(_vexp_x, _vexp_x);
72
73 _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
74 _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
75 _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
76 _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
77 _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
78
79 _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
80 _vexp_y = v_add(_vexp_y, _vexp_one_fp16);
81 _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
82
83 // exp(NAN) -> NAN
84 _TpVec16F mask_not_nan = v_not_nan(x);
85 return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
86}
87
88template<typename _TpVec32F, typename _TpVec32S>
89inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) {
90 const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f);
91 const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f);
92 const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f);
93 const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f);
94 const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f);
95 const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f);
96 const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f);
97 const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f);
98 const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f);
99 const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f);
100 const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f);
101 const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f);
102 const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f);
103
104 _TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
105 _TpVec32S _vexp_mm;
106 const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f);
107
108 // compute exponential of x
109 _vexp_x = v_max(x, _vexp_lo_f32);
110 _vexp_x = v_min(_vexp_x, _vexp_hi_f32);
111
112 _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
113 _vexp_mm = v_floor(_vexp_);
114 _vexp_ = v_cvt_f32(_vexp_mm);
115 _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
116 _vexp_mm = v_shl(_vexp_mm, 23);
117
118 _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
119 _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
120 _vexp_xx = v_mul(_vexp_x, _vexp_x);
121
122 _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
123 _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
124 _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
125 _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
126 _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
127
128 _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
129 _vexp_y = v_add(_vexp_y, _vexp_one_fp32);
130 _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
131
132 // exp(NAN) -> NAN
133 _TpVec32F mask_not_nan = v_not_nan(x);
134 return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
135}
136
137template<typename _TpVec64F, typename _TpVec64S>
138inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
139 const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428);
140 const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.);
141 const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5);
142 const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0);
143 const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0);
144 const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736);
145 const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1);
146 const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6);
147 const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4);
148 const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2);
149 const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1);
150 const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6);
151 const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3);
152 const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1);
153 const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0);
154
155 _TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
156 _TpVec64S _vexp_mm;
157 const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff);
158
159 // compute exponential of x
160 _vexp_x = v_max(x, _vexp_lo_f64);
161 _vexp_x = v_min(_vexp_x, _vexp_hi_f64);
162
163 _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
164 _vexp_mm = v_expand_low(v_floor(_vexp_));
165 _vexp_ = v_cvt_f64(_vexp_mm);
166 _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
167 _vexp_mm = v_shl(_vexp_mm, 52);
168
169 _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
170 _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
171 _vexp_xx = v_mul(_vexp_x, _vexp_x);
172
173 _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
174 _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
175 _vexp_y = v_mul(_vexp_y, _vexp_x);
176
177 _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
178 _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
179 _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
180
181 _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
182 _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
183 _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
184
185 // exp(NAN) -> NAN
186 _TpVec64F mask_not_nan = v_not_nan(x);
187 return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000)));
188}
189//! @}
190
191//! @name Natural Logarithm
192//! @{
193template<typename _TpVec16F, typename _TpVec16S>
194inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
195 const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f);
196 const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f);
197 const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f);
198 const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f);
199 const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f);
200 const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f);
201 const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f);
202 const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f);
203 const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f);
204 const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f);
205 const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f);
206 const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f);
207 const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f);
208
209 _TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
210 _TpVec16S _vlog_ux, _vlog_emm0;
211 const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00);
212
213 _vlog_ux = v_reinterpret_as_s16(x);
214 _vlog_emm0 = v_shr(_vlog_ux, 10);
215
216 _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
217 _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f)));
218 _vlog_x = v_reinterpret_as_f16(_vlog_ux);
219
220 _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf));
221 _vlog_e = v_cvt_f16(_vlog_emm0);
222
223 _vlog_e = v_add(_vlog_e, _vlog_one_fp16);
224
225 _TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
226 _vlog_tmp = v_and(_vlog_x, _vlog_mask);
227 _vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
228 _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
229 _vlog_x = v_add(_vlog_x, _vlog_tmp);
230
231 _vlog_z = v_mul(_vlog_x, _vlog_x);
232
233 _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
234 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
235 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
236 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
237 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
238 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
239 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
240 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
241 _vlog_y = v_mul(_vlog_y, _vlog_x);
242 _vlog_y = v_mul(_vlog_y, _vlog_z);
243
244 _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
245
246 _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f)));
247
248 _vlog_x = v_add(_vlog_x, _vlog_y);
249 _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
250 // log(0) -> -INF
251 _TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>());
252 _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x);
253 // log(NEG), log(NAN) -> NAN
254 _TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>());
255 _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
256 // log(INF) -> INF
257 _TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
258 _vlog_x = v_select(mask_inf, x, _vlog_x);
259 return _vlog_x;
260}
261
262template<typename _TpVec32F, typename _TpVec32S>
263inline _TpVec32F v_log_default_32f(const _TpVec32F &x) {
264 const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f);
265 const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f);
266 const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f);
267 const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f);
268 const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f);
269 const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f);
270 const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f);
271 const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f);
272 const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f);
273 const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f);
274 const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f);
275 const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f);
276 const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f);
277
278 _TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
279 _TpVec32S _vlog_ux, _vlog_emm0;
280 const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000);
281
282 _vlog_ux = v_reinterpret_as_s32(x);
283 _vlog_emm0 = v_shr(_vlog_ux, 23);
284
285 _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
286 _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f)));
287 _vlog_x = v_reinterpret_as_f32(_vlog_ux);
288
289 _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f));
290 _vlog_e = v_cvt_f32(_vlog_emm0);
291
292 _vlog_e = v_add(_vlog_e, _vlog_one_fp32);
293
294 _TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
295 _vlog_tmp = v_and(_vlog_x, _vlog_mask);
296 _vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
297 _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
298 _vlog_x = v_add(_vlog_x, _vlog_tmp);
299
300 _vlog_z = v_mul(_vlog_x, _vlog_x);
301
302 _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
303 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
304 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
305 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
306 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
307 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
308 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
309 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
310 _vlog_y = v_mul(_vlog_y, _vlog_x);
311 _vlog_y = v_mul(_vlog_y, _vlog_z);
312
313 _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
314
315 _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f)));
316
317 _vlog_x = v_add(_vlog_x, _vlog_y);
318 _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
319 // log(0) -> -INF
320 _TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>());
321 _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x);
322 // log(NEG), log(NAN) -> NAN
323 _TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>());
324 _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
325 // log(INF) -> INF
326 _TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
327 _vlog_x = v_select(mask_inf, x, _vlog_x);
328 return _vlog_x;
329}
330
331template<typename _TpVec64F, typename _TpVec64S>
332inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
333 const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0);
334 const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244);
335 const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4);
336 const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1);
337 const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854);
338 const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1);
339 const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1);
340 const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666);
341 const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1);
342 const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1);
343 const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1);
344 const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1);
345 const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1);
346
347 const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4);
348 const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375);
349
350 _TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
351 _TpVec64S _vlog_ux, _vlog_emm0;
352 const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000);
353
354 _vlog_ux = v_reinterpret_as_s64(x);
355 _vlog_emm0 = v_shr(_vlog_ux, 52);
356
357 _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
358 _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5)));
359 _vlog_x = v_reinterpret_as_f64(_vlog_ux);
360
361 _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff));
362 _vlog_e = v_cvt_f64(_vlog_emm0);
363
364 _vlog_e = v_add(_vlog_e, _vlog_one_fp64);
365
366 _TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
367 _vlog_tmp = v_and(_vlog_x, _vlog_mask);
368 _vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
369 _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
370 _vlog_x = v_add(_vlog_x, _vlog_tmp);
371
372 _vlog_xx = v_mul(_vlog_x, _vlog_x);
373
374 _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
375 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
376 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
377 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
378 _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
379 _vlog_y = v_mul(_vlog_y, _vlog_x);
380 _vlog_y = v_mul(_vlog_y, _vlog_xx);
381
382 _vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
383 _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
384 _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
385 _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
386 _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
387
388 _vlog_z = v_div(_vlog_y, _vlog_z);
389 _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
390 _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5)));
391
392 _vlog_z = v_add(_vlog_z, _vlog_x);
393 _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
394
395 // log(0) -> -INF
396 _TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>());
397 _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z);
398 // log(NEG), log(NAN) -> NAN
399 _TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>());
400 _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000)));
401 // log(INF) -> INF
402 _TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
403 _vlog_z = v_select(mask_inf, x, _vlog_z);
404 return _vlog_z;
405}
406//! @}
407
408//! @name Sine and Cosine
409//! @{
410template<typename _TpVec16F, typename _TpVec16S>
411inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
412 const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI
413 const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f));
414 const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f));
415 const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f));
416 const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f));
417 const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f));
418 const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f));
419 const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f));
420 const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f));
421 const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f));
422 const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00));
423 const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f));
424
425 _TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
426 _TpVec16S emm2;
427
428 sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
429 _vx = v_abs(x);
430 _vy = v_mul(_vx, v_cephes_FOPI);
431
432 emm2 = v_trunc(_vy);
433 emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1));
434 emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1));
435 _vy = v_cvt_f16(emm2);
436
437 _TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0)));
438
439 _vx = v_fma(_vy, v_minus_DP1, _vx);
440 _vx = v_fma(_vy, v_minus_DP2, _vx);
441 _vx = v_fma(_vy, v_minus_DP3, _vx);
442
443 sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))));
444 sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)));
445
446 _TpVec16F _vxx = v_mul(_vx, _vx);
447 _TpVec16F y1, y2;
448
449 y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
450 y1 = v_fma(y1, _vxx, v_coscof_p2);
451 y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f)));
452 y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f)));
453
454 y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
455 y2 = v_fma(y2, _vxx, v_sincof_p2);
456 y2 = v_mul(y2, _vxx);
457 y2 = v_fma(y2, _vx, _vx);
458
459 ysin = v_select(poly_mask, y2, y1);
460 ycos = v_select(poly_mask, y1, y2);
461 ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
462 ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
463
464 // sincos(NAN) -> NAN, sincos(±INF) -> NAN
465 _TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
466 _TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
467 ysin = v_select(mask_nan, v_nan, ysin);
468 ycos = v_select(mask_nan, v_nan, ycos);
469}
470
471template<typename _TpVec16F, typename _TpVec16S>
472inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
473 _TpVec16F ysin, ycos;
474 v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
475 return ysin;
476}
477
478template<typename _TpVec16F, typename _TpVec16S>
479inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
480 _TpVec16F ysin, ycos;
481 v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
482 return ycos;
483}
484
485
486template<typename _TpVec32F, typename _TpVec32S>
487inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
488 const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI
489 const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f);
490 const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f);
491 const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f);
492 const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f);
493 const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f);
494 const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f);
495 const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f);
496 const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f);
497 const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f);
498 const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000));
499 const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f);
500
501 _TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
502 _TpVec32S emm2;
503
504 sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
505 _vx = v_abs(x);
506 _vy = v_mul(_vx, v_cephes_FOPI);
507
508 emm2 = v_trunc(_vy);
509 emm2 = v_add(emm2, v_setall_<_TpVec32S>(1));
510 emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1));
511 _vy = v_cvt_f32(emm2);
512
513 _TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0)));
514
515 _vx = v_fma(_vy, v_minus_DP1, _vx);
516 _vx = v_fma(_vy, v_minus_DP2, _vx);
517 _vx = v_fma(_vy, v_minus_DP3, _vx);
518
519 sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))));
520 sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)));
521
522 _TpVec32F _vxx = v_mul(_vx, _vx);
523 _TpVec32F y1, y2;
524
525 y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
526 y1 = v_fma(y1, _vxx, v_coscof_p2);
527 y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f));
528 y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f));
529
530 y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
531 y2 = v_fma(y2, _vxx, v_sincof_p2);
532 y2 = v_mul(y2, _vxx);
533 y2 = v_fma(y2, _vx, _vx);
534
535 ysin = v_select(poly_mask, y2, y1);
536 ycos = v_select(poly_mask, y1, y2);
537 ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
538 ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
539
540 // sincos(NAN) -> NAN, sincos(±INF) -> NAN
541 _TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
542 _TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
543 ysin = v_select(mask_nan, v_nan, ysin);
544 ycos = v_select(mask_nan, v_nan, ycos);
545}
546
547template<typename _TpVec32F, typename _TpVec32S>
548inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
549 _TpVec32F ysin, ycos;
550 v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
551 return ysin;
552}
553
554template<typename _TpVec32F, typename _TpVec32S>
555inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
556 _TpVec32F ysin, ycos;
557 v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
558 return ycos;
559}
560
561template<typename _TpVec64F, typename _TpVec64S>
562inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
563 const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI
564 const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1);
565 const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9);
566 const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17);
567 const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10);
568 const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8);
569 const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6);
570 const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4);
571 const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3);
572 const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1);
573 const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11);
574 const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9);
575 const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7);
576 const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5);
577 const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3);
578 const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2);
579 const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000));
580 const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0);
581
582 _TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
583 _TpVec64S emm2;
584
585 sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
586 _vx = v_abs(x);
587 _vy = v_mul(_vx, v_cephes_FOPI);
588
589 emm2 = v_expand_low(v_trunc(_vy));
590 emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1));
591 emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1));
592 _vy = v_cvt_f64(emm2);
593
594 _TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0)));
595
596 _vx = v_fma(_vy, v_minus_DP1, _vx);
597 _vx = v_fma(_vy, v_minus_DP2, _vx);
598 _vx = v_fma(_vy, v_minus_DP3, _vx);
599
600 sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))));
601 sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)));
602
603 _TpVec64F _vxx = v_mul(_vx, _vx);
604 _TpVec64F y1, y2;
605
606 y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
607 y1 = v_fma(y1, _vxx, v_cos_C3);
608 y1 = v_fma(y1, _vxx, v_cos_C4);
609 y1 = v_fma(y1, _vxx, v_cos_C5);
610 y1 = v_fma(y1, _vxx, v_cos_C6);
611 y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5));
612 y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0));
613
614 y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
615 y2 = v_fma(y2, _vxx, v_sin_C3);
616 y2 = v_fma(y2, _vxx, v_sin_C4);
617 y2 = v_fma(y2, _vxx, v_sin_C5);
618 y2 = v_fma(y2, _vxx, v_sin_C6);
619 y2 = v_mul(y2, _vxx);
620 y2 = v_fma(y2, _vx, _vx);
621
622 ysin = v_select(poly_mask, y2, y1);
623 ycos = v_select(poly_mask, y1, y2);
624 ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
625 ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
626
627 // sincos(NAN) -> NAN, sincos(±INF) -> NAN
628 _TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
629 _TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
630 ysin = v_select(mask_nan, v_nan, ysin);
631 ycos = v_select(mask_nan, v_nan, ycos);
632}
633
634template<typename _TpVec64F, typename _TpVec64S>
635inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
636 _TpVec64F ysin, ycos;
637 v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
638 return ysin;
639}
640
641template<typename _TpVec64F, typename _TpVec64S>
642inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
643 _TpVec64F ysin, ycos;
644 v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
645 return ycos;
646}
647//! @}
648
649
650/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
651 https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
652*/
653
654//! @name Error Function
655//! @{
656template<typename _TpVec32F, typename _TpVec32S>
657inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) {
658 const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f),
659 coef1 = v_setall_<_TpVec32F>(1.061405429f),
660 coef2 = v_setall_<_TpVec32F>(-1.453152027f),
661 coef3 = v_setall_<_TpVec32F>(1.421413741f),
662 coef4 = v_setall_<_TpVec32F>(-0.284496736f),
663 coef5 = v_setall_<_TpVec32F>(0.254829592f),
664 ones = v_setall_<_TpVec32F>(1.0f),
665 neg_zeros = v_setall_<_TpVec32F>(-0.f);
666 _TpVec32F t = v_abs(v);
667 // sign(v)
668 _TpVec32F sign_mask = v_and(neg_zeros, v);
669
670 t = v_div(ones, v_fma(coef0, t, ones));
671 _TpVec32F r = v_fma(coef1, t, coef2);
672 r = v_fma(r, t, coef3);
673 r = v_fma(r, t, coef4);
674 r = v_fma(r, t, coef5);
675 // - v * v
676 _TpVec32F v2 = v_mul(v, v);
677 _TpVec32F mv2 = v_xor(neg_zeros, v2);
678 // - exp(- v * v)
679 _TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2);
680 _TpVec32F neg_exp = v_xor(neg_zeros, exp);
681 _TpVec32F res = v_mul(t, neg_exp);
682 res = v_fma(r, res, ones);
683 return v_xor(sign_mask, res);
684}
685//! @}
686
687#endif // OPENCV_HAL_INTRIN_MATH_HPP
688

source code of opencv/modules/core/include/opencv2/core/hal/intrin_math.hpp