intrin_math.hpp source code [opencv/modules/core/include/opencv2/core/hal/intrin_math.hpp]

1	// This file is part of OpenCV project.
2	// It is subject to the license terms in the LICENSE file found in the top-level directory
3	// of this distribution and at http://opencv.org/license.html
4
5
6	/ Universal Intrinsics implementation of sin, cos, exp and log*
7
8	Inspired by Intel Approximate Math library, and based on the
9	corresponding algorithms of the cephes math library
10	*/
11
12	/ Copyright (C) 2010,2011 RJVB - extensions /
13	/ Copyright (C) 2011 Julien Pommier*
14
15	This software is provided 'as-is', without any express or implied
16	warranty. In no event will the authors be held liable for any damages
17	arising from the use of this software.
18
19	Permission is granted to anyone to use this software for any purpose,
20	including commercial applications, and to alter it and redistribute it
21	freely, subject to the following restrictions:
22
23	1. The origin of this software must not be misrepresented; you must not
24	claim that you wrote the original software. If you use this software
25	in a product, an acknowledgment in the product documentation would be
26	appreciated but is not required.
27	2. Altered source versions must be plainly marked as such, and must not be
28	misrepresented as being the original software.
29	3. This notice may not be removed or altered from any source distribution.
30
31	(this is the zlib license)
32	*/
33	#ifndef OPENCV_HAL_INTRIN_MATH_HPP
34	#define OPENCV_HAL_INTRIN_MATH_HPP
35
36	//! @name Exponential
37	//! @{
38	// Implementation is the same as float32 vector.
39	template<typename _TpVec16F, typename _TpVec16S>
40	inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
41	const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-`10.7421875f`);
42	const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(`11.f`);
43	const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(`0.5f`);
44	const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(`1.f`);
45	const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(`1.44269504088896341f`);
46	const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-`6.93359375E-1f`);
47	const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(`2.12194440E-4f`);
48	const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(`1.9875691500E-4f`);
49	const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(`1.3981999507E-3f`);
50	const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(`8.3334519073E-3f`);
51	const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(`4.1665795894E-2f`);
52	const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(`1.6666665459E-1f`);
53	const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(`5.0000001201E-1f`);
54
55	_TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
56	_TpVec16S _vexp_mm;
57	const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)`0xf`);
58
59	// compute exponential of x
60	_vexp_x = v_max(x, _vexp_lo_f16);
61	_vexp_x = v_min(_vexp_x, _vexp_hi_f16);
62
63	_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
64	_vexp_mm = v_floor(_vexp_);
65	_vexp_ = v_cvt_f16(_vexp_mm);
66	_vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
67	_vexp_mm = v_shl(_vexp_mm, `10`);
68
69	_vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
70	_vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
71	_vexp_xx = v_mul(_vexp_x, _vexp_x);
72
73	_vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
74	_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
75	_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
76	_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
77	_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
78
79	_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
80	_vexp_y = v_add(_vexp_y, _vexp_one_fp16);
81	_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
82
83	// exp(NAN) -> NAN
84	_TpVec16F mask_not_nan = v_not_nan(x);
85	return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)`0x7e00`)));
86	}
87
88	template<typename _TpVec32F, typename _TpVec32S>
89	inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) {
90	const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-`88.3762626647949f`);
91	const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(`89.f`);
92	const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(`0.5f`);
93	const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(`1.f`);
94	const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(`1.44269504088896341f`);
95	const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-`6.93359375E-1f`);
96	const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(`2.12194440E-4f`);
97	const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(`1.9875691500E-4f`);
98	const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(`1.3981999507E-3f`);
99	const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(`8.3334519073E-3f`);
100	const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(`4.1665795894E-2f`);
101	const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(`1.6666665459E-1f`);
102	const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(`5.0000001201E-1f`);
103
104	_TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
105	_TpVec32S _vexp_mm;
106	const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)`0x7f`);
107
108	// compute exponential of x
109	_vexp_x = v_max(x, _vexp_lo_f32);
110	_vexp_x = v_min(_vexp_x, _vexp_hi_f32);
111
112	_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
113	_vexp_mm = v_floor(_vexp_);
114	_vexp_ = v_cvt_f32(_vexp_mm);
115	_vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
116	_vexp_mm = v_shl(_vexp_mm, `23`);
117
118	_vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
119	_vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
120	_vexp_xx = v_mul(_vexp_x, _vexp_x);
121
122	_vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
123	_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
124	_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
125	_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
126	_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
127
128	_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
129	_vexp_y = v_add(_vexp_y, _vexp_one_fp32);
130	_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
131
132	// exp(NAN) -> NAN
133	_TpVec32F mask_not_nan = v_not_nan(x);
134	return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)`0x7fc00000`)));
135	}
136
137	template<typename _TpVec64F, typename _TpVec64S>
138	inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
139	const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-`709.43613930310391424428`);
140	const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(`710.`);
141	const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(`0.5`);
142	const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(`1.0`);
143	const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(`2.0`);
144	const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(`1.44269504088896340736`);
145	const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-`6.93145751953125E-1`);
146	const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-`1.42860682030941723212E-6`);
147	const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(`1.26177193074810590878E-4`);
148	const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(`3.02994407707441961300E-2`);
149	const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(`9.99999999999999999910E-1`);
150	const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(`3.00198505138664455042E-6`);
151	const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(`2.52448340349684104192E-3`);
152	const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(`2.27265548208155028766E-1`);
153	const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(`2.00000000000000000009E0`);
154
155	_TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
156	_TpVec64S _vexp_mm;
157	const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)`0x3ff`);
158
159	// compute exponential of x
160	_vexp_x = v_max(x, _vexp_lo_f64);
161	_vexp_x = v_min(_vexp_x, _vexp_hi_f64);
162
163	_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
164	_vexp_mm = v_expand_low(v_floor(_vexp_));
165	_vexp_ = v_cvt_f64(_vexp_mm);
166	_vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
167	_vexp_mm = v_shl(_vexp_mm, `52`);
168
169	_vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
170	_vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
171	_vexp_xx = v_mul(_vexp_x, _vexp_x);
172
173	_vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
174	_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
175	_vexp_y = v_mul(_vexp_y, _vexp_x);
176
177	_vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
178	_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
179	_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
180
181	_vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
182	_vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
183	_vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
184
185	// exp(NAN) -> NAN
186	_TpVec64F mask_not_nan = v_not_nan(x);
187	return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)`0x7FF8000000000000`)));
188	}
189	//! @}
190
191	//! @name Natural Logarithm
192	//! @{
193	template<typename _TpVec16F, typename _TpVec16S>
194	inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
195	const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(`1.0f`);
196	const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(`0.707106781186547524f`);
197	const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-`2.12194440E-4f`);
198	const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(`0.693359375f`);
199	const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(`7.0376836292E-2f`);
200	const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-`1.1514610310E-1f`);
201	const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(`1.1676998740E-1f`);
202	const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-`1.2420140846E-1f`);
203	const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(`1.4249322787E-1f`);
204	const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-`1.6668057665E-1f`);
205	const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(`2.0000714765E-1f`);
206	const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-`2.4999993993E-1f`);
207	const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(`3.3333331174E-1f`);
208
209	_TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
210	_TpVec16S _vlog_ux, _vlog_emm0;
211	const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~`0x7c00`);
212
213	_vlog_ux = v_reinterpret_as_s16(x);
214	_vlog_emm0 = v_shr(_vlog_ux, `10`);
215
216	_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
217	_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(`0.5f`)));
218	_vlog_x = v_reinterpret_as_f16(_vlog_ux);
219
220	_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)`0xf`));
221	_vlog_e = v_cvt_f16(_vlog_emm0);
222
223	_vlog_e = v_add(_vlog_e, _vlog_one_fp16);
224
225	_TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
226	_vlog_tmp = v_and(_vlog_x, _vlog_mask);
227	_vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
228	_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
229	_vlog_x = v_add(_vlog_x, _vlog_tmp);
230
231	_vlog_z = v_mul(_vlog_x, _vlog_x);
232
233	_vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
234	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
235	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
236	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
237	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
238	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
239	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
240	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
241	_vlog_y = v_mul(_vlog_y, _vlog_x);
242	_vlog_y = v_mul(_vlog_y, _vlog_z);
243
244	_vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
245
246	_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(`0.5f`)));
247
248	_vlog_x = v_add(_vlog_x, _vlog_y);
249	_vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
250	// log(0) -> -INF
251	_TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>());
252	_vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)`0xfc00`)), _vlog_x);
253	// log(NEG), log(NAN) -> NAN
254	_TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>());
255	_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)`0x7e00`)));
256	// log(INF) -> INF
257	_TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)`0x7c00`)));
258	_vlog_x = v_select(mask_inf, x, _vlog_x);
259	return _vlog_x;
260	}
261
262	template<typename _TpVec32F, typename _TpVec32S>
263	inline _TpVec32F v_log_default_32f(const _TpVec32F &x) {
264	const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(`1.0f`);
265	const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(`0.707106781186547524f`);
266	const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-`2.12194440E-4f`);
267	const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(`0.693359375f`);
268	const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(`7.0376836292E-2f`);
269	const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-`1.1514610310E-1f`);
270	const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(`1.1676998740E-1f`);
271	const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-`1.2420140846E-1f`);
272	const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(`1.4249322787E-1f`);
273	const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-`1.6668057665E-1f`);
274	const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(`2.0000714765E-1f`);
275	const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-`2.4999993993E-1f`);
276	const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(`3.3333331174E-1f`);
277
278	_TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
279	_TpVec32S _vlog_ux, _vlog_emm0;
280	const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~`0x7f800000`);
281
282	_vlog_ux = v_reinterpret_as_s32(x);
283	_vlog_emm0 = v_shr(_vlog_ux, `23`);
284
285	_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
286	_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(`0.5f`)));
287	_vlog_x = v_reinterpret_as_f32(_vlog_ux);
288
289	_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)`0x7f`));
290	_vlog_e = v_cvt_f32(_vlog_emm0);
291
292	_vlog_e = v_add(_vlog_e, _vlog_one_fp32);
293
294	_TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
295	_vlog_tmp = v_and(_vlog_x, _vlog_mask);
296	_vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
297	_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
298	_vlog_x = v_add(_vlog_x, _vlog_tmp);
299
300	_vlog_z = v_mul(_vlog_x, _vlog_x);
301
302	_vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
303	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
304	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
305	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
306	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
307	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
308	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
309	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
310	_vlog_y = v_mul(_vlog_y, _vlog_x);
311	_vlog_y = v_mul(_vlog_y, _vlog_z);
312
313	_vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
314
315	_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(`0.5f`)));
316
317	_vlog_x = v_add(_vlog_x, _vlog_y);
318	_vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
319	// log(0) -> -INF
320	_TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>());
321	_vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)`0xff800000`)), _vlog_x);
322	// log(NEG), log(NAN) -> NAN
323	_TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>());
324	_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)`0x7fc00000`)));
325	// log(INF) -> INF
326	_TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)`0x7f800000`)));
327	_vlog_x = v_select(mask_inf, x, _vlog_x);
328	return _vlog_x;
329	}
330
331	template<typename _TpVec64F, typename _TpVec64S>
332	inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
333	const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(`1.0`);
334	const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(`0.7071067811865475244`);
335	const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(`1.01875663804580931796E-4`);
336	const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(`4.97494994976747001425E-1`);
337	const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(`4.70579119878881725854`);
338	const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(`1.44989225341610930846E1`);
339	const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(`1.79368678507819816313E1`);
340	const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(`7.70838733755885391666`);
341	const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(`1.12873587189167450590E1`);
342	const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(`4.52279145837532221105E1`);
343	const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(`8.29875266912776603211E1`);
344	const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(`7.11544750618563894466E1`);
345	const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(`2.31251620126765340583E1`);
346
347	const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(`2.121944400546905827679e-4`);
348	const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(`0.693359375`);
349
350	_TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
351	_TpVec64S _vlog_ux, _vlog_emm0;
352	const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~`0x7ff0000000000000`);
353
354	_vlog_ux = v_reinterpret_as_s64(x);
355	_vlog_emm0 = v_shr(_vlog_ux, `52`);
356
357	_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
358	_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(`0.5`)));
359	_vlog_x = v_reinterpret_as_f64(_vlog_ux);
360
361	_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)`0x3ff`));
362	_vlog_e = v_cvt_f64(_vlog_emm0);
363
364	_vlog_e = v_add(_vlog_e, _vlog_one_fp64);
365
366	_TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
367	_vlog_tmp = v_and(_vlog_x, _vlog_mask);
368	_vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
369	_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
370	_vlog_x = v_add(_vlog_x, _vlog_tmp);
371
372	_vlog_xx = v_mul(_vlog_x, _vlog_x);
373
374	_vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
375	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
376	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
377	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
378	_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
379	_vlog_y = v_mul(_vlog_y, _vlog_x);
380	_vlog_y = v_mul(_vlog_y, _vlog_xx);
381
382	_vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
383	_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
384	_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
385	_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
386	_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
387
388	_vlog_z = v_div(_vlog_y, _vlog_z);
389	_vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
390	_vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(`0.5`)));
391
392	_vlog_z = v_add(_vlog_z, _vlog_x);
393	_vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
394
395	// log(0) -> -INF
396	_TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>());
397	_vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)`0xfff0000000000000`)), _vlog_z);
398	// log(NEG), log(NAN) -> NAN
399	_TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>());
400	_vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)`0x7ff8000000000000`)));
401	// log(INF) -> INF
402	_TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)`0x7ff0000000000000`)));
403	_vlog_z = v_select(mask_inf, x, _vlog_z);
404	return _vlog_z;
405	}
406	//! @}
407
408	//! @name Sine and Cosine
409	//! @{
410	template<typename _TpVec16F, typename _TpVec16S>
411	inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
412	const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat (`1.27323954473516f`)); // 4 / M_PI
413	const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat (-`0.78515625f`));
414	const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat (-`2.4187564849853515625E-4f`));
415	const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat (-`3.77489497744594108E-8f`));
416	const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat (-`1.9515295891E-4f`));
417	const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat (`8.3321608736E-3f`));
418	const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat (-`1.6666654611E-1f`));
419	const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat (`2.443315711809948E-5f`));
420	const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat (-`1.388731625493765E-3f`));
421	const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat (`4.166664568298827E-2f`));
422	const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)`0x7e00`));
423	const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat (-`0.f`));
424
425	_TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
426	_TpVec16S emm2;
427
428	sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
429	_vx = v_abs(x);
430	_vy = v_mul(_vx, v_cephes_FOPI);
431
432	emm2 = v_trunc(_vy);
433	emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)`1`));
434	emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~`1`));
435	_vy = v_cvt_f16(emm2);
436
437	_TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)`2`)), v_setall_<_TpVec16S>((short)`0`)));
438
439	_vx = v_fma(_vy, v_minus_DP1, _vx);
440	_vx = v_fma(_vy, v_minus_DP2, _vx);
441	_vx = v_fma(_vy, v_minus_DP3, _vx);
442
443	sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)`4`)), v_setall_<_TpVec16S>((short)`0`))));
444	sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)`2`)), v_setall_<_TpVec16S>((short)`4`)), v_setall_<_TpVec16S>((short)`0`)));
445
446	_TpVec16F _vxx = v_mul(_vx, _vx);
447	_TpVec16F y1, y2;
448
449	y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
450	y1 = v_fma(y1, _vxx, v_coscof_p2);
451	y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat (-`0.5f`)));
452	y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat (`1.f`)));
453
454	y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
455	y2 = v_fma(y2, _vxx, v_sincof_p2);
456	y2 = v_mul(y2, _vxx);
457	y2 = v_fma(y2, _vx, _vx);
458
459	ysin = v_select(poly_mask, y2, y1);
460	ycos = v_select(poly_mask, y1, y2);
461	ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
462	ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
463
464	// sincos(NAN) -> NAN, sincos(±INF) -> NAN
465	_TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)`0x7c00`)));
466	_TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
467	ysin = v_select(mask_nan, v_nan, ysin);
468	ycos = v_select(mask_nan, v_nan, ycos);
469	}
470
471	template<typename _TpVec16F, typename _TpVec16S>
472	inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
473	_TpVec16F ysin, ycos;
474	v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
475	return ysin;
476	}
477
478	template<typename _TpVec16F, typename _TpVec16S>
479	inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
480	_TpVec16F ysin, ycos;
481	v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
482	return ycos;
483	}
484
485
486	template<typename _TpVec32F, typename _TpVec32S>
487	inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
488	const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(`1.27323954473516f`); // 4 / M_PI
489	const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-`0.78515625f`);
490	const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-`2.4187564849853515625E-4f`);
491	const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-`3.77489497744594108E-8f`);
492	const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-`1.9515295891E-4f`);
493	const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(`8.3321608736E-3f`);
494	const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-`1.6666654611E-1f`);
495	const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(`2.443315711809948E-5f`);
496	const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-`1.388731625493765E-3f`);
497	const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(`4.166664568298827E-2f`);
498	const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)`0x7fc00000`));
499	const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-`0.f`);
500
501	_TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
502	_TpVec32S emm2;
503
504	sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
505	_vx = v_abs(x);
506	_vy = v_mul(_vx, v_cephes_FOPI);
507
508	emm2 = v_trunc(_vy);
509	emm2 = v_add(emm2, v_setall_<_TpVec32S>(`1`));
510	emm2 = v_and(emm2, v_setall_<_TpVec32S>(~`1`));
511	_vy = v_cvt_f32(emm2);
512
513	_TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(`2`)), v_setall_<_TpVec32S>(`0`)));
514
515	_vx = v_fma(_vy, v_minus_DP1, _vx);
516	_vx = v_fma(_vy, v_minus_DP2, _vx);
517	_vx = v_fma(_vy, v_minus_DP3, _vx);
518
519	sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(`4`)), v_setall_<_TpVec32S>(`0`))));
520	sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(`2`)), v_setall_<_TpVec32S>(`4`)), v_setall_<_TpVec32S>(`0`)));
521
522	_TpVec32F _vxx = v_mul(_vx, _vx);
523	_TpVec32F y1, y2;
524
525	y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
526	y1 = v_fma(y1, _vxx, v_coscof_p2);
527	y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-`0.5f`));
528	y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(`1.f`));
529
530	y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
531	y2 = v_fma(y2, _vxx, v_sincof_p2);
532	y2 = v_mul(y2, _vxx);
533	y2 = v_fma(y2, _vx, _vx);
534
535	ysin = v_select(poly_mask, y2, y1);
536	ycos = v_select(poly_mask, y1, y2);
537	ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
538	ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
539
540	// sincos(NAN) -> NAN, sincos(±INF) -> NAN
541	_TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)`0x7f800000`)));
542	_TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
543	ysin = v_select(mask_nan, v_nan, ysin);
544	ycos = v_select(mask_nan, v_nan, ycos);
545	}
546
547	template<typename _TpVec32F, typename _TpVec32S>
548	inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
549	_TpVec32F ysin, ycos;
550	v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
551	return ysin;
552	}
553
554	template<typename _TpVec32F, typename _TpVec32S>
555	inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
556	_TpVec32F ysin, ycos;
557	v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
558	return ycos;
559	}
560
561	template<typename _TpVec64F, typename _TpVec64S>
562	inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
563	const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(`1.2732395447351626861510701069801148`); // 4 / M_PI
564	const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-`7.853981554508209228515625E-1`);
565	const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-`7.94662735614792836714E-9`);
566	const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-`3.06161699786838294307E-17`);
567	const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(`1.58962301576546568060E-10`);
568	const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-`2.50507477628578072866E-8`);
569	const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(`2.75573136213857245213E-6`);
570	const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-`1.98412698295895385996E-4`);
571	const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(`8.33333333332211858878E-3`);
572	const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-`1.66666666666666307295E-1`);
573	const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-`1.13585365213876817300E-11`);
574	const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(`2.08757008419747316778E-9`);
575	const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-`2.75573141792967388112E-7`);
576	const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(`2.48015872888517045348E-5`);
577	const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-`1.38888888888730564116E-3`);
578	const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(`4.16666666666665929218E-2`);
579	const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)`0x7ff8000000000000`));
580	const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-`0.0`);
581
582	_TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
583	_TpVec64S emm2;
584
585	sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
586	_vx = v_abs(x);
587	_vy = v_mul(_vx, v_cephes_FOPI);
588
589	emm2 = v_expand_low(v_trunc(_vy));
590	emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)`1`));
591	emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~`1`));
592	_vy = v_cvt_f64(emm2);
593
594	_TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)`2`)), v_setall_<_TpVec64S>((int64)`0`)));
595
596	_vx = v_fma(_vy, v_minus_DP1, _vx);
597	_vx = v_fma(_vy, v_minus_DP2, _vx);
598	_vx = v_fma(_vy, v_minus_DP3, _vx);
599
600	sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)`4`)), v_setall_<_TpVec64S>((int64)`0`))));
601	sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)`2`)), v_setall_<_TpVec64S>((int64)`4`)), v_setall_<_TpVec64S>((int64)`0`)));
602
603	_TpVec64F _vxx = v_mul(_vx, _vx);
604	_TpVec64F y1, y2;
605
606	y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
607	y1 = v_fma(y1, _vxx, v_cos_C3);
608	y1 = v_fma(y1, _vxx, v_cos_C4);
609	y1 = v_fma(y1, _vxx, v_cos_C5);
610	y1 = v_fma(y1, _vxx, v_cos_C6);
611	y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-`0.5`));
612	y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(`1.0`));
613
614	y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
615	y2 = v_fma(y2, _vxx, v_sin_C3);
616	y2 = v_fma(y2, _vxx, v_sin_C4);
617	y2 = v_fma(y2, _vxx, v_sin_C5);
618	y2 = v_fma(y2, _vxx, v_sin_C6);
619	y2 = v_mul(y2, _vxx);
620	y2 = v_fma(y2, _vx, _vx);
621
622	ysin = v_select(poly_mask, y2, y1);
623	ycos = v_select(poly_mask, y1, y2);
624	ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
625	ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
626
627	// sincos(NAN) -> NAN, sincos(±INF) -> NAN
628	_TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)`0x7ff0000000000000`)));
629	_TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
630	ysin = v_select(mask_nan, v_nan, ysin);
631	ycos = v_select(mask_nan, v_nan, ycos);
632	}
633
634	template<typename _TpVec64F, typename _TpVec64S>
635	inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
636	_TpVec64F ysin, ycos;
637	v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
638	return ysin;
639	}
640
641	template<typename _TpVec64F, typename _TpVec64S>
642	inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
643	_TpVec64F ysin, ycos;
644	v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
645	return ycos;
646	}
647	//! @}
648
649
650	/ This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch*
651	https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
652	*/
653
654	//! @name Error Function
655	//! @{
656	template<typename _TpVec32F, typename _TpVec32S>
657	inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) {
658	const _TpVec32F coef0 = v_setall_<_TpVec32F>(`0.3275911f`),
659	coef1 = v_setall_<_TpVec32F>(`1.061405429f`),
660	coef2 = v_setall_<_TpVec32F>(-`1.453152027f`),
661	coef3 = v_setall_<_TpVec32F>(`1.421413741f`),
662	coef4 = v_setall_<_TpVec32F>(-`0.284496736f`),
663	coef5 = v_setall_<_TpVec32F>(`0.254829592f`),
664	ones = v_setall_<_TpVec32F>(`1.0f`),
665	neg_zeros = v_setall_<_TpVec32F>(-`0.f`);
666	_TpVec32F t = v_abs(v);
667	// sign(v)
668	_TpVec32F sign_mask = v_and(neg_zeros, v);
669
670	t = v_div(ones, v_fma(coef0, t, ones));
671	_TpVec32F r = v_fma(coef1, t, coef2);
672	r = v_fma(r, t, coef3);
673	r = v_fma(r, t, coef4);
674	r = v_fma(r, t, coef5);
675	// - v v*
676	_TpVec32F v2 = v_mul(v, v);
677	_TpVec32F mv2 = v_xor(neg_zeros, v2);
678	// - exp(- v v)*
679	_TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2);
680	_TpVec32F neg_exp = v_xor(neg_zeros, exp);
681	_TpVec32F res = v_mul(t, neg_exp);
682	res = v_fma(r, res, ones);
683	return v_xor(sign_mask, res);
684	}
685	//! @}
686
687	#endif // OPENCV_HAL_INTRIN_MATH_HPP
688

source code of opencv/modules/core/include/opencv2/core/hal/intrin_math.hpp