simd_builtin.h source code [include/c++/11/experimental/bits/simd_builtin.h]

1	// Simd Abi specific implementations -- C++ --
2
3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
4	//
5	// This file is part of the GNU ISO C++ Library. This library is free
6	// software; you can redistribute it and/or modify it under the
7	// terms of the GNU General Public License as published by the
8	// Free Software Foundation; either version 3, or (at your option)
9	// any later version.
10
11	// This library is distributed in the hope that it will be useful,
12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	// GNU General Public License for more details.
15
16	// Under Section 7 of GPL version 3, you are granted additional
17	// permissions described in the GCC Runtime Library Exception, version
18	// 3.1, as published by the Free Software Foundation.
19
20	// You should have received a copy of the GNU General Public License and
21	// a copy of the GCC Runtime Library Exception along with this program;
22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	// <http://www.gnu.org/licenses/>.
24
25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
26	#define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
27
28	#if __cplusplus >= 201703L
29
30	#include <array>
31	#include <cmath>
32	#include <cstdlib>
33
34	_GLIBCXX_SIMD_BEGIN_NAMESPACE
35	// _S_allbits{{{
36	template <typename _V>
37	static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits
38	= reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());
39
40	// }}}
41	// _S_signmask, _S_absmask{{{
42	template <typename _V, typename = _VectorTraits<_V>>
43	static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask
44	= __xor(_V() + `1`, _V() - `1`);
45
46	template <typename _V, typename = _VectorTraits<_V>>
47	static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask
48	= __andnot(_S_signmask<_V>, _S_allbits<_V>);
49
50	//}}}
51	// __vector_permute<Indices...>{{{
52	// Index == -1 requests zeroing of the output element
53	template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
54	constexpr _Tp
55	__vector_permute(_Tp __x)
56	{
57	static_assert(sizeof...(_Indices) == _TVT::_S_full_size);
58	return __make_vector<typename _TVT::value_type>(
59	(_Indices == -`1` ? `0` : __x[_Indices == -`1` ? `0` : _Indices])...);
60	}
61
62	// }}}
63	// __vector_shuffle<Indices...>{{{
64	// Index == -1 requests zeroing of the output element
65	template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
66	constexpr _Tp
67	__vector_shuffle(_Tp __x, _Tp __y)
68	{
69	return _Tp{(_Indices == -`1` ? `0`
70	: _Indices < _TVT::_S_full_size
71	? __x[_Indices]
72	: __y[_Indices - _TVT::_S_full_size])...};
73	}
74
75	// }}}
76	// __make_wrapper{{{
77	template <typename _Tp, typename... _Args>
78	_GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)>
79	__make_wrapper(const _Args&... __args)
80	{ return __make_vector<_Tp>(__args...); }
81
82	// }}}
83	// __wrapper_bitcast{{{
84	template <typename _Tp, size_t _ToN = `0`, typename _Up, size_t _M,
85	size_t _Np = _ToN != `0` ? _ToN : sizeof(_Up) * _M / sizeof(_Tp)>
86	_GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np>
87	__wrapper_bitcast(_SimdWrapper<_Up, _M> __x)
88	{
89	static_assert(_Np > `1`);
90	return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data);
91	}
92
93	// }}}
94	// __shift_elements_right{{{
95	// if (__shift % 2ⁿ == 0) => the low n Bytes are correct
96	template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
97	_GLIBCXX_SIMD_INTRINSIC _Tp
98	__shift_elements_right(_Tp __v)
99	{
100	[[maybe_unused]] const auto __iv = __to_intrin(__v);
101	static_assert(__shift <= sizeof(_Tp));
102	if constexpr (__shift == `0`)
103	return __v;
104	else if constexpr (__shift == sizeof(_Tp))
105	return _Tp();
106	#if _GLIBCXX_SIMD_X86INTRIN // {{{
107	else if constexpr (__have_sse && __shift == `8`
108	&& _TVT::template _S_is<float, `4`>)
109	return _mm_movehl_ps(__iv, __iv);
110	else if constexpr (__have_sse2 && __shift == `8`
111	&& _TVT::template _S_is<double, `2`>)
112	return _mm_unpackhi_pd(__iv, __iv);
113	else if constexpr (__have_sse2 && sizeof(_Tp) == `16`)
114	return reinterpret_cast<typename _TVT::type>(
115	_mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift));
116	else if constexpr (__shift == `16` && sizeof(_Tp) == `32`)
117	{
118	/if constexpr (__have_avx && _TVT::template _S_is<double, 4>)*
119	return _mm256_permute2f128_pd(__iv, __iv, 0x81);
120	else if constexpr (__have_avx && _TVT::template _S_is<float, 8>)
121	return _mm256_permute2f128_ps(__iv, __iv, 0x81);
122	else if constexpr (__have_avx)
123	return reinterpret_cast<typename _TVT::type>(
124	_mm256_permute2f128_si256(__iv, __iv, 0x81));
125	else/*
126	return __zero_extend(__hi128(__v));
127	}
128	else if constexpr (__have_avx2 && sizeof(_Tp) == `32` && __shift < `16`)
129	{
130	const auto __vll = __vector_bitcast<_LLong>(__v);
131	return reinterpret_cast<typename _TVT::type>(
132	_mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, `0x81`),
133	__vll, __shift));
134	}
135	else if constexpr (__have_avx && sizeof(_Tp) == `32` && __shift < `16`)
136	{
137	const auto __vll = __vector_bitcast<_LLong>(__v);
138	return reinterpret_cast<typename _TVT::type>(
139	__concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift),
140	_mm_srli_si128(__hi128(__vll), __shift)));
141	}
142	else if constexpr (sizeof(_Tp) == `32` && __shift > `16`)
143	return __zero_extend(__shift_elements_right<__shift - `16`>(__hi128(__v)));
144	else if constexpr (sizeof(_Tp) == `64` && __shift == `32`)
145	return __zero_extend(__hi256(__v));
146	else if constexpr (__have_avx512f && sizeof(_Tp) == `64`)
147	{
148	if constexpr (__shift >= `48`)
149	return __zero_extend(
150	__shift_elements_right<__shift - `48`>(__extract<`3`, `4`>(__v)));
151	else if constexpr (__shift >= `32`)
152	return __zero_extend(
153	__shift_elements_right<__shift - `32`>(__hi256(__v)));
154	else if constexpr (__shift % `8` == `0`)
155	return reinterpret_cast<typename _TVT::type>(
156	_mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v),
157	__shift / `8`));
158	else if constexpr (__shift % `4` == `0`)
159	return reinterpret_cast<typename _TVT::type>(
160	_mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v),
161	__shift / `4`));
162	else if constexpr (__have_avx512bw && __shift < `16`)
163	{
164	const auto __vll = __vector_bitcast<_LLong>(__v);
165	return reinterpret_cast<typename _TVT::type>(
166	_mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, `0xf9`),
167	__vll, __shift));
168	}
169	else if constexpr (__have_avx512bw && __shift < `32`)
170	{
171	const auto __vll = __vector_bitcast<_LLong>(__v);
172	return reinterpret_cast<typename _TVT::type>(
173	_mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), `0xee`),
174	_mm512_shuffle_i32x4(__vll, __vll, `0xf9`),
175	__shift - `16`));
176	}
177	else
178	__assert_unreachable<_Tp>();
179	}
180	/*
181	} else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64)
182	return __auto_bitcast(__extract<__shift / 16, 4>(__v));
183	*/
184	#endif // _GLIBCXX_SIMD_X86INTRIN }}}
185	else
186	{
187	constexpr int __chunksize = __shift % `8` == `0` ? `8`
188	: __shift % `4` == `0` ? `4`
189	: __shift % `2` == `0` ? `2`
190	: `1`;
191	auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v);
192	using _Up = decltype(__w);
193	return __intrin_bitcast<_Tp>(
194	__call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>(
195	[](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
196	return _Up{__chunks...};
197	}, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
198	return __w[__shift / __chunksize + __i];
199	}));
200	}
201	}
202
203	// }}}
204	// __extract_part(_SimdWrapper<_Tp, _Np>) {{{
205	template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
206	_GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
207	_SimdWrapper<_Tp, _Np / _Total * _Combine>
208	__extract_part(const _SimdWrapper<_Tp, _Np> __x)
209	{
210	if constexpr (_Index % `2` == `0` && _Total % `2` == `0` && _Combine % `2` == `0`)
211	return __extract_part<_Index / `2`, _Total / `2`, _Combine / `2`>(__x);
212	else
213	{
214	constexpr size_t __values_per_part = _Np / _Total;
215	constexpr size_t __values_to_skip = _Index * __values_per_part;
216	constexpr size_t __return_size = __values_per_part * _Combine;
217	using _R = __vector_type_t<_Tp, __return_size>;
218	static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp)
219	<= sizeof(__x),
220	"out of bounds __extract_part");
221	// the following assertion would ensure no "padding" to be read
222	// static_assert(_Total >= _Index + _Combine, "_Total must be greater
223	// than _Index");
224
225	// static_assert(__return_size _Total == _Np, "_Np must be divisible*
226	// by _Total");
227	if (__x._M_is_constprop())
228	return __generate_from_n_evaluations<__return_size, _R>(
229	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
230	return __x[__values_to_skip + __i];
231	});
232	if constexpr (_Index == `0` && _Total == `1`)
233	return __x;
234	else if constexpr (_Index == `0`)
235	return __intrin_bitcast<_R>(__as_vector(__x));
236	#if _GLIBCXX_SIMD_X86INTRIN // {{{
237	else if constexpr (sizeof(__x) == `32`
238	&& __return_size * sizeof(_Tp) <= `16`)
239	{
240	constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp);
241	if constexpr (__bytes_to_skip == `16`)
242	return __vector_bitcast<_Tp, __return_size>(
243	__hi128(__as_vector(__x)));
244	else
245	return __vector_bitcast<_Tp, __return_size>(
246	_mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)),
247	__lo128(__vector_bitcast<_LLong>(__x)),
248	__bytes_to_skip));
249	}
250	#endif // _GLIBCXX_SIMD_X86INTRIN }}}
251	else if constexpr (_Index > `0`
252	&& (__values_to_skip % __return_size != `0`
253	\|\| sizeof(_R) >= `8`)
254	&& (__values_to_skip + __return_size) * sizeof(_Tp)
255	<= `64`
256	&& sizeof(__x) >= `16`)
257	return __intrin_bitcast<_R>(
258	__shift_elements_right<__values_to_skip * sizeof(_Tp)>(
259	__as_vector(__x)));
260	else
261	{
262	_R __r = {};
263	__builtin_memcpy(&__r,
264	reinterpret_cast<const char*>(&__x)
265	+ sizeof(_Tp) * __values_to_skip,
266	__return_size * sizeof(_Tp));
267	return __r;
268	}
269	}
270	}
271
272	// }}}
273	// __extract_part(_SimdWrapper<bool, _Np>) {{{
274	template <int _Index, int _Total, int _Combine = `1`, size_t _Np>
275	_GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<bool, _Np / _Total * _Combine>
276	__extract_part(const _SimdWrapper<bool, _Np> __x)
277	{
278	static_assert(_Combine == `1`, "_Combine != 1 not implemented");
279	static_assert(__have_avx512f && _Np == _Np);
280	static_assert(_Total >= `2` && _Index + _Combine <= _Total && _Index >= `0`);
281	return __x._M_data >> (_Index * _Np / _Total);
282	}
283
284	// }}}
285
286	// __vector_convert {{{
287	// implementation requires an index sequence
288	template <typename _To, typename _From, size_t... _I>
289	_GLIBCXX_SIMD_INTRINSIC constexpr _To
290	__vector_convert(_From __a, index_sequence<_I...>)
291	{
292	using _Tp = typename _VectorTraits<_To>::value_type;
293	return _To{static_cast<_Tp>(__a[_I])...};
294	}
295
296	template <typename _To, typename _From, size_t... _I>
297	_GLIBCXX_SIMD_INTRINSIC constexpr _To
298	__vector_convert(_From __a, _From __b, index_sequence<_I...>)
299	{
300	using _Tp = typename _VectorTraits<_To>::value_type;
301	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...};
302	}
303
304	template <typename _To, typename _From, size_t... _I>
305	_GLIBCXX_SIMD_INTRINSIC constexpr _To
306	__vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>)
307	{
308	using _Tp = typename _VectorTraits<_To>::value_type;
309	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
310	static_cast<_Tp>(__c[_I])...};
311	}
312
313	template <typename _To, typename _From, size_t... _I>
314	_GLIBCXX_SIMD_INTRINSIC constexpr _To
315	__vector_convert(_From __a, _From __b, _From __c, _From __d,
316	index_sequence<_I...>)
317	{
318	using _Tp = typename _VectorTraits<_To>::value_type;
319	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
320	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...};
321	}
322
323	template <typename _To, typename _From, size_t... _I>
324	_GLIBCXX_SIMD_INTRINSIC constexpr _To
325	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
326	index_sequence<_I...>)
327	{
328	using _Tp = typename _VectorTraits<_To>::value_type;
329	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
330	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
331	static_cast<_Tp>(__e[_I])...};
332	}
333
334	template <typename _To, typename _From, size_t... _I>
335	_GLIBCXX_SIMD_INTRINSIC constexpr _To
336	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
337	_From __f, index_sequence<_I...>)
338	{
339	using _Tp = typename _VectorTraits<_To>::value_type;
340	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
341	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
342	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...};
343	}
344
345	template <typename _To, typename _From, size_t... _I>
346	_GLIBCXX_SIMD_INTRINSIC constexpr _To
347	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
348	_From __f, _From __g, index_sequence<_I...>)
349	{
350	using _Tp = typename _VectorTraits<_To>::value_type;
351	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
352	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
353	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
354	static_cast<_Tp>(__g[_I])...};
355	}
356
357	template <typename _To, typename _From, size_t... _I>
358	_GLIBCXX_SIMD_INTRINSIC constexpr _To
359	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
360	_From __f, _From __g, _From __h, index_sequence<_I...>)
361	{
362	using _Tp = typename _VectorTraits<_To>::value_type;
363	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
364	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
365	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
366	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...};
367	}
368
369	template <typename _To, typename _From, size_t... _I>
370	_GLIBCXX_SIMD_INTRINSIC constexpr _To
371	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
372	_From __f, _From __g, _From __h, _From __i,
373	index_sequence<_I...>)
374	{
375	using _Tp = typename _VectorTraits<_To>::value_type;
376	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
377	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
378	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
379	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
380	static_cast<_Tp>(__i[_I])...};
381	}
382
383	template <typename _To, typename _From, size_t... _I>
384	_GLIBCXX_SIMD_INTRINSIC constexpr _To
385	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
386	_From __f, _From __g, _From __h, _From __i, _From __j,
387	index_sequence<_I...>)
388	{
389	using _Tp = typename _VectorTraits<_To>::value_type;
390	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
391	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
392	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
393	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
394	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...};
395	}
396
397	template <typename _To, typename _From, size_t... _I>
398	_GLIBCXX_SIMD_INTRINSIC constexpr _To
399	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
400	_From __f, _From __g, _From __h, _From __i, _From __j,
401	_From __k, index_sequence<_I...>)
402	{
403	using _Tp = typename _VectorTraits<_To>::value_type;
404	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
405	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
406	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
407	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
408	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
409	static_cast<_Tp>(__k[_I])...};
410	}
411
412	template <typename _To, typename _From, size_t... _I>
413	_GLIBCXX_SIMD_INTRINSIC constexpr _To
414	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
415	_From __f, _From __g, _From __h, _From __i, _From __j,
416	_From __k, _From __l, index_sequence<_I...>)
417	{
418	using _Tp = typename _VectorTraits<_To>::value_type;
419	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
420	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
421	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
422	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
423	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
424	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...};
425	}
426
427	template <typename _To, typename _From, size_t... _I>
428	_GLIBCXX_SIMD_INTRINSIC constexpr _To
429	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
430	_From __f, _From __g, _From __h, _From __i, _From __j,
431	_From __k, _From __l, _From __m, index_sequence<_I...>)
432	{
433	using _Tp = typename _VectorTraits<_To>::value_type;
434	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
435	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
436	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
437	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
438	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
439	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
440	static_cast<_Tp>(__m[_I])...};
441	}
442
443	template <typename _To, typename _From, size_t... _I>
444	_GLIBCXX_SIMD_INTRINSIC constexpr _To
445	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
446	_From __f, _From __g, _From __h, _From __i, _From __j,
447	_From __k, _From __l, _From __m, _From __n,
448	index_sequence<_I...>)
449	{
450	using _Tp = typename _VectorTraits<_To>::value_type;
451	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
452	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
453	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
454	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
455	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
456	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
457	static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...};
458	}
459
460	template <typename _To, typename _From, size_t... _I>
461	_GLIBCXX_SIMD_INTRINSIC constexpr _To
462	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
463	_From __f, _From __g, _From __h, _From __i, _From __j,
464	_From __k, _From __l, _From __m, _From __n, _From __o,
465	index_sequence<_I...>)
466	{
467	using _Tp = typename _VectorTraits<_To>::value_type;
468	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
469	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
470	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
471	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
472	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
473	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
474	static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
475	static_cast<_Tp>(__o[_I])...};
476	}
477
478	template <typename _To, typename _From, size_t... _I>
479	_GLIBCXX_SIMD_INTRINSIC constexpr _To
480	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
481	_From __f, _From __g, _From __h, _From __i, _From __j,
482	_From __k, _From __l, _From __m, _From __n, _From __o,
483	_From __p, index_sequence<_I...>)
484	{
485	using _Tp = typename _VectorTraits<_To>::value_type;
486	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
487	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
488	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
489	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
490	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
491	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
492	static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
493	static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...};
494	}
495
496	// Defer actual conversion to the overload that takes an index sequence. Note
497	// that this function adds zeros or drops values off the end if you don't ensure
498	// matching width.
499	template <typename _To, typename... _From, size_t _FromSize>
500	_GLIBCXX_SIMD_INTRINSIC constexpr _To
501	__vector_convert(_SimdWrapper<_From, _FromSize>... __xs)
502	{
503	#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
504	using _From0 = __first_of_pack_t<_From...>;
505	using _FW = _SimdWrapper<_From0, _FromSize>;
506	if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop()))
507	{
508	if constexpr ((sizeof...(_From) & (sizeof...(_From) - `1`))
509	== `0`) // power-of-two number of arguments
510	return __convert_x86<_To>(__as_vector(__xs)...);
511	else // append zeros and recurse until the above branch is taken
512	return __vector_convert<_To>(__xs..., _FW{});
513	}
514	else
515	#endif
516	return __vector_convert<_To>(
517	__as_vector(__xs)...,
518	make_index_sequence<(sizeof...(__xs) == `1` ? std::min(
519	_VectorTraits<_To>::_S_full_size, int(_FromSize))
520	: _FromSize)>());
521	}
522
523	// }}}
524	// __convert function{{{
525	template <typename _To, typename _From, typename... _More>
526	_GLIBCXX_SIMD_INTRINSIC constexpr auto
527	__convert(_From __v0, _More... __vs)
528	{
529	static_assert((true && ... && is_same_v<_From, _More>) );
530	if constexpr (__is_vectorizable_v<_From>)
531	{
532	using _V = typename _VectorTraits<_To>::type;
533	using _Tp = typename _VectorTraits<_To>::value_type;
534	return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...};
535	}
536	else if constexpr (__is_vector_type_v<_From>)
537	return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...);
538	else // _SimdWrapper arguments
539	{
540	constexpr size_t __input_size = _From::_S_size * (`1` + sizeof...(_More));
541	if constexpr (__is_vectorizable_v<_To>)
542	return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...);
543	else if constexpr (!__is_vector_type_v<_To>)
544	return _To(__convert<typename _To::_BuiltinType>(__v0, __vs...));
545	else
546	{
547	static_assert(
548	sizeof...(_More) == `0`
549	\|\| _VectorTraits<_To>::_S_full_size >= __input_size,
550	"__convert(...) requires the input to fit into the output");
551	return __vector_convert<_To>(__v0, __vs...);
552	}
553	}
554	}
555
556	// }}}
557	// __convert_all{{{
558	// Converts __v into array<_To, N>, where N is _NParts if non-zero or
559	// otherwise deduced from _To such that N #elements(_To) <= #elements(__v).*
560	// Note: this function may return less than all converted elements
561	template <typename _To,
562	size_t _NParts = `0`, // allows to convert fewer or more (only last
563	// _To, to be partially filled) than all
564	size_t _Offset = `0`, // where to start, # of elements (not Bytes or
565	// Parts)
566	typename _From, typename _FromVT = _VectorTraits<_From>>
567	_GLIBCXX_SIMD_INTRINSIC auto
568	__convert_all(_From __v)
569	{
570	if constexpr (is_arithmetic_v<_To> && _NParts != `1`)
571	{
572	static_assert(_Offset < _FromVT::_S_full_size);
573	constexpr auto _Np
574	= _NParts == `0` ? _FromVT::_S_partial_width - _Offset : _NParts;
575	return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
576	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
577	return static_cast<_To>(__v[__i + _Offset]);
578	});
579	}
580	else
581	{
582	static_assert(__is_vector_type_v<_To>);
583	using _ToVT = _VectorTraits<_To>;
584	if constexpr (__is_vector_type_v<_From>)
585	return __convert_all<_To, _NParts>(__as_wrapper(__v));
586	else if constexpr (_NParts == `1`)
587	{
588	static_assert(_Offset % _ToVT::_S_full_size == `0`);
589	return array<_To, `1`>{__vector_convert<_To>(
590	__extract_part<_Offset / _ToVT::_S_full_size,
591	__div_roundup(_FromVT::_S_partial_width,
592	_ToVT::_S_full_size)>(__v))};
593	}
594	#if _GLIBCXX_SIMD_X86INTRIN // {{{
595	else if constexpr (!__have_sse4_1 && _Offset == `0`
596	&& is_integral_v<typename _FromVT::value_type>
597	&& sizeof(typename _FromVT::value_type)
598	< sizeof(typename _ToVT::value_type)
599	&& !(sizeof(typename _FromVT::value_type) == `4`
600	&& is_same_v<typename _ToVT::value_type, double>))
601	{
602	using _ToT = typename _ToVT::value_type;
603	using _FromT = typename _FromVT::value_type;
604	constexpr size_t _Np
605	= _NParts != `0`
606	? _NParts
607	: (_FromVT::_S_partial_width / _ToVT::_S_full_size);
608	using _R = array<_To, _Np>;
609	// __adjust modifies its input to have _Np (use _SizeConstant)
610	// entries so that no unnecessary intermediate conversions are
611	// requested and, more importantly, no intermediate conversions are
612	// missing
613	[[maybe_unused]] auto __adjust
614	= [](auto __n,
615	auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> {
616	return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
617	};
618	[[maybe_unused]] const auto __vi = __to_intrin(__v);
619	auto&& __make_array
620	= [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
621	if constexpr (_Np == `1`)
622	return _R{__intrin_bitcast<_To>(__x0)};
623	else
624	return _R{__intrin_bitcast<_To>(__x0),
625	__intrin_bitcast<_To>(__x1)};
626	};
627
628	if constexpr (_Np == `0`)
629	return _R{};
630	else if constexpr (sizeof(_FromT) == `1` && sizeof(_ToT) == `2`)
631	{
632	static_assert(is_integral_v<_FromT>);
633	static_assert(is_integral_v<_ToT>);
634	if constexpr (is_unsigned_v<_FromT>)
635	return __make_array(_mm_unpacklo_epi8(__vi, __m128i()),
636	_mm_unpackhi_epi8(__vi, __m128i()));
637	else
638	return __make_array(
639	_mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), `8`),
640	_mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), `8`));
641	}
642	else if constexpr (sizeof(_FromT) == `2` && sizeof(_ToT) == `4`)
643	{
644	static_assert(is_integral_v<_FromT>);
645	if constexpr (is_floating_point_v<_ToT>)
646	{
647	const auto __ints
648	= __convert_all<__vector_type16_t<int>, _Np>(
649	__adjust(_SizeConstant<_Np * `4`>(), __v));
650	return __generate_from_n_evaluations<_Np, _R>(
651	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
652	return __vector_convert<_To>(__as_wrapper(__ints[__i]));
653	});
654	}
655	else if constexpr (is_unsigned_v<_FromT>)
656	return __make_array(_mm_unpacklo_epi16(__vi, __m128i()),
657	_mm_unpackhi_epi16(__vi, __m128i()));
658	else
659	return __make_array(
660	_mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), `16`),
661	_mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), `16`));
662	}
663	else if constexpr (sizeof(_FromT) == `4` && sizeof(_ToT) == `8`
664	&& is_integral_v<_FromT> && is_integral_v<_ToT>)
665	{
666	if constexpr (is_unsigned_v<_FromT>)
667	return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
668	_mm_unpackhi_epi32(__vi, __m128i()));
669	else
670	return __make_array(
671	_mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, `31`)),
672	_mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, `31`)));
673	}
674	else if constexpr (sizeof(_FromT) == `4` && sizeof(_ToT) == `8`
675	&& is_integral_v<_FromT> && is_integral_v<_ToT>)
676	{
677	if constexpr (is_unsigned_v<_FromT>)
678	return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
679	_mm_unpackhi_epi32(__vi, __m128i()));
680	else
681	return __make_array(
682	_mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, `31`)),
683	_mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, `31`)));
684	}
685	else if constexpr (sizeof(_FromT) == `1` && sizeof(_ToT) >= `4`
686	&& is_signed_v<_FromT>)
687	{
688	const __m128i __vv[`2`] = {_mm_unpacklo_epi8(__vi, __vi),
689	_mm_unpackhi_epi8(__vi, __vi)};
690	const __vector_type_t<int, `4`> __vvvv[`4`] = {
691	__vector_bitcast<int>(x: _mm_unpacklo_epi16(a: __vv[`0`], b: __vv[`0`])),
692	__vector_bitcast<int>(x: _mm_unpackhi_epi16(a: __vv[`0`], b: __vv[`0`])),
693	__vector_bitcast<int>(x: _mm_unpacklo_epi16(a: __vv[`1`], b: __vv[`1`])),
694	__vector_bitcast<int>(x: _mm_unpackhi_epi16(a: __vv[`1`], b: __vv[`1`]))};
695	if constexpr (sizeof(_ToT) == `4`)
696	return __generate_from_n_evaluations<_Np, _R>(
697	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
698	return __vector_convert<_To>(
699	_SimdWrapper<int, `4`>(__vvvv[__i] >> `24`));
700	});
701	else if constexpr (is_integral_v<_ToT>)
702	return __generate_from_n_evaluations<_Np, _R>(
703	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
704	const auto __signbits = __to_intrin(__vvvv[__i / `2`] >> `31`);
705	const auto __sx32 = __to_intrin(__vvvv[__i / `2`] >> `24`);
706	return __vector_bitcast<_ToT>(
707	__i % `2` == `0` ? _mm_unpacklo_epi32(__sx32, __signbits)
708	: _mm_unpackhi_epi32(__sx32, __signbits));
709	});
710	else
711	return __generate_from_n_evaluations<_Np, _R>(
712	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
713	const _SimdWrapper<int, `4`> __int4 = __vvvv[__i / `2`] >> `24`;
714	return __vector_convert<_To>(
715	__i % `2` == `0` ? __int4
716	: _SimdWrapper<int, `4`>(
717	_mm_unpackhi_epi64(a: __to_intrin(x: __int4),
718	b: __to_intrin(x: __int4))));
719	});
720	}
721	else if constexpr (sizeof(_FromT) == `1` && sizeof(_ToT) == `4`)
722	{
723	const auto __shorts = __convert_all<__vector_type16_t<
724	conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
725	__adjust(_SizeConstant<(_Np + `1`) / `2` * `8`>(), __v));
726	return __generate_from_n_evaluations<_Np, _R>(
727	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
728	return __convert_all<_To>(__shorts[__i / `2`])[__i % `2`];
729	});
730	}
731	else if constexpr (sizeof(_FromT) == `2` && sizeof(_ToT) == `8`
732	&& is_signed_v<_FromT> && is_integral_v<_ToT>)
733	{
734	const __m128i __vv[`2`] = {_mm_unpacklo_epi16(__vi, __vi),
735	_mm_unpackhi_epi16(__vi, __vi)};
736	const __vector_type16_t<int> __vvvv[`4`]
737	= {__vector_bitcast<int>(
738	x: _mm_unpacklo_epi32(a: _mm_srai_epi32(a: __vv[`0`], count: `16`),
739	b: _mm_srai_epi32(a: __vv[`0`], count: `31`))),
740	__vector_bitcast<int>(
741	x: _mm_unpackhi_epi32(a: _mm_srai_epi32(a: __vv[`0`], count: `16`),
742	b: _mm_srai_epi32(a: __vv[`0`], count: `31`))),
743	__vector_bitcast<int>(
744	x: _mm_unpacklo_epi32(a: _mm_srai_epi32(a: __vv[`1`], count: `16`),
745	b: _mm_srai_epi32(a: __vv[`1`], count: `31`))),
746	__vector_bitcast<int>(
747	x: _mm_unpackhi_epi32(a: _mm_srai_epi32(a: __vv[`1`], count: `16`),
748	b: _mm_srai_epi32(a: __vv[`1`], count: `31`)))};
749	return __generate_from_n_evaluations<_Np, _R>(
750	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
751	return __vector_bitcast<_ToT>(__vvvv[__i]);
752	});
753	}
754	else if constexpr (sizeof(_FromT) <= `2` && sizeof(_ToT) == `8`)
755	{
756	const auto __ints
757	= __convert_all<__vector_type16_t<conditional_t<
758	is_signed_v<_FromT> \|\| is_floating_point_v<_ToT>, int,
759	unsigned int>>>(
760	__adjust(_SizeConstant<(_Np + `1`) / `2` * `4`>(), __v));
761	return __generate_from_n_evaluations<_Np, _R>(
762	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
763	return __convert_all<_To>(__ints[__i / `2`])[__i % `2`];
764	});
765	}
766	else
767	__assert_unreachable<_To>();
768	}
769	#endif // _GLIBCXX_SIMD_X86INTRIN }}}
770	else if constexpr ((_FromVT::_S_partial_width - _Offset)
771	> _ToVT::_S_full_size)
772	{
773	/*
774	static_assert(
775	(_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) ==
776	0,
777	"__convert_all only supports power-of-2 number of elements.
778	Otherwise " "the return type cannot be array<_To, N>.");
779	*/
780	constexpr size_t _NTotal
781	= (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size;
782	constexpr size_t _Np = _NParts == `0` ? _NTotal : _NParts;
783	static_assert(
784	_Np <= _NTotal
785	\|\| (_Np == _NTotal + `1`
786	&& (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size
787	> `0`));
788	using _R = array<_To, _Np>;
789	if constexpr (_Np == `1`)
790	return _R{__vector_convert<_To>(
791	__extract_part<_Offset, _FromVT::_S_partial_width,
792	_ToVT::_S_full_size>(__v))};
793	else
794	return __generate_from_n_evaluations<_Np, _R>(
795	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
796	auto __part
797	= __extract_part<__i * _ToVT::_S_full_size + _Offset,
798	_FromVT::_S_partial_width,
799	_ToVT::_S_full_size>(__v);
800	return __vector_convert<_To>(__part);
801	});
802	}
803	else if constexpr (_Offset == `0`)
804	return array<_To, `1`>{__vector_convert<_To>(__v)};
805	else
806	return array<_To, `1`>{__vector_convert<_To>(
807	__extract_part<_Offset, _FromVT::_S_partial_width,
808	_FromVT::_S_partial_width - _Offset>(__v))};
809	}
810	}
811
812	// }}}
813
814	// _GnuTraits {{{
815	template <typename _Tp, typename _Mp, typename _Abi, size_t _Np>
816	struct _GnuTraits
817	{
818	using _IsValid = true_type;
819	using _SimdImpl = typename _Abi::_SimdImpl;
820	using _MaskImpl = typename _Abi::_MaskImpl;
821
822	// simd and simd_mask member types {{{
823	using _SimdMember = _SimdWrapper<_Tp, _Np>;
824	using _MaskMember = _SimdWrapper<_Mp, _Np>;
825	static constexpr size_t _S_simd_align = alignof(_SimdMember);
826	static constexpr size_t _S_mask_align = alignof(_MaskMember);
827
828	// }}}
829	// size metadata {{{
830	static constexpr size_t _S_full_size = _SimdMember::_S_full_size;
831	static constexpr bool _S_is_partial = _SimdMember::_S_is_partial;
832
833	// }}}
834	// _SimdBase / base class for simd, providing extra conversions {{{
835	struct _SimdBase2
836	{
837	explicit
838	operator __intrinsic_type_t<_Tp, _Np>() const
839	{ return __to_intrin(static_cast<const simd<_Tp, _Abi>>(this*)->_M_data); }
840
841	explicit
842	operator __vector_type_t<_Tp, _Np>() const
843	{ return static_cast<const simd<_Tp, _Abi>>(this*)->_M_data.__builtin(); }
844	};
845
846	struct _SimdBase1
847	{
848	explicit
849	operator __intrinsic_type_t<_Tp, _Np>() const
850	{ return __data(*static_cast<const simd<_Tp, _Abi>>(this*)); }
851	};
852
853	using _SimdBase = conditional_t<
854	is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
855	_SimdBase1, _SimdBase2>;
856
857	// }}}
858	// _MaskBase {{{
859	struct _MaskBase2
860	{
861	explicit
862	operator __intrinsic_type_t<_Tp, _Np>() const
863	{ return static_cast<const simd_mask<_Tp, _Abi>>(this*) ->_M_data.__intrin(); }
864
865	explicit
866	operator __vector_type_t<_Tp, _Np>() const
867	{ return static_cast<const simd_mask<_Tp, _Abi>>(this*)->_M_data._M_data; }
868	};
869
870	struct _MaskBase1
871	{
872	explicit
873	operator __intrinsic_type_t<_Tp, _Np>() const
874	{ return __data(*static_cast<const simd_mask<_Tp, _Abi>>(this*)); }
875	};
876
877	using _MaskBase = conditional_t<
878	is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
879	_MaskBase1, _MaskBase2>;
880
881	// }}}
882	// _MaskCastType {{{
883	// parameter type of one explicit simd_mask constructor
884	class _MaskCastType
885	{
886	using _Up = __intrinsic_type_t<_Tp, _Np>;
887	_Up _M_data;
888
889	public:
890	_MaskCastType(_Up __x) : _M_data(__x) {}
891
892	operator _MaskMember() const { return _M_data; }
893	};
894
895	// }}}
896	// _SimdCastType {{{
897	// parameter type of one explicit simd constructor
898	class _SimdCastType1
899	{
900	using _Ap = __intrinsic_type_t<_Tp, _Np>;
901	_SimdMember _M_data;
902
903	public:
904	constexpr
905	_SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
906
907	constexpr
908	operator _SimdMember() const { return _M_data; }
909	};
910
911	class _SimdCastType2
912	{
913	using _Ap = __intrinsic_type_t<_Tp, _Np>;
914	using _Bp = __vector_type_t<_Tp, _Np>;
915	_SimdMember _M_data;
916
917	public:
918	constexpr
919	_SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
920
921	constexpr
922	_SimdCastType2(_Bp __b) : _M_data(__b) {}
923
924	constexpr
925	operator _SimdMember() const { return _M_data; }
926	};
927
928	using _SimdCastType = conditional_t<
929	is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
930	_SimdCastType1, _SimdCastType2>;
931	//}}}
932	};
933
934	// }}}
935	struct _CommonImplX86;
936	struct _CommonImplNeon;
937	struct _CommonImplBuiltin;
938	template <typename _Abi> struct _SimdImplBuiltin;
939	template <typename _Abi> struct _MaskImplBuiltin;
940	template <typename _Abi> struct _SimdImplX86;
941	template <typename _Abi> struct _MaskImplX86;
942	template <typename _Abi> struct _SimdImplNeon;
943	template <typename _Abi> struct _MaskImplNeon;
944	template <typename _Abi> struct _SimdImplPpc;
945	template <typename _Abi> struct _MaskImplPpc;
946
947	// simd_abi::_VecBuiltin {{{
948	template <int _UsedBytes>
949	struct simd_abi::_VecBuiltin
950	{
951	template <typename _Tp>
952	static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
953
954	// validity traits {{{
955	struct _IsValidAbiTag : __bool_constant<(_UsedBytes > `1`)> {};
956
957	template <typename _Tp>
958	struct _IsValidSizeFor
959	: __bool_constant<(_UsedBytes / sizeof(_Tp) > `1`
960	&& _UsedBytes % sizeof(_Tp) == `0`
961	&& _UsedBytes <= __vectorized_sizeof<_Tp>()
962	&& (!__have_avx512f \|\| _UsedBytes <= `32`))> {};
963
964	template <typename _Tp>
965	struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>,
966	_IsValidSizeFor<_Tp>> {};
967
968	template <typename _Tp>
969	static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
970
971	// }}}
972	// _SimdImpl/_MaskImpl {{{
973	#if _GLIBCXX_SIMD_X86INTRIN
974	using _CommonImpl = _CommonImplX86;
975	using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>;
976	using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>;
977	#elif _GLIBCXX_SIMD_HAVE_NEON
978	using _CommonImpl = _CommonImplNeon;
979	using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>;
980	using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>;
981	#else
982	using _CommonImpl = _CommonImplBuiltin;
983	#ifdef __ALTIVEC__
984	using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>;
985	using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>;
986	#else
987	using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>;
988	using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>;
989	#endif
990	#endif
991
992	// }}}
993	// __traits {{{
994	template <typename _Tp>
995	using _MaskValueType = __int_for_sizeof_t<_Tp>;
996
997	template <typename _Tp>
998	using __traits
999	= conditional_t<_S_is_valid_v<_Tp>,
1000	_GnuTraits<_Tp, _MaskValueType<_Tp>,
1001	_VecBuiltin<_UsedBytes>, _S_size<_Tp>>,
1002	_InvalidTraits>;
1003
1004	//}}}
1005	// size metadata {{{
1006	template <typename _Tp>
1007	static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1008
1009	template <typename _Tp>
1010	static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1011
1012	// }}}
1013	// implicit masks {{{
1014	template <typename _Tp>
1015	using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>;
1016
1017	template <typename _Tp>
1018	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1019	_S_implicit_mask()
1020	{
1021	using _UV = typename _MaskMember<_Tp>::_BuiltinType;
1022	if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1023	return ~_UV();
1024	else
1025	{
1026	constexpr auto __size = _S_size<_Tp>;
1027	_GLIBCXX_SIMD_USE_CONSTEXPR auto __r
1028	= __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1029	{ return __i < __size ? -`1` : `0`; });
1030	return __r;
1031	}
1032	}
1033
1034	template <typename _Tp>
1035	_GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>>
1036	_S_implicit_mask_intrin()
1037	{ return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); }
1038
1039	template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1040	_GLIBCXX_SIMD_INTRINSIC static constexpr _TW
1041	_S_masked(_TW __x)
1042	{
1043	using _Tp = typename _TVT::value_type;
1044	if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1045	return __x;
1046	else
1047	return __and(__as_vector(__x),
1048	__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()));
1049	}
1050
1051	template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1052	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
1053	__make_padding_nonzero(_TW __x)
1054	{
1055	using _Tp = typename _TVT::value_type;
1056	if constexpr (!_S_is_partial<_Tp>)
1057	return __x;
1058	else
1059	{
1060	_GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask
1061	= __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>());
1062	if constexpr (is_integral_v<_Tp>)
1063	return __or(__x, ~__implicit_mask);
1064	else
1065	{
1066	_GLIBCXX_SIMD_USE_CONSTEXPR auto __one
1067	= __andnot(__implicit_mask,
1068	__vector_broadcast<_S_full_size<_Tp>>(_Tp(`1`)));
1069	// it's not enough to return `x \| 1_in_padding` because the
1070	// padding in x might be inf or nan (independent of
1071	// __FINITE_MATH_ONLY__, because it's about padding bits)
1072	return __or(__and(__x, __implicit_mask), __one);
1073	}
1074	}
1075	}
1076	// }}}
1077	};
1078
1079	// }}}
1080	// simd_abi::_VecBltnBtmsk {{{
1081	template <int _UsedBytes>
1082	struct simd_abi::_VecBltnBtmsk
1083	{
1084	template <typename _Tp>
1085	static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
1086
1087	// validity traits {{{
1088	struct _IsValidAbiTag : __bool_constant<(_UsedBytes > `1`)> {};
1089
1090	template <typename _Tp>
1091	struct _IsValidSizeFor
1092	: __bool_constant<(_UsedBytes / sizeof(_Tp) > `1`
1093	&& _UsedBytes % sizeof(_Tp) == `0` && _UsedBytes <= `64`
1094	&& (_UsedBytes > `32` \|\| __have_avx512vl))> {};
1095
1096	// Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also
1097	// required.
1098	template <typename _Tp>
1099	struct _IsValid
1100	: conjunction<
1101	_IsValidAbiTag, __bool_constant<__have_avx512f>,
1102	__bool_constant<__have_avx512bw \|\| (sizeof(_Tp) >= `4`)>,
1103	__bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>,
1104	_IsValidSizeFor<_Tp>> {};
1105
1106	template <typename _Tp>
1107	static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
1108
1109	// }}}
1110	// simd/_MaskImpl {{{
1111	#if _GLIBCXX_SIMD_X86INTRIN
1112	using _CommonImpl = _CommonImplX86;
1113	using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>;
1114	using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>;
1115	#else
1116	template <int>
1117	struct _MissingImpl;
1118
1119	using _CommonImpl = _MissingImpl<_UsedBytes>;
1120	using _SimdImpl = _MissingImpl<_UsedBytes>;
1121	using _MaskImpl = _MissingImpl<_UsedBytes>;
1122	#endif
1123
1124	// }}}
1125	// __traits {{{
1126	template <typename _Tp>
1127	using _MaskMember = _SimdWrapper<bool, _S_size<_Tp>>;
1128
1129	template <typename _Tp>
1130	using __traits = conditional_t<
1131	_S_is_valid_v<_Tp>,
1132	_GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>,
1133	_InvalidTraits>;
1134
1135	//}}}
1136	// size metadata {{{
1137	template <typename _Tp>
1138	static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1139	template <typename _Tp>
1140	static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1141
1142	// }}}
1143	// implicit mask {{{
1144	private:
1145	template <typename _Tp>
1146	using _ImplicitMask = _SimdWrapper<bool, _S_size<_Tp>>;
1147
1148	public:
1149	template <size_t _Np>
1150	_GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np>
1151	__implicit_mask_n()
1152	{
1153	using _Tp = __bool_storage_member_type_t<_Np>;
1154	return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((`1ULL` << _Np) - `1`) : ~_Tp();
1155	}
1156
1157	template <typename _Tp>
1158	_GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp>
1159	_S_implicit_mask()
1160	{ return __implicit_mask_n<_S_size<_Tp>>(); }
1161
1162	template <typename _Tp>
1163	_GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>>
1164	_S_implicit_mask_intrin()
1165	{ return __implicit_mask_n<_S_size<_Tp>>(); }
1166
1167	template <typename _Tp, size_t _Np>
1168	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1169	_S_masked(_SimdWrapper<_Tp, _Np> __x)
1170	{
1171	if constexpr (is_same_v<_Tp, bool>)
1172	if constexpr (_Np < `8` \|\| (_Np & (_Np - `1`)) != `0`)
1173	return _MaskImpl::_S_bit_and(
1174	__x, _SimdWrapper<_Tp, _Np>(
1175	__bool_storage_member_type_t<_Np>((`1ULL` << _Np) - `1`)));
1176	else
1177	return __x;
1178	else
1179	return _S_masked(__x._M_data);
1180	}
1181
1182	template <typename _TV>
1183	_GLIBCXX_SIMD_INTRINSIC static constexpr _TV
1184	_S_masked(_TV __x)
1185	{
1186	using _Tp = typename _VectorTraits<_TV>::value_type;
1187	static_assert(
1188	!__is_bitmask_v<_TV>,
1189	"_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't "
1190	"know the number of elements. Use _SimdWrapper<bool, N> instead.");
1191	if constexpr (_S_is_partial<_Tp>)
1192	{
1193	constexpr size_t _Np = _S_size<_Tp>;
1194	return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1195	_S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(),
1196	_SimdWrapper<_Tp, _Np>(__x));
1197	}
1198	else
1199	return __x;
1200	}
1201
1202	template <typename _TV, typename _TVT = _VectorTraits<_TV>>
1203	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
1204	__make_padding_nonzero(_TV __x)
1205	{
1206	using _Tp = typename _TVT::value_type;
1207	if constexpr (!_S_is_partial<_Tp>)
1208	return __x;
1209	else
1210	{
1211	constexpr size_t _Np = _S_size<_Tp>;
1212	if constexpr (is_integral_v<typename _TVT::value_type>)
1213	return __x
1214	\| __generate_vector<_Tp, _S_full_size<_Tp>>(
1215	[](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp {
1216	if (__i < _Np)
1217	return `0`;
1218	else
1219	return `1`;
1220	});
1221	else
1222	return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1223	_S_implicit_mask<_Tp>(),
1224	_SimdWrapper<_Tp, _Np>(
1225	__vector_broadcast<_S_full_size<_Tp>>(_Tp(`1`))),
1226	_SimdWrapper<_Tp, _Np>(__x))
1227	._M_data;
1228	}
1229	}
1230
1231	// }}}
1232	};
1233
1234	//}}}
1235	// _CommonImplBuiltin {{{
1236	struct _CommonImplBuiltin
1237	{
1238	// _S_converts_via_decomposition{{{
1239	// This lists all cases where a __vector_convert needs to fall back to
1240	// conversion of individual scalars (i.e. decompose the input vector into
1241	// scalars, convert, compose output vector). In those cases, _S_masked_load &
1242	// _S_masked_store prefer to use the _S_bit_iteration implementation.
1243	template <typename _From, typename _To, size_t _ToSize>
1244	static inline constexpr bool __converts_via_decomposition_v
1245	= sizeof(_From) != sizeof(_To);
1246
1247	// }}}
1248	// _S_load{{{
1249	template <typename _Tp, size_t _Np, size_t _Bytes = _Np * sizeof(_Tp)>
1250	_GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np>
1251	_S_load(const void* __p)
1252	{
1253	static_assert(_Np > `1`);
1254	static_assert(_Bytes % sizeof(_Tp) == `0`);
1255	using _Rp = __vector_type_t<_Tp, _Np>;
1256	if constexpr (sizeof(_Rp) == _Bytes)
1257	{
1258	_Rp __r;
1259	__builtin_memcpy(&__r, __p, _Bytes);
1260	return __r;
1261	}
1262	else
1263	{
1264	#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1265	using _Up = conditional_t<
1266	is_integral_v<_Tp>,
1267	conditional_t<_Bytes % `4` == `0`,
1268	conditional_t<_Bytes % `8` == `0`, long long, int>,
1269	conditional_t<_Bytes % `2` == `0`, short, signed char>>,
1270	conditional_t<(_Bytes < `8` \|\| _Np % `2` == `1` \|\| _Np == `2`), _Tp,
1271	double>>;
1272	using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>;
1273	if constexpr (sizeof(_V) != sizeof(_Rp))
1274	{ // on i386 with 4 < _Bytes <= 8
1275	_Rp __r{};
1276	__builtin_memcpy(&__r, __p, _Bytes);
1277	return __r;
1278	}
1279	else
1280	#else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1281	using _V = _Rp;
1282	#endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1283	{
1284	_V __r{};
1285	static_assert(_Bytes <= sizeof(_V));
1286	__builtin_memcpy(&__r, __p, _Bytes);
1287	return reinterpret_cast<_Rp>(__r);
1288	}
1289	}
1290	}
1291
1292	// }}}
1293	// _S_store {{{
1294	template <size_t _ReqBytes = `0`, typename _TV>
1295	_GLIBCXX_SIMD_INTRINSIC static void
1296	_S_store(_TV __x, void* __addr)
1297	{
1298	constexpr size_t _Bytes = _ReqBytes == `0` ? sizeof(__x) : _ReqBytes;
1299	static_assert(sizeof(__x) >= _Bytes);
1300
1301	if constexpr (__is_vector_type_v<_TV>)
1302	{
1303	using _Tp = typename _VectorTraits<_TV>::value_type;
1304	constexpr size_t _Np = _Bytes / sizeof(_Tp);
1305	static_assert(_Np * sizeof(_Tp) == _Bytes);
1306
1307	#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1308	using _Up = conditional_t<
1309	(is_integral_v<_Tp> \|\| _Bytes < `4`),
1310	conditional_t<(sizeof(__x) > sizeof(long long)), long long, _Tp>,
1311	float>;
1312	const auto __v = __vector_bitcast<_Up>(__x);
1313	#else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1314	const __vector_type_t<_Tp, _Np> __v = __x;
1315	#endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1316
1317	if constexpr ((_Bytes & (_Bytes - `1`)) != `0`)
1318	{
1319	constexpr size_t _MoreBytes = std::__bit_ceil(x: _Bytes);
1320	alignas(decltype(__v)) char __tmp[_MoreBytes];
1321	__builtin_memcpy(__tmp, &__v, _MoreBytes);
1322	__builtin_memcpy(__addr, __tmp, _Bytes);
1323	}
1324	else
1325	__builtin_memcpy(__addr, &__v, _Bytes);
1326	}
1327	else
1328	__builtin_memcpy(__addr, &__x, _Bytes);
1329	}
1330
1331	template <typename _Tp, size_t _Np>
1332	_GLIBCXX_SIMD_INTRINSIC static void
1333	_S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
1334	{ _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); }
1335
1336	// }}}
1337	// _S_store_bool_array(_BitMask) {{{
1338	template <size_t _Np, bool _Sanitized>
1339	_GLIBCXX_SIMD_INTRINSIC static constexpr void
1340	_S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
1341	{
1342	if constexpr (_Np == `1`)
1343	__mem[`0`] = __x[`0`];
1344	else if (__builtin_is_constant_evaluated())
1345	{
1346	for (size_t __i = `0`; __i < _Np; ++__i)
1347	__mem[__i] = __x[__i];
1348	}
1349	else if constexpr (_Np == `2`)
1350	{
1351	short __bool2 = (__x._M_to_bits() * `0x81`) & `0x0101`;
1352	_S_store<_Np>(__bool2, __mem);
1353	}
1354	else if constexpr (_Np == `3`)
1355	{
1356	int __bool3 = (__x._M_to_bits() * `0x4081`) & `0x010101`;
1357	_S_store<_Np>(__bool3, __mem);
1358	}
1359	else
1360	{
1361	__execute_n_times<__div_roundup(a: _Np, b: `4`)>(
1362	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1363	constexpr int __offset = __i * `4`;
1364	constexpr int __remaining = _Np - __offset;
1365	if constexpr (__remaining > `4` && __remaining <= `7`)
1366	{
1367	const _ULLong __bool7
1368	= (__x.template _M_extract<__offset>()._M_to_bits()
1369	* `0x40810204081ULL`)
1370	& `0x0101010101010101ULL`;
1371	_S_store<__remaining>(__bool7, __mem + __offset);
1372	}
1373	else if constexpr (__remaining >= `4`)
1374	{
1375	int __bits = __x.template _M_extract<__offset>()._M_to_bits();
1376	if constexpr (__remaining > `7`)
1377	__bits &= `0xf`;
1378	const int __bool4 = (__bits * `0x204081`) & `0x01010101`;
1379	_S_store<`4`>(x: __bool4, addr: __mem + __offset);
1380	}
1381	});
1382	}
1383	}
1384
1385	// }}}
1386	// _S_blend{{{
1387	template <typename _Tp, size_t _Np>
1388	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
1389	_S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
1390	_SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
1391	{ return __k._M_data ? __at1._M_data : __at0._M_data; }
1392
1393	// }}}
1394	};
1395
1396	// }}}
1397	// _SimdImplBuiltin {{{1
1398	template <typename _Abi>
1399	struct _SimdImplBuiltin
1400	{
1401	// member types {{{2
1402	template <typename _Tp>
1403	static constexpr size_t _S_max_store_size = `16`;
1404
1405	using abi_type = _Abi;
1406
1407	template <typename _Tp>
1408	using _TypeTag = _Tp*;
1409
1410	template <typename _Tp>
1411	using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
1412
1413	template <typename _Tp>
1414	using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
1415
1416	template <typename _Tp>
1417	static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
1418
1419	template <typename _Tp>
1420	static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
1421
1422	using _CommonImpl = typename _Abi::_CommonImpl;
1423	using _SuperImpl = typename _Abi::_SimdImpl;
1424	using _MaskImpl = typename _Abi::_MaskImpl;
1425
1426	// _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2
1427	template <typename _Tp, size_t _Np>
1428	_GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1429	_M_make_simd(_SimdWrapper<_Tp, _Np> __x)
1430	{ return {__private_init, __x}; }
1431
1432	template <typename _Tp, size_t _Np>
1433	_GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1434	_M_make_simd(__intrinsic_type_t<_Tp, _Np> __x)
1435	{ return {__private_init, __vector_bitcast<_Tp>(__x)}; }
1436
1437	// _S_broadcast {{{2
1438	template <typename _Tp>
1439	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1440	_S_broadcast(_Tp __x) noexcept
1441	{ return __vector_broadcast<_S_full_size<_Tp>>(__x); }
1442
1443	// _S_generator {{{2
1444	template <typename _Fp, typename _Tp>
1445	inline static constexpr _SimdMember<_Tp>
1446	_S_generator(_Fp&& __gen, _TypeTag<_Tp>)
1447	{
1448	return __generate_vector<_Tp, _S_full_size<_Tp>>(
1449	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1450	if constexpr (__i < _S_size<_Tp>)
1451	return __gen(__i);
1452	else
1453	return `0`;
1454	});
1455	}
1456
1457	// _S_load {{{2
1458	template <typename _Tp, typename _Up>
1459	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1460	_S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
1461	{
1462	constexpr size_t _Np = _S_size<_Tp>;
1463	constexpr size_t __max_load_size
1464	= (sizeof(_Up) >= `4` && __have_avx512f) \|\| __have_avx512bw ? `64`
1465	: (is_floating_point_v<_Up> && __have_avx) \|\| __have_avx2 ? `32`
1466	: `16`;
1467	constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
1468	if (__builtin_is_constant_evaluated())
1469	return __generate_vector<_Tp, _S_full_size<_Tp>>(
1470	[&](auto __i) constexpr {
1471	return static_cast<_Tp>(__i < _Np ? __mem[__i] : `0`);
1472	});
1473	else if constexpr (sizeof(_Up) > `8`)
1474	return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>(
1475	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1476	return static_cast<_Tp>(__i < _Np ? __mem[__i] : `0`);
1477	});
1478	else if constexpr (is_same_v<_Up, _Tp>)
1479	return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
1480	_Np * sizeof(_Tp)>(__mem);
1481	else if constexpr (__bytes_to_load <= __max_load_size)
1482	return __convert<_SimdMember<_Tp>>(
1483	_CommonImpl::template _S_load<_Up, _Np>(__mem));
1484	else if constexpr (__bytes_to_load % __max_load_size == `0`)
1485	{
1486	constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
1487	constexpr size_t __elements_per_load = _Np / __n_loads;
1488	return __call_with_n_evaluations<__n_loads>(
1489	[](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1490	return __convert<_SimdMember<_Tp>>(__uncvted...);
1491	}, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1492	return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1493	__mem + __i * __elements_per_load);
1494	});
1495	}
1496	else if constexpr (__bytes_to_load % (__max_load_size / `2`) == `0`
1497	&& __max_load_size > `16`)
1498	{ // e.g. int[] -> <char, 12> with AVX2
1499	constexpr size_t __n_loads
1500	= __bytes_to_load / (__max_load_size / `2`);
1501	constexpr size_t __elements_per_load = _Np / __n_loads;
1502	return __call_with_n_evaluations<__n_loads>(
1503	[](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1504	return __convert<_SimdMember<_Tp>>(__uncvted...);
1505	}, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1506	return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1507	__mem + __i * __elements_per_load);
1508	});
1509	}
1510	else // e.g. int[] -> <char, 9>
1511	return __call_with_subscripts(
1512	__mem, make_index_sequence<_Np>(),
1513	[](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1514	return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...};
1515	});
1516	}
1517
1518	// _S_masked_load {{{2
1519	template <typename _Tp, size_t _Np, typename _Up>
1520	static constexpr inline _SimdWrapper<_Tp, _Np>
1521	_S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
1522	const _Up* __mem) noexcept
1523	{
1524	_BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1525	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1526	__merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1527	});
1528	return __merge;
1529	}
1530
1531	// _S_store {{{2
1532	template <typename _Tp, typename _Up>
1533	_GLIBCXX_SIMD_INTRINSIC static constexpr void
1534	_S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
1535	{
1536	// TODO: converting int -> "smaller int" can be optimized with AVX512
1537	constexpr size_t _Np = _S_size<_Tp>;
1538	constexpr size_t __max_store_size
1539	= _SuperImpl::template _S_max_store_size<_Up>;
1540	if (__builtin_is_constant_evaluated())
1541	{
1542	for (size_t __i = `0`; __i < _Np; ++__i)
1543	__mem[__i] = __v[__i];
1544	}
1545	else if constexpr (sizeof(_Up) > `8`)
1546	__execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1547	__mem[__i] = __v[__i];
1548	});
1549	else if constexpr (is_same_v<_Up, _Tp>)
1550	_CommonImpl::_S_store(__v, __mem);
1551	else if constexpr (sizeof(_Up) * _Np <= __max_store_size)
1552	_CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)),
1553	__mem);
1554	else
1555	{
1556	constexpr size_t __vsize = __max_store_size / sizeof(_Up);
1557	// round up to convert the last partial vector as well:
1558	constexpr size_t __stores = __div_roundup(a: _Np, b: __vsize);
1559	constexpr size_t __full_stores = _Np / __vsize;
1560	using _V = __vector_type_t<_Up, __vsize>;
1561	const array<_V, __stores> __converted
1562	= __convert_all<_V, __stores>(__v);
1563	__execute_n_times<__full_stores>(
1564	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1565	_CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
1566	});
1567	if constexpr (__full_stores < __stores)
1568	_CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
1569	* sizeof(_Up)>(
1570	__converted[__full_stores], __mem + __full_stores * __vsize);
1571	}
1572	}
1573
1574	// _S_masked_store_nocvt {{{2
1575	template <typename _Tp, size_t _Np>
1576	_GLIBCXX_SIMD_INTRINSIC static constexpr void
1577	_S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k)
1578	{
1579	_BitOps::_S_bit_iteration(
1580	_MaskImpl::_S_to_bits(__k),
1581	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1582	__mem[__i] = __v[__i];
1583	});
1584	}
1585
1586	// _S_masked_store {{{2
1587	template <typename _TW, typename _TVT = _VectorTraits<_TW>,
1588	typename _Tp = typename _TVT::value_type, typename _Up>
1589	static constexpr inline void
1590	_S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept
1591	{
1592	constexpr size_t _TV_size = _S_size<_Tp>;
1593	[[maybe_unused]] const auto __vi = __to_intrin(__v);
1594	constexpr size_t __max_store_size
1595	= _SuperImpl::template _S_max_store_size<_Up>;
1596	if constexpr (
1597	is_same_v<
1598	_Tp,
1599	_Up> \|\| (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
1600	{
1601	// bitwise or no conversion, reinterpret:
1602	const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1603	if constexpr (__is_bitmask_v<decltype(__k)>)
1604	return _MaskMember<_Up>(__k._M_data);
1605	else
1606	return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k);
1607	}();
1608	_SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v),
1609	__mem, __kk);
1610	}
1611	else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up)
1612	&& !_CommonImpl::
1613	template __converts_via_decomposition_v<
1614	_Tp, _Up, __max_store_size>)
1615	{ // conversion via decomposition is better handled via the
1616	// bit_iteration
1617	// fallback below
1618	constexpr size_t _UW_size
1619	= std::min(a: _TV_size, b: __max_store_size / sizeof(_Up));
1620	static_assert(_UW_size <= _TV_size);
1621	using _UW = _SimdWrapper<_Up, _UW_size>;
1622	using _UV = __vector_type_t<_Up, _UW_size>;
1623	using _UAbi = simd_abi::deduce_t<_Up, _UW_size>;
1624	if constexpr (_UW_size == _TV_size) // one convert+store
1625	{
1626	const _UW __converted = __convert<_UW>(__v);
1627	_SuperImpl::_S_masked_store_nocvt(
1628	__converted, __mem,
1629	_UAbi::_MaskImpl::template _S_convert<
1630	__int_for_sizeof_t<_Up>>(__k));
1631	}
1632	else
1633	{
1634	static_assert(_UW_size * sizeof(_Up) == __max_store_size);
1635	constexpr size_t _NFullStores = _TV_size / _UW_size;
1636	constexpr size_t _NAllStores
1637	= __div_roundup(a: _TV_size, b: _UW_size);
1638	constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
1639	const array<_UV, _NAllStores> __converted
1640	= __convert_all<_UV, _NAllStores>(__v);
1641	__execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1642	_SuperImpl::_S_masked_store_nocvt(
1643	_UW(__converted[__i]), __mem + __i * _UW_size,
1644	_UAbi::_MaskImpl::template _S_convert<
1645	__int_for_sizeof_t<_Up>>(
1646	__extract_part<__i, _NParts>(__k.__as_full_vector())));
1647	});
1648	if constexpr (_NAllStores
1649	> _NFullStores) // one partial at the end
1650	_SuperImpl::_S_masked_store_nocvt(
1651	_UW(__converted[_NFullStores]),
1652	__mem + _NFullStores * _UW_size,
1653	_UAbi::_MaskImpl::template _S_convert<
1654	__int_for_sizeof_t<_Up>>(
1655	__extract_part<_NFullStores, _NParts>(
1656	__k.__as_full_vector())));
1657	}
1658	}
1659	else
1660	_BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1661	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1662	__mem[__i] = static_cast<_Up>(__v[__i]);
1663	});
1664	}
1665
1666	// _S_complement {{{2
1667	template <typename _Tp, size_t _Np>
1668	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1669	_S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept
1670	{ return ~__x._M_data; }
1671
1672	// _S_unary_minus {{{2
1673	template <typename _Tp, size_t _Np>
1674	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1675	_S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept
1676	{
1677	// GCC doesn't use the psign instructions, but pxor & psub seem to be
1678	// just as good a choice as pcmpeqd & psign. So meh.
1679	return -__x._M_data;
1680	}
1681
1682	// arithmetic operators {{{2
1683	template <typename _Tp, size_t _Np>
1684	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1685	_S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1686	{ return __x._M_data + __y._M_data; }
1687
1688	template <typename _Tp, size_t _Np>
1689	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1690	_S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1691	{ return __x._M_data - __y._M_data; }
1692
1693	template <typename _Tp, size_t _Np>
1694	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1695	_S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1696	{ return __x._M_data * __y._M_data; }
1697
1698	template <typename _Tp, size_t _Np>
1699	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1700	_S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1701	{
1702	// Note that division by 0 is always UB, so we must ensure we avoid the
1703	// case for partial registers
1704	if constexpr (!_Abi::template _S_is_partial<_Tp>)
1705	return __x._M_data / __y._M_data;
1706	else
1707	return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data);
1708	}
1709
1710	template <typename _Tp, size_t _Np>
1711	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1712	_S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1713	{
1714	if constexpr (!_Abi::template _S_is_partial<_Tp>)
1715	return __x._M_data % __y._M_data;
1716	else
1717	return __as_vector(__x)
1718	% _Abi::__make_padding_nonzero(__as_vector(__y));
1719	}
1720
1721	template <typename _Tp, size_t _Np>
1722	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1723	_S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1724	{ return __and(__x, __y); }
1725
1726	template <typename _Tp, size_t _Np>
1727	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1728	_S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1729	{ return __or(__x, __y); }
1730
1731	template <typename _Tp, size_t _Np>
1732	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1733	_S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1734	{ return __xor(__x, __y); }
1735
1736	template <typename _Tp, size_t _Np>
1737	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1738	_S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1739	{ return __x._M_data << __y._M_data; }
1740
1741	template <typename _Tp, size_t _Np>
1742	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1743	_S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1744	{ return __x._M_data >> __y._M_data; }
1745
1746	template <typename _Tp, size_t _Np>
1747	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1748	_S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y)
1749	{ return __x._M_data << __y; }
1750
1751	template <typename _Tp, size_t _Np>
1752	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1753	_S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y)
1754	{ return __x._M_data >> __y; }
1755
1756	// compares {{{2
1757	// _S_equal_to {{{3
1758	template <typename _Tp, size_t _Np>
1759	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1760	_S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1761	{ return __x._M_data == __y._M_data; }
1762
1763	// _S_not_equal_to {{{3
1764	template <typename _Tp, size_t _Np>
1765	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1766	_S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1767	{ return __x._M_data != __y._M_data; }
1768
1769	// _S_less {{{3
1770	template <typename _Tp, size_t _Np>
1771	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1772	_S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1773	{ return __x._M_data < __y._M_data; }
1774
1775	// _S_less_equal {{{3
1776	template <typename _Tp, size_t _Np>
1777	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1778	_S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1779	{ return __x._M_data <= __y._M_data; }
1780
1781	// _S_negate {{{2
1782	template <typename _Tp, size_t _Np>
1783	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1784	_S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
1785	{ return !__x._M_data; }
1786
1787	// _S_min, _S_max, _S_minmax {{{2
1788	template <typename _Tp, size_t _Np>
1789	_GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1790	_SimdWrapper<_Tp, _Np>
1791	_S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1792	{ return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; }
1793
1794	template <typename _Tp, size_t _Np>
1795	_GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1796	_SimdWrapper<_Tp, _Np>
1797	_S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1798	{ return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; }
1799
1800	template <typename _Tp, size_t _Np>
1801	_GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1802	pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>>
1803	_S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1804	{
1805	return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data,
1806	__a._M_data < __b._M_data ? __b._M_data : __a._M_data};
1807	}
1808
1809	// reductions {{{2
1810	template <size_t _Np, size_t... _Is, size_t... _Zeros, typename _Tp,
1811	typename _BinaryOperation>
1812	_GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1813	_S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>,
1814	simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1815	{
1816	using _V = __vector_type_t<_Tp, _Np / `2`>;
1817	static_assert(sizeof(_V) <= sizeof(__x));
1818	// _S_full_size is the size of the smallest native SIMD register that
1819	// can store _Np/2 elements:
1820	using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>;
1821	using _HalfSimd = __deduced_simd<_Tp, _Np / `2`>;
1822	const auto __xx = __as_vector(__x);
1823	return _HalfSimd::abi_type::_SimdImpl::_S_reduce(
1824	static_cast<_HalfSimd>(__as_vector(__binary_op(
1825	static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)),
1826	static_cast<_FullSimd>(__intrin_bitcast<_V>(
1827	__vector_permute<(_Np / `2` + _Is)..., (int(_Zeros * `0`) - `1`)...>(
1828	__xx)))))),
1829	__binary_op);
1830	}
1831
1832	template <typename _Tp, typename _BinaryOperation>
1833	_GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1834	_S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1835	{
1836	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
1837	if constexpr (_Np == `1`)
1838	return __x[`0`];
1839	else if constexpr (_Np == `2`)
1840	return __binary_op(simd<_Tp, simd_abi::scalar>(__x[`0`]),
1841	simd<_Tp, simd_abi::scalar>(__x[`1`]))[`0`];
1842	else if (__builtin_is_constant_evaluated())
1843	{
1844	simd<_Tp, simd_abi::scalar> __acc = __x[`0`];
1845	for (size_t __i = `1`; __i < _Np; ++__i)
1846	__acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i]));
1847	return __acc[`0`];
1848	}
1849	else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{
1850	{
1851	[[maybe_unused]] constexpr auto __full_size
1852	= _Abi::template _S_full_size<_Tp>;
1853	if constexpr (_Np == `3`)
1854	return __binary_op(
1855	__binary_op(simd<_Tp, simd_abi::scalar>(__x[`0`]),
1856	simd<_Tp, simd_abi::scalar>(__x[`1`])),
1857	simd<_Tp, simd_abi::scalar>(__x[`2`]))[`0`];
1858	else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1859	plus<>>)
1860	{
1861	using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1862	return _Ap::_SimdImpl::_S_reduce(
1863	simd<_Tp, _Ap>(__private_init,
1864	_Abi::_S_masked(__as_vector(__x))),
1865	__binary_op);
1866	}
1867	else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1868	multiplies<>>)
1869	{
1870	using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1871	using _TW = _SimdWrapper<_Tp, __full_size>;
1872	_GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
1873	= _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
1874	_GLIBCXX_SIMD_USE_CONSTEXPR _TW __one
1875	= __vector_broadcast<__full_size>(_Tp(`1`));
1876	const _TW __x_full = __data(__x).__as_full_vector();
1877	const _TW __x_padded_with_ones
1878	= _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one,
1879	__x_full);
1880	return _Ap::_SimdImpl::_S_reduce(
1881	simd<_Tp, _Ap>(__private_init, __x_padded_with_ones),
1882	__binary_op);
1883	}
1884	else if constexpr (_Np & `1`)
1885	{
1886	using _Ap = simd_abi::deduce_t<_Tp, _Np - `1`>;
1887	return __binary_op(
1888	simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
1889	simd<_Tp, _Ap>(
1890	__intrin_bitcast<__vector_type_t<_Tp, _Np - `1`>>(
1891	__as_vector(__x))),
1892	__binary_op)),
1893	simd<_Tp, simd_abi::scalar>(__x[_Np - `1`]))[`0`];
1894	}
1895	else
1896	return _S_reduce_partial<_Np>(
1897	make_index_sequence<_Np / `2`>(),
1898	make_index_sequence<__full_size - _Np / `2`>(), __x, __binary_op);
1899	} //}}}
1900	else if constexpr (sizeof(__x) == `16`) //{{{
1901	{
1902	if constexpr (_Np == `16`)
1903	{
1904	const auto __y = __data(__x);
1905	__x = __binary_op(
1906	_M_make_simd<_Tp, _Np>(
1907	__vector_permute<`0`, `0`, `1`, `1`, `2`, `2`, `3`, `3`, `4`, `4`, `5`, `5`, `6`, `6`,
1908	`7`, `7`>(__y)),
1909	_M_make_simd<_Tp, _Np>(
1910	__vector_permute<`8`, `8`, `9`, `9`, `10`, `10`, `11`, `11`, `12`, `12`, `13`, `13`,
1911	`14`, `14`, `15`, `15`>(__y)));
1912	}
1913	if constexpr (_Np >= `8`)
1914	{
1915	const auto __y = __vector_bitcast<short>(__data(__x));
1916	__x = __binary_op(
1917	_M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1918	__vector_permute<`0`, `0`, `1`, `1`, `2`, `2`, `3`, `3`>(__y))),
1919	_M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1920	__vector_permute<`4`, `4`, `5`, `5`, `6`, `6`, `7`, `7`>(__y))));
1921	}
1922	if constexpr (_Np >= `4`)
1923	{
1924	using _Up = conditional_t<is_floating_point_v<_Tp>, float, int>;
1925	const auto __y = __vector_bitcast<_Up>(__data(__x));
1926	__x = __binary_op(__x,
1927	_M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1928	__vector_permute<`3`, `2`, `1`, `0`>(__y))));
1929	}
1930	using _Up = conditional_t<is_floating_point_v<_Tp>, double, _LLong>;
1931	const auto __y = __vector_bitcast<_Up>(__data(__x));
1932	__x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1933	__vector_permute<`1`, `1`>(__y))));
1934	return __x[`0`];
1935	} //}}}
1936	else
1937	{
1938	static_assert(sizeof(__x) > __min_vector_size<_Tp>);
1939	static_assert((_Np & (_Np - `1`)) == `0`); // _Np must be a power of 2
1940	using _Ap = simd_abi::deduce_t<_Tp, _Np / `2`>;
1941	using _V = simd<_Tp, _Ap>;
1942	return _Ap::_SimdImpl::_S_reduce(
1943	__binary_op(_V(__private_init, __extract<`0`, `2`>(__as_vector(__x))),
1944	_V(__private_init,
1945	__extract<`1`, `2`>(__as_vector(__x)))),
1946	static_cast<_BinaryOperation&&>(__binary_op));
1947	}
1948	}
1949
1950	// math {{{2
1951	// frexp, modf and copysign implemented in simd_math.h
1952	#define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \
1953	template <typename _Tp, typename... _More> \
1954	static _Tp \
1955	_S_##__name(const _Tp& __x, const _More&... __more) \
1956	{ \
1957	return __generate_vector<_Tp>( \
1958	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1959	return __name(__x[__i], __more[__i]...); \
1960	}); \
1961	}
1962
1963	#define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \
1964	template <typename _Tp, typename... _More> \
1965	static typename _Tp::mask_type \
1966	_S_##__name(const _Tp& __x, const _More&... __more) \
1967	{ \
1968	return __generate_vector<_Tp>( \
1969	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1970	return __name(__x[__i], __more[__i]...); \
1971	}); \
1972	}
1973
1974	#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \
1975	template <typename _Tp, typename... _More> \
1976	static auto \
1977	_S_##__name(const _Tp& __x, const _More&... __more) \
1978	{ \
1979	return __fixed_size_storage_t<_RetTp, \
1980	_VectorTraits<_Tp>::_S_partial_width>:: \
1981	_S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1982	return __meta._S_generator( \
1983	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1984	return __name(__x[__meta._S_offset + __i], \
1985	__more[__meta._S_offset + __i]...); \
1986	}, \
1987	static_cast<_RetTp*>(nullptr)); \
1988	}); \
1989	}
1990
1991	_GLIBCXX_SIMD_MATH_FALLBACK(acos)
1992	_GLIBCXX_SIMD_MATH_FALLBACK(asin)
1993	_GLIBCXX_SIMD_MATH_FALLBACK(atan)
1994	_GLIBCXX_SIMD_MATH_FALLBACK(atan2)
1995	_GLIBCXX_SIMD_MATH_FALLBACK(cos)
1996	_GLIBCXX_SIMD_MATH_FALLBACK(sin)
1997	_GLIBCXX_SIMD_MATH_FALLBACK(tan)
1998	_GLIBCXX_SIMD_MATH_FALLBACK(acosh)
1999	_GLIBCXX_SIMD_MATH_FALLBACK(asinh)
2000	_GLIBCXX_SIMD_MATH_FALLBACK(atanh)
2001	_GLIBCXX_SIMD_MATH_FALLBACK(cosh)
2002	_GLIBCXX_SIMD_MATH_FALLBACK(sinh)
2003	_GLIBCXX_SIMD_MATH_FALLBACK(tanh)
2004	_GLIBCXX_SIMD_MATH_FALLBACK(exp)
2005	_GLIBCXX_SIMD_MATH_FALLBACK(exp2)
2006	_GLIBCXX_SIMD_MATH_FALLBACK(expm1)
2007	_GLIBCXX_SIMD_MATH_FALLBACK(ldexp)
2008	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
2009	_GLIBCXX_SIMD_MATH_FALLBACK(log)
2010	_GLIBCXX_SIMD_MATH_FALLBACK(log10)
2011	_GLIBCXX_SIMD_MATH_FALLBACK(log1p)
2012	_GLIBCXX_SIMD_MATH_FALLBACK(log2)
2013	_GLIBCXX_SIMD_MATH_FALLBACK(logb)
2014
2015	// modf implemented in simd_math.h
2016	_GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
2017	_GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
2018	_GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
2019	_GLIBCXX_SIMD_MATH_FALLBACK(fabs)
2020	_GLIBCXX_SIMD_MATH_FALLBACK(pow)
2021	_GLIBCXX_SIMD_MATH_FALLBACK(sqrt)
2022	_GLIBCXX_SIMD_MATH_FALLBACK(erf)
2023	_GLIBCXX_SIMD_MATH_FALLBACK(erfc)
2024	_GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
2025	_GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
2026
2027	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
2028	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
2029
2030	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
2031	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
2032
2033	_GLIBCXX_SIMD_MATH_FALLBACK(fmod)
2034	_GLIBCXX_SIMD_MATH_FALLBACK(remainder)
2035
2036	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2037	static _Tp
2038	_S_remquo(const _Tp __x, const _Tp __y,
2039	__fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
2040	{
2041	return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2042	int __tmp;
2043	auto __r = remquo(__x[__i], __y[__i], &__tmp);
2044	__z->_M_set(__i, __tmp);
2045	return __r;
2046	});
2047	}
2048
2049	// copysign in simd_math.h
2050	_GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
2051	_GLIBCXX_SIMD_MATH_FALLBACK(fdim)
2052	_GLIBCXX_SIMD_MATH_FALLBACK(fmax)
2053	_GLIBCXX_SIMD_MATH_FALLBACK(fmin)
2054	_GLIBCXX_SIMD_MATH_FALLBACK(fma)
2055
2056	template <typename _Tp, size_t _Np>
2057	static constexpr _MaskMember<_Tp>
2058	_S_isgreater(_SimdWrapper<_Tp, _Np> __x,
2059	_SimdWrapper<_Tp, _Np> __y) noexcept
2060	{
2061	using _Ip = __int_for_sizeof_t<_Tp>;
2062	const auto __xn = __vector_bitcast<_Ip>(__x);
2063	const auto __yn = __vector_bitcast<_Ip>(__y);
2064	const auto __xp = __xn < `0` ? -(__xn & __finite_max_v<_Ip>) : __xn;
2065	const auto __yp = __yn < `0` ? -(__yn & __finite_max_v<_Ip>) : __yn;
2066	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2067	__xp > __yp);
2068	}
2069
2070	template <typename _Tp, size_t _Np>
2071	static constexpr _MaskMember<_Tp>
2072	_S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x,
2073	_SimdWrapper<_Tp, _Np> __y) noexcept
2074	{
2075	using _Ip = __int_for_sizeof_t<_Tp>;
2076	const auto __xn = __vector_bitcast<_Ip>(__x);
2077	const auto __yn = __vector_bitcast<_Ip>(__y);
2078	const auto __xp = __xn < `0` ? -(__xn & __finite_max_v<_Ip>) : __xn;
2079	const auto __yp = __yn < `0` ? -(__yn & __finite_max_v<_Ip>) : __yn;
2080	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2081	__xp >= __yp);
2082	}
2083
2084	template <typename _Tp, size_t _Np>
2085	static constexpr _MaskMember<_Tp>
2086	_S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept
2087	{
2088	using _Ip = __int_for_sizeof_t<_Tp>;
2089	const auto __xn = __vector_bitcast<_Ip>(__x);
2090	const auto __yn = __vector_bitcast<_Ip>(__y);
2091	const auto __xp = __xn < `0` ? -(__xn & __finite_max_v<_Ip>) : __xn;
2092	const auto __yp = __yn < `0` ? -(__yn & __finite_max_v<_Ip>) : __yn;
2093	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2094	__xp < __yp);
2095	}
2096
2097	template <typename _Tp, size_t _Np>
2098	static constexpr _MaskMember<_Tp>
2099	_S_islessequal(_SimdWrapper<_Tp, _Np> __x,
2100	_SimdWrapper<_Tp, _Np> __y) noexcept
2101	{
2102	using _Ip = __int_for_sizeof_t<_Tp>;
2103	const auto __xn = __vector_bitcast<_Ip>(__x);
2104	const auto __yn = __vector_bitcast<_Ip>(__y);
2105	const auto __xp = __xn < `0` ? -(__xn & __finite_max_v<_Ip>) : __xn;
2106	const auto __yp = __yn < `0` ? -(__yn & __finite_max_v<_Ip>) : __yn;
2107	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2108	__xp <= __yp);
2109	}
2110
2111	template <typename _Tp, size_t _Np>
2112	static constexpr _MaskMember<_Tp>
2113	_S_islessgreater(_SimdWrapper<_Tp, _Np> __x,
2114	_SimdWrapper<_Tp, _Np> __y) noexcept
2115	{
2116	return __andnot(_SuperImpl::_S_isunordered(__x, __y),
2117	_SuperImpl::_S_not_equal_to(__x, __y));
2118	}
2119
2120	#undef _GLIBCXX_SIMD_MATH_FALLBACK
2121	#undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET
2122	#undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
2123	// _S_abs {{{3
2124	template <typename _Tp, size_t _Np>
2125	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2126	_S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept
2127	{
2128	// if (__builtin_is_constant_evaluated())
2129	// {
2130	// return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2131	// }
2132	if constexpr (is_floating_point_v<_Tp>)
2133	// `v < 0 ? -v : v` cannot compile to the efficient implementation of
2134	// masking the signbit off because it must consider v == -0
2135
2136	// ~(-0.) & v would be easy, but breaks with fno-signed-zeros
2137	return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data);
2138	else
2139	return __x._M_data < `0` ? -__x._M_data : __x._M_data;
2140	}
2141
2142	// }}}3
2143	// _S_plus_minus {{{
2144	// Returns __x + __y - __y without -fassociative-math optimizing to __x.
2145	// - _TV must be __vector_type_t<floating-point type, N>.
2146	// - _UV must be _TV or floating-point type.
2147	template <typename _TV, typename _UV>
2148	_GLIBCXX_SIMD_INTRINSIC static constexpr _TV
2149	_S_plus_minus(_TV __x, _UV __y) noexcept
2150	{
2151	#if defined __i386__ && !defined __SSE_MATH__
2152	if constexpr (sizeof(__x) == `8`)
2153	{ // operations on __x would use the FPU
2154	static_assert(is_same_v<_TV, __vector_type_t<float, `2`>>);
2155	const auto __x4 = __vector_bitcast<float, `4`>(__x);
2156	if constexpr (is_same_v<_TV, _UV>)
2157	return __vector_bitcast<float, `2`>(
2158	_S_plus_minus(__x4, __vector_bitcast<float, `4`>(__y)));
2159	else
2160	return __vector_bitcast<float, `2`>(_S_plus_minus(__x4, __y));
2161	}
2162	#endif
2163	#if !defined __clang__ && __GCC_IEC_559 == 0
2164	if (__builtin_is_constant_evaluated()
2165	\|\| (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
2166	return (__x + __y) - __y;
2167	else
2168	return [&] {
2169	__x += __y;
2170	if constexpr(__have_sse)
2171	{
2172	if constexpr (sizeof(__x) >= `16`)
2173	asm("" : "+x"(__x));
2174	else if constexpr (is_same_v<__vector_type_t<float, `2`>, _TV>)
2175	asm("" : "+x"(__x[`0`]), "+x"(__x[`1`]));
2176	else
2177	__assert_unreachable<_TV>();
2178	}
2179	else if constexpr(__have_neon)
2180	asm("" : "+w"(__x));
2181	else if constexpr (__have_power_vmx)
2182	{
2183	if constexpr (is_same_v<__vector_type_t<float, `2`>, _TV>)
2184	asm("" : "+fgr"(__x[`0`]), "+fgr"(__x[`1`]));
2185	else
2186	asm("" : "+v"(__x));
2187	}
2188	else
2189	asm("" : "+g"(__x));
2190	return __x - __y;
2191	}();
2192	#else
2193	return (__x + __y) - __y;
2194	#endif
2195	}
2196
2197	// }}}
2198	// _S_nearbyint {{{3
2199	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2200	_GLIBCXX_SIMD_INTRINSIC static _Tp
2201	_S_nearbyint(_Tp __x_) noexcept
2202	{
2203	using value_type = typename _TVT::value_type;
2204	using _V = typename _TVT::type;
2205	const _V __x = __x_;
2206	const _V __absx = __and(__x, _S_absmask<_V>);
2207	static_assert(__CHAR_BIT__ * sizeof(`1ull`) >= __digits_v<value_type>);
2208	_GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs
2209	= _V() + (`1ull` << (__digits_v<value_type> - `1`));
2210	const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs);
2211	const _V __shifted = _S_plus_minus(__x, __shifter);
2212	return __absx < __shifter_abs ? __shifted : __x;
2213	}
2214
2215	// _S_rint {{{3
2216	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2217	_GLIBCXX_SIMD_INTRINSIC static _Tp
2218	_S_rint(_Tp __x) noexcept
2219	{ return _SuperImpl::_S_nearbyint(__x); }
2220
2221	// _S_trunc {{{3
2222	template <typename _Tp, size_t _Np>
2223	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2224	_S_trunc(_SimdWrapper<_Tp, _Np> __x)
2225	{
2226	using _V = __vector_type_t<_Tp, _Np>;
2227	const _V __absx = __and(__x._M_data, _S_absmask<_V>);
2228	static_assert(__CHAR_BIT__ * sizeof(`1ull`) >= __digits_v<_Tp>);
2229	constexpr _Tp __shifter = `1ull` << (__digits_v<_Tp> - `1`);
2230	_V __truncated = _S_plus_minus(__absx, __shifter);
2231	__truncated -= __truncated > __absx ? _V() + `1` : _V();
2232	return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated)
2233	: __x._M_data;
2234	}
2235
2236	// _S_round {{{3
2237	template <typename _Tp, size_t _Np>
2238	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2239	_S_round(_SimdWrapper<_Tp, _Np> __x)
2240	{
2241	const auto __abs_x = _SuperImpl::_S_abs(__x);
2242	const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data;
2243	const auto __r_abs // round(abs(x)) =
2244	= __t_abs + (__abs_x._M_data - __t_abs >= _Tp(`.5`) ? _Tp(`1`) : `0`);
2245	return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs);
2246	}
2247
2248	// _S_floor {{{3
2249	template <typename _Tp, size_t _Np>
2250	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2251	_S_floor(_SimdWrapper<_Tp, _Np> __x)
2252	{
2253	const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2254	const auto __negative_input
2255	= __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(`0`));
2256	const auto __mask
2257	= __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2258	return __or(__andnot(__mask, __y),
2259	__and(__mask, __y - __vector_broadcast<_Np, _Tp>(`1`)));
2260	}
2261
2262	// _S_ceil {{{3
2263	template <typename _Tp, size_t _Np>
2264	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2265	_S_ceil(_SimdWrapper<_Tp, _Np> __x)
2266	{
2267	const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2268	const auto __negative_input
2269	= __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(`0`));
2270	const auto __inv_mask
2271	= __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2272	return __or(__and(__inv_mask, __y),
2273	__andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(`1`)));
2274	}
2275
2276	// _S_isnan {{{3
2277	template <typename _Tp, size_t _Np>
2278	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2279	_S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2280	{
2281	#if __FINITE_MATH_ONLY__
2282	return {}; // false
2283	#elif !defined __SUPPORT_SNAN__
2284	return ~(__x._M_data == __x._M_data);
2285	#elif defined __STDC_IEC_559__
2286	using _Ip = __int_for_sizeof_t<_Tp>;
2287	const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2288	const auto __infn
2289	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
2290	return __infn < __absn;
2291	#else
2292	#error "Not implemented: how to support SNaN but non-IEC559 floating-point?"
2293	#endif
2294	}
2295
2296	// _S_isfinite {{{3
2297	template <typename _Tp, size_t _Np>
2298	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2299	_S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2300	{
2301	#if __FINITE_MATH_ONLY__
2302	using _UV = typename _MaskMember<_Tp>::_BuiltinType;
2303	_GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV();
2304	return __alltrue;
2305	#else
2306	// if all exponent bits are set, __x is either inf or NaN
2307	using _Ip = __int_for_sizeof_t<_Tp>;
2308	const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2309	const auto __maxn
2310	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2311	return __absn <= __maxn;
2312	#endif
2313	}
2314
2315	// _S_isunordered {{{3
2316	template <typename _Tp, size_t _Np>
2317	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2318	_S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2319	{ return __or(_S_isnan(__x), _S_isnan(__y)); }
2320
2321	// _S_signbit {{{3
2322	template <typename _Tp, size_t _Np>
2323	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2324	_S_signbit(_SimdWrapper<_Tp, _Np> __x)
2325	{
2326	using _Ip = __int_for_sizeof_t<_Tp>;
2327	return __vector_bitcast<_Ip>(__x) < `0`;
2328	// Arithmetic right shift (SRA) would also work (instead of compare), but
2329	// 64-bit SRA isn't available on x86 before AVX512. And in general,
2330	// compares are more likely to be efficient than SRA.
2331	}
2332
2333	// _S_isinf {{{3
2334	template <typename _Tp, size_t _Np>
2335	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2336	_S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2337	{
2338	#if __FINITE_MATH_ONLY__
2339	return {}; // false
2340	#else
2341	return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x),
2342	__vector_broadcast<_Np>(
2343	__infinity_v<_Tp>));
2344	// alternative:
2345	// compare to inf using the corresponding integer type
2346	/*
2347	return
2348	__vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>(
2349	_S_abs(__x)._M_data)
2350	==
2351	__vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>(
2352	__infinity_v<_Tp>)));
2353	*/
2354	#endif
2355	}
2356
2357	// _S_isnormal {{{3
2358	template <typename _Tp, size_t _Np>
2359	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2360	_S_isnormal(_SimdWrapper<_Tp, _Np> __x)
2361	{
2362	using _Ip = __int_for_sizeof_t<_Tp>;
2363	const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2364	const auto __minn
2365	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>));
2366	#if __FINITE_MATH_ONLY__
2367	return __absn >= __minn;
2368	#else
2369	const auto __maxn
2370	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2371	return __minn <= __absn && __absn <= __maxn;
2372	#endif
2373	}
2374
2375	// _S_fpclassify {{{3
2376	template <typename _Tp, size_t _Np>
2377	_GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
2378	_S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
2379	{
2380	using _I = __int_for_sizeof_t<_Tp>;
2381	const auto __xn
2382	= __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
2383	constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
2384	_GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
2385	= __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
2386
2387	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
2388	= __vector_broadcast<_NI, _I>(FP_NORMAL);
2389	#if !__FINITE_MATH_ONLY__
2390	_GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
2391	= __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
2392	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
2393	= __vector_broadcast<_NI, _I>(FP_NAN);
2394	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
2395	= __vector_broadcast<_NI, _I>(FP_INFINITE);
2396	#endif
2397	#ifndef __FAST_MATH__
2398	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
2399	= __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
2400	#endif
2401	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
2402	= __vector_broadcast<_NI, _I>(FP_ZERO);
2403
2404	__vector_type_t<_I, _NI>
2405	__tmp = __xn < __minn
2406	#ifdef __FAST_MATH__
2407	? __fp_zero
2408	#else
2409	? (__xn == `0` ? __fp_zero : __fp_subnormal)
2410	#endif
2411	#if __FINITE_MATH_ONLY__
2412	: __fp_normal;
2413	#else
2414	: (__xn < __infn ? __fp_normal
2415	: (__xn == __infn ? __fp_infinite : __fp_nan));
2416	#endif
2417
2418	if constexpr (sizeof(_I) == sizeof(int))
2419	{
2420	using _FixedInt = __fixed_size_storage_t<int, _Np>;
2421	const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
2422	if constexpr (_FixedInt::_S_tuple_size == `1`)
2423	return {__as_int};
2424	else if constexpr (_FixedInt::_S_tuple_size == `2`
2425	&& is_same_v<
2426	typename _FixedInt::_SecondType::_FirstAbi,
2427	simd_abi::scalar>)
2428	return {__extract<`0`, `2`>(__as_int), __as_int[_Np - `1`]};
2429	else if constexpr (_FixedInt::_S_tuple_size == `2`)
2430	return {__extract<`0`, `2`>(__as_int),
2431	__auto_bitcast(__extract<`1`, `2`>(__as_int))};
2432	else
2433	__assert_unreachable<_Tp>();
2434	}
2435	else if constexpr (_Np == `2` && sizeof(_I) == `8`
2436	&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == `2`)
2437	{
2438	const auto __aslong = __vector_bitcast<_LLong>(__tmp);
2439	return {int(__aslong[`0`]), {int(__aslong[`1`])}};
2440	}
2441	#if _GLIBCXX_SIMD_X86INTRIN
2442	else if constexpr (sizeof(_Tp) == `8` && sizeof(__tmp) == `32`
2443	&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == `1`)
2444	return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
2445	__to_intrin(__hi128(__tmp)))};
2446	else if constexpr (sizeof(_Tp) == `8` && sizeof(__tmp) == `64`
2447	&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == `1`)
2448	return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
2449	#endif // _GLIBCXX_SIMD_X86INTRIN
2450	else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == `1`)
2451	return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
2452	[](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2453	return __make_wrapper<int>(__l...);
2454	})};
2455	else
2456	__assert_unreachable<_Tp>();
2457	}
2458
2459	// _S_increment & _S_decrement{{{2
2460	template <typename _Tp, size_t _Np>
2461	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2462	_S_increment(_SimdWrapper<_Tp, _Np>& __x)
2463	{ __x = __x._M_data + `1`; }
2464
2465	template <typename _Tp, size_t _Np>
2466	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2467	_S_decrement(_SimdWrapper<_Tp, _Np>& __x)
2468	{ __x = __x._M_data - `1`; }
2469
2470	// smart_reference access {{{2
2471	template <typename _Tp, size_t _Np, typename _Up>
2472	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2473	_S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
2474	{ __v._M_set(__i, static_cast<_Up&&>(__x)); }
2475
2476	// _S_masked_assign{{{2
2477	template <typename _Tp, typename _K, size_t _Np>
2478	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2479	_S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2480	__type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2481	{
2482	if (__k._M_is_constprop_none_of())
2483	return;
2484	else if (__k._M_is_constprop_all_of())
2485	__lhs = __rhs;
2486	else
2487	__lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs);
2488	}
2489
2490	template <typename _Tp, typename _K, size_t _Np>
2491	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2492	_S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2493	__type_identity_t<_Tp> __rhs)
2494	{
2495	if (__k._M_is_constprop_none_of())
2496	return;
2497	else if (__k._M_is_constprop_all_of())
2498	__lhs = __vector_broadcast<_Np>(__rhs);
2499	else if (__builtin_constant_p(__rhs) && __rhs == `0`)
2500	{
2501	if constexpr (!is_same_v<bool, _K>)
2502	// the __andnot optimization only makes sense if __k._M_data is a
2503	// vector register
2504	__lhs._M_data
2505	= __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data);
2506	else
2507	// for AVX512/__mmask, a _mm512_maskz_mov is best
2508	__lhs
2509	= _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>());
2510	}
2511	else
2512	__lhs = _CommonImpl::_S_blend(__k, __lhs,
2513	_SimdWrapper<_Tp, _Np>(
2514	__vector_broadcast<_Np>(__rhs)));
2515	}
2516
2517	// _S_masked_cassign {{{2
2518	template <typename _Op, typename _Tp, typename _K, size_t _Np>
2519	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2520	_S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2521	_SimdWrapper<_Tp, _Np>& __lhs,
2522	const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs,
2523	_Op __op)
2524	{
2525	if (__k._M_is_constprop_none_of())
2526	return;
2527	else if (__k._M_is_constprop_all_of())
2528	__lhs = __op(_SuperImpl{}, __lhs, __rhs);
2529	else
2530	__lhs = _CommonImpl::_S_blend(__k, __lhs,
2531	__op(_SuperImpl{}, __lhs, __rhs));
2532	}
2533
2534	template <typename _Op, typename _Tp, typename _K, size_t _Np>
2535	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2536	_S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2537	_SimdWrapper<_Tp, _Np>& __lhs,
2538	const __type_identity_t<_Tp> __rhs, _Op __op)
2539	{ _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); }
2540
2541	// _S_masked_unary {{{2
2542	template <template <typename> class _Op, typename _Tp, typename _K,
2543	size_t _Np>
2544	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2545	_S_masked_unary(const _SimdWrapper<_K, _Np> __k,
2546	const _SimdWrapper<_Tp, _Np> __v)
2547	{
2548	if (__k._M_is_constprop_none_of())
2549	return __v;
2550	auto __vv = _M_make_simd(__v);
2551	_Op<decltype(__vv)> __op;
2552	if (__k._M_is_constprop_all_of())
2553	return __data(__op(__vv));
2554	else if constexpr (is_same_v<_Op<void>, __increment<void>>)
2555	{
2556	static_assert(not std::is_same_v<_K, bool>);
2557	if constexpr (is_integral_v<_Tp>)
2558	// Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2559	return __v._M_data - __vector_bitcast<_Tp>(__k._M_data);
2560	else if constexpr (not __have_avx2)
2561	return __v._M_data
2562	+ __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2563	_K, _Tp(`1`)));
2564	// starting with AVX2 it is more efficient to blend after add
2565	}
2566	else if constexpr (is_same_v<_Op<void>, __decrement<void>>)
2567	{
2568	static_assert(not std::is_same_v<_K, bool>);
2569	if constexpr (is_integral_v<_Tp>)
2570	// Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2571	return __v._M_data + __vector_bitcast<_Tp>(__k._M_data);
2572	else if constexpr (not __have_avx2)
2573	return __v._M_data
2574	- __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2575	_K, _Tp(`1`)));
2576	// starting with AVX2 it is more efficient to blend after sub
2577	}
2578	return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
2579	}
2580
2581	//}}}2
2582	};
2583
2584	// _MaskImplBuiltinMixin {{{1
2585	struct _MaskImplBuiltinMixin
2586	{
2587	template <typename _Tp>
2588	using _TypeTag = _Tp*;
2589
2590	// _S_to_maskvector {{{
2591	template <typename _Up, size_t _ToN = `1`>
2592	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2593	_S_to_maskvector(bool __x)
2594	{
2595	static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2596	return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
2597	: __vector_type_t<_Up, _ToN>{};
2598	}
2599
2600	template <typename _Up, size_t _UpN = `0`, size_t _Np, bool _Sanitized,
2601	size_t _ToN = _UpN == `0` ? _Np : _UpN>
2602	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2603	_S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
2604	{
2605	static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2606	return __generate_vector<__vector_type_t<_Up, _ToN>>(
2607	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2608	if constexpr (__i < _Np)
2609	return __x[__i] ? ~_Up() : _Up();
2610	else
2611	return _Up();
2612	});
2613	}
2614
2615	template <typename _Up, size_t _UpN = `0`, typename _Tp, size_t _Np,
2616	size_t _ToN = _UpN == `0` ? _Np : _UpN>
2617	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2618	_S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
2619	{
2620	static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2621	using _TW = _SimdWrapper<_Tp, _Np>;
2622	using _UW = _SimdWrapper<_Up, _ToN>;
2623	if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW))
2624	return __wrapper_bitcast<_Up, _ToN>(__x);
2625	else if constexpr (is_same_v<_Tp, bool>) // bits -> vector
2626	return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data));
2627	else
2628	{ // vector -> vector
2629	/*
2630	[[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data);
2631	if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) ==
2632	16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr
2633	(sizeof(_Tp) == 4 && sizeof(_Up) == 2
2634	&& sizeof(__y) == 16)
2635	return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y);
2636	else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
2637	&& sizeof(__y) == 16)
2638	return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y);
2639	else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
2640	&& sizeof(__y) == 16)
2641	return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1,
2642	-1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 &&
2643	sizeof(_Up) == 1
2644	&& sizeof(__y) == 16)
2645	return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
2646	-1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 &&
2647	sizeof(_Up) == 1
2648	&& sizeof(__y) == 16)
2649	return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2650	-1, -1, -1, -1, -1>(__y); else
2651	*/
2652	{
2653	return __generate_vector<__vector_type_t<_Up, _ToN>>(
2654	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2655	if constexpr (__i < _Np)
2656	return _Up(__x[__i.value]);
2657	else
2658	return _Up();
2659	});
2660	}
2661	}
2662	}
2663
2664	// }}}
2665	// _S_to_bits {{{
2666	template <typename _Tp, size_t _Np>
2667	_GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
2668	_S_to_bits(_SimdWrapper<_Tp, _Np> __x)
2669	{
2670	static_assert(!is_same_v<_Tp, bool>);
2671	static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong));
2672	using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
2673	const auto __bools
2674	= __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - `1`);
2675	_ULLong __r = `0`;
2676	__execute_n_times<_Np>(
2677	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2678	__r \|= _ULLong(__bools[__i.value]) << __i;
2679	});
2680	return __r;
2681	}
2682
2683	// }}}
2684	};
2685
2686	// _MaskImplBuiltin {{{1
2687	template <typename _Abi>
2688	struct _MaskImplBuiltin : _MaskImplBuiltinMixin
2689	{
2690	using _MaskImplBuiltinMixin::_S_to_bits;
2691	using _MaskImplBuiltinMixin::_S_to_maskvector;
2692
2693	// member types {{{
2694	template <typename _Tp>
2695	using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
2696
2697	template <typename _Tp>
2698	using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
2699
2700	using _SuperImpl = typename _Abi::_MaskImpl;
2701	using _CommonImpl = typename _Abi::_CommonImpl;
2702
2703	template <typename _Tp>
2704	static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
2705
2706	// }}}
2707	// _S_broadcast {{{
2708	template <typename _Tp>
2709	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2710	_S_broadcast(bool __x)
2711	{ return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); }
2712
2713	// }}}
2714	// _S_load {{{
2715	template <typename _Tp>
2716	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2717	_S_load(const bool* __mem)
2718	{
2719	using _I = __int_for_sizeof_t<_Tp>;
2720	if (not __builtin_is_constant_evaluated())
2721	if constexpr (sizeof(_Tp) == sizeof(bool))
2722	{
2723	const auto __bools
2724	= _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem);
2725	// bool is {0, 1}, everything else is UB
2726	return __bools > `0`;
2727	}
2728	return __generate_vector<_I, _S_size<_Tp>>(
2729	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2730	return __mem[__i] ? ~_I() : _I();
2731	});
2732	}
2733
2734	// }}}
2735	// _S_convert {{{
2736	template <typename _Tp, size_t _Np, bool _Sanitized>
2737	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
2738	_S_convert(_BitMask<_Np, _Sanitized> __x)
2739	{
2740	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2741	return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_to_bits());
2742	else
2743	return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2744	_S_size<_Tp>>(
2745	__x._M_sanitized());
2746	}
2747
2748	template <typename _Tp, size_t _Np>
2749	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
2750	_S_convert(_SimdWrapper<bool, _Np> __x)
2751	{
2752	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2753	return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_data);
2754	else
2755	return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2756	_S_size<_Tp>>(
2757	_BitMask<_Np>(__x._M_data)._M_sanitized());
2758	}
2759
2760	template <typename _Tp, typename _Up, size_t _Np>
2761	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
2762	_S_convert(_SimdWrapper<_Up, _Np> __x)
2763	{
2764	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2765	return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(
2766	_SuperImpl::_S_to_bits(__x));
2767	else
2768	return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2769	_S_size<_Tp>>(__x);
2770	}
2771
2772	template <typename _Tp, typename _Up, typename _UAbi>
2773	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
2774	_S_convert(simd_mask<_Up, _UAbi> __x)
2775	{
2776	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2777	{
2778	using _R = _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>;
2779	if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits
2780	return _R(__data(__x));
2781	else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits
2782	return _R(__data(__x));
2783	else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits
2784	return _R(__data(__x)._M_to_bits());
2785	else // vector -> bits
2786	return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
2787	}
2788	else
2789	return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2790	_S_size<_Tp>>(
2791	__data(__x));
2792	}
2793
2794	// }}}
2795	// _S_masked_load {{{2
2796	template <typename _Tp, size_t _Np>
2797	static inline _SimdWrapper<_Tp, _Np>
2798	_S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
2799	_SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
2800	{
2801	// AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
2802	auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
2803	_BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
2804	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2805	__tmp._M_set(__i, -__mem[__i]);
2806	});
2807	__merge = __wrapper_bitcast<_Tp>(__tmp);
2808	return __merge;
2809	}
2810
2811	// _S_store {{{2
2812	template <typename _Tp, size_t _Np>
2813	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2814	_S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
2815	{
2816	__execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2817	__mem[__i] = __v[__i];
2818	});
2819	}
2820
2821	// _S_masked_store {{{2
2822	template <typename _Tp, size_t _Np>
2823	static inline void
2824	_S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
2825	const _SimdWrapper<_Tp, _Np> __k) noexcept
2826	{
2827	_BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k),
2828	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2829	__mem[__i] = __v[__i];
2830	});
2831	}
2832
2833	// _S_from_bitmask{{{2
2834	template <size_t _Np, typename _Tp>
2835	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2836	_S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
2837	{ return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); }
2838
2839	// logical and bitwise operators {{{2
2840	template <typename _Tp, size_t _Np>
2841	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2842	_S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2843	{ return __and(__x._M_data, __y._M_data); }
2844
2845	template <typename _Tp, size_t _Np>
2846	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2847	_S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2848	{ return __or(__x._M_data, __y._M_data); }
2849
2850	template <typename _Tp, size_t _Np>
2851	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2852	_S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
2853	{
2854	if constexpr (_Abi::template _S_is_partial<_Tp>)
2855	return __andnot(__x, __wrapper_bitcast<_Tp>(
2856	_Abi::template _S_implicit_mask<_Tp>()));
2857	else
2858	return __not(__x._M_data);
2859	}
2860
2861	template <typename _Tp, size_t _Np>
2862	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2863	_S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2864	{ return __and(__x._M_data, __y._M_data); }
2865
2866	template <typename _Tp, size_t _Np>
2867	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2868	_S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2869	{ return __or(__x._M_data, __y._M_data); }
2870
2871	template <typename _Tp, size_t _Np>
2872	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2873	_S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2874	{ return __xor(__x._M_data, __y._M_data); }
2875
2876	// smart_reference access {{{2
2877	template <typename _Tp, size_t _Np>
2878	static constexpr void
2879	_S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, bool __x) noexcept
2880	{
2881	if constexpr (is_same_v<_Tp, bool>)
2882	__k._M_set(__i, __x);
2883	else
2884	{
2885	static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
2886	if (__builtin_is_constant_evaluated())
2887	{
2888	__k = __generate_from_n_evaluations<_Np,
2889	__vector_type_t<_Tp, _Np>>(
2890	[&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2891	if (__i == static_cast<int>(__j))
2892	return _Tp(-__x);
2893	else
2894	return __k[+__j];
2895	});
2896	}
2897	else
2898	__k._M_data[__i] = -__x;
2899	}
2900	}
2901
2902	// _S_masked_assign{{{2
2903	template <typename _Tp, size_t _Np>
2904	_GLIBCXX_SIMD_INTRINSIC static void
2905	_S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2906	__type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2907	{ __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
2908
2909	template <typename _Tp, size_t _Np>
2910	_GLIBCXX_SIMD_INTRINSIC static void
2911	_S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs)
2912	{
2913	if (__builtin_constant_p(__rhs))
2914	{
2915	if (__rhs == false)
2916	__lhs = __andnot(__k, __lhs);
2917	else
2918	__lhs = __or(__k, __lhs);
2919	return;
2920	}
2921	__lhs = _CommonImpl::_S_blend(__k, __lhs,
2922	__data(simd_mask<_Tp, _Abi>(__rhs)));
2923	}
2924
2925	//}}}2
2926	// _S_all_of {{{
2927	template <typename _Tp>
2928	_GLIBCXX_SIMD_INTRINSIC static bool
2929	_S_all_of(simd_mask<_Tp, _Abi> __k)
2930	{
2931	return __call_with_subscripts(
2932	__data(__k), make_index_sequence<_S_size<_Tp>>(),
2933	[](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2934	{ return (... && !(__ent == `0`)); });
2935	}
2936
2937	// }}}
2938	// _S_any_of {{{
2939	template <typename _Tp>
2940	_GLIBCXX_SIMD_INTRINSIC static bool
2941	_S_any_of(simd_mask<_Tp, _Abi> __k)
2942	{
2943	return __call_with_subscripts(
2944	__data(__k), make_index_sequence<_S_size<_Tp>>(),
2945	[](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2946	{ return (... \|\| !(__ent == `0`)); });
2947	}
2948
2949	// }}}
2950	// _S_none_of {{{
2951	template <typename _Tp>
2952	_GLIBCXX_SIMD_INTRINSIC static bool
2953	_S_none_of(simd_mask<_Tp, _Abi> __k)
2954	{
2955	return __call_with_subscripts(
2956	__data(__k), make_index_sequence<_S_size<_Tp>>(),
2957	[](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2958	{ return (... && (__ent == `0`)); });
2959	}
2960
2961	// }}}
2962	// _S_some_of {{{
2963	template <typename _Tp>
2964	_GLIBCXX_SIMD_INTRINSIC static bool
2965	_S_some_of(simd_mask<_Tp, _Abi> __k)
2966	{
2967	const int __n_true = _SuperImpl::_S_popcount(__k);
2968	return __n_true > `0` && __n_true < int(_S_size<_Tp>);
2969	}
2970
2971	// }}}
2972	// _S_popcount {{{
2973	template <typename _Tp>
2974	_GLIBCXX_SIMD_INTRINSIC static int
2975	_S_popcount(simd_mask<_Tp, _Abi> __k)
2976	{
2977	using _I = __int_for_sizeof_t<_Tp>;
2978	if constexpr (is_default_constructible_v<simd<_I, _Abi>>)
2979	return -reduce(
2980	simd<_I, _Abi>(__private_init, __wrapper_bitcast<_I>(__data(__k))));
2981	else
2982	return -reduce(__bit_cast<rebind_simd_t<_I, simd<_Tp, _Abi>>>(
2983	simd<_Tp, _Abi>(__private_init, __data(__k))));
2984	}
2985
2986	// }}}
2987	// _S_find_first_set {{{
2988	template <typename _Tp>
2989	_GLIBCXX_SIMD_INTRINSIC static int
2990	_S_find_first_set(simd_mask<_Tp, _Abi> __k)
2991	{ return std::__countr_zero(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()); }
2992
2993	// }}}
2994	// _S_find_last_set {{{
2995	template <typename _Tp>
2996	_GLIBCXX_SIMD_INTRINSIC static int
2997	_S_find_last_set(simd_mask<_Tp, _Abi> __k)
2998	{ return std::__bit_width(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()) - `1`; }
2999
3000	// }}}
3001	};
3002
3003	//}}}1
3004	_GLIBCXX_SIMD_END_NAMESPACE
3005	#endif // __cplusplus >= 201703L
3006	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
3007
3008	// vim: foldmethod=marker foldmarker={{{,}}} sw=2 noet ts=8 sts=2 tw=80
3009

source code of include/c++/11/experimental/bits/simd_builtin.h