simd_builtin.h source code [include/c++/13/experimental/bits/simd_builtin.h]

1	// Simd Abi specific implementations -- C++ --
2
3	// Copyright (C) 2020-2023 Free Software Foundation, Inc.
4	//
5	// This file is part of the GNU ISO C++ Library. This library is free
6	// software; you can redistribute it and/or modify it under the
7	// terms of the GNU General Public License as published by the
8	// Free Software Foundation; either version 3, or (at your option)
9	// any later version.
10
11	// This library is distributed in the hope that it will be useful,
12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	// GNU General Public License for more details.
15
16	// Under Section 7 of GPL version 3, you are granted additional
17	// permissions described in the GCC Runtime Library Exception, version
18	// 3.1, as published by the Free Software Foundation.
19
20	// You should have received a copy of the GNU General Public License and
21	// a copy of the GCC Runtime Library Exception along with this program;
22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	// <http://www.gnu.org/licenses/>.
24
25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
26	#define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
27
28	#if __cplusplus >= 201703L
29
30	#include <array>
31	#include <cmath>
32	#include <cstdlib>
33
34	_GLIBCXX_SIMD_BEGIN_NAMESPACE
35	// _S_allbits{{{
36	template <typename _V>
37	static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits
38	= reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());
39
40	// }}}
41	// _S_signmask, _S_absmask{{{
42	template <typename _V, typename = _VectorTraits<_V>>
43	static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask
44	= __xor(_V() + `1`, _V() - `1`);
45
46	template <typename _V, typename = _VectorTraits<_V>>
47	static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask
48	= __andnot(_S_signmask<_V>, _S_allbits<_V>);
49
50	//}}}
51	// __vector_permute<Indices...>{{{
52	// Index == -1 requests zeroing of the output element
53	template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
54	typename = __detail::__odr_helper>
55	constexpr _Tp
56	__vector_permute(_Tp __x)
57	{
58	static_assert(sizeof...(_Indices) == _TVT::_S_full_size);
59	return __make_vector<typename _TVT::value_type>(
60	(_Indices == -`1` ? `0` : __x[_Indices == -`1` ? `0` : _Indices])...);
61	}
62
63	// }}}
64	// __vector_shuffle<Indices...>{{{
65	// Index == -1 requests zeroing of the output element
66	template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
67	typename = __detail::__odr_helper>
68	constexpr _Tp
69	__vector_shuffle(_Tp __x, _Tp __y)
70	{
71	return _Tp{(_Indices == -`1` ? `0`
72	: _Indices < _TVT::_S_full_size
73	? __x[_Indices]
74	: __y[_Indices - _TVT::_S_full_size])...};
75	}
76
77	// }}}
78	// __make_wrapper{{{
79	template <typename _Tp, typename... _Args>
80	_GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)>
81	__make_wrapper(const _Args&... __args)
82	{ return __make_vector<_Tp>(__args...); }
83
84	// }}}
85	// __wrapper_bitcast{{{
86	template <typename _Tp, size_t _ToN = `0`, typename _Up, size_t _M,
87	size_t _Np = _ToN != `0` ? _ToN : sizeof(_Up) * _M / sizeof(_Tp)>
88	_GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np>
89	__wrapper_bitcast(_SimdWrapper<_Up, _M> __x)
90	{
91	static_assert(_Np > `1`);
92	return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data);
93	}
94
95	// }}}
96	// __extract_part(_SimdWrapper<_Tp, _Np>) {{{
97	template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
98	_GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
99	conditional_t<_Np == _Total and _Combine == `1`, _Tp, _SimdWrapper<_Tp, _Np / _Total * _Combine>>
100	__extract_part(const _SimdWrapper<_Tp, _Np> __x)
101	{
102	if constexpr (_Np == _Total and _Combine == `1`)
103	return __x[_Index];
104	else if constexpr (_Index % `2` == `0` && _Total % `2` == `0` && _Combine % `2` == `0`)
105	return __extract_part<_Index / `2`, _Total / `2`, _Combine / `2`>(__x);
106	else
107	{
108	constexpr size_t __values_per_part = _Np / _Total;
109	constexpr size_t __values_to_skip = _Index * __values_per_part;
110	constexpr size_t __return_size = __values_per_part * _Combine;
111	using _R = __vector_type_t<_Tp, __return_size>;
112	static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp)
113	<= sizeof(__x),
114	"out of bounds __extract_part");
115	// the following assertion would ensure no "padding" to be read
116	// static_assert(_Total >= _Index + _Combine, "_Total must be greater
117	// than _Index");
118
119	// static_assert(__return_size _Total == _Np, "_Np must be divisible*
120	// by _Total");
121	if (__x._M_is_constprop())
122	return __generate_from_n_evaluations<__return_size, _R>(
123	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
124	return __x[__values_to_skip + __i];
125	});
126	if constexpr (_Index == `0` && _Total == `1`)
127	return __x;
128	else if constexpr (_Index == `0`)
129	return __intrin_bitcast<_R>(__as_vector(__x));
130	else
131	return __vec_shuffle(__as_vector(__x), make_index_sequence<__bit_ceil(x: __return_size)>(),
132	[](size_t __i) {
133	return __i + __values_to_skip;
134	});
135	}
136	}
137
138	// }}}
139	// __extract_part(_SimdWrapper<bool, _Np>) {{{
140	template <int _Index, int _Total, int _Combine = `1`, size_t _Np>
141	_GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<bool, _Np / _Total * _Combine>
142	__extract_part(const _SimdWrapper<bool, _Np> __x)
143	{
144	static_assert(_Combine == `1`, "_Combine != 1 not implemented");
145	static_assert(__have_avx512f && _Total >= `2` && _Index + _Combine <= _Total && _Index >= `0`);
146	return __x._M_data >> (_Index * _Np / _Total);
147	}
148
149	// }}}
150
151	// __vector_convert {{{
152	// implementation requires an index sequence
153	template <typename _To, typename _From, size_t... _I>
154	_GLIBCXX_SIMD_INTRINSIC constexpr _To
155	__vector_convert(_From __a, index_sequence<_I...>)
156	{
157	using _Tp = typename _VectorTraits<_To>::value_type;
158	return _To{static_cast<_Tp>(__a[_I])...};
159	}
160
161	template <typename _To, typename _From, size_t... _I>
162	_GLIBCXX_SIMD_INTRINSIC constexpr _To
163	__vector_convert(_From __a, _From __b, index_sequence<_I...>)
164	{
165	using _Tp = typename _VectorTraits<_To>::value_type;
166	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...};
167	}
168
169	template <typename _To, typename _From, size_t... _I>
170	_GLIBCXX_SIMD_INTRINSIC constexpr _To
171	__vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>)
172	{
173	using _Tp = typename _VectorTraits<_To>::value_type;
174	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
175	static_cast<_Tp>(__c[_I])...};
176	}
177
178	template <typename _To, typename _From, size_t... _I>
179	_GLIBCXX_SIMD_INTRINSIC constexpr _To
180	__vector_convert(_From __a, _From __b, _From __c, _From __d,
181	index_sequence<_I...>)
182	{
183	using _Tp = typename _VectorTraits<_To>::value_type;
184	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
185	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...};
186	}
187
188	template <typename _To, typename _From, size_t... _I>
189	_GLIBCXX_SIMD_INTRINSIC constexpr _To
190	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
191	index_sequence<_I...>)
192	{
193	using _Tp = typename _VectorTraits<_To>::value_type;
194	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
195	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
196	static_cast<_Tp>(__e[_I])...};
197	}
198
199	template <typename _To, typename _From, size_t... _I>
200	_GLIBCXX_SIMD_INTRINSIC constexpr _To
201	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
202	_From __f, index_sequence<_I...>)
203	{
204	using _Tp = typename _VectorTraits<_To>::value_type;
205	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
206	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
207	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...};
208	}
209
210	template <typename _To, typename _From, size_t... _I>
211	_GLIBCXX_SIMD_INTRINSIC constexpr _To
212	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
213	_From __f, _From __g, index_sequence<_I...>)
214	{
215	using _Tp = typename _VectorTraits<_To>::value_type;
216	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
217	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
218	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
219	static_cast<_Tp>(__g[_I])...};
220	}
221
222	template <typename _To, typename _From, size_t... _I>
223	_GLIBCXX_SIMD_INTRINSIC constexpr _To
224	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
225	_From __f, _From __g, _From __h, index_sequence<_I...>)
226	{
227	using _Tp = typename _VectorTraits<_To>::value_type;
228	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
229	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
230	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
231	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...};
232	}
233
234	template <typename _To, typename _From, size_t... _I>
235	_GLIBCXX_SIMD_INTRINSIC constexpr _To
236	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
237	_From __f, _From __g, _From __h, _From __i,
238	index_sequence<_I...>)
239	{
240	using _Tp = typename _VectorTraits<_To>::value_type;
241	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
242	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
243	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
244	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
245	static_cast<_Tp>(__i[_I])...};
246	}
247
248	template <typename _To, typename _From, size_t... _I>
249	_GLIBCXX_SIMD_INTRINSIC constexpr _To
250	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
251	_From __f, _From __g, _From __h, _From __i, _From __j,
252	index_sequence<_I...>)
253	{
254	using _Tp = typename _VectorTraits<_To>::value_type;
255	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
256	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
257	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
258	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
259	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...};
260	}
261
262	template <typename _To, typename _From, size_t... _I>
263	_GLIBCXX_SIMD_INTRINSIC constexpr _To
264	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
265	_From __f, _From __g, _From __h, _From __i, _From __j,
266	_From __k, index_sequence<_I...>)
267	{
268	using _Tp = typename _VectorTraits<_To>::value_type;
269	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
270	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
271	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
272	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
273	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
274	static_cast<_Tp>(__k[_I])...};
275	}
276
277	template <typename _To, typename _From, size_t... _I>
278	_GLIBCXX_SIMD_INTRINSIC constexpr _To
279	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
280	_From __f, _From __g, _From __h, _From __i, _From __j,
281	_From __k, _From __l, index_sequence<_I...>)
282	{
283	using _Tp = typename _VectorTraits<_To>::value_type;
284	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
285	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
286	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
287	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
288	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
289	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...};
290	}
291
292	template <typename _To, typename _From, size_t... _I>
293	_GLIBCXX_SIMD_INTRINSIC constexpr _To
294	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
295	_From __f, _From __g, _From __h, _From __i, _From __j,
296	_From __k, _From __l, _From __m, index_sequence<_I...>)
297	{
298	using _Tp = typename _VectorTraits<_To>::value_type;
299	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
300	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
301	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
302	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
303	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
304	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
305	static_cast<_Tp>(__m[_I])...};
306	}
307
308	template <typename _To, typename _From, size_t... _I>
309	_GLIBCXX_SIMD_INTRINSIC constexpr _To
310	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
311	_From __f, _From __g, _From __h, _From __i, _From __j,
312	_From __k, _From __l, _From __m, _From __n,
313	index_sequence<_I...>)
314	{
315	using _Tp = typename _VectorTraits<_To>::value_type;
316	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
317	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
318	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
319	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
320	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
321	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
322	static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...};
323	}
324
325	template <typename _To, typename _From, size_t... _I>
326	_GLIBCXX_SIMD_INTRINSIC constexpr _To
327	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
328	_From __f, _From __g, _From __h, _From __i, _From __j,
329	_From __k, _From __l, _From __m, _From __n, _From __o,
330	index_sequence<_I...>)
331	{
332	using _Tp = typename _VectorTraits<_To>::value_type;
333	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
334	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
335	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
336	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
337	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
338	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
339	static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
340	static_cast<_Tp>(__o[_I])...};
341	}
342
343	template <typename _To, typename _From, size_t... _I>
344	_GLIBCXX_SIMD_INTRINSIC constexpr _To
345	__vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
346	_From __f, _From __g, _From __h, _From __i, _From __j,
347	_From __k, _From __l, _From __m, _From __n, _From __o,
348	_From __p, index_sequence<_I...>)
349	{
350	using _Tp = typename _VectorTraits<_To>::value_type;
351	return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
352	static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
353	static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
354	static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
355	static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
356	static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
357	static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
358	static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...};
359	}
360
361	// Defer actual conversion to the overload that takes an index sequence. Note
362	// that this function adds zeros or drops values off the end if you don't ensure
363	// matching width.
364	template <typename _To, typename... _From, size_t _FromSize>
365	_GLIBCXX_SIMD_INTRINSIC constexpr _To
366	__vector_convert(_SimdWrapper<_From, _FromSize>... __xs)
367	{
368	#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
369	using _From0 = __first_of_pack_t<_From...>;
370	using _FW = _SimdWrapper<_From0, _FromSize>;
371	if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop()))
372	{
373	if constexpr ((sizeof...(_From) & (sizeof...(_From) - `1`))
374	== `0`) // power-of-two number of arguments
375	return __convert_x86<_To>(__as_vector(__xs)...);
376	else // append zeros and recurse until the above branch is taken
377	return __vector_convert<_To>(__xs..., _FW{});
378	}
379	else
380	#endif
381	return __vector_convert<_To>(
382	__as_vector(__xs)...,
383	make_index_sequence<(sizeof...(__xs) == `1` ? std::min(
384	_VectorTraits<_To>::_S_full_size, int(_FromSize))
385	: _FromSize)>());
386	}
387
388	// }}}
389	// __convert function{{{
390	template <typename _To, typename _From, typename... _More>
391	_GLIBCXX_SIMD_INTRINSIC constexpr auto
392	__convert(_From __v0, _More... __vs)
393	{
394	static_assert((true && ... && is_same_v<_From, _More>) );
395	if constexpr (__is_vectorizable_v<_From>)
396	{
397	using _V = typename _VectorTraits<_To>::type;
398	using _Tp = typename _VectorTraits<_To>::value_type;
399	return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...};
400	}
401	else if constexpr (__is_vector_type_v<_From>)
402	return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...);
403	else // _SimdWrapper arguments
404	{
405	constexpr size_t __input_size = _From::_S_size * (`1` + sizeof...(_More));
406	if constexpr (__is_vectorizable_v<_To>)
407	return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...);
408	else if constexpr (!__is_vector_type_v<_To>)
409	return _To(__convert<typename _To::_BuiltinType>(__v0, __vs...));
410	else
411	{
412	static_assert(
413	sizeof...(_More) == `0`
414	\|\| _VectorTraits<_To>::_S_full_size >= __input_size,
415	"__convert(...) requires the input to fit into the output");
416	return __vector_convert<_To>(__v0, __vs...);
417	}
418	}
419	}
420
421	// }}}
422	// __convert_all{{{
423	// Converts __v into array<_To, N>, where N is _NParts if non-zero or
424	// otherwise deduced from _To such that N #elements(_To) <= #elements(__v).*
425	// Note: this function may return less than all converted elements
426	template <typename _To,
427	size_t _NParts = `0`, // allows to convert fewer or more (only last
428	// _To, to be partially filled) than all
429	size_t _Offset = `0`, // where to start, # of elements (not Bytes or
430	// Parts)
431	typename _From, typename _FromVT = _VectorTraits<_From>>
432	_GLIBCXX_SIMD_INTRINSIC auto
433	__convert_all(_From __v)
434	{
435	if constexpr (is_arithmetic_v<_To> && _NParts != `1`)
436	{
437	static_assert(_Offset < _FromVT::_S_full_size);
438	constexpr auto _Np
439	= _NParts == `0` ? _FromVT::_S_partial_width - _Offset : _NParts;
440	return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
441	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
442	return static_cast<_To>(__v[__i + _Offset]);
443	});
444	}
445	else
446	{
447	static_assert(__is_vector_type_v<_To>);
448	using _ToVT = _VectorTraits<_To>;
449	if constexpr (__is_vector_type_v<_From>)
450	return __convert_all<_To, _NParts>(__as_wrapper(__v));
451	else if constexpr (_NParts == `1`)
452	{
453	static_assert(_Offset % _ToVT::_S_full_size == `0`);
454	return array<_To, `1`>{__vector_convert<_To>(
455	__extract_part<_Offset / _ToVT::_S_full_size,
456	__div_roundup(_FromVT::_S_partial_width,
457	_ToVT::_S_full_size)>(__v))};
458	}
459	#if _GLIBCXX_SIMD_X86INTRIN // {{{
460	else if constexpr (!__have_sse4_1 && _Offset == `0`
461	&& is_integral_v<typename _FromVT::value_type>
462	&& sizeof(typename _FromVT::value_type)
463	< sizeof(typename _ToVT::value_type)
464	&& !(sizeof(typename _FromVT::value_type) == `4`
465	&& is_same_v<typename _ToVT::value_type, double>))
466	{
467	using _ToT = typename _ToVT::value_type;
468	using _FromT = typename _FromVT::value_type;
469	constexpr size_t _Np
470	= _NParts != `0`
471	? _NParts
472	: (_FromVT::_S_partial_width / _ToVT::_S_full_size);
473	using _R = array<_To, _Np>;
474	// __adjust modifies its input to have _Np (use _SizeConstant)
475	// entries so that no unnecessary intermediate conversions are
476	// requested and, more importantly, no intermediate conversions are
477	// missing
478	[[maybe_unused]] auto __adjust
479	= [](auto __n,
480	auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> {
481	return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
482	};
483	[[maybe_unused]] const auto __vi = __to_intrin(__v);
484	auto&& __make_array
485	= [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
486	if constexpr (_Np == `1`)
487	return _R{__intrin_bitcast<_To>(__x0)};
488	else
489	return _R{__intrin_bitcast<_To>(__x0),
490	__intrin_bitcast<_To>(__x1)};
491	};
492
493	if constexpr (_Np == `0`)
494	return _R{};
495	else if constexpr (sizeof(_FromT) == `1` && sizeof(_ToT) == `2`)
496	{
497	static_assert(is_integral_v<_FromT>);
498	static_assert(is_integral_v<_ToT>);
499	if constexpr (is_unsigned_v<_FromT>)
500	return __make_array(_mm_unpacklo_epi8(__vi, __m128i()),
501	_mm_unpackhi_epi8(__vi, __m128i()));
502	else
503	return __make_array(
504	_mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), `8`),
505	_mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), `8`));
506	}
507	else if constexpr (sizeof(_FromT) == `2` && sizeof(_ToT) == `4`)
508	{
509	static_assert(is_integral_v<_FromT>);
510	if constexpr (is_floating_point_v<_ToT>)
511	{
512	const auto __ints
513	= __convert_all<__vector_type16_t<int>, _Np>(
514	__adjust(_SizeConstant<_Np * `4`>(), __v));
515	return __generate_from_n_evaluations<_Np, _R>(
516	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
517	return __vector_convert<_To>(__as_wrapper(__ints[__i]));
518	});
519	}
520	else if constexpr (is_unsigned_v<_FromT>)
521	return __make_array(_mm_unpacklo_epi16(__vi, __m128i()),
522	_mm_unpackhi_epi16(__vi, __m128i()));
523	else
524	return __make_array(
525	_mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), `16`),
526	_mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), `16`));
527	}
528	else if constexpr (sizeof(_FromT) == `4` && sizeof(_ToT) == `8`
529	&& is_integral_v<_FromT> && is_integral_v<_ToT>)
530	{
531	if constexpr (is_unsigned_v<_FromT>)
532	return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
533	_mm_unpackhi_epi32(__vi, __m128i()));
534	else
535	return __make_array(
536	_mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, `31`)),
537	_mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, `31`)));
538	}
539	else if constexpr (sizeof(_FromT) == `4` && sizeof(_ToT) == `8`
540	&& is_integral_v<_FromT> && is_integral_v<_ToT>)
541	{
542	if constexpr (is_unsigned_v<_FromT>)
543	return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
544	_mm_unpackhi_epi32(__vi, __m128i()));
545	else
546	return __make_array(
547	_mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, `31`)),
548	_mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, `31`)));
549	}
550	else if constexpr (sizeof(_FromT) == `1` && sizeof(_ToT) >= `4`
551	&& is_signed_v<_FromT>)
552	{
553	const __m128i __vv[`2`] = {_mm_unpacklo_epi8(__vi, __vi),
554	_mm_unpackhi_epi8(__vi, __vi)};
555	const __vector_type_t<int, `4`> __vvvv[`4`] = {
556	__vector_bitcast<int>(x: _mm_unpacklo_epi16(a: __vv[`0`], b: __vv[`0`])),
557	__vector_bitcast<int>(x: _mm_unpackhi_epi16(a: __vv[`0`], b: __vv[`0`])),
558	__vector_bitcast<int>(x: _mm_unpacklo_epi16(a: __vv[`1`], b: __vv[`1`])),
559	__vector_bitcast<int>(x: _mm_unpackhi_epi16(a: __vv[`1`], b: __vv[`1`]))};
560	if constexpr (sizeof(_ToT) == `4`)
561	return __generate_from_n_evaluations<_Np, _R>(
562	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
563	return __vector_convert<_To>(
564	_SimdWrapper<int, `4`>(__vvvv[__i] >> `24`));
565	});
566	else if constexpr (is_integral_v<_ToT>)
567	return __generate_from_n_evaluations<_Np, _R>(
568	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
569	const auto __signbits = __to_intrin(__vvvv[__i / `2`] >> `31`);
570	const auto __sx32 = __to_intrin(__vvvv[__i / `2`] >> `24`);
571	return __vector_bitcast<_ToT>(
572	__i % `2` == `0` ? _mm_unpacklo_epi32(__sx32, __signbits)
573	: _mm_unpackhi_epi32(__sx32, __signbits));
574	});
575	else
576	return __generate_from_n_evaluations<_Np, _R>(
577	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
578	const _SimdWrapper<int, `4`> __int4 = __vvvv[__i / `2`] >> `24`;
579	return __vector_convert<_To>(
580	__i % `2` == `0` ? __int4
581	: _SimdWrapper<int, `4`>(
582	_mm_unpackhi_epi64(a: __to_intrin(x: __int4),
583	b: __to_intrin(x: __int4))));
584	});
585	}
586	else if constexpr (sizeof(_FromT) == `1` && sizeof(_ToT) == `4`)
587	{
588	const auto __shorts = __convert_all<__vector_type16_t<
589	conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
590	__adjust(_SizeConstant<(_Np + `1`) / `2` * `8`>(), __v));
591	return __generate_from_n_evaluations<_Np, _R>(
592	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
593	return __convert_all<_To>(__shorts[__i / `2`])[__i % `2`];
594	});
595	}
596	else if constexpr (sizeof(_FromT) == `2` && sizeof(_ToT) == `8`
597	&& is_signed_v<_FromT> && is_integral_v<_ToT>)
598	{
599	const __m128i __vv[`2`] = {_mm_unpacklo_epi16(__vi, __vi),
600	_mm_unpackhi_epi16(__vi, __vi)};
601	const __vector_type16_t<int> __vvvv[`4`]
602	= {__vector_bitcast<int>(
603	x: _mm_unpacklo_epi32(a: _mm_srai_epi32(a: __vv[`0`], count: `16`),
604	b: _mm_srai_epi32(a: __vv[`0`], count: `31`))),
605	__vector_bitcast<int>(
606	x: _mm_unpackhi_epi32(a: _mm_srai_epi32(a: __vv[`0`], count: `16`),
607	b: _mm_srai_epi32(a: __vv[`0`], count: `31`))),
608	__vector_bitcast<int>(
609	x: _mm_unpacklo_epi32(a: _mm_srai_epi32(a: __vv[`1`], count: `16`),
610	b: _mm_srai_epi32(a: __vv[`1`], count: `31`))),
611	__vector_bitcast<int>(
612	x: _mm_unpackhi_epi32(a: _mm_srai_epi32(a: __vv[`1`], count: `16`),
613	b: _mm_srai_epi32(a: __vv[`1`], count: `31`)))};
614	return __generate_from_n_evaluations<_Np, _R>(
615	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
616	return __vector_bitcast<_ToT>(__vvvv[__i]);
617	});
618	}
619	else if constexpr (sizeof(_FromT) <= `2` && sizeof(_ToT) == `8`)
620	{
621	const auto __ints
622	= __convert_all<__vector_type16_t<conditional_t<
623	is_signed_v<_FromT> \|\| is_floating_point_v<_ToT>, int,
624	unsigned int>>>(
625	__adjust(_SizeConstant<(_Np + `1`) / `2` * `4`>(), __v));
626	return __generate_from_n_evaluations<_Np, _R>(
627	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
628	return __convert_all<_To>(__ints[__i / `2`])[__i % `2`];
629	});
630	}
631	else
632	__assert_unreachable<_To>();
633	}
634	#endif // _GLIBCXX_SIMD_X86INTRIN }}}
635	else if constexpr ((_FromVT::_S_partial_width - _Offset)
636	> _ToVT::_S_full_size)
637	{
638	/*
639	static_assert(
640	(_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) ==
641	0,
642	"__convert_all only supports power-of-2 number of elements.
643	Otherwise " "the return type cannot be array<_To, N>.");
644	*/
645	constexpr size_t _NTotal
646	= (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size;
647	constexpr size_t _Np = _NParts == `0` ? _NTotal : _NParts;
648	static_assert(
649	_Np <= _NTotal
650	\|\| (_Np == _NTotal + `1`
651	&& (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size
652	> `0`));
653	using _R = array<_To, _Np>;
654	if constexpr (_Np == `1`)
655	return _R{__vector_convert<_To>(
656	__extract_part<_Offset, _FromVT::_S_partial_width,
657	_ToVT::_S_full_size>(__v))};
658	else
659	return __generate_from_n_evaluations<_Np, _R>(
660	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
661	auto __part
662	= __extract_part<__i * _ToVT::_S_full_size + _Offset,
663	_FromVT::_S_partial_width,
664	_ToVT::_S_full_size>(__v);
665	return __vector_convert<_To>(__part);
666	});
667	}
668	else if constexpr (_Offset == `0`)
669	return array<_To, `1`>{__vector_convert<_To>(__v)};
670	else
671	return array<_To, `1`>{__vector_convert<_To>(
672	__extract_part<_Offset, _FromVT::_S_partial_width,
673	_FromVT::_S_partial_width - _Offset>(__v))};
674	}
675	}
676
677	// }}}
678
679	// _GnuTraits {{{
680	template <typename _Tp, typename _Mp, typename _Abi, size_t _Np>
681	struct _GnuTraits
682	{
683	using _IsValid = true_type;
684	using _SimdImpl = typename _Abi::_SimdImpl;
685	using _MaskImpl = typename _Abi::_MaskImpl;
686
687	// simd and simd_mask member types {{{
688	using _SimdMember = _SimdWrapper<_Tp, _Np>;
689	using _MaskMember = _SimdWrapper<_Mp, _Np>;
690	static constexpr size_t _S_simd_align = alignof(_SimdMember);
691	static constexpr size_t _S_mask_align = alignof(_MaskMember);
692
693	// }}}
694	// size metadata {{{
695	static constexpr size_t _S_full_size = _SimdMember::_S_full_size;
696	static constexpr bool _S_is_partial = _SimdMember::_S_is_partial;
697
698	// }}}
699	// _SimdBase / base class for simd, providing extra conversions {{{
700	struct _SimdBase2
701	{
702	_GLIBCXX_SIMD_ALWAYS_INLINE explicit
703	operator __intrinsic_type_t<_Tp, _Np>() const
704	{ return __to_intrin(static_cast<const simd<_Tp, _Abi>>(this*)->_M_data); }
705
706	_GLIBCXX_SIMD_ALWAYS_INLINE explicit
707	operator __vector_type_t<_Tp, _Np>() const
708	{ return __data(*static_cast<const simd<_Tp, _Abi>>(this*)); }
709	};
710
711	struct _SimdBase1
712	{
713	_GLIBCXX_SIMD_ALWAYS_INLINE explicit
714	operator __intrinsic_type_t<_Tp, _Np>() const
715	{ return __data(*static_cast<const simd<_Tp, _Abi>>(this*)); }
716	};
717
718	using _SimdBase = conditional_t<
719	is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
720	_SimdBase1, _SimdBase2>;
721
722	// }}}
723	// _MaskBase {{{
724	struct _MaskBase2
725	{
726	_GLIBCXX_SIMD_ALWAYS_INLINE explicit
727	operator __intrinsic_type_t<_Tp, _Np>() const
728	{ return static_cast<const simd_mask<_Tp, _Abi>>(this*) ->_M_data.__intrin(); }
729
730	_GLIBCXX_SIMD_ALWAYS_INLINE explicit
731	operator __vector_type_t<_Tp, _Np>() const
732	{ return static_cast<const simd_mask<_Tp, _Abi>>(this*)->_M_data._M_data; }
733	};
734
735	struct _MaskBase1
736	{
737	_GLIBCXX_SIMD_ALWAYS_INLINE explicit
738	operator __intrinsic_type_t<_Tp, _Np>() const
739	{ return __data(*static_cast<const simd_mask<_Tp, _Abi>>(this*)); }
740	};
741
742	using _MaskBase = conditional_t<
743	is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
744	_MaskBase1, _MaskBase2>;
745
746	// }}}
747	// _MaskCastType {{{
748	// parameter type of one explicit simd_mask constructor
749	class _MaskCastType
750	{
751	using _Up = __intrinsic_type_t<_Tp, _Np>;
752	_Up _M_data;
753
754	public:
755	_GLIBCXX_SIMD_ALWAYS_INLINE
756	_MaskCastType(_Up __x) : _M_data(__x) {}
757
758	_GLIBCXX_SIMD_ALWAYS_INLINE
759	operator _MaskMember() const { return _M_data; }
760	};
761
762	// }}}
763	// _SimdCastType {{{
764	// parameter type of one explicit simd constructor
765	class _SimdCastType1
766	{
767	using _Ap = __intrinsic_type_t<_Tp, _Np>;
768	_SimdMember _M_data;
769
770	public:
771	_GLIBCXX_SIMD_ALWAYS_INLINE constexpr
772	_SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
773
774	_GLIBCXX_SIMD_ALWAYS_INLINE constexpr
775	operator _SimdMember() const { return _M_data; }
776	};
777
778	class _SimdCastType2
779	{
780	using _Ap = __intrinsic_type_t<_Tp, _Np>;
781	using _Bp = __vector_type_t<_Tp, _Np>;
782	_SimdMember _M_data;
783
784	public:
785	_GLIBCXX_SIMD_ALWAYS_INLINE constexpr
786	_SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
787
788	_GLIBCXX_SIMD_ALWAYS_INLINE constexpr
789	_SimdCastType2(_Bp __b) : _M_data(__b) {}
790
791	_GLIBCXX_SIMD_ALWAYS_INLINE constexpr
792	operator _SimdMember() const { return _M_data; }
793	};
794
795	using _SimdCastType = conditional_t<
796	is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
797	_SimdCastType1, _SimdCastType2>;
798	//}}}
799	};
800
801	// }}}
802	struct _CommonImplX86;
803	struct _CommonImplNeon;
804	struct _CommonImplBuiltin;
805	template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplBuiltin;
806	template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplBuiltin;
807	template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplX86;
808	template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplX86;
809	template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplNeon;
810	template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplNeon;
811	template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplPpc;
812	template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplPpc;
813
814	// simd_abi::_VecBuiltin {{{
815	template <int _UsedBytes>
816	struct simd_abi::_VecBuiltin
817	{
818	template <typename _Tp>
819	static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
820
821	// validity traits {{{
822	struct _IsValidAbiTag : __bool_constant<(_UsedBytes > `1`)> {};
823
824	template <typename _Tp>
825	struct _IsValidSizeFor
826	: __bool_constant<(_UsedBytes / sizeof(_Tp) > `1`
827	&& _UsedBytes % sizeof(_Tp) == `0`
828	&& _UsedBytes <= __vectorized_sizeof<_Tp>()
829	&& (!__have_avx512f \|\| _UsedBytes <= `32`))> {};
830
831	template <typename _Tp>
832	struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>,
833	_IsValidSizeFor<_Tp>> {};
834
835	template <typename _Tp>
836	static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
837
838	// }}}
839	// _SimdImpl/_MaskImpl {{{
840	#if _GLIBCXX_SIMD_X86INTRIN
841	using _CommonImpl = _CommonImplX86;
842	using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>;
843	using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>;
844	#elif _GLIBCXX_SIMD_HAVE_NEON
845	using _CommonImpl = _CommonImplNeon;
846	using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>;
847	using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>;
848	#else
849	using _CommonImpl = _CommonImplBuiltin;
850	#ifdef __ALTIVEC__
851	using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>;
852	using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>;
853	#else
854	using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>;
855	using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>;
856	#endif
857	#endif
858
859	// }}}
860	// __traits {{{
861	template <typename _Tp>
862	using _MaskValueType = __int_for_sizeof_t<_Tp>;
863
864	template <typename _Tp>
865	using __traits
866	= conditional_t<_S_is_valid_v<_Tp>,
867	_GnuTraits<_Tp, _MaskValueType<_Tp>,
868	_VecBuiltin<_UsedBytes>, _S_size<_Tp>>,
869	_InvalidTraits>;
870
871	//}}}
872	// size metadata {{{
873	template <typename _Tp>
874	static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
875
876	template <typename _Tp>
877	static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
878
879	// }}}
880	// implicit masks {{{
881	template <typename _Tp>
882	using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>;
883
884	template <typename _Tp>
885	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
886	_S_implicit_mask()
887	{
888	using _UV = typename _MaskMember<_Tp>::_BuiltinType;
889	if constexpr (!_MaskMember<_Tp>::_S_is_partial)
890	return ~_UV();
891	else
892	{
893	constexpr auto __size = _S_size<_Tp>;
894	_GLIBCXX_SIMD_USE_CONSTEXPR auto __r
895	= __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
896	{ return __i < __size ? -`1` : `0`; });
897	return __r;
898	}
899	}
900
901	template <typename _Tp>
902	_GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>>
903	_S_implicit_mask_intrin()
904	{ return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); }
905
906	template <typename _TW, typename _TVT = _VectorTraits<_TW>>
907	_GLIBCXX_SIMD_INTRINSIC static constexpr _TW
908	_S_masked(_TW __x)
909	{
910	using _Tp = typename _TVT::value_type;
911	if constexpr (!_MaskMember<_Tp>::_S_is_partial)
912	return __x;
913	else
914	return __and(__as_vector(__x),
915	__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()));
916	}
917
918	template <typename _TW, typename _TVT = _VectorTraits<_TW>>
919	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
920	__make_padding_nonzero(_TW __x)
921	{
922	using _Tp = typename _TVT::value_type;
923	if constexpr (!_S_is_partial<_Tp>)
924	return __x;
925	else
926	{
927	_GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask
928	= __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>());
929	if constexpr (is_integral_v<_Tp>)
930	return __or(__x, ~__implicit_mask);
931	else
932	{
933	_GLIBCXX_SIMD_USE_CONSTEXPR auto __one
934	= __andnot(__implicit_mask,
935	__vector_broadcast<_S_full_size<_Tp>>(_Tp(`1`)));
936	// it's not enough to return `x \| 1_in_padding` because the
937	// padding in x might be inf or nan (independent of
938	// __FINITE_MATH_ONLY__, because it's about padding bits)
939	return __or(__and(__x, __implicit_mask), __one);
940	}
941	}
942	}
943	// }}}
944	};
945
946	// }}}
947	// simd_abi::_VecBltnBtmsk {{{
948	template <int _UsedBytes>
949	struct simd_abi::_VecBltnBtmsk
950	{
951	template <typename _Tp>
952	static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
953
954	// validity traits {{{
955	struct _IsValidAbiTag : __bool_constant<(_UsedBytes > `1`)> {};
956
957	template <typename _Tp>
958	struct _IsValidSizeFor
959	: __bool_constant<(_UsedBytes / sizeof(_Tp) > `1`
960	&& _UsedBytes % sizeof(_Tp) == `0` && _UsedBytes <= `64`
961	&& (_UsedBytes > `32` \|\| __have_avx512vl))> {};
962
963	// Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also
964	// required.
965	template <typename _Tp>
966	struct _IsValid
967	: conjunction<
968	_IsValidAbiTag, __bool_constant<__have_avx512f>,
969	__bool_constant<__have_avx512bw \|\| (sizeof(_Tp) >= `4`)>,
970	__bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>,
971	_IsValidSizeFor<_Tp>> {};
972
973	template <typename _Tp>
974	static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
975
976	// }}}
977	// simd/_MaskImpl {{{
978	#if _GLIBCXX_SIMD_X86INTRIN
979	using _CommonImpl = _CommonImplX86;
980	using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>;
981	using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>;
982	#else
983	template <int>
984	struct _MissingImpl;
985
986	using _CommonImpl = _MissingImpl<_UsedBytes>;
987	using _SimdImpl = _MissingImpl<_UsedBytes>;
988	using _MaskImpl = _MissingImpl<_UsedBytes>;
989	#endif
990
991	// }}}
992	// __traits {{{
993	template <typename _Tp>
994	using _MaskMember = _SimdWrapper<bool, _S_size<_Tp>>;
995
996	template <typename _Tp>
997	using __traits = conditional_t<
998	_S_is_valid_v<_Tp>,
999	_GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>,
1000	_InvalidTraits>;
1001
1002	//}}}
1003	// size metadata {{{
1004	template <typename _Tp>
1005	static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1006	template <typename _Tp>
1007	static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1008
1009	// }}}
1010	// implicit mask {{{
1011	private:
1012	template <typename _Tp>
1013	using _ImplicitMask = _SimdWrapper<bool, _S_size<_Tp>>;
1014
1015	public:
1016	template <size_t _Np>
1017	_GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np>
1018	__implicit_mask_n()
1019	{
1020	using _Tp = __bool_storage_member_type_t<_Np>;
1021	return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((`1ULL` << _Np) - `1`) : ~_Tp();
1022	}
1023
1024	template <typename _Tp>
1025	_GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp>
1026	_S_implicit_mask()
1027	{ return __implicit_mask_n<_S_size<_Tp>>(); }
1028
1029	template <typename _Tp>
1030	_GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>>
1031	_S_implicit_mask_intrin()
1032	{ return __implicit_mask_n<_S_size<_Tp>>(); }
1033
1034	template <typename _Tp, size_t _Np>
1035	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1036	_S_masked(_SimdWrapper<_Tp, _Np> __x)
1037	{
1038	if constexpr (is_same_v<_Tp, bool>)
1039	if constexpr (_Np < `8` \|\| (_Np & (_Np - `1`)) != `0`)
1040	return _MaskImpl::_S_bit_and(
1041	__x, _SimdWrapper<_Tp, _Np>(
1042	__bool_storage_member_type_t<_Np>((`1ULL` << _Np) - `1`)));
1043	else
1044	return __x;
1045	else
1046	return _S_masked(__x._M_data);
1047	}
1048
1049	template <typename _TV>
1050	_GLIBCXX_SIMD_INTRINSIC static constexpr _TV
1051	_S_masked(_TV __x)
1052	{
1053	using _Tp = typename _VectorTraits<_TV>::value_type;
1054	static_assert(
1055	!__is_bitmask_v<_TV>,
1056	"_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't "
1057	"know the number of elements. Use _SimdWrapper<bool, N> instead.");
1058	if constexpr (_S_is_partial<_Tp>)
1059	{
1060	constexpr size_t _Np = _S_size<_Tp>;
1061	return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1062	_S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(),
1063	_SimdWrapper<_Tp, _Np>(__x));
1064	}
1065	else
1066	return __x;
1067	}
1068
1069	template <typename _TV, typename _TVT = _VectorTraits<_TV>>
1070	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
1071	__make_padding_nonzero(_TV __x)
1072	{
1073	using _Tp = typename _TVT::value_type;
1074	if constexpr (!_S_is_partial<_Tp>)
1075	return __x;
1076	else
1077	{
1078	constexpr size_t _Np = _S_size<_Tp>;
1079	if constexpr (is_integral_v<typename _TVT::value_type>)
1080	return __x
1081	\| __generate_vector<_Tp, _S_full_size<_Tp>>(
1082	[](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp {
1083	if (__i < _Np)
1084	return `0`;
1085	else
1086	return `1`;
1087	});
1088	else
1089	return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1090	_S_implicit_mask<_Tp>(),
1091	_SimdWrapper<_Tp, _Np>(
1092	__vector_broadcast<_S_full_size<_Tp>>(_Tp(`1`))),
1093	_SimdWrapper<_Tp, _Np>(__x))
1094	._M_data;
1095	}
1096	}
1097
1098	// }}}
1099	};
1100
1101	//}}}
1102	// _CommonImplBuiltin {{{
1103	struct _CommonImplBuiltin
1104	{
1105	// _S_converts_via_decomposition{{{
1106	// This lists all cases where a __vector_convert needs to fall back to
1107	// conversion of individual scalars (i.e. decompose the input vector into
1108	// scalars, convert, compose output vector). In those cases, _S_masked_load &
1109	// _S_masked_store prefer to use the _S_bit_iteration implementation.
1110	template <typename _From, typename _To, size_t _ToSize>
1111	static inline constexpr bool __converts_via_decomposition_v
1112	= sizeof(_From) != sizeof(_To);
1113
1114	// }}}
1115	// _S_load{{{
1116	template <typename _Tp, size_t _Np, size_t _Bytes = _Np * sizeof(_Tp)>
1117	_GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np>
1118	_S_load(const void* __p)
1119	{
1120	static_assert(_Np > `1`);
1121	static_assert(_Bytes % sizeof(_Tp) == `0`);
1122	using _Rp = __vector_type_t<_Tp, _Np>;
1123	if constexpr (sizeof(_Rp) == _Bytes)
1124	{
1125	_Rp __r;
1126	__builtin_memcpy(&__r, __p, _Bytes);
1127	return __r;
1128	}
1129	else
1130	{
1131	#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1132	using _Up = conditional_t<
1133	is_integral_v<_Tp>,
1134	conditional_t<_Bytes % `4` == `0`,
1135	conditional_t<_Bytes % `8` == `0`, long long, int>,
1136	conditional_t<_Bytes % `2` == `0`, short, signed char>>,
1137	conditional_t<(_Bytes < `8` \|\| _Np % `2` == `1` \|\| _Np == `2`), _Tp,
1138	double>>;
1139	using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>;
1140	if constexpr (sizeof(_V) != sizeof(_Rp))
1141	{ // on i386 with 4 < _Bytes <= 8
1142	_Rp __r{};
1143	__builtin_memcpy(&__r, __p, _Bytes);
1144	return __r;
1145	}
1146	else
1147	#else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1148	using _V = _Rp;
1149	#endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1150	{
1151	_V __r{};
1152	static_assert(_Bytes <= sizeof(_V));
1153	__builtin_memcpy(&__r, __p, _Bytes);
1154	return reinterpret_cast<_Rp>(__r);
1155	}
1156	}
1157	}
1158
1159	// }}}
1160	// _S_store {{{
1161	template <size_t _Bytes>
1162	_GLIBCXX_SIMD_INTRINSIC static void
1163	_S_memcpy(char* __dst, const char* __src)
1164	{
1165	if constexpr (_Bytes > `0`)
1166	{
1167	constexpr size_t _Ns = std::__bit_floor(x: _Bytes);
1168	__builtin_memcpy(__dst, __src, _Ns);
1169	_S_memcpy<_Bytes - _Ns>(__dst + _Ns, __src + _Ns);
1170	}
1171	}
1172
1173	template <size_t _ReqBytes = `0`, typename _TV>
1174	_GLIBCXX_SIMD_INTRINSIC static void
1175	_S_store(_TV __x, void* __addr)
1176	{
1177	constexpr size_t _Bytes = _ReqBytes == `0` ? sizeof(__x) : _ReqBytes;
1178	static_assert(sizeof(__x) >= _Bytes);
1179
1180	#if !defined __clang__ && _GLIBCXX_SIMD_WORKAROUND_PR90424
1181	if constexpr (__is_vector_type_v<_TV>)
1182	_S_memcpy<_Bytes>(reinterpret_cast<char>(__addr), reinterpret_cast<const* char*>(&__x));
1183	else
1184	#endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1185	__builtin_memcpy(__addr, &__x, _Bytes);
1186	}
1187
1188	template <typename _Tp, size_t _Np>
1189	_GLIBCXX_SIMD_INTRINSIC static void
1190	_S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
1191	{ _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); }
1192
1193	// }}}
1194	// _S_store_bool_array(_BitMask) {{{
1195	template <size_t _Np, bool _Sanitized>
1196	_GLIBCXX_SIMD_INTRINSIC static constexpr void
1197	_S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
1198	{
1199	if constexpr (_Np == `1`)
1200	__mem[`0`] = __x[`0`];
1201	else if (__builtin_is_constant_evaluated())
1202	{
1203	for (size_t __i = `0`; __i < _Np; ++__i)
1204	__mem[__i] = __x[__i];
1205	}
1206	else if constexpr (_Np == `2`)
1207	{
1208	short __bool2 = (__x._M_to_bits() * `0x81`) & `0x0101`;
1209	_S_store<_Np>(__bool2, __mem);
1210	}
1211	else if constexpr (_Np == `3`)
1212	{
1213	int __bool3 = (__x._M_to_bits() * `0x4081`) & `0x010101`;
1214	_S_store<_Np>(__bool3, __mem);
1215	}
1216	else
1217	{
1218	__execute_n_times<__div_roundup(a: _Np, b: `4`)>(
1219	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1220	constexpr int __offset = __i * `4`;
1221	constexpr int __remaining = _Np - __offset;
1222	if constexpr (__remaining > `4` && __remaining <= `7`)
1223	{
1224	const _ULLong __bool7
1225	= (__x.template _M_extract<__offset>()._M_to_bits()
1226	* `0x40810204081ULL`)
1227	& `0x0101010101010101ULL`;
1228	_S_store<__remaining>(__bool7, __mem + __offset);
1229	}
1230	else if constexpr (__remaining >= `4`)
1231	{
1232	int __bits = __x.template _M_extract<__offset>()._M_to_bits();
1233	if constexpr (__remaining > `7`)
1234	__bits &= `0xf`;
1235	const int __bool4 = (__bits * `0x204081`) & `0x01010101`;
1236	_S_store<`4`>(x: __bool4, addr: __mem + __offset);
1237	}
1238	});
1239	}
1240	}
1241
1242	// }}}
1243	// _S_blend{{{
1244	template <typename _Tp, size_t _Np>
1245	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
1246	_S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
1247	_SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
1248	{ return __k._M_data ? __at1._M_data : __at0._M_data; }
1249
1250	// }}}
1251	};
1252
1253	// }}}
1254	// _SimdImplBuiltin {{{1
1255	template <typename _Abi, typename>
1256	struct _SimdImplBuiltin
1257	{
1258	// member types {{{2
1259	template <typename _Tp>
1260	static constexpr size_t _S_max_store_size = `16`;
1261
1262	using abi_type = _Abi;
1263
1264	template <typename _Tp>
1265	using _TypeTag = _Tp*;
1266
1267	template <typename _Tp>
1268	using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
1269
1270	template <typename _Tp>
1271	using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
1272
1273	template <typename _Tp>
1274	static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
1275
1276	template <typename _Tp>
1277	static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
1278
1279	using _CommonImpl = typename _Abi::_CommonImpl;
1280	using _SuperImpl = typename _Abi::_SimdImpl;
1281	using _MaskImpl = typename _Abi::_MaskImpl;
1282
1283	// _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2
1284	template <typename _Tp, size_t _Np>
1285	_GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1286	_M_make_simd(_SimdWrapper<_Tp, _Np> __x)
1287	{ return {__private_init, __x}; }
1288
1289	template <typename _Tp, size_t _Np>
1290	_GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1291	_M_make_simd(__intrinsic_type_t<_Tp, _Np> __x)
1292	{ return {__private_init, __vector_bitcast<_Tp>(__x)}; }
1293
1294	// _S_broadcast {{{2
1295	template <typename _Tp>
1296	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1297	_S_broadcast(_Tp __x) noexcept
1298	{ return __vector_broadcast<_S_full_size<_Tp>>(__x); }
1299
1300	// _S_generator {{{2
1301	template <typename _Fp, typename _Tp>
1302	inline static constexpr _SimdMember<_Tp>
1303	_S_generator(_Fp&& __gen, _TypeTag<_Tp>)
1304	{
1305	return __generate_vector<_Tp, _S_full_size<_Tp>>(
1306	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1307	if constexpr (__i < _S_size<_Tp>)
1308	return __gen(__i);
1309	else
1310	return `0`;
1311	});
1312	}
1313
1314	// _S_load {{{2
1315	template <typename _Tp, typename _Up>
1316	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1317	_S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
1318	{
1319	constexpr size_t _Np = _S_size<_Tp>;
1320	constexpr size_t __max_load_size
1321	= (sizeof(_Up) >= `4` && __have_avx512f) \|\| __have_avx512bw ? `64`
1322	: (is_floating_point_v<_Up> && __have_avx) \|\| __have_avx2 ? `32`
1323	: `16`;
1324	constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
1325	if (__builtin_is_constant_evaluated())
1326	return __generate_vector<_Tp, _S_full_size<_Tp>>(
1327	[&](auto __i) constexpr {
1328	return static_cast<_Tp>(__i < _Np ? __mem[__i] : `0`);
1329	});
1330	else if constexpr (sizeof(_Up) > `8` or __vectorized_sizeof<_Up>() <= sizeof(_Up))
1331	return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>(
1332	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1333	return static_cast<_Tp>(__i < _Np ? __mem[__i] : `0`);
1334	});
1335	else if constexpr (is_same_v<_Up, _Tp>)
1336	return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
1337	_Np * sizeof(_Tp)>(__mem);
1338	else if constexpr (__bytes_to_load <= __max_load_size)
1339	return __convert<_SimdMember<_Tp>>(
1340	_CommonImpl::template _S_load<_Up, _Np>(__mem));
1341	else if constexpr (__bytes_to_load % __max_load_size == `0`)
1342	{
1343	constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
1344	constexpr size_t __elements_per_load = _Np / __n_loads;
1345	return __call_with_n_evaluations<__n_loads>(
1346	[](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1347	return __convert<_SimdMember<_Tp>>(__uncvted...);
1348	}, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1349	return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1350	__mem + __i * __elements_per_load);
1351	});
1352	}
1353	else if constexpr (__bytes_to_load % (__max_load_size / `2`) == `0`
1354	&& __max_load_size > `16`)
1355	{ // e.g. int[] -> <char, 12> with AVX2
1356	constexpr size_t __n_loads
1357	= __bytes_to_load / (__max_load_size / `2`);
1358	constexpr size_t __elements_per_load = _Np / __n_loads;
1359	return __call_with_n_evaluations<__n_loads>(
1360	[](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1361	return __convert<_SimdMember<_Tp>>(__uncvted...);
1362	}, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1363	return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1364	__mem + __i * __elements_per_load);
1365	});
1366	}
1367	else // e.g. int[] -> <char, 9>
1368	return __call_with_subscripts(
1369	__mem, make_index_sequence<_Np>(),
1370	[](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1371	return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...};
1372	});
1373	}
1374
1375	// _S_masked_load {{{2
1376	template <typename _Tp, size_t _Np, typename _Up>
1377	static constexpr inline _SimdWrapper<_Tp, _Np>
1378	_S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
1379	const _Up* __mem) noexcept
1380	{
1381	_BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1382	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1383	__merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1384	});
1385	return __merge;
1386	}
1387
1388	// _S_store {{{2
1389	template <typename _Tp, typename _Up>
1390	_GLIBCXX_SIMD_INTRINSIC static constexpr void
1391	_S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
1392	{
1393	// TODO: converting int -> "smaller int" can be optimized with AVX512
1394	constexpr size_t _Np = _S_size<_Tp>;
1395	constexpr size_t __max_store_size
1396	= _SuperImpl::template _S_max_store_size<_Up>;
1397	if (__builtin_is_constant_evaluated())
1398	{
1399	for (size_t __i = `0`; __i < _Np; ++__i)
1400	__mem[__i] = __v[__i];
1401	}
1402	else if constexpr (sizeof(_Up) > `8` or __vectorized_sizeof<_Up>() <= sizeof(_Up))
1403	__execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1404	__mem[__i] = __v[__i];
1405	});
1406	else if constexpr (is_same_v<_Up, _Tp>)
1407	_CommonImpl::_S_store(__v, __mem);
1408	else if constexpr (sizeof(_Up) * _Np <= __max_store_size)
1409	_CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)),
1410	__mem);
1411	else
1412	{
1413	constexpr size_t __vsize = __max_store_size / sizeof(_Up);
1414	// round up to convert the last partial vector as well:
1415	constexpr size_t __stores = __div_roundup(a: _Np, b: __vsize);
1416	constexpr size_t __full_stores = _Np / __vsize;
1417	using _V = __vector_type_t<_Up, __vsize>;
1418	const array<_V, __stores> __converted
1419	= __convert_all<_V, __stores>(__v);
1420	__execute_n_times<__full_stores>(
1421	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1422	_CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
1423	});
1424	if constexpr (__full_stores < __stores)
1425	_CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
1426	* sizeof(_Up)>(
1427	__converted[__full_stores], __mem + __full_stores * __vsize);
1428	}
1429	}
1430
1431	// _S_masked_store_nocvt {{{2
1432	template <typename _Tp, size_t _Np>
1433	_GLIBCXX_SIMD_INTRINSIC static constexpr void
1434	_S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k)
1435	{
1436	_BitOps::_S_bit_iteration(
1437	_MaskImpl::_S_to_bits(__k),
1438	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1439	__mem[__i] = __v[__i];
1440	});
1441	}
1442
1443	// _S_masked_store {{{2
1444	template <typename _TW, typename _TVT = _VectorTraits<_TW>,
1445	typename _Tp = typename _TVT::value_type, typename _Up>
1446	static constexpr inline void
1447	_S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept
1448	{
1449	constexpr size_t _TV_size = _S_size<_Tp>;
1450	[[maybe_unused]] const auto __vi = __to_intrin(__v);
1451	constexpr size_t __max_store_size
1452	= _SuperImpl::template _S_max_store_size<_Up>;
1453	if constexpr (
1454	is_same_v<
1455	_Tp,
1456	_Up> \|\| (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
1457	{
1458	// bitwise or no conversion, reinterpret:
1459	const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1460	if constexpr (__is_bitmask_v<decltype(__k)>)
1461	return _MaskMember<_Up>(__k._M_data);
1462	else
1463	return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k);
1464	}();
1465	_SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v),
1466	__mem, __kk);
1467	}
1468	else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up)
1469	&& !_CommonImpl::
1470	template __converts_via_decomposition_v<
1471	_Tp, _Up, __max_store_size>)
1472	{ // conversion via decomposition is better handled via the
1473	// bit_iteration
1474	// fallback below
1475	constexpr size_t _UW_size
1476	= std::min(a: _TV_size, b: __max_store_size / sizeof(_Up));
1477	static_assert(_UW_size <= _TV_size);
1478	using _UW = _SimdWrapper<_Up, _UW_size>;
1479	using _UV = __vector_type_t<_Up, _UW_size>;
1480	using _UAbi = simd_abi::deduce_t<_Up, _UW_size>;
1481	if constexpr (_UW_size == _TV_size) // one convert+store
1482	{
1483	const _UW __converted = __convert<_UW>(__v);
1484	_UAbi::_SimdImpl::_S_masked_store_nocvt(
1485	__converted, __mem,
1486	_UAbi::_MaskImpl::template _S_convert<
1487	__int_for_sizeof_t<_Up>>(__k));
1488	}
1489	else
1490	{
1491	static_assert(_UW_size * sizeof(_Up) == __max_store_size);
1492	constexpr size_t _NFullStores = _TV_size / _UW_size;
1493	constexpr size_t _NAllStores
1494	= __div_roundup(a: _TV_size, b: _UW_size);
1495	constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
1496	const array<_UV, _NAllStores> __converted
1497	= __convert_all<_UV, _NAllStores>(__v);
1498	__execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1499	_UAbi::_SimdImpl::_S_masked_store_nocvt(
1500	_UW(__converted[__i]), __mem + __i * _UW_size,
1501	_UAbi::_MaskImpl::template _S_convert<
1502	__int_for_sizeof_t<_Up>>(
1503	__extract_part<__i, _NParts>(__k.__as_full_vector())));
1504	});
1505	if constexpr (_NAllStores
1506	> _NFullStores) // one partial at the end
1507	_UAbi::_SimdImpl::_S_masked_store_nocvt(
1508	_UW(__converted[_NFullStores]),
1509	__mem + _NFullStores * _UW_size,
1510	_UAbi::_MaskImpl::template _S_convert<
1511	__int_for_sizeof_t<_Up>>(
1512	__extract_part<_NFullStores, _NParts>(
1513	__k.__as_full_vector())));
1514	}
1515	}
1516	else
1517	_BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1518	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1519	__mem[__i] = static_cast<_Up>(__v[__i]);
1520	});
1521	}
1522
1523	// _S_complement {{{2
1524	template <typename _Tp, size_t _Np>
1525	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1526	_S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept
1527	{
1528	if constexpr (is_floating_point_v<_Tp>)
1529	return __vector_bitcast<_Tp>(~__vector_bitcast<__int_for_sizeof_t<_Tp>>(__x));
1530	else
1531	return ~__x._M_data;
1532	}
1533
1534	// _S_unary_minus {{{2
1535	template <typename _Tp, size_t _Np>
1536	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1537	_S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept
1538	{
1539	// GCC doesn't use the psign instructions, but pxor & psub seem to be
1540	// just as good a choice as pcmpeqd & psign. So meh.
1541	return -__x._M_data;
1542	}
1543
1544	// arithmetic operators {{{2
1545	template <typename _Tp, size_t _Np>
1546	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1547	_S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1548	{ return __x._M_data + __y._M_data; }
1549
1550	template <typename _Tp, size_t _Np>
1551	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1552	_S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1553	{ return __x._M_data - __y._M_data; }
1554
1555	template <typename _Tp, size_t _Np>
1556	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1557	_S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1558	{ return __x._M_data * __y._M_data; }
1559
1560	template <typename _Tp, size_t _Np>
1561	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1562	_S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1563	{
1564	// Note that division by 0 is always UB, so we must ensure we avoid the
1565	// case for partial registers
1566	if constexpr (!_Abi::template _S_is_partial<_Tp>)
1567	return __x._M_data / __y._M_data;
1568	else
1569	return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data);
1570	}
1571
1572	template <typename _Tp, size_t _Np>
1573	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1574	_S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1575	{
1576	if constexpr (!_Abi::template _S_is_partial<_Tp>)
1577	return __x._M_data % __y._M_data;
1578	else
1579	return __as_vector(__x)
1580	% _Abi::__make_padding_nonzero(__as_vector(__y));
1581	}
1582
1583	template <typename _Tp, size_t _Np>
1584	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1585	_S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1586	{ return __and(__x, __y); }
1587
1588	template <typename _Tp, size_t _Np>
1589	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1590	_S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1591	{ return __or(__x, __y); }
1592
1593	template <typename _Tp, size_t _Np>
1594	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1595	_S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1596	{ return __xor(__x, __y); }
1597
1598	template <typename _Tp, size_t _Np>
1599	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1600	_S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1601	{ return __x._M_data << __y._M_data; }
1602
1603	template <typename _Tp, size_t _Np>
1604	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1605	_S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1606	{ return __x._M_data >> __y._M_data; }
1607
1608	template <typename _Tp, size_t _Np>
1609	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1610	_S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y)
1611	{ return __x._M_data << __y; }
1612
1613	template <typename _Tp, size_t _Np>
1614	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1615	_S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y)
1616	{ return __x._M_data >> __y; }
1617
1618	// compares {{{2
1619	// _S_equal_to {{{3
1620	template <typename _Tp, size_t _Np>
1621	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1622	_S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1623	{ return __x._M_data == __y._M_data; }
1624
1625	// _S_not_equal_to {{{3
1626	template <typename _Tp, size_t _Np>
1627	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1628	_S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1629	{ return __x._M_data != __y._M_data; }
1630
1631	// _S_less {{{3
1632	template <typename _Tp, size_t _Np>
1633	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1634	_S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1635	{ return __x._M_data < __y._M_data; }
1636
1637	// _S_less_equal {{{3
1638	template <typename _Tp, size_t _Np>
1639	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1640	_S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1641	{ return __x._M_data <= __y._M_data; }
1642
1643	// _S_negate {{{2
1644	template <typename _Tp, size_t _Np>
1645	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1646	_S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
1647	{ return !__x._M_data; }
1648
1649	// _S_min, _S_max, _S_minmax {{{2
1650	template <typename _Tp, size_t _Np>
1651	_GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1652	_SimdWrapper<_Tp, _Np>
1653	_S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1654	{ return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; }
1655
1656	template <typename _Tp, size_t _Np>
1657	_GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1658	_SimdWrapper<_Tp, _Np>
1659	_S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1660	{ return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; }
1661
1662	template <typename _Tp, size_t _Np>
1663	_GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1664	pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>>
1665	_S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1666	{
1667	return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data,
1668	__a._M_data < __b._M_data ? __b._M_data : __a._M_data};
1669	}
1670
1671	// reductions {{{2
1672	template <size_t _Np, size_t... _Is, size_t... _Zeros, typename _Tp,
1673	typename _BinaryOperation>
1674	_GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1675	_S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>,
1676	simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1677	{
1678	using _V = __vector_type_t<_Tp, _Np / `2`>;
1679	static_assert(sizeof(_V) <= sizeof(__x));
1680	// _S_full_size is the size of the smallest native SIMD register that
1681	// can store _Np/2 elements:
1682	using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>;
1683	using _HalfSimd = __deduced_simd<_Tp, _Np / `2`>;
1684	const auto __xx = __as_vector(__x);
1685	return _HalfSimd::abi_type::_SimdImpl::_S_reduce(
1686	static_cast<_HalfSimd>(__as_vector(__binary_op(
1687	static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)),
1688	static_cast<_FullSimd>(__intrin_bitcast<_V>(
1689	__vector_permute<(_Np / `2` + _Is)..., (int(_Zeros * `0`) - `1`)...>(
1690	__xx)))))),
1691	__binary_op);
1692	}
1693
1694	template <typename _Tp, typename _BinaryOperation>
1695	_GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1696	_S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1697	{
1698	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
1699	if constexpr (_Np == `1`)
1700	return __x[`0`];
1701	else if constexpr (_Np == `2`)
1702	return __binary_op(simd<_Tp, simd_abi::scalar>(__x[`0`]),
1703	simd<_Tp, simd_abi::scalar>(__x[`1`]))[`0`];
1704	else if (__builtin_is_constant_evaluated())
1705	{
1706	simd<_Tp, simd_abi::scalar> __acc = __x[`0`];
1707	for (size_t __i = `1`; __i < _Np; ++__i)
1708	__acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i]));
1709	return __acc[`0`];
1710	}
1711	else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{
1712	{
1713	[[maybe_unused]] constexpr auto __full_size
1714	= _Abi::template _S_full_size<_Tp>;
1715	if constexpr (_Np == `3`)
1716	return __binary_op(
1717	__binary_op(simd<_Tp, simd_abi::scalar>(__x[`0`]),
1718	simd<_Tp, simd_abi::scalar>(__x[`1`])),
1719	simd<_Tp, simd_abi::scalar>(__x[`2`]))[`0`];
1720	else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1721	plus<>>)
1722	{
1723	using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1724	return _Ap::_SimdImpl::_S_reduce(
1725	simd<_Tp, _Ap>(__private_init,
1726	_Abi::_S_masked(__as_vector(__x))),
1727	__binary_op);
1728	}
1729	else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1730	multiplies<>>)
1731	{
1732	using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1733	using _TW = _SimdWrapper<_Tp, __full_size>;
1734	_GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
1735	= _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
1736	_GLIBCXX_SIMD_USE_CONSTEXPR _TW __one
1737	= __vector_broadcast<__full_size>(_Tp(`1`));
1738	const _TW __x_full = __data(__x).__as_full_vector();
1739	const _TW __x_padded_with_ones
1740	= _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one,
1741	__x_full);
1742	return _Ap::_SimdImpl::_S_reduce(
1743	simd<_Tp, _Ap>(__private_init, __x_padded_with_ones),
1744	__binary_op);
1745	}
1746	else if constexpr (_Np & `1`)
1747	{
1748	using _Ap = simd_abi::deduce_t<_Tp, _Np - `1`>;
1749	return __binary_op(
1750	simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
1751	simd<_Tp, _Ap>(
1752	__intrin_bitcast<__vector_type_t<_Tp, _Np - `1`>>(
1753	__as_vector(__x))),
1754	__binary_op)),
1755	simd<_Tp, simd_abi::scalar>(__x[_Np - `1`]))[`0`];
1756	}
1757	else
1758	return _S_reduce_partial<_Np>(
1759	make_index_sequence<_Np / `2`>(),
1760	make_index_sequence<__full_size - _Np / `2`>(), __x, __binary_op);
1761	} //}}}
1762	else if constexpr (sizeof(__x) == `16`) //{{{
1763	{
1764	if constexpr (_Np == `16`)
1765	{
1766	const auto __y = __data(__x);
1767	__x = __binary_op(
1768	_M_make_simd<_Tp, _Np>(
1769	__vector_permute<`0`, `0`, `1`, `1`, `2`, `2`, `3`, `3`, `4`, `4`, `5`, `5`, `6`, `6`,
1770	`7`, `7`>(__y)),
1771	_M_make_simd<_Tp, _Np>(
1772	__vector_permute<`8`, `8`, `9`, `9`, `10`, `10`, `11`, `11`, `12`, `12`, `13`, `13`,
1773	`14`, `14`, `15`, `15`>(__y)));
1774	}
1775	if constexpr (_Np >= `8`)
1776	{
1777	const auto __y = __vector_bitcast<short>(__data(__x));
1778	__x = __binary_op(
1779	_M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1780	__vector_permute<`0`, `0`, `1`, `1`, `2`, `2`, `3`, `3`>(__y))),
1781	_M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1782	__vector_permute<`4`, `4`, `5`, `5`, `6`, `6`, `7`, `7`>(__y))));
1783	}
1784	if constexpr (_Np >= `4`)
1785	{
1786	using _Up = conditional_t<is_floating_point_v<_Tp>, float, int>;
1787	const auto __y = __vector_bitcast<_Up>(__data(__x));
1788	__x = __binary_op(__x,
1789	_M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1790	__vector_permute<`3`, `2`, `1`, `0`>(__y))));
1791	}
1792	using _Up = conditional_t<is_floating_point_v<_Tp>, double, _LLong>;
1793	const auto __y = __vector_bitcast<_Up>(__data(__x));
1794	__x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1795	__vector_permute<`1`, `1`>(__y))));
1796	return __x[`0`];
1797	} //}}}
1798	else
1799	{
1800	static_assert(sizeof(__x) > __min_vector_size<_Tp>);
1801	static_assert((_Np & (_Np - `1`)) == `0`); // _Np must be a power of 2
1802	using _Ap = simd_abi::deduce_t<_Tp, _Np / `2`>;
1803	using _V = simd<_Tp, _Ap>;
1804	return _Ap::_SimdImpl::_S_reduce(
1805	__binary_op(_V(__private_init, __extract<`0`, `2`>(__as_vector(__x))),
1806	_V(__private_init,
1807	__extract<`1`, `2`>(__as_vector(__x)))),
1808	static_cast<_BinaryOperation&&>(__binary_op));
1809	}
1810	}
1811
1812	// math {{{2
1813	// frexp, modf and copysign implemented in simd_math.h
1814	#define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \
1815	template <typename _Tp, typename... _More> \
1816	static _Tp \
1817	_S_##__name(const _Tp& __x, const _More&... __more) \
1818	{ \
1819	return __generate_vector<_Tp>( \
1820	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1821	return __name(__x[__i], __more[__i]...); \
1822	}); \
1823	}
1824
1825	#define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \
1826	template <typename _Tp, typename... _More> \
1827	static typename _Tp::mask_type \
1828	_S_##__name(const _Tp& __x, const _More&... __more) \
1829	{ \
1830	return __generate_vector<_Tp>( \
1831	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1832	return __name(__x[__i], __more[__i]...); \
1833	}); \
1834	}
1835
1836	#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \
1837	template <typename _Tp, typename... _More> \
1838	static auto \
1839	_S_##__name(const _Tp& __x, const _More&... __more) \
1840	{ \
1841	return __fixed_size_storage_t<_RetTp, \
1842	_VectorTraits<_Tp>::_S_partial_width>:: \
1843	_S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1844	return __meta._S_generator( \
1845	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1846	return __name(__x[__meta._S_offset + __i], \
1847	__more[__meta._S_offset + __i]...); \
1848	}, \
1849	static_cast<_RetTp*>(nullptr)); \
1850	}); \
1851	}
1852
1853	_GLIBCXX_SIMD_MATH_FALLBACK(acos)
1854	_GLIBCXX_SIMD_MATH_FALLBACK(asin)
1855	_GLIBCXX_SIMD_MATH_FALLBACK(atan)
1856	_GLIBCXX_SIMD_MATH_FALLBACK(atan2)
1857	_GLIBCXX_SIMD_MATH_FALLBACK(cos)
1858	_GLIBCXX_SIMD_MATH_FALLBACK(sin)
1859	_GLIBCXX_SIMD_MATH_FALLBACK(tan)
1860	_GLIBCXX_SIMD_MATH_FALLBACK(acosh)
1861	_GLIBCXX_SIMD_MATH_FALLBACK(asinh)
1862	_GLIBCXX_SIMD_MATH_FALLBACK(atanh)
1863	_GLIBCXX_SIMD_MATH_FALLBACK(cosh)
1864	_GLIBCXX_SIMD_MATH_FALLBACK(sinh)
1865	_GLIBCXX_SIMD_MATH_FALLBACK(tanh)
1866	_GLIBCXX_SIMD_MATH_FALLBACK(exp)
1867	_GLIBCXX_SIMD_MATH_FALLBACK(exp2)
1868	_GLIBCXX_SIMD_MATH_FALLBACK(expm1)
1869	_GLIBCXX_SIMD_MATH_FALLBACK(ldexp)
1870	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
1871	_GLIBCXX_SIMD_MATH_FALLBACK(log)
1872	_GLIBCXX_SIMD_MATH_FALLBACK(log10)
1873	_GLIBCXX_SIMD_MATH_FALLBACK(log1p)
1874	_GLIBCXX_SIMD_MATH_FALLBACK(log2)
1875	_GLIBCXX_SIMD_MATH_FALLBACK(logb)
1876
1877	// modf implemented in simd_math.h
1878	_GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
1879	_GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
1880	_GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
1881	_GLIBCXX_SIMD_MATH_FALLBACK(fabs)
1882	_GLIBCXX_SIMD_MATH_FALLBACK(pow)
1883	_GLIBCXX_SIMD_MATH_FALLBACK(sqrt)
1884	_GLIBCXX_SIMD_MATH_FALLBACK(erf)
1885	_GLIBCXX_SIMD_MATH_FALLBACK(erfc)
1886	_GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
1887	_GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
1888
1889	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
1890	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
1891
1892	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
1893	_GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
1894
1895	_GLIBCXX_SIMD_MATH_FALLBACK(fmod)
1896	_GLIBCXX_SIMD_MATH_FALLBACK(remainder)
1897
1898	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1899	static _Tp
1900	_S_remquo(const _Tp __x, const _Tp __y,
1901	__fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
1902	{
1903	return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1904	int __tmp;
1905	auto __r = remquo(__x[__i], __y[__i], &__tmp);
1906	__z->_M_set(__i, __tmp);
1907	return __r;
1908	});
1909	}
1910
1911	// copysign in simd_math.h
1912	_GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
1913	_GLIBCXX_SIMD_MATH_FALLBACK(fdim)
1914	_GLIBCXX_SIMD_MATH_FALLBACK(fmax)
1915	_GLIBCXX_SIMD_MATH_FALLBACK(fmin)
1916	_GLIBCXX_SIMD_MATH_FALLBACK(fma)
1917
1918	template <typename _Tp, size_t _Np>
1919	static constexpr _MaskMember<_Tp>
1920	_S_isgreater(_SimdWrapper<_Tp, _Np> __x,
1921	_SimdWrapper<_Tp, _Np> __y) noexcept
1922	{
1923	using _Ip = __int_for_sizeof_t<_Tp>;
1924	const auto __xn = __vector_bitcast<_Ip>(__x);
1925	const auto __yn = __vector_bitcast<_Ip>(__y);
1926	const auto __xp = __xn < `0` ? -(__xn & __finite_max_v<_Ip>) : __xn;
1927	const auto __yp = __yn < `0` ? -(__yn & __finite_max_v<_Ip>) : __yn;
1928	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
1929	__xp > __yp);
1930	}
1931
1932	template <typename _Tp, size_t _Np>
1933	static constexpr _MaskMember<_Tp>
1934	_S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x,
1935	_SimdWrapper<_Tp, _Np> __y) noexcept
1936	{
1937	using _Ip = __int_for_sizeof_t<_Tp>;
1938	const auto __xn = __vector_bitcast<_Ip>(__x);
1939	const auto __yn = __vector_bitcast<_Ip>(__y);
1940	const auto __xp = __xn < `0` ? -(__xn & __finite_max_v<_Ip>) : __xn;
1941	const auto __yp = __yn < `0` ? -(__yn & __finite_max_v<_Ip>) : __yn;
1942	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
1943	__xp >= __yp);
1944	}
1945
1946	template <typename _Tp, size_t _Np>
1947	static constexpr _MaskMember<_Tp>
1948	_S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept
1949	{
1950	using _Ip = __int_for_sizeof_t<_Tp>;
1951	const auto __xn = __vector_bitcast<_Ip>(__x);
1952	const auto __yn = __vector_bitcast<_Ip>(__y);
1953	const auto __xp = __xn < `0` ? -(__xn & __finite_max_v<_Ip>) : __xn;
1954	const auto __yp = __yn < `0` ? -(__yn & __finite_max_v<_Ip>) : __yn;
1955	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
1956	__xp < __yp);
1957	}
1958
1959	template <typename _Tp, size_t _Np>
1960	static constexpr _MaskMember<_Tp>
1961	_S_islessequal(_SimdWrapper<_Tp, _Np> __x,
1962	_SimdWrapper<_Tp, _Np> __y) noexcept
1963	{
1964	using _Ip = __int_for_sizeof_t<_Tp>;
1965	const auto __xn = __vector_bitcast<_Ip>(__x);
1966	const auto __yn = __vector_bitcast<_Ip>(__y);
1967	const auto __xp = __xn < `0` ? -(__xn & __finite_max_v<_Ip>) : __xn;
1968	const auto __yp = __yn < `0` ? -(__yn & __finite_max_v<_Ip>) : __yn;
1969	return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
1970	__xp <= __yp);
1971	}
1972
1973	template <typename _Tp, size_t _Np>
1974	static constexpr _MaskMember<_Tp>
1975	_S_islessgreater(_SimdWrapper<_Tp, _Np> __x,
1976	_SimdWrapper<_Tp, _Np> __y) noexcept
1977	{
1978	return __andnot(_SuperImpl::_S_isunordered(__x, __y),
1979	_SuperImpl::_S_not_equal_to(__x, __y));
1980	}
1981
1982	#undef _GLIBCXX_SIMD_MATH_FALLBACK
1983	#undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET
1984	#undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
1985	// _S_abs {{{3
1986	template <typename _Tp, size_t _Np>
1987	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1988	_S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept
1989	{
1990	// if (__builtin_is_constant_evaluated())
1991	// {
1992	// return __x._M_data < 0 ? -__x._M_data : __x._M_data;
1993	// }
1994	if constexpr (is_floating_point_v<_Tp>)
1995	// `v < 0 ? -v : v` cannot compile to the efficient implementation of
1996	// masking the signbit off because it must consider v == -0
1997
1998	// ~(-0.) & v would be easy, but breaks with fno-signed-zeros
1999	return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data);
2000	else
2001	return __x._M_data < `0` ? -__x._M_data : __x._M_data;
2002	}
2003
2004	// }}}3
2005	// _S_plus_minus {{{
2006	// Returns __x + __y - __y without -fassociative-math optimizing to __x.
2007	// - _TV must be __vector_type_t<floating-point type, N>.
2008	// - _UV must be _TV or floating-point type.
2009	template <typename _TV, typename _UV>
2010	_GLIBCXX_SIMD_INTRINSIC static constexpr _TV
2011	_S_plus_minus(_TV __x, _UV __y) noexcept
2012	{
2013	#if defined __i386__ && !defined __SSE_MATH__
2014	if constexpr (sizeof(__x) == `8`)
2015	{ // operations on __x would use the FPU
2016	static_assert(is_same_v<_TV, __vector_type_t<float, `2`>>);
2017	const auto __x4 = __vector_bitcast<float, `4`>(__x);
2018	if constexpr (is_same_v<_TV, _UV>)
2019	return __vector_bitcast<float, `2`>(
2020	_S_plus_minus(__x4, __vector_bitcast<float, `4`>(__y)));
2021	else
2022	return __vector_bitcast<float, `2`>(_S_plus_minus(__x4, __y));
2023	}
2024	#endif
2025	#if !defined __clang__ && __GCC_IEC_559 == 0
2026	if (__builtin_is_constant_evaluated()
2027	\|\| (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
2028	return (__x + __y) - __y;
2029	else
2030	return [&] {
2031	__x += __y;
2032	if constexpr(__have_sse)
2033	{
2034	if constexpr (sizeof(__x) >= `16`)
2035	asm("" : "+x"(__x));
2036	else if constexpr (is_same_v<__vector_type_t<float, `2`>, _TV>)
2037	asm("" : "+x"(__x[`0`]), "+x"(__x[`1`]));
2038	else
2039	__assert_unreachable<_TV>();
2040	}
2041	else if constexpr(__have_neon)
2042	asm("" : "+w"(__x));
2043	else if constexpr (__have_power_vmx)
2044	{
2045	if constexpr (is_same_v<__vector_type_t<float, `2`>, _TV>)
2046	asm("" : "+fgr"(__x[`0`]), "+fgr"(__x[`1`]));
2047	else
2048	asm("" : "+v"(__x));
2049	}
2050	else
2051	asm("" : "+g"(__x));
2052	return __x - __y;
2053	}();
2054	#else
2055	return (__x + __y) - __y;
2056	#endif
2057	}
2058
2059	// }}}
2060	// _S_nearbyint {{{3
2061	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2062	_GLIBCXX_SIMD_INTRINSIC static _Tp
2063	_S_nearbyint(_Tp __x_) noexcept
2064	{
2065	using value_type = typename _TVT::value_type;
2066	using _V = typename _TVT::type;
2067	const _V __x = __x_;
2068	const _V __absx = __and(__x, _S_absmask<_V>);
2069	static_assert(__CHAR_BIT__ * sizeof(`1ull`) >= __digits_v<value_type>);
2070	_GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs
2071	= _V() + (`1ull` << (__digits_v<value_type> - `1`));
2072	const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs);
2073	const _V __shifted = _S_plus_minus(__x, __shifter);
2074	return __absx < __shifter_abs ? __shifted : __x;
2075	}
2076
2077	// _S_rint {{{3
2078	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2079	_GLIBCXX_SIMD_INTRINSIC static _Tp
2080	_S_rint(_Tp __x) noexcept
2081	{ return _SuperImpl::_S_nearbyint(__x); }
2082
2083	// _S_trunc {{{3
2084	template <typename _Tp, size_t _Np>
2085	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2086	_S_trunc(_SimdWrapper<_Tp, _Np> __x)
2087	{
2088	using _V = __vector_type_t<_Tp, _Np>;
2089	const _V __absx = __and(__x._M_data, _S_absmask<_V>);
2090	static_assert(__CHAR_BIT__ * sizeof(`1ull`) >= __digits_v<_Tp>);
2091	constexpr _Tp __shifter = `1ull` << (__digits_v<_Tp> - `1`);
2092	_V __truncated = _S_plus_minus(__absx, __shifter);
2093	__truncated -= __truncated > __absx ? _V() + `1` : _V();
2094	return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated)
2095	: __x._M_data;
2096	}
2097
2098	// _S_round {{{3
2099	template <typename _Tp, size_t _Np>
2100	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2101	_S_round(_SimdWrapper<_Tp, _Np> __x)
2102	{
2103	const auto __abs_x = _SuperImpl::_S_abs(__x);
2104	const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data;
2105	const auto __r_abs // round(abs(x)) =
2106	= __t_abs + (__abs_x._M_data - __t_abs >= _Tp(`.5`) ? _Tp(`1`) : `0`);
2107	return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs);
2108	}
2109
2110	// _S_floor {{{3
2111	template <typename _Tp, size_t _Np>
2112	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2113	_S_floor(_SimdWrapper<_Tp, _Np> __x)
2114	{
2115	const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2116	const auto __negative_input
2117	= __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(`0`));
2118	const auto __mask
2119	= __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2120	return __or(__andnot(__mask, __y),
2121	__and(__mask, __y - __vector_broadcast<_Np, _Tp>(`1`)));
2122	}
2123
2124	// _S_ceil {{{3
2125	template <typename _Tp, size_t _Np>
2126	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2127	_S_ceil(_SimdWrapper<_Tp, _Np> __x)
2128	{
2129	const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2130	const auto __negative_input
2131	= __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(`0`));
2132	const auto __inv_mask
2133	= __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2134	return __or(__and(__inv_mask, __y),
2135	__andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(`1`)));
2136	}
2137
2138	// _S_isnan {{{3
2139	template <typename _Tp, size_t _Np>
2140	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2141	_S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2142	{
2143	#if __FINITE_MATH_ONLY__
2144	return {}; // false
2145	#elif !defined __SUPPORT_SNAN__
2146	return ~(__x._M_data == __x._M_data);
2147	#elif defined __STDC_IEC_559__
2148	using _Ip = __int_for_sizeof_t<_Tp>;
2149	const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2150	const auto __infn
2151	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
2152	return __infn < __absn;
2153	#else
2154	#error "Not implemented: how to support SNaN but non-IEC559 floating-point?"
2155	#endif
2156	}
2157
2158	// _S_isfinite {{{3
2159	template <typename _Tp, size_t _Np>
2160	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2161	_S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2162	{
2163	#if __FINITE_MATH_ONLY__
2164	using _UV = typename _MaskMember<_Tp>::_BuiltinType;
2165	_GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV();
2166	return __alltrue;
2167	#else
2168	// if all exponent bits are set, __x is either inf or NaN
2169	using _Ip = __int_for_sizeof_t<_Tp>;
2170	const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2171	const auto __maxn
2172	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2173	return __absn <= __maxn;
2174	#endif
2175	}
2176
2177	// _S_isunordered {{{3
2178	template <typename _Tp, size_t _Np>
2179	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2180	_S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2181	{ return __or(_S_isnan(__x), _S_isnan(__y)); }
2182
2183	// _S_signbit {{{3
2184	template <typename _Tp, size_t _Np>
2185	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2186	_S_signbit(_SimdWrapper<_Tp, _Np> __x)
2187	{
2188	using _Ip = __int_for_sizeof_t<_Tp>;
2189	return __vector_bitcast<_Ip>(__x) < `0`;
2190	// Arithmetic right shift (SRA) would also work (instead of compare), but
2191	// 64-bit SRA isn't available on x86 before AVX512. And in general,
2192	// compares are more likely to be efficient than SRA.
2193	}
2194
2195	// _S_isinf {{{3
2196	template <typename _Tp, size_t _Np>
2197	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2198	_S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2199	{
2200	#if __FINITE_MATH_ONLY__
2201	return {}; // false
2202	#else
2203	return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x),
2204	__vector_broadcast<_Np>(
2205	__infinity_v<_Tp>));
2206	// alternative:
2207	// compare to inf using the corresponding integer type
2208	/*
2209	return
2210	__vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>(
2211	_S_abs(__x)._M_data)
2212	==
2213	__vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>(
2214	__infinity_v<_Tp>)));
2215	*/
2216	#endif
2217	}
2218
2219	// _S_isnormal {{{3
2220	template <typename _Tp, size_t _Np>
2221	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2222	_S_isnormal(_SimdWrapper<_Tp, _Np> __x)
2223	{
2224	using _Ip = __int_for_sizeof_t<_Tp>;
2225	const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2226	const auto __minn
2227	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>));
2228	#if __FINITE_MATH_ONLY__
2229	return __absn >= __minn;
2230	#else
2231	const auto __maxn
2232	= __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2233	return __minn <= __absn && __absn <= __maxn;
2234	#endif
2235	}
2236
2237	// _S_fpclassify {{{3
2238	template <typename _Tp, size_t _Np>
2239	_GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
2240	_S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
2241	{
2242	using _I = __int_for_sizeof_t<_Tp>;
2243	const auto __xn
2244	= __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
2245	constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
2246	_GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
2247	= __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
2248
2249	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
2250	= __vector_broadcast<_NI, _I>(FP_NORMAL);
2251	#if !__FINITE_MATH_ONLY__
2252	_GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
2253	= __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
2254	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
2255	= __vector_broadcast<_NI, _I>(FP_NAN);
2256	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
2257	= __vector_broadcast<_NI, _I>(FP_INFINITE);
2258	#endif
2259	#ifndef __FAST_MATH__
2260	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
2261	= __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
2262	#endif
2263	_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
2264	= __vector_broadcast<_NI, _I>(FP_ZERO);
2265
2266	__vector_type_t<_I, _NI>
2267	__tmp = __xn < __minn
2268	#ifdef __FAST_MATH__
2269	? __fp_zero
2270	#else
2271	? (__xn == `0` ? __fp_zero : __fp_subnormal)
2272	#endif
2273	#if __FINITE_MATH_ONLY__
2274	: __fp_normal;
2275	#else
2276	: (__xn < __infn ? __fp_normal
2277	: (__xn == __infn ? __fp_infinite : __fp_nan));
2278	#endif
2279
2280	if constexpr (sizeof(_I) == sizeof(int))
2281	{
2282	using _FixedInt = __fixed_size_storage_t<int, _Np>;
2283	const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
2284	if constexpr (_FixedInt::_S_tuple_size == `1`)
2285	return {__as_int};
2286	else if constexpr (_FixedInt::_S_tuple_size == `2`
2287	&& is_same_v<
2288	typename _FixedInt::_SecondType::_FirstAbi,
2289	simd_abi::scalar>)
2290	return {__extract<`0`, `2`>(__as_int), __as_int[_Np - `1`]};
2291	else if constexpr (_FixedInt::_S_tuple_size == `2`)
2292	return {__extract<`0`, `2`>(__as_int),
2293	__auto_bitcast(__extract<`1`, `2`>(__as_int))};
2294	else
2295	__assert_unreachable<_Tp>();
2296	}
2297	else if constexpr (_Np == `2` && sizeof(_I) == `8`
2298	&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == `2`)
2299	{
2300	const auto __aslong = __vector_bitcast<_LLong>(__tmp);
2301	return {int(__aslong[`0`]), {int(__aslong[`1`])}};
2302	}
2303	#if _GLIBCXX_SIMD_X86INTRIN
2304	else if constexpr (sizeof(_Tp) == `8` && sizeof(__tmp) == `32`
2305	&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == `1`)
2306	return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
2307	__to_intrin(__hi128(__tmp)))};
2308	else if constexpr (sizeof(_Tp) == `8` && sizeof(__tmp) == `64`
2309	&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == `1`)
2310	return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
2311	#endif // _GLIBCXX_SIMD_X86INTRIN
2312	else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == `1`)
2313	return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
2314	[](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2315	return __make_wrapper<int>(__l...);
2316	})};
2317	else
2318	__assert_unreachable<_Tp>();
2319	}
2320
2321	// _S_increment & _S_decrement{{{2
2322	template <typename _Tp, size_t _Np>
2323	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2324	_S_increment(_SimdWrapper<_Tp, _Np>& __x)
2325	{ __x = __x._M_data + `1`; }
2326
2327	template <typename _Tp, size_t _Np>
2328	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2329	_S_decrement(_SimdWrapper<_Tp, _Np>& __x)
2330	{ __x = __x._M_data - `1`; }
2331
2332	// smart_reference access {{{2
2333	template <typename _Tp, size_t _Np, typename _Up>
2334	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2335	_S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
2336	{ __v._M_set(__i, static_cast<_Up&&>(__x)); }
2337
2338	// _S_masked_assign{{{2
2339	template <typename _Tp, typename _K, size_t _Np>
2340	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2341	_S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2342	__type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2343	{
2344	if (__k._M_is_constprop_none_of())
2345	return;
2346	else if (__k._M_is_constprop_all_of())
2347	__lhs = __rhs;
2348	else
2349	__lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs);
2350	}
2351
2352	template <typename _Tp, typename _K, size_t _Np>
2353	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2354	_S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2355	__type_identity_t<_Tp> __rhs)
2356	{
2357	if (__k._M_is_constprop_none_of())
2358	return;
2359	else if (__k._M_is_constprop_all_of())
2360	__lhs = __vector_broadcast<_Np>(__rhs);
2361	else if (__builtin_constant_p(__rhs) && __rhs == `0`)
2362	{
2363	if constexpr (!is_same_v<bool, _K>)
2364	// the __andnot optimization only makes sense if __k._M_data is a
2365	// vector register
2366	__lhs._M_data
2367	= __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data);
2368	else
2369	// for AVX512/__mmask, a _mm512_maskz_mov is best
2370	__lhs
2371	= _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>());
2372	}
2373	else
2374	__lhs = _CommonImpl::_S_blend(__k, __lhs,
2375	_SimdWrapper<_Tp, _Np>(
2376	__vector_broadcast<_Np>(__rhs)));
2377	}
2378
2379	// _S_masked_cassign {{{2
2380	template <typename _Op, typename _Tp, typename _K, size_t _Np>
2381	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2382	_S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2383	_SimdWrapper<_Tp, _Np>& __lhs,
2384	const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs,
2385	_Op __op)
2386	{
2387	if (__k._M_is_constprop_none_of())
2388	return;
2389	else if (__k._M_is_constprop_all_of())
2390	__lhs = __op(_SuperImpl{}, __lhs, __rhs);
2391	else
2392	__lhs = _CommonImpl::_S_blend(__k, __lhs,
2393	__op(_SuperImpl{}, __lhs, __rhs));
2394	}
2395
2396	template <typename _Op, typename _Tp, typename _K, size_t _Np>
2397	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2398	_S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2399	_SimdWrapper<_Tp, _Np>& __lhs,
2400	const __type_identity_t<_Tp> __rhs, _Op __op)
2401	{ _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); }
2402
2403	// _S_masked_unary {{{2
2404	template <template <typename> class _Op, typename _Tp, typename _K,
2405	size_t _Np>
2406	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2407	_S_masked_unary(const _SimdWrapper<_K, _Np> __k,
2408	const _SimdWrapper<_Tp, _Np> __v)
2409	{
2410	if (__k._M_is_constprop_none_of())
2411	return __v;
2412	auto __vv = _M_make_simd(__v);
2413	_Op<decltype(__vv)> __op;
2414	if (__k._M_is_constprop_all_of())
2415	return __data(__op(__vv));
2416	else if constexpr (is_same_v<_Op<void>, __increment<void>>)
2417	{
2418	static_assert(not std::is_same_v<_K, bool>);
2419	if constexpr (is_integral_v<_Tp>)
2420	// Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2421	return __v._M_data - __vector_bitcast<_Tp>(__k._M_data);
2422	else if constexpr (not __have_avx2)
2423	return __v._M_data
2424	+ __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2425	_K, _Tp(`1`)));
2426	// starting with AVX2 it is more efficient to blend after add
2427	}
2428	else if constexpr (is_same_v<_Op<void>, __decrement<void>>)
2429	{
2430	static_assert(not std::is_same_v<_K, bool>);
2431	if constexpr (is_integral_v<_Tp>)
2432	// Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2433	return __v._M_data + __vector_bitcast<_Tp>(__k._M_data);
2434	else if constexpr (not __have_avx2)
2435	return __v._M_data
2436	- __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2437	_K, _Tp(`1`)));
2438	// starting with AVX2 it is more efficient to blend after sub
2439	}
2440	return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
2441	}
2442
2443	//}}}2
2444	};
2445
2446	// _MaskImplBuiltinMixin {{{1
2447	struct _MaskImplBuiltinMixin
2448	{
2449	template <typename _Tp>
2450	using _TypeTag = _Tp*;
2451
2452	// _S_to_maskvector {{{
2453	template <typename _Up, size_t _ToN = `1`>
2454	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2455	_S_to_maskvector(bool __x)
2456	{
2457	static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2458	return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
2459	: __vector_type_t<_Up, _ToN>{};
2460	}
2461
2462	template <typename _Up, size_t _UpN = `0`, size_t _Np, bool _Sanitized,
2463	size_t _ToN = _UpN == `0` ? _Np : _UpN>
2464	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2465	_S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
2466	{
2467	static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2468	return __generate_vector<__vector_type_t<_Up, _ToN>>(
2469	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2470	if constexpr (__i < _Np)
2471	return __x[__i] ? ~_Up() : _Up();
2472	else
2473	return _Up();
2474	});
2475	}
2476
2477	template <typename _Up, size_t _UpN = `0`, typename _Tp, size_t _Np,
2478	size_t _ToN = _UpN == `0` ? _Np : _UpN>
2479	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2480	_S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
2481	{
2482	static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2483	using _TW = _SimdWrapper<_Tp, _Np>;
2484	using _UW = _SimdWrapper<_Up, _ToN>;
2485	if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW))
2486	return __wrapper_bitcast<_Up, _ToN>(__x);
2487	else if constexpr (is_same_v<_Tp, bool>) // bits -> vector
2488	return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data));
2489	else
2490	{ // vector -> vector
2491	/*
2492	[[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data);
2493	if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) ==
2494	16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr
2495	(sizeof(_Tp) == 4 && sizeof(_Up) == 2
2496	&& sizeof(__y) == 16)
2497	return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y);
2498	else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
2499	&& sizeof(__y) == 16)
2500	return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y);
2501	else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
2502	&& sizeof(__y) == 16)
2503	return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1,
2504	-1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 &&
2505	sizeof(_Up) == 1
2506	&& sizeof(__y) == 16)
2507	return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
2508	-1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 &&
2509	sizeof(_Up) == 1
2510	&& sizeof(__y) == 16)
2511	return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2512	-1, -1, -1, -1, -1>(__y); else
2513	*/
2514	{
2515	return __generate_vector<__vector_type_t<_Up, _ToN>>(
2516	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2517	if constexpr (__i < _Np)
2518	return _Up(__x[__i.value]);
2519	else
2520	return _Up();
2521	});
2522	}
2523	}
2524	}
2525
2526	// }}}
2527	// _S_to_bits {{{
2528	template <typename _Tp, size_t _Np>
2529	_GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
2530	_S_to_bits(_SimdWrapper<_Tp, _Np> __x)
2531	{
2532	static_assert(!is_same_v<_Tp, bool>);
2533	static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong));
2534	using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
2535	const auto __bools
2536	= __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - `1`);
2537	_ULLong __r = `0`;
2538	__execute_n_times<_Np>(
2539	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2540	__r \|= _ULLong(__bools[__i.value]) << __i;
2541	});
2542	return __r;
2543	}
2544
2545	// }}}
2546	};
2547
2548	// _MaskImplBuiltin {{{1
2549	template <typename _Abi, typename>
2550	struct _MaskImplBuiltin : _MaskImplBuiltinMixin
2551	{
2552	using _MaskImplBuiltinMixin::_S_to_bits;
2553	using _MaskImplBuiltinMixin::_S_to_maskvector;
2554
2555	// member types {{{
2556	template <typename _Tp>
2557	using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
2558
2559	template <typename _Tp>
2560	using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
2561
2562	using _SuperImpl = typename _Abi::_MaskImpl;
2563	using _CommonImpl = typename _Abi::_CommonImpl;
2564
2565	template <typename _Tp>
2566	static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
2567
2568	// }}}
2569	// _S_broadcast {{{
2570	template <typename _Tp>
2571	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2572	_S_broadcast(bool __x)
2573	{ return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); }
2574
2575	// }}}
2576	// _S_load {{{
2577	template <typename _Tp>
2578	_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2579	_S_load(const bool* __mem)
2580	{
2581	using _I = __int_for_sizeof_t<_Tp>;
2582	if (not __builtin_is_constant_evaluated())
2583	if constexpr (sizeof(_Tp) == sizeof(bool))
2584	{
2585	const auto __bools
2586	= _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem);
2587	// bool is {0, 1}, everything else is UB
2588	return __bools > `0`;
2589	}
2590	return __generate_vector<_I, _S_size<_Tp>>(
2591	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2592	return __mem[__i] ? ~_I() : _I();
2593	});
2594	}
2595
2596	// }}}
2597	// _S_convert {{{
2598	template <typename _Tp, size_t _Np, bool _Sanitized>
2599	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
2600	_S_convert(_BitMask<_Np, _Sanitized> __x)
2601	{
2602	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2603	return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_to_bits());
2604	else
2605	return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2606	_S_size<_Tp>>(
2607	__x._M_sanitized());
2608	}
2609
2610	template <typename _Tp, size_t _Np>
2611	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
2612	_S_convert(_SimdWrapper<bool, _Np> __x)
2613	{
2614	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2615	return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_data);
2616	else
2617	return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2618	_S_size<_Tp>>(
2619	_BitMask<_Np>(__x._M_data)._M_sanitized());
2620	}
2621
2622	template <typename _Tp, typename _Up, size_t _Np>
2623	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
2624	_S_convert(_SimdWrapper<_Up, _Np> __x)
2625	{
2626	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2627	return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(
2628	_SuperImpl::_S_to_bits(__x));
2629	else
2630	return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2631	_S_size<_Tp>>(__x);
2632	}
2633
2634	template <typename _Tp, typename _Up, typename _UAbi>
2635	_GLIBCXX_SIMD_INTRINSIC static constexpr auto
2636	_S_convert(simd_mask<_Up, _UAbi> __x)
2637	{
2638	if constexpr (__is_builtin_bitmask_abi<_Abi>())
2639	{
2640	using _R = _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>;
2641	if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits
2642	return _R(__data(__x));
2643	else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits
2644	return _R(__data(__x));
2645	else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits
2646	return _R(__data(__x)._M_to_bits());
2647	else // vector -> bits
2648	return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
2649	}
2650	else
2651	return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2652	_S_size<_Tp>>(
2653	__data(__x));
2654	}
2655
2656	// }}}
2657	// _S_masked_load {{{2
2658	template <typename _Tp, size_t _Np>
2659	static inline _SimdWrapper<_Tp, _Np>
2660	_S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
2661	_SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
2662	{
2663	// AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
2664	auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
2665	_BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
2666	[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2667	__tmp._M_set(__i, -__mem[__i]);
2668	});
2669	__merge = __wrapper_bitcast<_Tp>(__tmp);
2670	return __merge;
2671	}
2672
2673	// _S_store {{{2
2674	template <typename _Tp, size_t _Np>
2675	_GLIBCXX_SIMD_INTRINSIC static constexpr void
2676	_S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
2677	{
2678	__execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2679	__mem[__i] = __v[__i];
2680	});
2681	}
2682
2683	// _S_masked_store {{{2
2684	template <typename _Tp, size_t _Np>
2685	static inline void
2686	_S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
2687	const _SimdWrapper<_Tp, _Np> __k) noexcept
2688	{
2689	_BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k),
2690	[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2691	__mem[__i] = __v[__i];
2692	});
2693	}
2694
2695	// _S_from_bitmask{{{2
2696	template <size_t _Np, typename _Tp>
2697	_GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2698	_S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
2699	{ return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); }
2700
2701	// logical and bitwise operators {{{2
2702	template <typename _Tp, size_t _Np>
2703	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2704	_S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2705	{ return __and(__x._M_data, __y._M_data); }
2706
2707	template <typename _Tp, size_t _Np>
2708	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2709	_S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2710	{ return __or(__x._M_data, __y._M_data); }
2711
2712	template <typename _Tp, size_t _Np>
2713	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2714	_S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
2715	{
2716	if constexpr (_Abi::template _S_is_partial<_Tp>)
2717	return __andnot(__x, __wrapper_bitcast<_Tp>(
2718	_Abi::template _S_implicit_mask<_Tp>()));
2719	else
2720	return __not(__x._M_data);
2721	}
2722
2723	template <typename _Tp, size_t _Np>
2724	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2725	_S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2726	{ return __and(__x._M_data, __y._M_data); }
2727
2728	template <typename _Tp, size_t _Np>
2729	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2730	_S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2731	{ return __or(__x._M_data, __y._M_data); }
2732
2733	template <typename _Tp, size_t _Np>
2734	_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2735	_S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2736	{ return __xor(__x._M_data, __y._M_data); }
2737
2738	// smart_reference access {{{2
2739	template <typename _Tp, size_t _Np>
2740	static constexpr void
2741	_S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, bool __x) noexcept
2742	{
2743	if constexpr (is_same_v<_Tp, bool>)
2744	__k._M_set(__i, __x);
2745	else
2746	{
2747	static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
2748	if (__builtin_is_constant_evaluated())
2749	{
2750	__k = __generate_from_n_evaluations<_Np,
2751	__vector_type_t<_Tp, _Np>>(
2752	[&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2753	if (__i == static_cast<int>(__j))
2754	return _Tp(-__x);
2755	else
2756	return __k[+__j];
2757	});
2758	}
2759	else
2760	__k._M_data[__i] = -__x;
2761	}
2762	}
2763
2764	// _S_masked_assign{{{2
2765	template <typename _Tp, size_t _Np>
2766	_GLIBCXX_SIMD_INTRINSIC static void
2767	_S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2768	__type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2769	{ __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
2770
2771	template <typename _Tp, size_t _Np>
2772	_GLIBCXX_SIMD_INTRINSIC static void
2773	_S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs)
2774	{
2775	if (__builtin_constant_p(__rhs))
2776	{
2777	if (__rhs == false)
2778	__lhs = __andnot(__k, __lhs);
2779	else
2780	__lhs = __or(__k, __lhs);
2781	return;
2782	}
2783	__lhs = _CommonImpl::_S_blend(__k, __lhs,
2784	__data(simd_mask<_Tp, _Abi>(__rhs)));
2785	}
2786
2787	//}}}2
2788	// _S_all_of {{{
2789	template <typename _Tp>
2790	_GLIBCXX_SIMD_INTRINSIC static bool
2791	_S_all_of(simd_mask<_Tp, _Abi> __k)
2792	{
2793	return __call_with_subscripts(
2794	__data(__k), make_index_sequence<_S_size<_Tp>>(),
2795	[](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2796	{ return (... && !(__ent == `0`)); });
2797	}
2798
2799	// }}}
2800	// _S_any_of {{{
2801	template <typename _Tp>
2802	_GLIBCXX_SIMD_INTRINSIC static bool
2803	_S_any_of(simd_mask<_Tp, _Abi> __k)
2804	{
2805	return __call_with_subscripts(
2806	__data(__k), make_index_sequence<_S_size<_Tp>>(),
2807	[](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2808	{ return (... \|\| !(__ent == `0`)); });
2809	}
2810
2811	// }}}
2812	// _S_none_of {{{
2813	template <typename _Tp>
2814	_GLIBCXX_SIMD_INTRINSIC static bool
2815	_S_none_of(simd_mask<_Tp, _Abi> __k)
2816	{
2817	return __call_with_subscripts(
2818	__data(__k), make_index_sequence<_S_size<_Tp>>(),
2819	[](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2820	{ return (... && (__ent == `0`)); });
2821	}
2822
2823	// }}}
2824	// _S_some_of {{{
2825	template <typename _Tp>
2826	_GLIBCXX_SIMD_INTRINSIC static bool
2827	_S_some_of(simd_mask<_Tp, _Abi> __k)
2828	{
2829	const int __n_true = _SuperImpl::_S_popcount(__k);
2830	return __n_true > `0` && __n_true < int(_S_size<_Tp>);
2831	}
2832
2833	// }}}
2834	// _S_popcount {{{
2835	template <typename _Tp>
2836	_GLIBCXX_SIMD_INTRINSIC static int
2837	_S_popcount(simd_mask<_Tp, _Abi> __k)
2838	{
2839	using _I = __int_for_sizeof_t<_Tp>;
2840	if constexpr (is_default_constructible_v<simd<_I, _Abi>>)
2841	return -reduce(
2842	simd<_I, _Abi>(__private_init, __wrapper_bitcast<_I>(__data(__k))));
2843	else
2844	return -reduce(__bit_cast<rebind_simd_t<_I, simd<_Tp, _Abi>>>(
2845	simd<_Tp, _Abi>(__private_init, __data(__k))));
2846	}
2847
2848	// }}}
2849	// _S_find_first_set {{{
2850	template <typename _Tp>
2851	_GLIBCXX_SIMD_INTRINSIC static int
2852	_S_find_first_set(simd_mask<_Tp, _Abi> __k)
2853	{ return std::__countr_zero(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()); }
2854
2855	// }}}
2856	// _S_find_last_set {{{
2857	template <typename _Tp>
2858	_GLIBCXX_SIMD_INTRINSIC static int
2859	_S_find_last_set(simd_mask<_Tp, _Abi> __k)
2860	{ return std::__bit_width(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()) - `1`; }
2861
2862	// }}}
2863	};
2864
2865	//}}}1
2866	_GLIBCXX_SIMD_END_NAMESPACE
2867	#endif // __cplusplus >= 201703L
2868	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
2869
2870	// vim: foldmethod=marker foldmarker={{{,}}} sw=2 noet ts=8 sts=2 tw=100
2871

source code of include/c++/13/experimental/bits/simd_builtin.h