1// Simd x86 specific implementations -*- C++ -*-
2
3// Copyright (C) 2020-2021 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_X86INTRIN
31#error \
32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available"
33#endif
34
35_GLIBCXX_SIMD_BEGIN_NAMESPACE
36
37// __to_masktype {{{
38// Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and
39// __vector_type_t.
40template <typename _Tp, size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 { return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(__x._M_data); }
44
45template <typename _TV,
46 typename _TVT
47 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
48 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
50 __to_masktype(_TV __x)
51 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); }
52
53// }}}
54// __interleave128_lo {{{
55template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>,
56 typename _Trait = _VectorTraits<_Tp>>
57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
58 __interleave128_lo(const _Ap& __av, const _Bp& __bv)
59 {
60 const _Tp __a(__av);
61 const _Tp __b(__bv);
62 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
63 return _Tp{__a[0], __b[0]};
64 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
65 return _Tp{__a[0], __b[0], __a[1], __b[1]};
66 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
67 return _Tp{__a[0], __b[0], __a[1], __b[1],
68 __a[2], __b[2], __a[3], __b[3]};
69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
72 __a[6], __b[6], __a[7], __b[7]};
73 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
74 return _Tp{__a[0], __b[0], __a[2], __b[2]};
75 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
76 return _Tp{__a[0], __b[0], __a[1], __b[1],
77 __a[4], __b[4], __a[5], __b[5]};
78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
81 __a[10], __b[10], __a[11], __b[11]};
82 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
87 __a[22], __b[22], __a[23], __b[23]};
88 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
89 return _Tp{__a[0], __b[0], __a[2], __b[2],
90 __a[4], __b[4], __a[6], __b[6]};
91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
94 __a[12], __b[12], __a[13], __b[13]};
95 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
100 __a[26], __b[26], __a[27], __b[27]};
101 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
111 __b[55]};
112 else
113 __assert_unreachable<_Tp>();
114 }
115
116// }}}
117// __is_zero{{{
118template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
119 _GLIBCXX_SIMD_INTRINSIC constexpr bool
120 __is_zero(_Tp __a)
121 {
122 if (!__builtin_is_constant_evaluated())
123 {
124 if constexpr (__have_avx)
125 {
126 if constexpr (_TVT::template _S_is<float, 8>)
127 return _mm256_testz_ps(__a, __a);
128 else if constexpr (_TVT::template _S_is<double, 4>)
129 return _mm256_testz_pd(__a, __a);
130 else if constexpr (sizeof(_Tp) == 32)
131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
132 else if constexpr (_TVT::template _S_is<float>)
133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
134 else if constexpr (_TVT::template _S_is<double, 2>)
135 return _mm_testz_pd(__a, __a);
136 else
137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
138 }
139 else if constexpr (__have_sse4_1)
140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
141 __intrin_bitcast<__m128i>(__a));
142 }
143 else if constexpr (sizeof(_Tp) <= 8)
144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
145 else
146 {
147 const auto __b = __vector_bitcast<_LLong>(__a);
148 if constexpr (sizeof(__b) == 16)
149 return (__b[0] | __b[1]) == 0;
150 else if constexpr (sizeof(__b) == 32)
151 return __is_zero(__lo128(__b) | __hi128(__b));
152 else if constexpr (sizeof(__b) == 64)
153 return __is_zero(__lo256(__b) | __hi256(__b));
154 else
155 __assert_unreachable<_Tp>();
156 }
157 }
158
159// }}}
160// __movemask{{{
161template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int
163 __movemask(_Tp __a)
164 {
165 if constexpr (sizeof(_Tp) == 32)
166 {
167 if constexpr (_TVT::template _S_is<float>)
168 return _mm256_movemask_ps(__to_intrin(__a));
169 else if constexpr (_TVT::template _S_is<double>)
170 return _mm256_movemask_pd(__to_intrin(__a));
171 else
172 return _mm256_movemask_epi8(__to_intrin(__a));
173 }
174 else if constexpr (_TVT::template _S_is<float>)
175 return _mm_movemask_ps(__to_intrin(__a));
176 else if constexpr (_TVT::template _S_is<double>)
177 return _mm_movemask_pd(__to_intrin(__a));
178 else
179 return _mm_movemask_epi8(__to_intrin(__a));
180 }
181
182// }}}
183// __testz{{{
184template <typename _TI, typename _TVT = _VectorTraits<_TI>>
185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
186 __testz(_TI __a, _TI __b)
187 {
188 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
189 _TVT::_S_full_size>>);
190 if (!__builtin_is_constant_evaluated())
191 {
192 if constexpr (sizeof(_TI) == 32)
193 {
194 if constexpr (_TVT::template _S_is<float>)
195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
196 else if constexpr (_TVT::template _S_is<double>)
197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
198 else
199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
200 }
201 else if constexpr (_TVT::template _S_is<float> && __have_avx)
202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
203 else if constexpr (_TVT::template _S_is<double> && __have_avx)
204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
205 else if constexpr (__have_sse4_1)
206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
207 __intrin_bitcast<__m128i>(__to_intrin(__b)));
208 else
209 return __movemask(0 == __and(__a, __b)) != 0;
210 }
211 else
212 return __is_zero(__and(__a, __b));
213 }
214
215// }}}
216// __testc{{{
217// requires SSE4.1 or above
218template <typename _TI, typename _TVT = _VectorTraits<_TI>>
219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
220 __testc(_TI __a, _TI __b)
221 {
222 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
223 _TVT::_S_full_size>>);
224 if (__builtin_is_constant_evaluated())
225 return __is_zero(__andnot(__a, __b));
226
227 if constexpr (sizeof(_TI) == 32)
228 {
229 if constexpr (_TVT::template _S_is<float>)
230 return _mm256_testc_ps(__a, __b);
231 else if constexpr (_TVT::template _S_is<double>)
232 return _mm256_testc_pd(__a, __b);
233 else
234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
235 }
236 else if constexpr (_TVT::template _S_is<float> && __have_avx)
237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
238 else if constexpr (_TVT::template _S_is<double> && __have_avx)
239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
240 else
241 {
242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
244 __intrin_bitcast<__m128i>(__to_intrin(__b)));
245 }
246 }
247
248// }}}
249// __testnzc{{{
250template <typename _TI, typename _TVT = _VectorTraits<_TI>>
251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
252 __testnzc(_TI __a, _TI __b)
253 {
254 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
255 _TVT::_S_full_size>>);
256 if (!__builtin_is_constant_evaluated())
257 {
258 if constexpr (sizeof(_TI) == 32)
259 {
260 if constexpr (_TVT::template _S_is<float>)
261 return _mm256_testnzc_ps(__a, __b);
262 else if constexpr (_TVT::template _S_is<double>)
263 return _mm256_testnzc_pd(__a, __b);
264 else
265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
266 }
267 else if constexpr (_TVT::template _S_is<float> && __have_avx)
268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
269 else if constexpr (_TVT::template _S_is<double> && __have_avx)
270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
271 else if constexpr (__have_sse4_1)
272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
273 __intrin_bitcast<__m128i>(__to_intrin(__b)));
274 else
275 return __movemask(0 == __and(__a, __b)) == 0
276 && __movemask(0 == __andnot(__a, __b)) == 0;
277 }
278 else
279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
280 }
281
282// }}}
283// __xzyw{{{
284// shuffles the complete vector, swapping the inner two quarters. Often useful
285// for AVX for fixing up a shuffle result.
286template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
287 _GLIBCXX_SIMD_INTRINSIC _Tp
288 __xzyw(_Tp __a)
289 {
290 if constexpr (sizeof(_Tp) == 16)
291 {
292 const auto __x = __vector_bitcast<conditional_t<
293 is_floating_point_v<typename _TVT::value_type>, float, int>>(__a);
294 return reinterpret_cast<_Tp>(
295 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
296 }
297 else if constexpr (sizeof(_Tp) == 32)
298 {
299 const auto __x = __vector_bitcast<conditional_t<
300 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
301 return reinterpret_cast<_Tp>(
302 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
303 }
304 else if constexpr (sizeof(_Tp) == 64)
305 {
306 const auto __x = __vector_bitcast<conditional_t<
307 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
308 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4],
309 __x[5], __x[2], __x[3],
310 __x[6], __x[7]});
311 }
312 else
313 __assert_unreachable<_Tp>();
314 }
315
316// }}}
317// __maskload_epi32{{{
318template <typename _Tp>
319 _GLIBCXX_SIMD_INTRINSIC auto
320 __maskload_epi32(const int* __ptr, _Tp __k)
321 {
322 if constexpr (sizeof(__k) == 16)
323 return _mm_maskload_epi32(__ptr, __k);
324 else
325 return _mm256_maskload_epi32(__ptr, __k);
326 }
327
328// }}}
329// __maskload_epi64{{{
330template <typename _Tp>
331 _GLIBCXX_SIMD_INTRINSIC auto
332 __maskload_epi64(const _LLong* __ptr, _Tp __k)
333 {
334 if constexpr (sizeof(__k) == 16)
335 return _mm_maskload_epi64(__ptr, __k);
336 else
337 return _mm256_maskload_epi64(__ptr, __k);
338 }
339
340// }}}
341// __maskload_ps{{{
342template <typename _Tp>
343 _GLIBCXX_SIMD_INTRINSIC auto
344 __maskload_ps(const float* __ptr, _Tp __k)
345 {
346 if constexpr (sizeof(__k) == 16)
347 return _mm_maskload_ps(__ptr, __k);
348 else
349 return _mm256_maskload_ps(__ptr, __k);
350 }
351
352// }}}
353// __maskload_pd{{{
354template <typename _Tp>
355 _GLIBCXX_SIMD_INTRINSIC auto
356 __maskload_pd(const double* __ptr, _Tp __k)
357 {
358 if constexpr (sizeof(__k) == 16)
359 return _mm_maskload_pd(__ptr, __k);
360 else
361 return _mm256_maskload_pd(__ptr, __k);
362 }
363
364// }}}
365
366#ifdef __clang__
367template <size_t _Np, typename _Tp, typename _Kp>
368 _GLIBCXX_SIMD_INTRINSIC constexpr auto
369 __movm(_Kp __k) noexcept
370 {
371 static_assert(is_unsigned_v<_Kp>);
372 if constexpr (sizeof(_Tp) == 1 && __have_avx512bw)
373 {
374 if constexpr (_Np <= 16 && __have_avx512vl)
375 return __builtin_ia32_cvtmask2b128(__k);
376 else if constexpr (_Np <= 32 && __have_avx512vl)
377 return __builtin_ia32_cvtmask2b256(__k);
378 else
379 return __builtin_ia32_cvtmask2b512(__k);
380 }
381 else if constexpr (sizeof(_Tp) == 2 && __have_avx512bw)
382 {
383 if constexpr (_Np <= 8 && __have_avx512vl)
384 return __builtin_ia32_cvtmask2w128(__k);
385 else if constexpr (_Np <= 16 && __have_avx512vl)
386 return __builtin_ia32_cvtmask2w256(__k);
387 else
388 return __builtin_ia32_cvtmask2w512(__k);
389 }
390 else if constexpr (sizeof(_Tp) == 4 && __have_avx512dq)
391 {
392 if constexpr (_Np <= 4 && __have_avx512vl)
393 return __builtin_ia32_cvtmask2d128(__k);
394 else if constexpr (_Np <= 8 && __have_avx512vl)
395 return __builtin_ia32_cvtmask2d256(__k);
396 else
397 return __builtin_ia32_cvtmask2d512(__k);
398 }
399 else if constexpr (sizeof(_Tp) == 8 && __have_avx512dq)
400 {
401 if constexpr (_Np <= 2 && __have_avx512vl)
402 return __builtin_ia32_cvtmask2q128(__k);
403 else if constexpr (_Np <= 4 && __have_avx512vl)
404 return __builtin_ia32_cvtmask2q256(__k);
405 else
406 return __builtin_ia32_cvtmask2q512(__k);
407 }
408 else
409 __assert_unreachable<_Tp>();
410 }
411#endif // __clang__
412
413#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
414#include "simd_x86_conversions.h"
415#endif
416
417// ISA & type detection {{{
418template <typename _Tp, size_t _Np>
419 constexpr bool
420 __is_sse_ps()
421 {
422 return __have_sse
423 && is_same_v<_Tp,
424 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
425 }
426
427template <typename _Tp, size_t _Np>
428 constexpr bool
429 __is_sse_pd()
430 {
431 return __have_sse2
432 && is_same_v<_Tp,
433 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
434 }
435
436template <typename _Tp, size_t _Np>
437 constexpr bool
438 __is_avx_ps()
439 {
440 return __have_avx
441 && is_same_v<_Tp,
442 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
443 }
444
445template <typename _Tp, size_t _Np>
446 constexpr bool
447 __is_avx_pd()
448 {
449 return __have_avx
450 && is_same_v<_Tp,
451 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
452 }
453
454template <typename _Tp, size_t _Np>
455 constexpr bool
456 __is_avx512_ps()
457 {
458 return __have_avx512f
459 && is_same_v<_Tp,
460 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
461 }
462
463template <typename _Tp, size_t _Np>
464 constexpr bool
465 __is_avx512_pd()
466 {
467 return __have_avx512f
468 && is_same_v<_Tp,
469 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
470 }
471
472// }}}
473struct _MaskImplX86Mixin;
474
475// _CommonImplX86 {{{
476struct _CommonImplX86 : _CommonImplBuiltin
477{
478#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
479 // _S_converts_via_decomposition {{{
480 template <typename _From, typename _To, size_t _ToSize>
481 static constexpr bool
482 _S_converts_via_decomposition()
483 {
484 if constexpr (is_integral_v<
485 _From> && is_integral_v<_To> && sizeof(_From) == 8
486 && _ToSize == 16)
487 return (sizeof(_To) == 2 && !__have_ssse3)
488 || (sizeof(_To) == 1 && !__have_avx512f);
489 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
490 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
491 && !__have_avx512dq)
492 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
493 && _ToSize == 16);
494 else if constexpr (
495 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
496 && !__have_avx512dq)
497 return (sizeof(_To) == 4 && _ToSize == 16)
498 || (sizeof(_To) == 8 && _ToSize < 64);
499 else
500 return false;
501 }
502
503 template <typename _From, typename _To, size_t _ToSize>
504 static inline constexpr bool __converts_via_decomposition_v
505 = _S_converts_via_decomposition<_From, _To, _ToSize>();
506
507 // }}}
508#endif
509 // _S_store {{{
510 using _CommonImplBuiltin::_S_store;
511
512 template <typename _Tp, size_t _Np>
513 _GLIBCXX_SIMD_INTRINSIC static constexpr void
514 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
515 {
516 constexpr size_t _Bytes = _Np * sizeof(_Tp);
517
518 if (__builtin_is_constant_evaluated())
519 _CommonImplBuiltin::_S_store(__x, __addr);
520 else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
521 {
522 const auto __v = __to_intrin(__x);
523
524 if constexpr (_Bytes & 1)
525 {
526 if constexpr (_Bytes < 16)
527 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
528 __intrin_bitcast<__m128i>(__v));
529 else if constexpr (_Bytes < 32)
530 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
531 __intrin_bitcast<__m256i>(__v));
532 else
533 _mm512_mask_storeu_epi8(__addr,
534 0xffffffffffffffffull >> (64 - _Bytes),
535 __intrin_bitcast<__m512i>(__v));
536 }
537 else if constexpr (_Bytes & 2)
538 {
539 if constexpr (_Bytes < 16)
540 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
541 __intrin_bitcast<__m128i>(__v));
542 else if constexpr (_Bytes < 32)
543 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
544 __intrin_bitcast<__m256i>(__v));
545 else
546 _mm512_mask_storeu_epi16(__addr,
547 0xffffffffull >> (32 - _Bytes / 2),
548 __intrin_bitcast<__m512i>(__v));
549 }
550 else if constexpr (_Bytes & 4)
551 {
552 if constexpr (_Bytes < 16)
553 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
554 __intrin_bitcast<__m128i>(__v));
555 else if constexpr (_Bytes < 32)
556 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
557 __intrin_bitcast<__m256i>(__v));
558 else
559 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
560 __intrin_bitcast<__m512i>(__v));
561 }
562 else
563 {
564 static_assert(
565 _Bytes > 16,
566 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes "
567 "- 1)) != 0 is impossible");
568 if constexpr (_Bytes < 32)
569 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
570 __intrin_bitcast<__m256i>(__v));
571 else
572 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
573 __intrin_bitcast<__m512i>(__v));
574 }
575 }
576 else
577 _CommonImplBuiltin::_S_store(__x, __addr);
578 }
579
580 // }}}
581 // _S_store_bool_array(_BitMask) {{{
582 template <size_t _Np, bool _Sanitized>
583 _GLIBCXX_SIMD_INTRINSIC static constexpr void
584 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
585 {
586 if (__builtin_is_constant_evaluated())
587 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
588 else if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
589 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
590 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
591 if constexpr (_Np <= 16)
592 return _mm_movm_epi8(__x._M_to_bits());
593 else if constexpr (_Np <= 32)
594 return _mm256_movm_epi8(__x._M_to_bits());
595 else if constexpr (_Np <= 64)
596 return _mm512_movm_epi8(__x._M_to_bits());
597 else
598 __assert_unreachable<_SizeConstant<_Np>>();
599 }()),
600 __mem);
601 else if constexpr (__have_bmi2)
602 {
603 if constexpr (_Np <= 4)
604 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
605 else
606 __execute_n_times<__div_roundup(a: _Np, b: sizeof(size_t))>(
607 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
608 constexpr size_t __offset = __i * sizeof(size_t);
609 constexpr int __todo = std::min(a: sizeof(size_t), b: _Np - __offset);
610 if constexpr (__todo == 1)
611 __mem[__offset] = __x[__offset];
612 else
613 {
614 const auto __bools =
615#ifdef __x86_64__
616 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
617 0x0101010101010101ULL);
618#else // __x86_64__
619 _pdep_u32(
620 __x.template _M_extract<__offset>()._M_to_bits(),
621 0x01010101U);
622#endif // __x86_64__
623 _S_store<__todo>(__bools, __mem + __offset);
624 }
625 });
626 }
627 else if constexpr (__have_sse2 && _Np > 7)
628 __execute_n_times<__div_roundup(a: _Np, b: 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
629 constexpr int __offset = __i * 16;
630 constexpr int __todo = std::min(a: 16, b: int(_Np) - __offset);
631 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
632 __vector_type16_t<_UChar> __bools;
633 if constexpr (__have_avx512f)
634 {
635 auto __as32bits
636 = _mm512_maskz_mov_epi32(U: __bits, A: __to_intrin(
637 x: __vector_broadcast<16>(x: 1)));
638 auto __as16bits
639 = __xzyw(a: _mm256_packs_epi32(a: __lo256(x: __as32bits),
640 b: __todo > 8 ? __hi256(x: __as32bits)
641 : __m256i()));
642 __bools = __vector_bitcast<_UChar>(
643 x: _mm_packs_epi16(a: __lo128(x: __as16bits), b: __hi128(x: __as16bits)));
644 }
645 else
646 {
647 using _V = __vector_type_t<_UChar, 16>;
648 auto __tmp = _mm_cvtsi32_si128(a: __bits);
649 __tmp = _mm_unpacklo_epi8(a: __tmp, b: __tmp);
650 __tmp = _mm_unpacklo_epi16(a: __tmp, b: __tmp);
651 __tmp = _mm_unpacklo_epi32(a: __tmp, b: __tmp);
652 _V __tmp2 = reinterpret_cast<_V>(__tmp);
653 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
654 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index
655 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01
656 }
657 _S_store<__todo>(__bools, __mem + __offset);
658 });
659 else
660 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
661 }
662
663 // }}}
664 // _S_blend_avx512 {{{
665 // Returns: __k ? __b : __a
666 // TODO: reverse __a and __b to match COND_EXPR
667 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask
668 // __k
669 template <typename _Kp, typename _TV>
670 _GLIBCXX_SIMD_INTRINSIC static _TV
671 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept
672 {
673 static_assert(__is_vector_type_v<_TV>);
674 using _Tp = typename _VectorTraits<_TV>::value_type;
675 static_assert(sizeof(_TV) >= 16);
676 static_assert(sizeof(_Tp) <= 8);
677#ifdef __clang__
678 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a;
679#else
680 using _IntT
681 = conditional_t<(sizeof(_Tp) > 2),
682 conditional_t<sizeof(_Tp) == 4, int, long long>,
683 conditional_t<sizeof(_Tp) == 1, char, short>>;
684 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a);
685 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b);
686 if constexpr (sizeof(_TV) == 64)
687 {
688 if constexpr (sizeof(_Tp) == 1)
689 return reinterpret_cast<_TV>(
690 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
691 else if constexpr (sizeof(_Tp) == 2)
692 return reinterpret_cast<_TV>(
693 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
694 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
695 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
696 else if constexpr (sizeof(_Tp) == 4)
697 return reinterpret_cast<_TV>(
698 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
699 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
700 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
701 else if constexpr (sizeof(_Tp) == 8)
702 return reinterpret_cast<_TV>(
703 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
704 }
705 else if constexpr (sizeof(_TV) == 32)
706 {
707 if constexpr (sizeof(_Tp) == 1)
708 return reinterpret_cast<_TV>(
709 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
710 else if constexpr (sizeof(_Tp) == 2)
711 return reinterpret_cast<_TV>(
712 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
713 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
714 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
715 else if constexpr (sizeof(_Tp) == 4)
716 return reinterpret_cast<_TV>(
717 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
718 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
719 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
720 else if constexpr (sizeof(_Tp) == 8)
721 return reinterpret_cast<_TV>(
722 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
723 }
724 else if constexpr (sizeof(_TV) == 16)
725 {
726 if constexpr (sizeof(_Tp) == 1)
727 return reinterpret_cast<_TV>(
728 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
729 else if constexpr (sizeof(_Tp) == 2)
730 return reinterpret_cast<_TV>(
731 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
732 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
733 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
734 else if constexpr (sizeof(_Tp) == 4)
735 return reinterpret_cast<_TV>(
736 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
737 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
738 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
739 else if constexpr (sizeof(_Tp) == 8)
740 return reinterpret_cast<_TV>(
741 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
742 }
743#endif
744 }
745
746 // }}}
747 // _S_blend_intrin {{{
748 // Returns: __k ? __b : __a
749 // TODO: reverse __a and __b to match COND_EXPR
750 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32
751 // Bytes wide
752 template <typename _Tp>
753 _GLIBCXX_SIMD_INTRINSIC static _Tp
754 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept
755 {
756 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
757 constexpr struct
758 {
759 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
760 __m128 __k) const noexcept
761 {
762 return __builtin_ia32_blendvps(__a, __b, __k);
763 }
764 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
765 __m128d __k) const noexcept
766 {
767 return __builtin_ia32_blendvpd(__a, __b, __k);
768 }
769 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
770 __m128i __k) const noexcept
771 {
772 return reinterpret_cast<__m128i>(
773 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
774 reinterpret_cast<__v16qi>(__b),
775 reinterpret_cast<__v16qi>(__k)));
776 }
777 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
778 __m256 __k) const noexcept
779 {
780 return __builtin_ia32_blendvps256(__a, __b, __k);
781 }
782 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
783 __m256d __k) const noexcept
784 {
785 return __builtin_ia32_blendvpd256(__a, __b, __k);
786 }
787 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
788 __m256i __k) const noexcept
789 {
790 if constexpr (__have_avx2)
791 return reinterpret_cast<__m256i>(
792 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
793 reinterpret_cast<__v32qi>(__b),
794 reinterpret_cast<__v32qi>(__k)));
795 else
796 return reinterpret_cast<__m256i>(
797 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
798 reinterpret_cast<__v8sf>(__b),
799 reinterpret_cast<__v8sf>(__k)));
800 }
801 } __eval;
802 return __eval(__a, __b, __k);
803 }
804
805 // }}}
806 // _S_blend {{{
807 // Returns: __k ? __at1 : __at0
808 // TODO: reverse __at0 and __at1 to match COND_EXPR
809 template <typename _Tp, size_t _Np>
810 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
811 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
812 _SimdWrapper<_Tp, _Np> __at1)
813 {
814 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
815 if (__k._M_is_constprop() && __at0._M_is_constprop()
816 && __at1._M_is_constprop())
817 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
818 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
819 return __k[__i] ? __at1[__i] : __at0[__i];
820 });
821 else if constexpr (sizeof(__at0) == 64
822 || (__have_avx512vl && sizeof(__at0) >= 16))
823 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
824 else
825 {
826 static_assert((__have_avx512vl && sizeof(__at0) < 16)
827 || !__have_avx512vl);
828 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp);
829 return __vector_bitcast<_Tp, _Np>(
830 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
831 __vector_bitcast<_Tp, __size>(__at1)));
832 }
833 }
834
835 template <typename _Tp, size_t _Np>
836 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
837 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
838 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
839 {
840 const auto __kk = __wrapper_bitcast<_Tp>(__k);
841 if (__builtin_is_constant_evaluated()
842 || (__kk._M_is_constprop() && __at0._M_is_constprop()
843 && __at1._M_is_constprop()))
844 {
845 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
846 if (__r._M_is_constprop())
847 return __r;
848 }
849 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl)
850 && (sizeof(_Tp) >= 4 || __have_avx512bw))
851 // convert to bitmask and call overload above
852 return _S_blend(
853 _SimdWrapper<bool, _Np>(
854 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
855 ._M_to_bits()),
856 __at0, __at1);
857 else
858 {
859 // Since GCC does not assume __k to be a mask, using the builtin
860 // conditional operator introduces an extra compare against 0 before
861 // blending. So we rather call the intrinsic here.
862 if constexpr (__have_sse4_1)
863 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
864 __to_intrin(__at1));
865 else
866 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
867 }
868 }
869
870 // }}}
871};
872
873// }}}
874// _SimdImplX86 {{{
875template <typename _Abi>
876 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
877 {
878 using _Base = _SimdImplBuiltin<_Abi>;
879
880 template <typename _Tp>
881 using _MaskMember = typename _Base::template _MaskMember<_Tp>;
882
883 template <typename _Tp>
884 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
885
886 template <typename _Tp>
887 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
888
889 template <typename _Tp>
890 static constexpr size_t _S_max_store_size
891 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
892 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
893 : 16;
894
895 using _MaskImpl = typename _Abi::_MaskImpl;
896
897 // _S_masked_load {{{
898 template <typename _Tp, size_t _Np, typename _Up>
899 static inline _SimdWrapper<_Tp, _Np>
900 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
901 const _Up* __mem) noexcept
902 {
903 static_assert(_Np == _S_size<_Tp>);
904 if constexpr (is_same_v<_Tp, _Up> || // no conversion
905 (sizeof(_Tp) == sizeof(_Up)
906 && is_integral_v<
907 _Tp> == is_integral_v<_Up>) // conversion via bit
908 // reinterpretation
909 )
910 {
911 [[maybe_unused]] const auto __intrin = __to_intrin(__merge);
912 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
913 && sizeof(_Tp) == 1)
914 {
915 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
916 if constexpr (sizeof(__intrin) == 16)
917 __merge = __vector_bitcast<_Tp, _Np>(
918 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
919 else if constexpr (sizeof(__merge) == 32)
920 __merge = __vector_bitcast<_Tp, _Np>(
921 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
922 else if constexpr (sizeof(__merge) == 64)
923 __merge = __vector_bitcast<_Tp, _Np>(
924 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
925 else
926 __assert_unreachable<_Tp>();
927 }
928 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
929 && sizeof(_Tp) == 2)
930 {
931 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
932 if constexpr (sizeof(__intrin) == 16)
933 __merge = __vector_bitcast<_Tp, _Np>(
934 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
935 else if constexpr (sizeof(__intrin) == 32)
936 __merge = __vector_bitcast<_Tp, _Np>(
937 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
938 else if constexpr (sizeof(__intrin) == 64)
939 __merge = __vector_bitcast<_Tp, _Np>(
940 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
941 else
942 __assert_unreachable<_Tp>();
943 }
944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
945 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
946 {
947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
948 if constexpr (sizeof(__intrin) == 16)
949 __merge = __vector_bitcast<_Tp, _Np>(
950 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
951 else if constexpr (sizeof(__intrin) == 32)
952 __merge = __vector_bitcast<_Tp, _Np>(
953 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
954 else if constexpr (sizeof(__intrin) == 64)
955 __merge = __vector_bitcast<_Tp, _Np>(
956 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
957 else
958 __assert_unreachable<_Tp>();
959 }
960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
961 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
962 {
963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
964 if constexpr (sizeof(__intrin) == 16)
965 __merge = __vector_bitcast<_Tp, _Np>(
966 _mm_mask_loadu_ps(__intrin, __kk, __mem));
967 else if constexpr (sizeof(__intrin) == 32)
968 __merge = __vector_bitcast<_Tp, _Np>(
969 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
970 else if constexpr (sizeof(__intrin) == 64)
971 __merge = __vector_bitcast<_Tp, _Np>(
972 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
973 else
974 __assert_unreachable<_Tp>();
975 }
976 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
977 && is_integral_v<_Up>)
978 {
979 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
980 __merge
981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
982 __vector_bitcast<_Tp, _Np>(
983 __maskload_epi32(reinterpret_cast<const int*>(__mem),
984 __to_intrin(__k))));
985 }
986 else if constexpr (__have_avx && sizeof(_Tp) == 4)
987 {
988 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
989 __merge
990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
991 __vector_bitcast<_Tp, _Np>(
992 __maskload_ps(reinterpret_cast<const float*>(__mem),
993 __to_intrin(__k))));
994 }
995 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
996 && sizeof(_Tp) == 8 && is_integral_v<_Up>)
997 {
998 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
999 if constexpr (sizeof(__intrin) == 16)
1000 __merge = __vector_bitcast<_Tp, _Np>(
1001 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
1002 else if constexpr (sizeof(__intrin) == 32)
1003 __merge = __vector_bitcast<_Tp, _Np>(
1004 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
1005 else if constexpr (sizeof(__intrin) == 64)
1006 __merge = __vector_bitcast<_Tp, _Np>(
1007 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
1008 else
1009 __assert_unreachable<_Tp>();
1010 }
1011 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
1012 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
1013 {
1014 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1015 if constexpr (sizeof(__intrin) == 16)
1016 __merge = __vector_bitcast<_Tp, _Np>(
1017 _mm_mask_loadu_pd(__intrin, __kk, __mem));
1018 else if constexpr (sizeof(__intrin) == 32)
1019 __merge = __vector_bitcast<_Tp, _Np>(
1020 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
1021 else if constexpr (sizeof(__intrin) == 64)
1022 __merge = __vector_bitcast<_Tp, _Np>(
1023 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
1024 else
1025 __assert_unreachable<_Tp>();
1026 }
1027 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1028 && is_integral_v<_Up>)
1029 {
1030 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1031 __merge
1032 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1033 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
1034 reinterpret_cast<const _LLong*>(__mem),
1035 __to_intrin(__k))));
1036 }
1037 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1038 {
1039 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1040 __merge
1041 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1042 __vector_bitcast<_Tp, _Np>(
1043 __maskload_pd(reinterpret_cast<const double*>(__mem),
1044 __to_intrin(__k))));
1045 }
1046 else
1047 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1048 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1049 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1050 });
1051 }
1052 /* Very uncertain, that the following improves anything. Needs
1053 benchmarking
1054 * before it's activated.
1055 else if constexpr (sizeof(_Up) <= 8 && // no long double
1056 !__converts_via_decomposition_v<
1057 _Up, _Tp,
1058 sizeof(__merge)> // conversion via decomposition
1059 // is better handled via the
1060 // bit_iteration fallback below
1061 )
1062 {
1063 // TODO: copy pattern from _S_masked_store, which doesn't resort to
1064 // fixed_size
1065 using _Ap = simd_abi::deduce_t<_Up, _Np>;
1066 using _ATraits = _SimdTraits<_Up, _Ap>;
1067 using _AImpl = typename _ATraits::_SimdImpl;
1068 typename _ATraits::_SimdMember __uncvted{};
1069 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template
1070 _S_convert<_Up>(__k);
1071 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem);
1072 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter;
1073 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted));
1074 }
1075 */
1076 else
1077 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1078 return __merge;
1079 }
1080
1081 // }}}
1082 // _S_masked_store_nocvt {{{
1083 template <typename _Tp, size_t _Np>
1084 _GLIBCXX_SIMD_INTRINSIC static void
1085 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper<bool, _Np> __k)
1086 {
1087 [[maybe_unused]] const auto __vi = __to_intrin(__v);
1088 if constexpr (sizeof(__vi) == 64)
1089 {
1090 static_assert(sizeof(__v) == 64 && __have_avx512f);
1091 if constexpr (__have_avx512bw && sizeof(_Tp) == 1)
1092 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1093 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1094 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1095 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1096 {
1097 if constexpr (is_integral_v<_Tp>)
1098 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1099 else
1100 _mm512_mask_storeu_ps(__mem, __k, __vi);
1101 }
1102 else if constexpr (__have_avx512f && sizeof(_Tp) == 8)
1103 {
1104 if constexpr (is_integral_v<_Tp>)
1105 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1106 else
1107 _mm512_mask_storeu_pd(__mem, __k, __vi);
1108 }
1109#if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32
1110 // with Skylake-AVX512, __have_avx512bw is true
1111 else if constexpr (__have_sse2)
1112 {
1113 using _M = __vector_type_t<_Tp, _Np>;
1114 using _MVT = _VectorTraits<_M>;
1115 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)),
1116 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)),
1117 reinterpret_cast<char*>(__mem));
1118 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)),
1119 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1120 __k._M_data >> 1 * _MVT::_S_full_size)),
1121 reinterpret_cast<char*>(__mem) + 1 * 16);
1122 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)),
1123 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1124 __k._M_data >> 2 * _MVT::_S_full_size)),
1125 reinterpret_cast<char*>(__mem) + 2 * 16);
1126 if constexpr (_Np > 48 / sizeof(_Tp))
1127 _mm_maskmoveu_si128(
1128 __auto_bitcast(__extract<3, 4>(__v._M_data)),
1129 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1130 __k._M_data >> 3 * _MVT::_S_full_size)),
1131 reinterpret_cast<char*>(__mem) + 3 * 16);
1132 }
1133#endif
1134 else
1135 __assert_unreachable<_Tp>();
1136 }
1137 else if constexpr (sizeof(__vi) == 32)
1138 {
1139 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1140 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1141 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1142 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1143 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1144 {
1145 if constexpr (is_integral_v<_Tp>)
1146 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1147 else
1148 _mm256_mask_storeu_ps(__mem, __k, __vi);
1149 }
1150 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1151 {
1152 if constexpr (is_integral_v<_Tp>)
1153 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1154 else
1155 _mm256_mask_storeu_pd(__mem, __k, __vi);
1156 }
1157 else if constexpr (__have_avx512f
1158 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1159 {
1160 // use a 512-bit maskstore, using zero-extension of the bitmask
1161 _S_masked_store_nocvt(
1162 _SimdWrapper64<_Tp>(
1163 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1164 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1165 }
1166 else
1167 _S_masked_store_nocvt(__v, __mem,
1168 _MaskImpl::template _S_to_maskvector<
1169 __int_for_sizeof_t<_Tp>, _Np>(__k));
1170 }
1171 else if constexpr (sizeof(__vi) == 16)
1172 {
1173 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1174 _mm_mask_storeu_epi8(__mem, __k, __vi);
1175 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1176 _mm_mask_storeu_epi16(__mem, __k, __vi);
1177 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1178 {
1179 if constexpr (is_integral_v<_Tp>)
1180 _mm_mask_storeu_epi32(__mem, __k, __vi);
1181 else
1182 _mm_mask_storeu_ps(__mem, __k, __vi);
1183 }
1184 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1185 {
1186 if constexpr (is_integral_v<_Tp>)
1187 _mm_mask_storeu_epi64(__mem, __k, __vi);
1188 else
1189 _mm_mask_storeu_pd(__mem, __k, __vi);
1190 }
1191 else if constexpr (__have_avx512f
1192 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1193 {
1194 // use a 512-bit maskstore, using zero-extension of the bitmask
1195 _S_masked_store_nocvt(
1196 _SimdWrapper64<_Tp>(
1197 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1198 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1199 }
1200 else
1201 _S_masked_store_nocvt(__v, __mem,
1202 _MaskImpl::template _S_to_maskvector<
1203 __int_for_sizeof_t<_Tp>, _Np>(__k));
1204 }
1205 else
1206 __assert_unreachable<_Tp>();
1207 }
1208
1209 template <typename _Tp, size_t _Np>
1210 _GLIBCXX_SIMD_INTRINSIC static void
1211 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1212 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1213 {
1214 if constexpr (sizeof(__v) <= 16)
1215 {
1216 [[maybe_unused]] const auto __vi
1217 = __intrin_bitcast<__m128i>(__as_vector(__v));
1218 [[maybe_unused]] const auto __ki
1219 = __intrin_bitcast<__m128i>(__as_vector(__k));
1220 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1221 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1222 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1223 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1224 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1225 && is_integral_v<_Tp>)
1226 _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1227 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1228 _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1229 __vector_bitcast<float>(__vi));
1230 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1231 && is_integral_v<_Tp>)
1232 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1233 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1234 _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1235 __vector_bitcast<double>(__vi));
1236 else if constexpr (__have_sse2)
1237 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<char*>(__mem));
1238 }
1239 else if constexpr (sizeof(__v) == 32)
1240 {
1241 [[maybe_unused]] const auto __vi
1242 = __intrin_bitcast<__m256i>(__as_vector(__v));
1243 [[maybe_unused]] const auto __ki
1244 = __intrin_bitcast<__m256i>(__as_vector(__k));
1245 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1246 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1247 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1248 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1249 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1250 && is_integral_v<_Tp>)
1251 _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1252 else if constexpr (sizeof(_Tp) == 4)
1253 _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1254 __vector_bitcast<float>(__v));
1255 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1256 && is_integral_v<_Tp>)
1257 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1258 __vi);
1259 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1260 _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1261 __vector_bitcast<double>(__v));
1262 else if constexpr (__have_sse2)
1263 {
1264 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki),
1265 reinterpret_cast<char*>(__mem));
1266 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki),
1267 reinterpret_cast<char*>(__mem) + 16);
1268 }
1269 }
1270 else
1271 __assert_unreachable<_Tp>();
1272 }
1273
1274 // }}}
1275 // _S_masked_store {{{
1276 template <typename _Tp, size_t _Np, typename _Up>
1277 _GLIBCXX_SIMD_INTRINSIC static void
1278 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1279 const _MaskMember<_Tp> __k) noexcept
1280 {
1281 if constexpr (is_integral_v<
1282 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up)
1283 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw)
1284 && (sizeof(__v) == 64 || __have_avx512vl))
1285 { // truncating store
1286 const auto __vi = __to_intrin(__v);
1287 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1288 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1289 && sizeof(__vi) == 64)
1290 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1291 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1292 && sizeof(__vi) == 32)
1293 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1294 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1295 && sizeof(__vi) == 16)
1296 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1297 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1298 && sizeof(__vi) == 64)
1299 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1300 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1301 && sizeof(__vi) == 32)
1302 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1303 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1304 && sizeof(__vi) == 16)
1305 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1306 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1307 && sizeof(__vi) == 64)
1308 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1309 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1310 && sizeof(__vi) == 32)
1311 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1312 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1313 && sizeof(__vi) == 16)
1314 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1315 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1316 && sizeof(__vi) == 64)
1317 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1318 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1319 && sizeof(__vi) == 32)
1320 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1321 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1322 && sizeof(__vi) == 16)
1323 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1324 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1325 && sizeof(__vi) == 64)
1326 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1327 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1328 && sizeof(__vi) == 32)
1329 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1330 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1331 && sizeof(__vi) == 16)
1332 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1333 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1334 && sizeof(__vi) == 64)
1335 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1336 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1337 && sizeof(__vi) == 32)
1338 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1339 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1340 && sizeof(__vi) == 16)
1341 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1342 else
1343 __assert_unreachable<_Tp>();
1344 }
1345 else
1346 _Base::_S_masked_store(__v, __mem, __k);
1347 }
1348
1349 // }}}
1350 // _S_multiplies {{{
1351 template <typename _V, typename _VVT = _VectorTraits<_V>>
1352 _GLIBCXX_SIMD_INTRINSIC static constexpr _V
1353 _S_multiplies(_V __x, _V __y)
1354 {
1355 using _Tp = typename _VVT::value_type;
1356 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1357 || __y._M_is_constprop())
1358 return __as_vector(__x) * __as_vector(__y);
1359 else if constexpr (sizeof(_Tp) == 1)
1360 {
1361 if constexpr (sizeof(_V) == 2)
1362 {
1363 const auto __xs = reinterpret_cast<short>(__x._M_data);
1364 const auto __ys = reinterpret_cast<short>(__y._M_data);
1365 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short(
1366 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1367 }
1368 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1369 {
1370 const auto __xi = reinterpret_cast<int>(__x._M_data);
1371 const auto __yi = reinterpret_cast<int>(__y._M_data);
1372 return reinterpret_cast<__vector_type_t<_Tp, 3>>(
1373 ((__xi * __yi) & 0xff)
1374 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1375 | ((__xi >> 16) * (__yi & 0xff0000)));
1376 }
1377 else if constexpr (sizeof(_V) == 4)
1378 {
1379 const auto __xi = reinterpret_cast<int>(__x._M_data);
1380 const auto __yi = reinterpret_cast<int>(__y._M_data);
1381 return reinterpret_cast<__vector_type_t<_Tp, 4>>(
1382 ((__xi * __yi) & 0xff)
1383 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1384 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1385 | ((__xi >> 24) * (__yi & 0xff000000u)));
1386 }
1387 else if constexpr (sizeof(_V) == 8 && __have_avx2
1388 && is_signed_v<_Tp>)
1389 return __convert<typename _VVT::type>(
1390 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1391 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1392 else if constexpr (sizeof(_V) == 8 && __have_avx2
1393 && is_unsigned_v<_Tp>)
1394 return __convert<typename _VVT::type>(
1395 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1396 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1397 else
1398 {
1399 // codegen of `x*y` is suboptimal (as of GCC 9.0.1)
1400 constexpr size_t __full_size = _VVT::_S_full_size;
1401 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8;
1402 using _ShortW = _SimdWrapper<short, _Np>;
1403 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1404 * __vector_bitcast<short, _Np>(__y);
1405 _ShortW __high_byte = _ShortW()._M_data - 256;
1406 //[&]() { asm("" : "+x"(__high_byte._M_data)); }();
1407 const _ShortW __odd
1408 = (__vector_bitcast<short, _Np>(__x) >> 8)
1409 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1410 if constexpr (__have_avx512bw && sizeof(_V) > 2)
1411 return _CommonImplX86::_S_blend_avx512(
1412 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1413 __vector_bitcast<_Tp>(__odd));
1414 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1415 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1416 __high_byte),
1417 __to_intrin(__even),
1418 __to_intrin(__odd));
1419 else
1420 return __to_intrin(
1421 __or(__andnot(__high_byte, __even), __odd));
1422 }
1423 }
1424 else
1425 return _Base::_S_multiplies(__x, __y);
1426 }
1427
1428 // }}}
1429 // _S_divides {{{
1430#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993
1431 template <typename _Tp, size_t _Np>
1432 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1433 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1434 {
1435 if (!__builtin_is_constant_evaluated()
1436 && !__builtin_constant_p(__y._M_data))
1437 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4)
1438 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1)
1439 // Note that using floating-point division is likely to raise the
1440 // *Inexact* exception flag and thus appears like an invalid
1441 // "as-if" transformation. However, C++ doesn't specify how the
1442 // fpenv can be observed and points to C. C says that function
1443 // calls are assumed to potentially raise fp exceptions, unless
1444 // documented otherwise. Consequently, operator/, which is a
1445 // function call, may raise fp exceptions.
1446 /*const struct _CsrGuard
1447 {
1448 const unsigned _M_data = _mm_getcsr();
1449 _CsrGuard()
1450 {
1451 _mm_setcsr(0x9f80); // turn off FP exceptions and
1452 flush-to-zero
1453 }
1454 ~_CsrGuard() { _mm_setcsr(_M_data); }
1455 } __csr;*/
1456 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1457 constexpr size_t __n_intermediate
1458 = std::min(_Np, (__have_avx512f ? 64
1459 : __have_avx ? 32
1460 : 16)
1461 / sizeof(_Float));
1462 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1463 constexpr size_t __n_floatv
1464 = __div_roundup(_Np, __n_intermediate);
1465 using _R = __vector_type_t<_Tp, _Np>;
1466 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1467 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1468 _Abi::__make_padding_nonzero(__as_vector(__y)));
1469 return __call_with_n_evaluations<__n_floatv>(
1470 [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1471 return __vector_convert<_R>(__quotients...);
1472 },
1473 [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1474 -> _SimdWrapper<_Float, __n_intermediate>
1475 {
1476#if __RECIPROCAL_MATH__
1477 // If -freciprocal-math is active, using the `/` operator is
1478 // incorrect because it may be translated to an imprecise
1479 // multiplication with reciprocal. We need to use inline
1480 // assembly to force a real division.
1481 _FloatV __r;
1482 if constexpr (__have_avx) // -mno-sse2avx is irrelevant
1483 // because once -mavx is given, GCC
1484 // emits VEX encoded vdivp[sd]
1485 {
1486 if constexpr (sizeof(_Tp) == 4)
1487 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}"
1488 : "=x"(__r)
1489 : "x"(__xf[__i]), "x"(__yf[__i]));
1490 else
1491 asm("vdivps\t{%2, %1, %0|%0, %1, %2}"
1492 : "=x"(__r)
1493 : "x"(__xf[__i]), "x"(__yf[__i]));
1494 }
1495 else
1496 {
1497 __r = __xf[__i];
1498 if constexpr (sizeof(_Tp) == 4)
1499 asm("divpd\t{%1, %0|%0, %1}"
1500 : "=x"(__r)
1501 : "x"(__yf[__i]));
1502 else
1503 asm("divps\t{%1, %0|%0, %1}"
1504 : "=x"(__r)
1505 : "x"(__yf[__i]));
1506 }
1507 return __r;
1508#else
1509 return __xf[__i] / __yf[__i];
1510#endif
1511 });
1512 }
1513 /* 64-bit int division is potentially optimizable via double division if
1514 * the value in __x is small enough and the conversion between
1515 * int<->double is efficient enough:
1516 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
1517 sizeof(_Tp) == 8)
1518 {
1519 if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1520 {
1521 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull,
1522 0xffe0'0000'0000'0000ull}))
1523 {
1524 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data)
1525 }
1526 }
1527 }
1528 */
1529 return _Base::_S_divides(__x, __y);
1530 }
1531#else
1532 using _Base::_S_divides;
1533#endif // _GLIBCXX_SIMD_WORKAROUND_PR90993
1534
1535 // }}}
1536 // _S_modulus {{{
1537 template <typename _Tp, size_t _Np>
1538 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1539 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1540 {
1541 if (__builtin_is_constant_evaluated()
1542 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8)
1543 return _Base::_S_modulus(__x, __y);
1544 else
1545 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y)));
1546 }
1547
1548 // }}}
1549 // _S_bit_shift_left {{{
1550 // Notes on UB. C++2a [expr.shift] says:
1551 // -1- [...] The operands shall be of integral or unscoped enumeration type
1552 // and integral promotions are performed. The type of the result is that
1553 // of the promoted left operand. The behavior is undefined if the right
1554 // operand is negative, or greater than or equal to the width of the
1555 // promoted left operand.
1556 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo
1557 // 2^N, where N is the width of the type of the result.
1558 //
1559 // C++17 [expr.shift] says:
1560 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated
1561 // bits are zero-filled. If E1 has an unsigned type, the value of the
1562 // result is E1 × 2^E2 , reduced modulo one more than the maximum value
1563 // representable in the result type. Otherwise, if E1 has a signed type
1564 // and non-negative value, and E1 × 2^E2 is representable in the
1565 // corresponding unsigned type of the result type, then that value,
1566 // converted to the result type, is the resulting value; otherwise, the
1567 // behavior is undefined.
1568 //
1569 // Consequences:
1570 // With C++2a signed and unsigned types have the same UB
1571 // characteristics:
1572 // - left shift is not UB for 0 <= RHS < max(32, #bits(T))
1573 //
1574 // With C++17 there's little room for optimizations because the standard
1575 // requires all shifts to happen on promoted integrals (i.e. int). Thus,
1576 // short and char shifts must assume shifts affect bits of neighboring
1577 // values.
1578 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1579 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1580 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1581 _S_bit_shift_left(_Tp __xx, int __y)
1582 {
1583 using _V = typename _TVT::type;
1584 using _Up = typename _TVT::value_type;
1585 _V __x = __xx;
1586 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1587 if (__builtin_is_constant_evaluated())
1588 return __x << __y;
1589#if __cplusplus > 201703
1590 // after C++17, signed shifts have no UB, and behave just like unsigned
1591 // shifts
1592 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>)
1593 return __vector_bitcast<_Up>(
1594 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1595 __y));
1596#endif
1597 else if constexpr (sizeof(_Up) == 1)
1598 {
1599 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894)
1600 if (__builtin_constant_p(__y))
1601 {
1602 if (__y == 0)
1603 return __x;
1604 else if (__y == 1)
1605 return __x + __x;
1606 else if (__y == 2)
1607 {
1608 __x = __x + __x;
1609 return __x + __x;
1610 }
1611 else if (__y > 2 && __y < 8)
1612 {
1613 if constexpr (sizeof(__x) > sizeof(unsigned))
1614 {
1615 const _UChar __mask = 0xff << __y; // precomputed vector
1616 return __vector_bitcast<_Up>(
1617 __vector_bitcast<_UChar>(
1618 __vector_bitcast<unsigned>(__x) << __y)
1619 & __mask);
1620 }
1621 else
1622 {
1623 const unsigned __mask
1624 = (0xff & (0xff << __y)) * 0x01010101u;
1625 return reinterpret_cast<_V>(
1626 static_cast<__int_for_sizeof_t<_V>>(
1627 unsigned(
1628 reinterpret_cast<__int_for_sizeof_t<_V>>(__x)
1629 << __y)
1630 & __mask));
1631 }
1632 }
1633 else if (__y >= 8 && __y < 32)
1634 return _V();
1635 else
1636 __builtin_unreachable();
1637 }
1638 // general strategy in the following: use an sllv instead of sll
1639 // instruction, because it's 2 to 4 times faster:
1640 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16)
1641 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8(
1642 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix),
1643 _mm256_set1_epi16(w: __y))));
1644 else if constexpr (__have_avx512bw && sizeof(__x) == 32)
1645 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1646 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix),
1647 _mm512_set1_epi16(w: __y))));
1648 else if constexpr (__have_avx512bw && sizeof(__x) == 64)
1649 {
1650 const auto __shift = _mm512_set1_epi16(w: __y);
1651 return __vector_bitcast<_Up>(
1652 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1653 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)),
1654 _mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1655 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift))));
1656 }
1657 else if constexpr (__have_avx2 && sizeof(__x) == 32)
1658 {
1659#if 1
1660 const auto __shift = _mm_cvtsi32_si128(a: __y);
1661 auto __k
1662 = _mm256_sll_epi16(a: _mm256_slli_epi16(a: ~__m256i(), count: 8), count: __shift);
1663 __k |= _mm256_srli_epi16(a: __k, count: 8);
1664 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift)
1665 & __k);
1666#else
1667 const _Up __k = 0xff << __y;
1668 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y)
1669 & __k;
1670#endif
1671 }
1672 else
1673 {
1674 const auto __shift = _mm_cvtsi32_si128(a: __y);
1675 auto __k
1676 = _mm_sll_epi16(a: _mm_slli_epi16(a: ~__m128i(), count: 8), count: __shift);
1677 __k |= _mm_srli_epi16(a: __k, count: 8);
1678 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k);
1679 }
1680 }
1681 return __x << __y;
1682 }
1683
1684 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1685 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1686 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y)
1687 {
1688 using _V = typename _TVT::type;
1689 using _Up = typename _TVT::value_type;
1690 _V __x = __xx;
1691 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1692 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1693 if (__builtin_is_constant_evaluated())
1694 return __x << __y;
1695#if __cplusplus > 201703
1696 // after C++17, signed shifts have no UB, and behave just like unsigned
1697 // shifts
1698 else if constexpr (is_signed_v<_Up>)
1699 return __vector_bitcast<_Up>(
1700 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1701 __vector_bitcast<make_unsigned_t<_Up>>(__y)));
1702#endif
1703 else if constexpr (sizeof(_Up) == 1)
1704 {
1705 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1706 return __vector_bitcast<_Up>(__concat(
1707 _mm512_cvtepi16_epi8(
1708 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)),
1709 _mm512_cvtepu8_epi16(__lo256(__iy)))),
1710 _mm512_cvtepi16_epi8(
1711 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)),
1712 _mm512_cvtepu8_epi16(__hi256(__iy))))));
1713 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1714 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1715 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix),
1716 _mm512_cvtepu8_epi16(__iy))));
1717 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl)
1718 return __intrin_bitcast<_V>(
1719 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix),
1720 _mm_cvtepu8_epi16(__iy))));
1721 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1722 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1723 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix),
1724 _mm256_cvtepu8_epi16(__iy))));
1725 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1726 return __intrin_bitcast<_V>(
1727 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1728 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)),
1729 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy))))));
1730 else if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1731 {
1732 auto __mask
1733 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5);
1734 auto __x4
1735 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1736 __x4 &= char(0xf0);
1737 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1738 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4)));
1739 __mask += __mask;
1740 auto __x2
1741 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1742 __x2 &= char(0xfc);
1743 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1744 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2)));
1745 __mask += __mask;
1746 auto __x1 = __x + __x;
1747 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1748 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1)));
1749 return __x
1750 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1751 }
1752 else if constexpr (sizeof(__x) == 16)
1753 {
1754 auto __mask
1755 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5);
1756 auto __x4
1757 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1758 __x4 &= char(0xf0);
1759 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x;
1760 __mask += __mask;
1761 auto __x2
1762 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1763 __x2 &= char(0xfc);
1764 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x;
1765 __mask += __mask;
1766 auto __x1 = __x + __x;
1767 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x;
1768 return __x
1769 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1770 }
1771 else
1772 return __x << __y;
1773 }
1774 else if constexpr (sizeof(_Up) == 2)
1775 {
1776 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1777 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy));
1778 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl)
1779 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy));
1780 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1781 return __vector_bitcast<_Up>(
1782 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix),
1783 _mm512_castsi256_si512(__iy))));
1784 else if constexpr (sizeof __ix == 32 && __have_avx2)
1785 {
1786 const auto __ux = __vector_bitcast<unsigned>(__x);
1787 const auto __uy = __vector_bitcast<unsigned>(__y);
1788 return __vector_bitcast<_Up>(_mm256_blend_epi16(
1789 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1790 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1791 }
1792 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1793 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy));
1794 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1795 return __intrin_bitcast<_V>(
1796 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix),
1797 _mm512_castsi128_si512(__iy))));
1798 else if constexpr (sizeof __ix == 16 && __have_avx2)
1799 {
1800 const auto __ux = __vector_bitcast<unsigned>(__ix);
1801 const auto __uy = __vector_bitcast<unsigned>(__iy);
1802 return __intrin_bitcast<_V>(_mm_blend_epi16(
1803 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1804 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1805 }
1806 else if constexpr (sizeof __ix == 16)
1807 {
1808 using _Float4 = __vector_type_t<float, 4>;
1809 using _Int4 = __vector_type_t<int, 4>;
1810 using _UInt4 = __vector_type_t<unsigned, 4>;
1811 const _UInt4 __yu
1812 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3)));
1813 return __x
1814 * __intrin_bitcast<_V>(
1815 __vector_convert<_Int4>(xs: _SimdWrapper<float, 4>(
1816 reinterpret_cast<_Float4>(__yu << 23)))
1817 | (__vector_convert<_Int4>(xs: _SimdWrapper<float, 4>(
1818 reinterpret_cast<_Float4>((__yu >> 16) << 23)))
1819 << 16));
1820 }
1821 else
1822 __assert_unreachable<_Tp>();
1823 }
1824 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16
1825 && !__have_avx2)
1826 // latency is suboptimal, but throughput is at full speedup
1827 return __intrin_bitcast<_V>(
1828 __vector_bitcast<unsigned>(__ix)
1829 * __vector_convert<__vector_type16_t<int>>(
1830 xs: _SimdWrapper<float, 4>(__vector_bitcast<float>(
1831 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000))));
1832 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16
1833 && !__have_avx2)
1834 {
1835 const auto __lo = _mm_sll_epi64(__ix, __iy);
1836 const auto __hi
1837 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy));
1838 if constexpr (__have_sse4_1)
1839 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0));
1840 else
1841 return __vector_bitcast<_Up>(
1842 _mm_move_sd(__vector_bitcast<double>(__hi),
1843 __vector_bitcast<double>(__lo)));
1844 }
1845 else
1846 return __x << __y;
1847 }
1848#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
1849
1850 // }}}
1851 // _S_bit_shift_right {{{
1852#ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1853 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1854 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1855 _S_bit_shift_right(_Tp __xx, int __y)
1856 {
1857 using _V = typename _TVT::type;
1858 using _Up = typename _TVT::value_type;
1859 _V __x = __xx;
1860 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1861 if (__builtin_is_constant_evaluated())
1862 return __x >> __y;
1863 else if (__builtin_constant_p(__y)
1864 && is_unsigned_v<
1865 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__))
1866 return _V();
1867 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{
1868 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y)
1869 & _Up(0xff >> __y);
1870 //}}}
1871 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{
1872 return __intrin_bitcast<_V>(
1873 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix)
1874 >> (__y + 8))
1875 << 8)
1876 | (__vector_bitcast<_UShort>(
1877 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8)
1878 >> __y)
1879 >> 8));
1880 //}}}
1881 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected
1882 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{
1883 {
1884 if (__y > 32)
1885 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32)
1886 & _Up(0xffff'ffff'0000'0000ull))
1887 | __vector_bitcast<_Up>(
1888 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix)
1889 >> 32)
1890 >> (__y - 32));
1891 else
1892 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix)
1893 >> __y)
1894 | __vector_bitcast<_Up>(
1895 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll)
1896 >> __y);
1897 }
1898 //}}}
1899 else
1900 return __x >> __y;
1901 }
1902
1903 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1904 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1905 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y)
1906 {
1907 using _V = typename _TVT::type;
1908 using _Up = typename _TVT::value_type;
1909 _V __x = __xx;
1910 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1911 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1912 if (__builtin_is_constant_evaluated()
1913 || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
1914 return __x >> __y;
1915 else if constexpr (sizeof(_Up) == 1) //{{{
1916 {
1917 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl)
1918 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8(
1919 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix),
1920 _mm_cvtepi8_epi16(__iy))
1921 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix),
1922 _mm_cvtepu8_epi16(__iy))));
1923 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl)
1924 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1925 is_signed_v<_Up>
1926 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix),
1927 _mm256_cvtepi8_epi16(__iy))
1928 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix),
1929 _mm256_cvtepu8_epi16(__iy))));
1930 else if constexpr (sizeof(__x) == 32 && __have_avx512bw)
1931 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1932 is_signed_v<_Up>
1933 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix),
1934 _mm512_cvtepi8_epi16(__iy))
1935 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix),
1936 _mm512_cvtepu8_epi16(__iy))));
1937 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>)
1938 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1939 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1940 0x5555'5555'5555'5555ull,
1941 _mm512_srav_epi16(
1942 _mm512_slli_epi16(__ix, 8),
1943 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy,
1944 _mm512_set1_epi16(w: 8)))));
1945 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>)
1946 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1947 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1948 0x5555'5555'5555'5555ull,
1949 _mm512_srlv_epi16(
1950 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix),
1951 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy))));
1952 /* This has better throughput but higher latency than the impl below
1953 else if constexpr (__have_avx2 && sizeof(__x) == 16 &&
1954 is_unsigned_v<_Up>)
1955 {
1956 const auto __shorts = __to_intrin(_S_bit_shift_right(
1957 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)),
1958 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy))));
1959 return __vector_bitcast<_Up>(
1960 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts)));
1961 }
1962 */
1963 else if constexpr (__have_avx2 && sizeof(__x) > 8)
1964 // the following uses vpsr[al]vd, which requires AVX2
1965 if constexpr (is_signed_v<_Up>)
1966 {
1967 const auto r3 = __vector_bitcast<_UInt>(
1968 (__vector_bitcast<int>(__x)
1969 >> (__vector_bitcast<_UInt>(__y) >> 24)))
1970 & 0xff000000u;
1971 const auto r2
1972 = __vector_bitcast<_UInt>(
1973 ((__vector_bitcast<int>(__x) << 8)
1974 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)))
1975 & 0xff000000u;
1976 const auto r1
1977 = __vector_bitcast<_UInt>(
1978 ((__vector_bitcast<int>(__x) << 16)
1979 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)))
1980 & 0xff000000u;
1981 const auto r0 = __vector_bitcast<_UInt>(
1982 (__vector_bitcast<int>(__x) << 24)
1983 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24));
1984 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1985 | (r0 >> 24));
1986 }
1987 else
1988 {
1989 const auto r3 = (__vector_bitcast<_UInt>(__x)
1990 >> (__vector_bitcast<_UInt>(__y) >> 24))
1991 & 0xff000000u;
1992 const auto r2
1993 = ((__vector_bitcast<_UInt>(__x) << 8)
1994 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))
1995 & 0xff000000u;
1996 const auto r1
1997 = ((__vector_bitcast<_UInt>(__x) << 16)
1998 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))
1999 & 0xff000000u;
2000 const auto r0
2001 = (__vector_bitcast<_UInt>(__x) << 24)
2002 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24);
2003 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
2004 | (r0 >> 24));
2005 }
2006 else if constexpr (__have_sse4_1
2007 && is_unsigned_v<_Up> && sizeof(__x) > 2)
2008 {
2009 auto __x128 = __vector_bitcast<_Up>(__ix);
2010 auto __mask
2011 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5);
2012 auto __x4 = __vector_bitcast<_Up>(
2013 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f));
2014 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2015 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4)));
2016 __mask += __mask;
2017 auto __x2 = __vector_bitcast<_Up>(
2018 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f));
2019 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2020 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2)));
2021 __mask += __mask;
2022 auto __x1 = __vector_bitcast<_Up>(
2023 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f));
2024 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2025 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1)));
2026 return __intrin_bitcast<_V>(
2027 __x128
2028 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2029 == 0)); // y > 7 nulls the result
2030 }
2031 else if constexpr (__have_sse4_1
2032 && is_signed_v<_Up> && sizeof(__x) > 2)
2033 {
2034 auto __mask = __vector_bitcast<_UChar>(
2035 __vector_bitcast<_UShort>(__iy) << 5);
2036 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2037 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
2038 };
2039 auto __xh = __vector_bitcast<short>(__ix);
2040 auto __xl = __vector_bitcast<short>(__ix) << 8;
2041 auto __xh4 = __xh >> 4;
2042 auto __xl4 = __xl >> 4;
2043 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2044 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4)));
2045 __xl = __vector_bitcast<short>(
2046 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2047 __to_intrin(__xl4)));
2048 __mask += __mask;
2049 auto __xh2 = __xh >> 2;
2050 auto __xl2 = __xl >> 2;
2051 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2052 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2)));
2053 __xl = __vector_bitcast<short>(
2054 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2055 __to_intrin(__xl2)));
2056 __mask += __mask;
2057 auto __xh1 = __xh >> 1;
2058 auto __xl1 = __xl >> 1;
2059 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2060 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1)));
2061 __xl = __vector_bitcast<short>(
2062 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2063 __to_intrin(__xl1)));
2064 return __intrin_bitcast<_V>(
2065 (__vector_bitcast<_Up>((__xh & short(0xff00)))
2066 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2067 >> 8))
2068 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2069 == 0)); // y > 7 nulls the result
2070 }
2071 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2
2072 {
2073 auto __mask
2074 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5);
2075 auto __x4 = __vector_bitcast<_Up>(
2076 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f));
2077 __x = __mask > 0x7f ? __x4 : __x;
2078 __mask += __mask;
2079 auto __x2 = __vector_bitcast<_Up>(
2080 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f));
2081 __x = __mask > 0x7f ? __x2 : __x;
2082 __mask += __mask;
2083 auto __x1 = __vector_bitcast<_Up>(
2084 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f));
2085 __x = __mask > 0x7f ? __x1 : __x;
2086 return __x
2087 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2088 }
2089 else if constexpr (sizeof(__x) > 2) // signed SSE2
2090 {
2091 static_assert(is_signed_v<_Up>);
2092 auto __maskh = __vector_bitcast<_UShort>(__y) << 5;
2093 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8);
2094 auto __xh = __vector_bitcast<short>(__x);
2095 auto __xl = __vector_bitcast<short>(__x) << 8;
2096 auto __xh4 = __xh >> 4;
2097 auto __xl4 = __xl >> 4;
2098 __xh = __maskh > 0x7fff ? __xh4 : __xh;
2099 __xl = __maskl > 0x7fff ? __xl4 : __xl;
2100 __maskh += __maskh;
2101 __maskl += __maskl;
2102 auto __xh2 = __xh >> 2;
2103 auto __xl2 = __xl >> 2;
2104 __xh = __maskh > 0x7fff ? __xh2 : __xh;
2105 __xl = __maskl > 0x7fff ? __xl2 : __xl;
2106 __maskh += __maskh;
2107 __maskl += __maskl;
2108 auto __xh1 = __xh >> 1;
2109 auto __xl1 = __xl >> 1;
2110 __xh = __maskh > 0x7fff ? __xh1 : __xh;
2111 __xl = __maskl > 0x7fff ? __xl1 : __xl;
2112 __x = __vector_bitcast<_Up>((__xh & short(0xff00)))
2113 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2114 >> 8);
2115 return __x
2116 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2117 }
2118 else
2119 return __x >> __y;
2120 } //}}}
2121 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
2122 {
2123 [[maybe_unused]] auto __blend_0xaa
2124 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2125 if constexpr (sizeof(__a) == 16)
2126 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2127 0xaa);
2128 else if constexpr (sizeof(__a) == 32)
2129 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2130 0xaa);
2131 else if constexpr (sizeof(__a) == 64)
2132 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
2133 __to_intrin(__b));
2134 else
2135 __assert_unreachable<decltype(__a)>();
2136 };
2137 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
2138 return __intrin_bitcast<_V>(is_signed_v<_Up>
2139 ? _mm_srav_epi16(__ix, __iy)
2140 : _mm_srlv_epi16(__ix, __iy));
2141 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32)
2142 return __vector_bitcast<_Up>(is_signed_v<_Up>
2143 ? _mm256_srav_epi16(__ix, __iy)
2144 : _mm256_srlv_epi16(__ix, __iy));
2145 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64)
2146 return __vector_bitcast<_Up>(is_signed_v<_Up>
2147 ? _mm512_srav_epi16(__ix, __iy)
2148 : _mm512_srlv_epi16(__ix, __iy));
2149 else if constexpr (__have_avx2 && is_signed_v<_Up>)
2150 return __intrin_bitcast<_V>(
2151 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16)
2152 >> (__vector_bitcast<int>(__iy) & 0xffffu))
2153 >> 16,
2154 __vector_bitcast<int>(__ix)
2155 >> (__vector_bitcast<int>(__iy) >> 16)));
2156 else if constexpr (__have_avx2 && is_unsigned_v<_Up>)
2157 return __intrin_bitcast<_V>(
2158 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu)
2159 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu),
2160 __vector_bitcast<_UInt>(__ix)
2161 >> (__vector_bitcast<_UInt>(__iy) >> 16)));
2162 else if constexpr (__have_sse4_1)
2163 {
2164 auto __mask = __vector_bitcast<_UShort>(__iy);
2165 auto __x128 = __vector_bitcast<_Up>(__ix);
2166 //__mask *= 0x0808;
2167 __mask = (__mask << 3) | (__mask << 11);
2168 // do __x128 = 0 where __y[4] is set
2169 __x128 = __vector_bitcast<_Up>(
2170 _mm_blendv_epi8(__to_intrin(__x128), __m128i(),
2171 __to_intrin(__mask)));
2172 // do __x128 =>> 8 where __y[3] is set
2173 __x128 = __vector_bitcast<_Up>(
2174 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8),
2175 __to_intrin(__mask += __mask)));
2176 // do __x128 =>> 4 where __y[2] is set
2177 __x128 = __vector_bitcast<_Up>(
2178 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4),
2179 __to_intrin(__mask += __mask)));
2180 // do __x128 =>> 2 where __y[1] is set
2181 __x128 = __vector_bitcast<_Up>(
2182 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2),
2183 __to_intrin(__mask += __mask)));
2184 // do __x128 =>> 1 where __y[0] is set
2185 return __intrin_bitcast<_V>(
2186 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1),
2187 __to_intrin(__mask + __mask)));
2188 }
2189 else
2190 {
2191 auto __k = __vector_bitcast<_UShort>(__iy) << 11;
2192 auto __x128 = __vector_bitcast<_Up>(__ix);
2193 auto __mask
2194 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2195 return __vector_bitcast<short>(x: __kk) < 0;
2196 };
2197 // do __x128 = 0 where __y[4] is set
2198 __x128 = __mask(__k) ? decltype(__x128)() : __x128;
2199 // do __x128 =>> 8 where __y[3] is set
2200 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128;
2201 // do __x128 =>> 4 where __y[2] is set
2202 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128;
2203 // do __x128 =>> 2 where __y[1] is set
2204 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128;
2205 // do __x128 =>> 1 where __y[0] is set
2206 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1
2207 : __x128);
2208 }
2209 } //}}}
2210 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{
2211 {
2212 if constexpr (is_unsigned_v<_Up>)
2213 {
2214 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31
2215 const __m128 __factor_f = reinterpret_cast<__m128>(
2216 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23));
2217 const __m128i __factor
2218 = __builtin_constant_p(__factor_f)
2219 ? __to_intrin(
2220 x: __make_vector<unsigned>(args: __factor_f[0], args: __factor_f[1],
2221 args: __factor_f[2], args: __factor_f[3]))
2222 : _mm_cvttps_epi32(a: __factor_f);
2223 const auto __r02
2224 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31);
2225 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4),
2226 _mm_srli_si128(__factor, 4));
2227 if constexpr (__have_sse4_1)
2228 return __intrin_bitcast<_V>(
2229 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33));
2230 else
2231 return __intrin_bitcast<_V>(
2232 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4));
2233 }
2234 else
2235 {
2236 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2237 if constexpr (is_signed_v<_Up>)
2238 return _mm_sra_epi32(__a, __b);
2239 else
2240 return _mm_srl_epi32(__a, __b);
2241 };
2242 const auto __r0
2243 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i()));
2244 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32));
2245 const auto __r2
2246 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i()));
2247 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12));
2248 if constexpr (__have_sse4_1)
2249 return __intrin_bitcast<_V>(
2250 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3),
2251 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0));
2252 else
2253 return __intrin_bitcast<_V>(_mm_unpacklo_epi64(
2254 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)),
2255 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4))));
2256 }
2257 } //}}}
2258 else
2259 return __x >> __y;
2260 }
2261#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
2262
2263 // }}}
2264 // compares {{{
2265 // _S_equal_to {{{
2266 template <typename _Tp, size_t _Np>
2267 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2268 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2269 {
2270 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2271 {
2272 if (__builtin_is_constant_evaluated()
2273 || (__x._M_is_constprop() && __y._M_is_constprop()))
2274 return _MaskImpl::_S_to_bits(
2275 __as_wrapper<_Np>(__x._M_data == __y._M_data));
2276
2277 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2278 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2279 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2280 if constexpr (is_floating_point_v<_Tp>)
2281 {
2282 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2283 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2284 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2285 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2286 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2287 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2288 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2289 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2290 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2291 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2292 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2293 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2294 else
2295 __assert_unreachable<_Tp>();
2296 }
2297 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2298 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2299 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2300 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2301 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2302 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2303 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2304 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2306 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2308 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2309 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2310 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2311 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2312 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2313 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2314 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2315 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2316 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2317 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2318 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2319 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2320 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2321 else
2322 __assert_unreachable<_Tp>();
2323 } // }}}
2324 else if (__builtin_is_constant_evaluated())
2325 return _Base::_S_equal_to(__x, __y);
2326 else if constexpr (sizeof(__x) == 8)
2327 {
2328 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2329 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2330 _MaskMember<_Tp> __r64{};
2331 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2332 return __r64;
2333 }
2334 else
2335 return _Base::_S_equal_to(__x, __y);
2336 }
2337
2338 // }}}
2339 // _S_not_equal_to {{{
2340 template <typename _Tp, size_t _Np>
2341 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2342 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2343 {
2344 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2345 {
2346 if (__builtin_is_constant_evaluated()
2347 || (__x._M_is_constprop() && __y._M_is_constprop()))
2348 return _MaskImpl::_S_to_bits(
2349 __as_wrapper<_Np>(__x._M_data != __y._M_data));
2350
2351 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2352 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2353 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2354 if constexpr (is_floating_point_v<_Tp>)
2355 {
2356 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2357 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2358 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2359 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2360 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2361 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2362 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2363 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2364 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2365 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2366 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2367 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2368 else
2369 __assert_unreachable<_Tp>();
2370 }
2371 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2372 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2373 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2374 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2375 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2376 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2377 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2378 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2379 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2380 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2381 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2382 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2383 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2384 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2385 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2386 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2387 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2388 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2389 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2390 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2391 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2392 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2393 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2394 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2395 else
2396 __assert_unreachable<_Tp>();
2397 } // }}}
2398 else if (__builtin_is_constant_evaluated())
2399 return _Base::_S_not_equal_to(__x, __y);
2400 else if constexpr (sizeof(__x) == 8)
2401 {
2402 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2403 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2404 _MaskMember<_Tp> __r64{};
2405 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2406 return __r64;
2407 }
2408 else
2409 return _Base::_S_not_equal_to(__x, __y);
2410 }
2411
2412 // }}}
2413 // _S_less {{{
2414 template <typename _Tp, size_t _Np>
2415 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2416 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2417 {
2418 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2419 {
2420 if (__builtin_is_constant_evaluated()
2421 || (__x._M_is_constprop() && __y._M_is_constprop()))
2422 return _MaskImpl::_S_to_bits(
2423 __as_wrapper<_Np>(__x._M_data < __y._M_data));
2424
2425 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2426 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2427 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2428 if constexpr (sizeof(__xi) == 64)
2429 {
2430 if constexpr (is_same_v<_Tp, float>)
2431 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2432 else if constexpr (is_same_v<_Tp, double>)
2433 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2434 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2435 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2436 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2437 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2438 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2439 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2440 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2441 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2442 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2443 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2444 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2445 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2446 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2447 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2448 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2449 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2450 else
2451 __assert_unreachable<_Tp>();
2452 }
2453 else if constexpr (sizeof(__xi) == 32)
2454 {
2455 if constexpr (is_same_v<_Tp, float>)
2456 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2457 else if constexpr (is_same_v<_Tp, double>)
2458 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2459 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2460 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2461 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2462 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2463 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2464 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2465 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2466 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2467 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2468 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2469 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2470 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2471 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2472 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2473 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2474 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2475 else
2476 __assert_unreachable<_Tp>();
2477 }
2478 else if constexpr (sizeof(__xi) == 16)
2479 {
2480 if constexpr (is_same_v<_Tp, float>)
2481 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2482 else if constexpr (is_same_v<_Tp, double>)
2483 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2484 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2485 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2486 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2487 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2488 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2489 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2490 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2491 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2492 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2493 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2494 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2495 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2496 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2497 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2498 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2499 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2500 else
2501 __assert_unreachable<_Tp>();
2502 }
2503 else
2504 __assert_unreachable<_Tp>();
2505 } // }}}
2506 else if (__builtin_is_constant_evaluated())
2507 return _Base::_S_less(__x, __y);
2508 else if constexpr (sizeof(__x) == 8)
2509 {
2510 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2511 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2512 _MaskMember<_Tp> __r64{};
2513 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2514 return __r64;
2515 }
2516 else
2517 return _Base::_S_less(__x, __y);
2518 }
2519
2520 // }}}
2521 // _S_less_equal {{{
2522 template <typename _Tp, size_t _Np>
2523 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2524 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2525 {
2526 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2527 {
2528 if (__builtin_is_constant_evaluated()
2529 || (__x._M_is_constprop() && __y._M_is_constprop()))
2530 return _MaskImpl::_S_to_bits(
2531 __as_wrapper<_Np>(__x._M_data <= __y._M_data));
2532
2533 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2534 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2535 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2536 if constexpr (sizeof(__xi) == 64)
2537 {
2538 if constexpr (is_same_v<_Tp, float>)
2539 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2540 else if constexpr (is_same_v<_Tp, double>)
2541 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2542 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2543 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi);
2544 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2545 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi);
2546 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2547 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi);
2548 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2549 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi);
2550 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2551 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi);
2552 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2553 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi);
2554 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2555 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi);
2556 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2557 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi);
2558 else
2559 __assert_unreachable<_Tp>();
2560 }
2561 else if constexpr (sizeof(__xi) == 32)
2562 {
2563 if constexpr (is_same_v<_Tp, float>)
2564 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2565 else if constexpr (is_same_v<_Tp, double>)
2566 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2567 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2568 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi);
2569 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2570 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi);
2571 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2572 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi);
2573 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2574 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi);
2575 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2576 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi);
2577 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2578 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi);
2579 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2580 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi);
2581 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2582 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi);
2583 else
2584 __assert_unreachable<_Tp>();
2585 }
2586 else if constexpr (sizeof(__xi) == 16)
2587 {
2588 if constexpr (is_same_v<_Tp, float>)
2589 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2590 else if constexpr (is_same_v<_Tp, double>)
2591 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2592 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2593 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi);
2594 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2595 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi);
2596 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2597 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi);
2598 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2599 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi);
2600 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2601 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi);
2602 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2603 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi);
2604 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2605 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi);
2606 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2607 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi);
2608 else
2609 __assert_unreachable<_Tp>();
2610 }
2611 else
2612 __assert_unreachable<_Tp>();
2613 } // }}}
2614 else if (__builtin_is_constant_evaluated())
2615 return _Base::_S_less_equal(__x, __y);
2616 else if constexpr (sizeof(__x) == 8)
2617 {
2618 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2619 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2620 _MaskMember<_Tp> __r64{};
2621 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2622 return __r64;
2623 }
2624 else
2625 return _Base::_S_less_equal(__x, __y);
2626 }
2627
2628 // }}} }}}
2629 // negation {{{
2630 template <typename _Tp, size_t _Np>
2631 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2632 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
2633 {
2634 if constexpr (__is_avx512_abi<_Abi>())
2635 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>());
2636 else
2637 return _Base::_S_negate(__x);
2638 }
2639
2640 // }}}
2641 // math {{{
2642 using _Base::_S_abs;
2643
2644 // _S_sqrt {{{
2645 template <typename _Tp, size_t _Np>
2646 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2647 _S_sqrt(_SimdWrapper<_Tp, _Np> __x)
2648 {
2649 if constexpr (__is_sse_ps<_Tp, _Np>())
2650 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x)));
2651 else if constexpr (__is_sse_pd<_Tp, _Np>())
2652 return _mm_sqrt_pd(__x);
2653 else if constexpr (__is_avx_ps<_Tp, _Np>())
2654 return _mm256_sqrt_ps(__x);
2655 else if constexpr (__is_avx_pd<_Tp, _Np>())
2656 return _mm256_sqrt_pd(__x);
2657 else if constexpr (__is_avx512_ps<_Tp, _Np>())
2658 return _mm512_sqrt_ps(__x);
2659 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2660 return _mm512_sqrt_pd(__x);
2661 else
2662 __assert_unreachable<_Tp>();
2663 }
2664
2665 // }}}
2666 // _S_ldexp {{{
2667 template <typename _Tp, size_t _Np>
2668 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2669 _S_ldexp(_SimdWrapper<_Tp, _Np> __x,
2670 __fixed_size_storage_t<int, _Np> __exp)
2671 {
2672 if constexpr (__is_avx512_abi<_Abi>())
2673 {
2674 const auto __xi = __to_intrin(__x);
2675 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi>
2676 __cvt;
2677 const auto __expi = __to_intrin(__cvt(__exp));
2678 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2679 if constexpr (sizeof(__xi) == 16)
2680 {
2681 if constexpr (sizeof(_Tp) == 8)
2682 return _mm_maskz_scalef_pd(__k1, __xi, __expi);
2683 else
2684 return _mm_maskz_scalef_ps(__k1, __xi, __expi);
2685 }
2686 else if constexpr (sizeof(__xi) == 32)
2687 {
2688 if constexpr (sizeof(_Tp) == 8)
2689 return _mm256_maskz_scalef_pd(__k1, __xi, __expi);
2690 else
2691 return _mm256_maskz_scalef_ps(__k1, __xi, __expi);
2692 }
2693 else
2694 {
2695 static_assert(sizeof(__xi) == 64);
2696 if constexpr (sizeof(_Tp) == 8)
2697 return _mm512_maskz_scalef_pd(__k1, __xi, __expi);
2698 else
2699 return _mm512_maskz_scalef_ps(__k1, __xi, __expi);
2700 }
2701 }
2702 else
2703 return _Base::_S_ldexp(__x, __exp);
2704 }
2705
2706 // }}}
2707 // _S_trunc {{{
2708 template <typename _Tp, size_t _Np>
2709 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2710 _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2711 {
2712 if constexpr (__is_avx512_ps<_Tp, _Np>())
2713 return _mm512_roundscale_ps(__x, 0x0b);
2714 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2715 return _mm512_roundscale_pd(__x, 0x0b);
2716 else if constexpr (__is_avx_ps<_Tp, _Np>())
2717 return _mm256_round_ps(__x, 0x3);
2718 else if constexpr (__is_avx_pd<_Tp, _Np>())
2719 return _mm256_round_pd(__x, 0x3);
2720 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2721 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x3));
2722 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2723 return _mm_round_pd(__x, 0x3);
2724 else if constexpr (__is_sse_ps<_Tp, _Np>())
2725 {
2726 auto __truncated
2727 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)));
2728 const auto __no_fractional_values
2729 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x))
2730 & 0x7f800000u)
2731 < 0x4b000000; // the exponent is so large that no mantissa bits
2732 // signify fractional values (0x3f8 + 23*8 =
2733 // 0x4b0)
2734 return __no_fractional_values ? __truncated : __to_intrin(__x);
2735 }
2736 else
2737 return _Base::_S_trunc(__x);
2738 }
2739
2740 // }}}
2741 // _S_round {{{
2742 template <typename _Tp, size_t _Np>
2743 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2744 _S_round(_SimdWrapper<_Tp, _Np> __x)
2745 {
2746 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away
2747 // from zero as required by std::round. Therefore this function is more
2748 // complicated.
2749 using _V = __vector_type_t<_Tp, _Np>;
2750 _V __truncated;
2751 if constexpr (__is_avx512_ps<_Tp, _Np>())
2752 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b);
2753 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2754 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b);
2755 else if constexpr (__is_avx_ps<_Tp, _Np>())
2756 __truncated = _mm256_round_ps(__x._M_data,
2757 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2758 else if constexpr (__is_avx_pd<_Tp, _Np>())
2759 __truncated = _mm256_round_pd(__x._M_data,
2760 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2761 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2762 __truncated = __auto_bitcast(
2763 _mm_round_ps(__to_intrin(__x),
2764 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
2765 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2766 __truncated
2767 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2768 else if constexpr (__is_sse_ps<_Tp, _Np>())
2769 __truncated = __auto_bitcast(
2770 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))));
2771 else
2772 return _Base::_S_round(__x);
2773
2774 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0
2775 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0
2776
2777 const _V __rounded
2778 = __truncated
2779 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5)
2780 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1)
2781 : _V());
2782 if constexpr (__have_sse4_1)
2783 return __rounded;
2784 else // adjust for missing range in cvttps_epi32
2785 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded
2786 : __x._M_data;
2787 }
2788
2789 // }}}
2790 // _S_nearbyint {{{
2791 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2792 _GLIBCXX_SIMD_INTRINSIC static _Tp
2793 _S_nearbyint(_Tp __x) noexcept
2794 {
2795 if constexpr (_TVT::template _S_is<float, 16>)
2796 return _mm512_roundscale_ps(__x, 0x0c);
2797 else if constexpr (_TVT::template _S_is<double, 8>)
2798 return _mm512_roundscale_pd(__x, 0x0c);
2799 else if constexpr (_TVT::template _S_is<float, 8>)
2800 return _mm256_round_ps(__x,
2801 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2802 else if constexpr (_TVT::template _S_is<double, 4>)
2803 return _mm256_round_pd(__x,
2804 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2805 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2806 return _mm_round_ps(__x,
2807 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2808 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2809 return _mm_round_pd(__x,
2810 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2811 else
2812 return _Base::_S_nearbyint(__x);
2813 }
2814
2815 // }}}
2816 // _S_rint {{{
2817 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2818 _GLIBCXX_SIMD_INTRINSIC static _Tp
2819 _S_rint(_Tp __x) noexcept
2820 {
2821 if constexpr (_TVT::template _S_is<float, 16>)
2822 return _mm512_roundscale_ps(__x, 0x04);
2823 else if constexpr (_TVT::template _S_is<double, 8>)
2824 return _mm512_roundscale_pd(__x, 0x04);
2825 else if constexpr (_TVT::template _S_is<float, 8>)
2826 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2827 else if constexpr (_TVT::template _S_is<double, 4>)
2828 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2829 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2830 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2831 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2832 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2833 else
2834 return _Base::_S_rint(__x);
2835 }
2836
2837 // }}}
2838 // _S_floor {{{
2839 template <typename _Tp, size_t _Np>
2840 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2841 _S_floor(_SimdWrapper<_Tp, _Np> __x)
2842 {
2843 if constexpr (__is_avx512_ps<_Tp, _Np>())
2844 return _mm512_roundscale_ps(__x, 0x09);
2845 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2846 return _mm512_roundscale_pd(__x, 0x09);
2847 else if constexpr (__is_avx_ps<_Tp, _Np>())
2848 return _mm256_round_ps(__x, 0x1);
2849 else if constexpr (__is_avx_pd<_Tp, _Np>())
2850 return _mm256_round_pd(__x, 0x1);
2851 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2852 return __auto_bitcast(_mm_floor_ps(__to_intrin(__x)));
2853 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2854 return _mm_floor_pd(__x);
2855 else
2856 return _Base::_S_floor(__x);
2857 }
2858
2859 // }}}
2860 // _S_ceil {{{
2861 template <typename _Tp, size_t _Np>
2862 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2863 _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2864 {
2865 if constexpr (__is_avx512_ps<_Tp, _Np>())
2866 return _mm512_roundscale_ps(__x, 0x0a);
2867 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2868 return _mm512_roundscale_pd(__x, 0x0a);
2869 else if constexpr (__is_avx_ps<_Tp, _Np>())
2870 return _mm256_round_ps(__x, 0x2);
2871 else if constexpr (__is_avx_pd<_Tp, _Np>())
2872 return _mm256_round_pd(__x, 0x2);
2873 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2874 return __auto_bitcast(_mm_ceil_ps(__to_intrin(__x)));
2875 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2876 return _mm_ceil_pd(__x);
2877 else
2878 return _Base::_S_ceil(__x);
2879 }
2880
2881 // }}}
2882 // _S_signbit {{{
2883 template <typename _Tp, size_t _Np>
2884 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2885 _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2886 {
2887 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2888 {
2889 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
2890 return _mm512_movepi32_mask(
2891 __intrin_bitcast<__m512i>(__x._M_data));
2892 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
2893 return _mm512_movepi64_mask(
2894 __intrin_bitcast<__m512i>(__x._M_data));
2895 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
2896 return _mm256_movepi32_mask(
2897 __intrin_bitcast<__m256i>(__x._M_data));
2898 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
2899 return _mm256_movepi64_mask(
2900 __intrin_bitcast<__m256i>(__x._M_data));
2901 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4)
2902 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data));
2903 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8)
2904 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data));
2905 }
2906 else if constexpr (__is_avx512_abi<_Abi>())
2907 {
2908 const auto __xi = __to_intrin(__x);
2909 [[maybe_unused]] constexpr auto __k1
2910 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2911 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2912 return _mm_movemask_ps(__xi);
2913 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2914 return _mm_movemask_pd(__xi);
2915 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2916 return _mm256_movemask_ps(__xi);
2917 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2918 return _mm256_movemask_pd(__xi);
2919 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2920 return _mm512_mask_cmplt_epi32_mask(
2921 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2922 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2923 return _mm512_mask_cmplt_epi64_mask(
2924 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2925 else
2926 __assert_unreachable<_Tp>();
2927 }
2928 else
2929 return _Base::_S_signbit(__x);
2930 /*{
2931 using _I = __int_for_sizeof_t<_Tp>;
2932 if constexpr (sizeof(__x) == 64)
2933 return _S_less(__vector_bitcast<_I>(__x), _I());
2934 else
2935 {
2936 const auto __xx = __vector_bitcast<_I>(__x._M_data);
2937 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>;
2938 if constexpr ((sizeof(_Tp) == 4 &&
2939 (__have_avx2 || sizeof(__x) == 16)) ||
2940 __have_avx512vl)
2941 {
2942 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>);
2943 }
2944 else if constexpr ((__have_avx2 ||
2945 (__have_ssse3 && sizeof(__x) == 16)))
2946 {
2947 return __vector_bitcast<_Tp>((__xx & __signmask) ==
2948 __signmask);
2949 }
2950 else
2951 { // SSE2/3 or AVX (w/o AVX2)
2952 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1);
2953 return __vector_bitcast<_Tp>(
2954 __vector_bitcast<_Tp>(
2955 (__xx & __signmask) |
2956 __vector_bitcast<_I>(__one)) // -1 or 1
2957 != __one);
2958 }
2959 }
2960 }*/
2961 }
2962
2963 // }}}
2964 // _S_isnonzerovalue_mask {{{
2965 // (isnormal | is subnormal == !isinf & !isnan & !is zero)
2966 template <typename _Tp>
2967 _GLIBCXX_SIMD_INTRINSIC static auto
2968 _S_isnonzerovalue_mask(_Tp __x)
2969 {
2970 using _Traits = _VectorTraits<_Tp>;
2971 if constexpr (__have_avx512dq_vl)
2972 {
2973 if constexpr (_Traits::template _S_is<
2974 float, 2> || _Traits::template _S_is<float, 4>)
2975 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f));
2976 else if constexpr (_Traits::template _S_is<float, 8>)
2977 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f));
2978 else if constexpr (_Traits::template _S_is<float, 16>)
2979 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f));
2980 else if constexpr (_Traits::template _S_is<double, 2>)
2981 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f));
2982 else if constexpr (_Traits::template _S_is<double, 4>)
2983 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f));
2984 else if constexpr (_Traits::template _S_is<double, 8>)
2985 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f));
2986 else
2987 __assert_unreachable<_Tp>();
2988 }
2989 else
2990 {
2991 using _Up = typename _Traits::value_type;
2992 constexpr size_t _Np = _Traits::_S_full_size;
2993 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0
2994 const auto __b = __x * _Up(); // NaN if __x == inf
2995 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>())
2996 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b),
2997 _CMP_ORD_Q);
2998 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>())
2999 return __mmask8(0xf
3000 & _mm512_cmp_ps_mask(__auto_bitcast(__a),
3001 __auto_bitcast(__b),
3002 _CMP_ORD_Q));
3003 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>())
3004 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3005 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>())
3006 return __mmask8(0x3
3007 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
3008 __auto_bitcast(__b),
3009 _CMP_ORD_Q));
3010 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>())
3011 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
3012 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>())
3013 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a),
3014 __auto_bitcast(__b),
3015 _CMP_ORD_Q));
3016 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>())
3017 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3018 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>())
3019 return __mmask8(0xf
3020 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
3021 __auto_bitcast(__b),
3022 _CMP_ORD_Q));
3023 else if constexpr (__is_avx512_ps<_Up, _Np>())
3024 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
3025 else if constexpr (__is_avx512_pd<_Up, _Np>())
3026 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3027 else
3028 __assert_unreachable<_Tp>();
3029 }
3030 }
3031
3032 // }}}
3033 // _S_isfinite {{{
3034 template <typename _Tp, size_t _Np>
3035 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3036 _S_isfinite(_SimdWrapper<_Tp, _Np> __x)
3037 {
3038 static_assert(is_floating_point_v<_Tp>);
3039#if !__FINITE_MATH_ONLY__
3040 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3041 {
3042 const auto __xi = __to_intrin(__x);
3043 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3044 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3045 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3046 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3047 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3048 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3049 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3050 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3051 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3052 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3053 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3054 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3055 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3056 }
3057 else if constexpr (__is_avx512_abi<_Abi>())
3058 {
3059 // if all exponent bits are set, __x is either inf or NaN
3060 using _I = __int_for_sizeof_t<_Tp>;
3061 const auto __inf = __vector_bitcast<_I>(
3062 __vector_broadcast<_Np>(__infinity_v<_Tp>));
3063 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf);
3064 }
3065 else
3066#endif
3067 return _Base::_S_isfinite(__x);
3068 }
3069
3070 // }}}
3071 // _S_isinf {{{
3072 template <typename _Tp, size_t _Np>
3073 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3074 _S_isinf(_SimdWrapper<_Tp, _Np> __x)
3075 {
3076#if !__FINITE_MATH_ONLY__
3077 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3078 {
3079 const auto __xi = __to_intrin(__x);
3080 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3081 return _mm512_fpclass_ps_mask(__xi, 0x18);
3082 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3083 return _mm512_fpclass_pd_mask(__xi, 0x18);
3084 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3085 return _mm256_fpclass_ps_mask(__xi, 0x18);
3086 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3087 return _mm256_fpclass_pd_mask(__xi, 0x18);
3088 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3089 return _mm_fpclass_ps_mask(__xi, 0x18);
3090 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3091 return _mm_fpclass_pd_mask(__xi, 0x18);
3092 else
3093 __assert_unreachable<_Tp>();
3094 }
3095 else if constexpr (__have_avx512dq_vl)
3096 {
3097 if constexpr (__is_sse_pd<_Tp, _Np>())
3098 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18));
3099 else if constexpr (__is_avx_pd<_Tp, _Np>())
3100 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18));
3101 else if constexpr (__is_sse_ps<_Tp, _Np>())
3102 return _mm_movm_epi32(
3103 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18));
3104 else if constexpr (__is_avx_ps<_Tp, _Np>())
3105 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18));
3106 else
3107 __assert_unreachable<_Tp>();
3108 }
3109 else
3110#endif
3111 return _Base::_S_isinf(__x);
3112 }
3113
3114 // }}}
3115 // _S_isnormal {{{
3116 template <typename _Tp, size_t _Np>
3117 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3118 _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
3119 {
3120#if __FINITE_MATH_ONLY__
3121 [[maybe_unused]] constexpr int __mode = 0x26;
3122#else
3123 [[maybe_unused]] constexpr int __mode = 0xbf;
3124#endif
3125 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3126 {
3127 const auto __xi = __to_intrin(__x);
3128 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3129 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3130 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode);
3131 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3132 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode);
3133 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3134 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode);
3135 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3136 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode);
3137 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3138 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode);
3139 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3140 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode);
3141 else
3142 __assert_unreachable<_Tp>();
3143 }
3144 else if constexpr (__have_avx512dq)
3145 {
3146 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>())
3147 return _mm_movm_epi32(
3148 A: _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode)));
3149 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>())
3150 return _mm256_movm_epi32(
3151 A: _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode)));
3152 else if constexpr (__is_avx512_ps<_Tp, _Np>())
3153 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode));
3154 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>())
3155 return _mm_movm_epi64(
3156 A: _knot_mask8(_mm_fpclass_pd_mask(__x, __mode)));
3157 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>())
3158 return _mm256_movm_epi64(
3159 A: _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode)));
3160 else if constexpr (__is_avx512_pd<_Tp, _Np>())
3161 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode));
3162 else
3163 __assert_unreachable<_Tp>();
3164 }
3165 else if constexpr (__is_avx512_abi<_Abi>())
3166 {
3167 using _I = __int_for_sizeof_t<_Tp>;
3168 const auto absn = __vector_bitcast<_I>(_S_abs(__x));
3169 const auto minn = __vector_bitcast<_I>(
3170 __vector_broadcast<_Np>(__norm_min_v<_Tp>));
3171#if __FINITE_MATH_ONLY__
3172 return _S_less_equal<_I, _Np>(minn, absn);
3173#else
3174 const auto infn
3175 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
3176 return __and(_S_less_equal<_I, _Np>(minn, absn),
3177 _S_less<_I, _Np>(absn, infn));
3178#endif
3179 }
3180 else
3181 return _Base::_S_isnormal(__x);
3182 }
3183
3184 // }}}
3185 // _S_isnan {{{
3186 template <typename _Tp, size_t _Np>
3187 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3188 _S_isnan(_SimdWrapper<_Tp, _Np> __x)
3189 { return _S_isunordered(__x, __x); }
3190
3191 // }}}
3192 // _S_isunordered {{{
3193 template <typename _Tp, size_t _Np>
3194 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3195 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x,
3196 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y)
3197 {
3198#if __FINITE_MATH_ONLY__
3199 return {}; // false
3200#else
3201 const auto __xi = __to_intrin(__x);
3202 const auto __yi = __to_intrin(__y);
3203 if constexpr (__is_avx512_abi<_Abi>())
3204 {
3205 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3206 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3207 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3208 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3209 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3210 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3211 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3212 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3213 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3214 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3215 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3216 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3217 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3218 }
3219 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3220 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q));
3221 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3222 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q));
3223 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3224 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi));
3225 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3226 return __to_masktype(_mm_cmpunord_pd(__xi, __yi));
3227 else
3228 __assert_unreachable<_Tp>();
3229#endif
3230 }
3231
3232 // }}}
3233 // _S_isgreater {{{
3234 template <typename _Tp, size_t _Np>
3235 static constexpr _MaskMember<_Tp>
3236 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3237 {
3238 const auto __xi = __to_intrin(__x);
3239 const auto __yi = __to_intrin(__y);
3240 if constexpr (__is_avx512_abi<_Abi>())
3241 {
3242 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3243 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3244 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3245 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3246 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3247 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3248 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3249 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3250 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3251 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3252 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3253 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3254 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3255 else
3256 __assert_unreachable<_Tp>();
3257 }
3258 else if constexpr (__have_avx)
3259 {
3260 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3261 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3262 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3263 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3264 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3265 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3266 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3267 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3268 else
3269 __assert_unreachable<_Tp>();
3270 }
3271 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3272 && sizeof(_Tp) == 4)
3273 {
3274 const auto __xn = __vector_bitcast<int>(__xi);
3275 const auto __yn = __vector_bitcast<int>(__yi);
3276 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3277 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3278 return __auto_bitcast(
3279 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp));
3280 }
3281 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3282 && sizeof(_Tp) == 8)
3283 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3284 -_mm_ucomigt_sd(__xi, __yi),
3285 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi),
3286 _mm_unpackhi_pd(__yi, __yi))};
3287 else
3288 return _Base::_S_isgreater(__x, __y);
3289 }
3290
3291 // }}}
3292 // _S_isgreaterequal {{{
3293 template <typename _Tp, size_t _Np>
3294 static constexpr _MaskMember<_Tp>
3295 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3296 {
3297 const auto __xi = __to_intrin(__x);
3298 const auto __yi = __to_intrin(__y);
3299 if constexpr (__is_avx512_abi<_Abi>())
3300 {
3301 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3302 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3303 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3304 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3305 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3306 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3307 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3308 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3309 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3310 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3311 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3312 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3313 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3314 else
3315 __assert_unreachable<_Tp>();
3316 }
3317 else if constexpr (__have_avx)
3318 {
3319 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3320 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3321 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3322 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3323 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3324 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3325 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3326 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3327 else
3328 __assert_unreachable<_Tp>();
3329 }
3330 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3331 && sizeof(_Tp) == 4)
3332 {
3333 const auto __xn = __vector_bitcast<int>(__xi);
3334 const auto __yn = __vector_bitcast<int>(__yi);
3335 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3336 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3337 return __auto_bitcast(
3338 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp));
3339 }
3340 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3341 && sizeof(_Tp) == 8)
3342 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3343 -_mm_ucomige_sd(__xi, __yi),
3344 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi),
3345 _mm_unpackhi_pd(__yi, __yi))};
3346 else
3347 return _Base::_S_isgreaterequal(__x, __y);
3348 }
3349
3350 // }}}
3351 // _S_isless {{{
3352 template <typename _Tp, size_t _Np>
3353 static constexpr _MaskMember<_Tp>
3354 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3355 {
3356 const auto __xi = __to_intrin(__x);
3357 const auto __yi = __to_intrin(__y);
3358 if constexpr (__is_avx512_abi<_Abi>())
3359 {
3360 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3361 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3362 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3363 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3364 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3365 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3366 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3367 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3368 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3369 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3370 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3371 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3372 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3373 else
3374 __assert_unreachable<_Tp>();
3375 }
3376 else if constexpr (__have_avx)
3377 {
3378 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3379 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3380 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3381 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3382 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3383 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3384 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3385 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3386 else
3387 __assert_unreachable<_Tp>();
3388 }
3389 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3390 && sizeof(_Tp) == 4)
3391 {
3392 const auto __xn = __vector_bitcast<int>(__xi);
3393 const auto __yn = __vector_bitcast<int>(__yi);
3394 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3395 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3396 return __auto_bitcast(
3397 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp));
3398 }
3399 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3400 && sizeof(_Tp) == 8)
3401 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3402 -_mm_ucomigt_sd(__yi, __xi),
3403 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi),
3404 _mm_unpackhi_pd(__xi, __xi))};
3405 else
3406 return _Base::_S_isless(__x, __y);
3407 }
3408
3409 // }}}
3410 // _S_islessequal {{{
3411 template <typename _Tp, size_t _Np>
3412 static constexpr _MaskMember<_Tp>
3413 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3414 {
3415 const auto __xi = __to_intrin(__x);
3416 const auto __yi = __to_intrin(__y);
3417 if constexpr (__is_avx512_abi<_Abi>())
3418 {
3419 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3420 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3421 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3422 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3423 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3424 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3425 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3426 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3427 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3428 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3429 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3430 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3431 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3432 else
3433 __assert_unreachable<_Tp>();
3434 }
3435 else if constexpr (__have_avx)
3436 {
3437 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3438 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3439 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3440 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3441 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3442 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3443 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3444 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3445 else
3446 __assert_unreachable<_Tp>();
3447 }
3448 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3449 && sizeof(_Tp) == 4)
3450 {
3451 const auto __xn = __vector_bitcast<int>(__xi);
3452 const auto __yn = __vector_bitcast<int>(__yi);
3453 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3454 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3455 return __auto_bitcast(
3456 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp));
3457 }
3458 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3459 && sizeof(_Tp) == 8)
3460 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3461 -_mm_ucomige_sd(__yi, __xi),
3462 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi),
3463 _mm_unpackhi_pd(__xi, __xi))};
3464 else
3465 return _Base::_S_islessequal(__x, __y);
3466 }
3467
3468 // }}}
3469 // _S_islessgreater {{{
3470 template <typename _Tp, size_t _Np>
3471 static constexpr _MaskMember<_Tp>
3472 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3473 {
3474 const auto __xi = __to_intrin(__x);
3475 const auto __yi = __to_intrin(__y);
3476 if constexpr (__is_avx512_abi<_Abi>())
3477 {
3478 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3479 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3480 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3481 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3482 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3483 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3484 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3485 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3486 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3487 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3488 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3489 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3490 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3491 else
3492 __assert_unreachable<_Tp>();
3493 }
3494 else if constexpr (__have_avx)
3495 {
3496 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3497 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3498 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3499 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3500 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3501 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3502 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3503 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3504 else
3505 __assert_unreachable<_Tp>();
3506 }
3507 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3508 return __auto_bitcast(
3509 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi)));
3510 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3511 return __to_masktype(
3512 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi)));
3513 else
3514 __assert_unreachable<_Tp>();
3515 }
3516
3517 //}}} }}}
3518 template <template <typename> class _Op, typename _Tp, typename _K, size_t _Np>
3519 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
3520 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v)
3521 {
3522 if (__k._M_is_constprop_none_of())
3523 return __v;
3524 else if (__k._M_is_constprop_all_of())
3525 {
3526 auto __vv = _Base::_M_make_simd(__v);
3527 _Op<decltype(__vv)> __op;
3528 return __data(__op(__vv));
3529 }
3530 else if constexpr (__is_bitmask_v<decltype(__k)>
3531 && (is_same_v<_Op<void>, __increment<void>>
3532 || is_same_v<_Op<void>, __decrement<void>>))
3533 {
3534 // optimize masked unary increment and decrement as masked sub +/-1
3535 constexpr int __pm_one
3536 = is_same_v<_Op<void>, __increment<void>> ? -1 : 1;
3537#ifdef __clang__
3538 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data;
3539#else // __clang__
3540 if constexpr (is_integral_v<_Tp>)
3541 {
3542 constexpr bool __lp64 = sizeof(long) == sizeof(long long);
3543 using _Ip = std::make_signed_t<_Tp>;
3544 using _Up = std::conditional_t<
3545 std::is_same_v<_Ip, long>,
3546 std::conditional_t<__lp64, long long, int>,
3547 std::conditional_t<
3548 std::is_same_v<_Ip, signed char>, char, _Ip>>;
3549 const auto __value = __vector_bitcast<_Up>(__v._M_data);
3550#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3551 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \
3552 return __vector_bitcast<_Tp>(__builtin_ia32_##_Instr##_mask(__value, \
3553 __vector_broadcast<_Np>(_Up(__pm_one)), __value, __k._M_data))
3554 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512);
3555 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256);
3556 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128);
3557 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512);
3558 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256);
3559 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128);
3560 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512);
3561 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256);
3562 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128);
3563 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512);
3564 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256);
3565 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128);
3566#undef _GLIBCXX_SIMD_MASK_SUB
3567 }
3568 else
3569 {
3570#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3571 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \
3572 return __builtin_ia32_##_Instr##_mask( \
3573 __v._M_data, __vector_broadcast<_Np>(_Tp(__pm_one)), __v._M_data, \
3574 __k._M_data, _MM_FROUND_CUR_DIRECTION)
3575 _GLIBCXX_SIMD_MASK_SUB(4, 64, subps512);
3576 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256);
3577 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128);
3578 _GLIBCXX_SIMD_MASK_SUB(8, 64, subpd512);
3579 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256);
3580 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128);
3581#undef _GLIBCXX_SIMD_MASK_SUB
3582 }
3583#endif // __clang__
3584 }
3585 else
3586 return _Base::template _S_masked_unary<_Op>(__k, __v);
3587 }
3588 };
3589
3590// }}}
3591// _MaskImplX86Mixin {{{
3592struct _MaskImplX86Mixin
3593{
3594 template <typename _Tp>
3595 using _TypeTag = _Tp*;
3596
3597 using _Base = _MaskImplBuiltinMixin;
3598
3599 // _S_to_maskvector(bool) {{{
3600 template <typename _Up, size_t _ToN = 1, typename _Tp>
3601 _GLIBCXX_SIMD_INTRINSIC static constexpr
3602 enable_if_t<is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>>
3603 _S_to_maskvector(_Tp __x)
3604 {
3605 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3606 return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
3607 : __vector_type_t<_Up, _ToN>();
3608 }
3609
3610 // }}}
3611 // _S_to_maskvector(_SanitizedBitMask) {{{
3612 template <typename _Up, size_t _UpN = 0, size_t _Np, size_t _ToN = _UpN == 0 ? _Np : _UpN>
3613 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3614 _S_to_maskvector(_SanitizedBitMask<_Np> __x)
3615 {
3616 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3617 using _UV = __vector_type_t<_Up, _ToN>;
3618 using _UI = __intrinsic_type_t<_Up, _ToN>;
3619 [[maybe_unused]] const auto __k = __x._M_to_bits();
3620 if constexpr (_Np == 1)
3621 return _S_to_maskvector<_Up, _ToN>(__k);
3622 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3623 return __generate_from_n_evaluations<std::min(a: _ToN, b: _Np), _UV>(
3624 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; });
3625 else if constexpr (sizeof(_Up) == 1)
3626 {
3627 if constexpr (sizeof(_UI) == 16)
3628 {
3629 if constexpr (__have_avx512bw_vl)
3630 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k));
3631 else if constexpr (__have_avx512bw)
3632 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k)));
3633 else if constexpr (__have_avx512f)
3634 {
3635 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3636 auto __as16bits
3637 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3638 __hi256(__as32bits)));
3639 return __intrin_bitcast<_UV>(
3640 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
3641 }
3642 else if constexpr (__have_ssse3)
3643 {
3644 const auto __bitmask = __to_intrin(
3645 x: __make_vector<_UChar>(args: 1, args: 2, args: 4, args: 8, args: 16, args: 32, args: 64, args: 128, args: 1, args: 2, args: 4,
3646 args: 8, args: 16, args: 32, args: 64, args: 128));
3647 return __intrin_bitcast<_UV>(
3648 __vector_bitcast<_Up>(
3649 _mm_shuffle_epi8(a: __to_intrin(
3650 x: __vector_type_t<_ULLong, 2>{__k}),
3651 b: _mm_setr_epi8(b0: 0, b1: 0, b2: 0, b3: 0, b4: 0, b5: 0, b6: 0, b7: 0, b8: 1,
3652 b9: 1, b10: 1, b11: 1, b12: 1, b13: 1, b14: 1, b15: 1))
3653 & __bitmask)
3654 != 0);
3655 }
3656 // else fall through
3657 }
3658 else if constexpr (sizeof(_UI) == 32)
3659 {
3660 if constexpr (__have_avx512bw_vl)
3661 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k));
3662 else if constexpr (__have_avx512bw)
3663 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k)));
3664 else if constexpr (__have_avx512f)
3665 {
3666 auto __as16bits = // 0 16 1 17 ... 15 31
3667 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()),
3668 16)
3669 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16,
3670 ~__m512i()),
3671 16);
3672 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16(
3673 __lo256(__as16bits),
3674 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ...
3675 );
3676 // deinterleave:
3677 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8(
3678 __0_16_1_17, // 0 16 1 17 2 ...
3679 _mm256_setr_epi8(b31: 0, b30: 2, b29: 4, b28: 6, b27: 8, b26: 10, b25: 12, b24: 14, b23: 1, b22: 3, b21: 5, b20: 7, b19: 9,
3680 b18: 11, b17: 13, b16: 15, b15: 0, b14: 2, b13: 4, b12: 6, b11: 8, b10: 10, b09: 12, b08: 14, b07: 1,
3681 b06: 3, b05: 5, b04: 7, b03: 9, b02: 11, b01: 13,
3682 b00: 15)))); // 0-7 16-23 8-15 24-31 -> xzyw
3683 // 0-3 8-11 16-19 24-27
3684 // 4-7 12-15 20-23 28-31
3685 }
3686 else if constexpr (__have_avx2)
3687 {
3688 const auto __bitmask
3689 = _mm256_broadcastsi128_si256(X: __to_intrin(
3690 x: __make_vector<_UChar>(args: 1, args: 2, args: 4, args: 8, args: 16, args: 32, args: 64, args: 128, args: 1, args: 2,
3691 args: 4, args: 8, args: 16, args: 32, args: 64, args: 128)));
3692 return __vector_bitcast<_Up>(
3693 __vector_bitcast<_Up>(
3694 _mm256_shuffle_epi8(
3695 a: _mm256_broadcastsi128_si256(
3696 X: __to_intrin(x: __vector_type_t<_ULLong, 2>{__k})),
3697 b: _mm256_setr_epi8(b31: 0, b30: 0, b29: 0, b28: 0, b27: 0, b26: 0, b25: 0, b24: 0, b23: 1, b22: 1, b21: 1, b20: 1, b19: 1,
3698 b18: 1, b17: 1, b16: 1, b15: 2, b14: 2, b13: 2, b12: 2, b11: 2, b10: 2, b09: 2, b08: 2, b07: 3, b06: 3,
3699 b05: 3, b04: 3, b03: 3, b02: 3, b01: 3, b00: 3))
3700 & __bitmask)
3701 != 0);
3702 }
3703 // else fall through
3704 }
3705 else if constexpr (sizeof(_UI) == 64)
3706 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k));
3707 if constexpr (std::min(a: _ToN, b: _Np) <= 4)
3708 {
3709 if constexpr (_Np > 7) // avoid overflow
3710 __x &= _SanitizedBitMask<_Np>(0x0f);
3711 const _UInt __char_mask
3712 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL)
3713 * 0xff;
3714 _UV __r = {};
3715 __builtin_memcpy(&__r, &__char_mask,
3716 std::min(a: sizeof(__r), b: sizeof(__char_mask)));
3717 return __r;
3718 }
3719 else if constexpr (std::min(a: _ToN, b: _Np) <= 7)
3720 {
3721 if constexpr (_Np > 7) // avoid overflow
3722 __x &= _SanitizedBitMask<_Np>(0x7f);
3723 const _ULLong __char_mask
3724 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL)
3725 * 0xff;
3726 _UV __r = {};
3727 __builtin_memcpy(&__r, &__char_mask,
3728 std::min(a: sizeof(__r), b: sizeof(__char_mask)));
3729 return __r;
3730 }
3731 }
3732 else if constexpr (sizeof(_Up) == 2)
3733 {
3734 if constexpr (sizeof(_UI) == 16)
3735 {
3736 if constexpr (__have_avx512bw_vl)
3737 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k));
3738 else if constexpr (__have_avx512bw)
3739 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k)));
3740 else if constexpr (__have_avx512f)
3741 {
3742 __m256i __as32bits = {};
3743 if constexpr (__have_avx512vl)
3744 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i());
3745 else
3746 __as32bits
3747 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()));
3748 return __intrin_bitcast<_UV>(
3749 _mm_packs_epi32(a: __lo128(x: __as32bits), b: __hi128(x: __as32bits)));
3750 }
3751 // else fall through
3752 }
3753 else if constexpr (sizeof(_UI) == 32)
3754 {
3755 if constexpr (__have_avx512bw_vl)
3756 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k));
3757 else if constexpr (__have_avx512bw)
3758 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k)));
3759 else if constexpr (__have_avx512f)
3760 {
3761 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3762 return __vector_bitcast<_Up>(
3763 __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3764 __hi256(__as32bits))));
3765 }
3766 // else fall through
3767 }
3768 else if constexpr (sizeof(_UI) == 64)
3769 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k));
3770 }
3771 else if constexpr (sizeof(_Up) == 4)
3772 {
3773 if constexpr (sizeof(_UI) == 16)
3774 {
3775 if constexpr (__have_avx512dq_vl)
3776 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k));
3777 else if constexpr (__have_avx512dq)
3778 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k)));
3779 else if constexpr (__have_avx512vl)
3780 return __intrin_bitcast<_UV>(
3781 _mm_maskz_mov_epi32(__k, ~__m128i()));
3782 else if constexpr (__have_avx512f)
3783 return __intrin_bitcast<_UV>(
3784 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3785 // else fall through
3786 }
3787 else if constexpr (sizeof(_UI) == 32)
3788 {
3789 if constexpr (__have_avx512dq_vl)
3790 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k));
3791 else if constexpr (__have_avx512dq)
3792 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k)));
3793 else if constexpr (__have_avx512vl)
3794 return __vector_bitcast<_Up>(
3795 _mm256_maskz_mov_epi32(__k, ~__m256i()));
3796 else if constexpr (__have_avx512f)
3797 return __vector_bitcast<_Up>(
3798 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3799 // else fall through
3800 }
3801 else if constexpr (sizeof(_UI) == 64)
3802 return __vector_bitcast<_Up>(
3803 __have_avx512dq ? _mm512_movm_epi32(__k)
3804 : _mm512_maskz_mov_epi32(__k, ~__m512i()));
3805 }
3806 else if constexpr (sizeof(_Up) == 8)
3807 {
3808 if constexpr (sizeof(_UI) == 16)
3809 {
3810 if constexpr (__have_avx512dq_vl)
3811 return __vector_bitcast<_Up>(_mm_movm_epi64(__k));
3812 else if constexpr (__have_avx512dq)
3813 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k)));
3814 else if constexpr (__have_avx512vl)
3815 return __vector_bitcast<_Up>(
3816 _mm_maskz_mov_epi64(__k, ~__m128i()));
3817 else if constexpr (__have_avx512f)
3818 return __vector_bitcast<_Up>(
3819 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3820 // else fall through
3821 }
3822 else if constexpr (sizeof(_UI) == 32)
3823 {
3824 if constexpr (__have_avx512dq_vl)
3825 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k));
3826 else if constexpr (__have_avx512dq)
3827 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k)));
3828 else if constexpr (__have_avx512vl)
3829 return __vector_bitcast<_Up>(
3830 _mm256_maskz_mov_epi64(__k, ~__m256i()));
3831 else if constexpr (__have_avx512f)
3832 return __vector_bitcast<_Up>(
3833 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3834 // else fall through
3835 }
3836 else if constexpr (sizeof(_UI) == 64)
3837 return __vector_bitcast<_Up>(
3838 __have_avx512dq ? _mm512_movm_epi64(__k)
3839 : _mm512_maskz_mov_epi64(__k, ~__m512i()));
3840 }
3841
3842 using _UpUInt = make_unsigned_t<_Up>;
3843 using _V = __vector_type_t<_UpUInt, _ToN>;
3844 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__;
3845 if constexpr (_ToN == 2)
3846 {
3847 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])});
3848 }
3849 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32)
3850 {
3851 if constexpr (sizeof(_Up) == 4)
3852 return __vector_bitcast<_Up>(_mm256_cmp_ps(
3853 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)),
3854 _mm256_castsi256_ps(_mm256_setr_epi32(
3855 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))),
3856 _mm256_setzero_ps(), _CMP_NEQ_UQ));
3857 else if constexpr (sizeof(_Up) == 8)
3858 return __vector_bitcast<_Up>(_mm256_cmp_pd(
3859 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)),
3860 _mm256_castsi256_pd(
3861 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))),
3862 _mm256_setzero_pd(), _CMP_NEQ_UQ));
3863 else
3864 __assert_unreachable<_Up>();
3865 }
3866 else if constexpr (__bits_per_element >= _ToN)
3867 {
3868 constexpr auto __bitmask
3869 = __generate_vector<_V>([](auto __i)
3870 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt
3871 { return __i < _ToN ? 1ull << __i : 0; });
3872 const auto __bits
3873 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
3874 if constexpr (__bits_per_element > _ToN)
3875 return __vector_bitcast<_Up>(__bits) > 0;
3876 else
3877 return __vector_bitcast<_Up>(__bits != 0);
3878 }
3879 else
3880 {
3881 const _V __tmp
3882 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3883 return static_cast<_UpUInt>(
3884 __k >> (__bits_per_element * (__i / __bits_per_element)));
3885 })
3886 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3887 return static_cast<_UpUInt>(1ull
3888 << (__i % __bits_per_element));
3889 }); // mask bit index
3890 return __intrin_bitcast<_UV>(__tmp != _V());
3891 }
3892 }
3893
3894 // }}}
3895 // _S_to_maskvector(_SimdWrapper) {{{
3896 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
3897 size_t _ToN = _UpN == 0 ? _Np : _UpN>
3898 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3899 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
3900 {
3901 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3902 using _TW = _SimdWrapper<_Tp, _Np>;
3903 using _UW = _SimdWrapper<_Up, _ToN>;
3904 using _UI = __intrinsic_type_t<_Up, _ToN>;
3905 if constexpr (is_same_v<_Tp, bool>) // bits -> vector
3906 return _S_to_maskvector<_Up, _ToN>(
3907 _BitMask<_Np>(__x._M_data)._M_sanitized());
3908 // vector -> vector bitcast
3909 else if constexpr (sizeof(_Up) == sizeof(_Tp)
3910 && sizeof(_TW) == sizeof(_UW))
3911 return __wrapper_bitcast<_Up, _ToN>(
3912 _ToN <= _Np
3913 ? __x
3914 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x));
3915 else // vector -> vector {{{
3916 {
3917 if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3918 {
3919 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
3920 return __generate_from_n_evaluations<std::min(a: _ToN, b: _Np),
3921 __vector_type_t<_Up, _ToN>>(
3922 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; });
3923 }
3924 using _To = __vector_type_t<_Up, _ToN>;
3925 [[maybe_unused]] constexpr size_t _FromN = _Np;
3926 constexpr int _FromBytes = sizeof(_Tp);
3927 constexpr int _ToBytes = sizeof(_Up);
3928 const auto __k = __x._M_data;
3929
3930 if constexpr (_FromBytes == _ToBytes)
3931 return __intrin_bitcast<_To>(__k);
3932 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16)
3933 { // SSE -> SSE {{{
3934 if constexpr (_FromBytes == 4 && _ToBytes == 8)
3935 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3936 else if constexpr (_FromBytes == 2 && _ToBytes == 8)
3937 {
3938 const auto __y
3939 = __vector_bitcast<int>(__interleave128_lo(__k, __k));
3940 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3941 }
3942 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3943 {
3944 auto __y
3945 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3946 auto __z
3947 = __vector_bitcast<int>(__interleave128_lo(__y, __y));
3948 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z));
3949 }
3950 else if constexpr (_FromBytes == 8 && _ToBytes == 4
3951 && __have_sse2)
3952 return __intrin_bitcast<_To>(
3953 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3954 else if constexpr (_FromBytes == 8 && _ToBytes == 4)
3955 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k),
3956 _UI());
3957 else if constexpr (_FromBytes == 2 && _ToBytes == 4)
3958 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3959 else if constexpr (_FromBytes == 1 && _ToBytes == 4)
3960 {
3961 const auto __y
3962 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3963 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3964 }
3965 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
3966 {
3967 if constexpr (__have_sse2 && !__have_ssse3)
3968 return __intrin_bitcast<_To>(_mm_packs_epi32(
3969 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()),
3970 __m128i()));
3971 else
3972 return __intrin_bitcast<_To>(
3973 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(
3974 __vector_bitcast<_Up>(__k)));
3975 }
3976 else if constexpr (_FromBytes == 4 && _ToBytes == 2)
3977 return __intrin_bitcast<_To>(
3978 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3979 else if constexpr (_FromBytes == 1 && _ToBytes == 2)
3980 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3981 else if constexpr (_FromBytes == 8 && _ToBytes == 1
3982 && __have_ssse3)
3983 return __intrin_bitcast<_To>(
3984 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3985 _mm_setr_epi8(b0: 7, b1: 15, b2: -1, b3: -1, b4: -1, b5: -1, b6: -1, b7: -1,
3986 b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1, b14: -1,
3987 b15: -1)));
3988 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
3989 {
3990 auto __y
3991 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3992 __y = _mm_packs_epi32(__y, __m128i());
3993 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3994 }
3995 else if constexpr (_FromBytes == 4 && _ToBytes == 1
3996 && __have_ssse3)
3997 return __intrin_bitcast<_To>(
3998 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3999 _mm_setr_epi8(b0: 3, b1: 7, b2: 11, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1,
4000 b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1, b14: -1,
4001 b15: -1)));
4002 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4003 {
4004 const auto __y
4005 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
4006 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
4007 }
4008 else if constexpr (_FromBytes == 2 && _ToBytes == 1)
4009 return __intrin_bitcast<_To>(
4010 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()));
4011 else
4012 __assert_unreachable<_Tp>();
4013 } // }}}
4014 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32)
4015 { // AVX -> AVX {{{
4016 if constexpr (_FromBytes == _ToBytes)
4017 __assert_unreachable<_Tp>();
4018 else if constexpr (_FromBytes == _ToBytes * 2)
4019 {
4020 const auto __y = __vector_bitcast<_LLong>(__k);
4021 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
4022 _mm_packs_epi16(__lo128(__y), __hi128(__y))));
4023 }
4024 else if constexpr (_FromBytes == _ToBytes * 4)
4025 {
4026 const auto __y = __vector_bitcast<_LLong>(__k);
4027 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
4028 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4029 __m128i())));
4030 }
4031 else if constexpr (_FromBytes == _ToBytes * 8)
4032 {
4033 const auto __y = __vector_bitcast<_LLong>(__k);
4034 return __intrin_bitcast<_To>(
4035 _mm256_castsi128_si256(_mm_shuffle_epi8(
4036 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4037 _mm_setr_epi8(b0: 3, b1: 7, b2: 11, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1, b8: -1, b9: -1, b10: -1,
4038 b11: -1, b12: -1, b13: -1, b14: -1, b15: -1))));
4039 }
4040 else if constexpr (_FromBytes * 2 == _ToBytes)
4041 {
4042 auto __y = __xzyw(__to_intrin(__k));
4043 if constexpr (is_floating_point_v<
4044 _Tp> || (!__have_avx2 && _FromBytes == 4))
4045 {
4046 const auto __yy = __vector_bitcast<float>(__y);
4047 return __intrin_bitcast<_To>(
4048 _mm256_unpacklo_ps(__yy, __yy));
4049 }
4050 else
4051 return __intrin_bitcast<_To>(
4052 _mm256_unpacklo_epi8(__y, __y));
4053 }
4054 else if constexpr (_FromBytes * 4 == _ToBytes)
4055 {
4056 auto __y
4057 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4058 __lo128(__vector_bitcast<_LLong>(
4059 __k))); // drops 3/4 of input
4060 return __intrin_bitcast<_To>(
4061 __concat(_mm_unpacklo_epi16(__y, __y),
4062 _mm_unpackhi_epi16(__y, __y)));
4063 }
4064 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
4065 {
4066 auto __y
4067 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4068 __lo128(__vector_bitcast<_LLong>(
4069 __k))); // drops 3/4 of input
4070 __y
4071 = _mm_unpacklo_epi16(__y,
4072 __y); // drops another 1/2 => 7/8 total
4073 return __intrin_bitcast<_To>(
4074 __concat(_mm_unpacklo_epi32(__y, __y),
4075 _mm_unpackhi_epi32(__y, __y)));
4076 }
4077 else
4078 __assert_unreachable<_Tp>();
4079 } // }}}
4080 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16)
4081 { // SSE -> AVX {{{
4082 if constexpr (_FromBytes == _ToBytes)
4083 return __intrin_bitcast<_To>(
4084 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>(
4085 __zero_extend(__to_intrin(__k))));
4086 else if constexpr (_FromBytes * 2 == _ToBytes)
4087 { // keep all
4088 return __intrin_bitcast<_To>(
4089 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k),
4090 __vector_bitcast<_LLong>(__k)),
4091 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k),
4092 __vector_bitcast<_LLong>(__k))));
4093 }
4094 else if constexpr (_FromBytes * 4 == _ToBytes)
4095 {
4096 if constexpr (__have_avx2)
4097 {
4098 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4099 __concat(__vector_bitcast<_LLong>(__k),
4100 __vector_bitcast<_LLong>(__k)),
4101 _mm256_setr_epi8(b31: 0, b30: 0, b29: 0, b28: 0, b27: 1, b26: 1, b25: 1, b24: 1, b23: 2, b22: 2, b21: 2, b20: 2, b19: 3,
4102 b18: 3, b17: 3, b16: 3, b15: 4, b14: 4, b13: 4, b12: 4, b11: 5, b10: 5, b09: 5, b08: 5, b07: 6, b06: 6,
4103 b05: 6, b04: 6, b03: 7, b02: 7, b01: 7, b00: 7)));
4104 }
4105 else
4106 {
4107 return __intrin_bitcast<_To>(__concat(
4108 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4109 _mm_setr_epi8(b0: 0, b1: 0, b2: 0, b3: 0, b4: 1, b5: 1, b6: 1, b7: 1,
4110 b8: 2, b9: 2, b10: 2, b11: 2, b12: 3, b13: 3, b14: 3, b15: 3)),
4111 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4112 _mm_setr_epi8(b0: 4, b1: 4, b2: 4, b3: 4, b4: 5, b5: 5, b6: 5, b7: 5,
4113 b8: 6, b9: 6, b10: 6, b11: 6, b12: 7, b13: 7, b14: 7,
4114 b15: 7))));
4115 }
4116 }
4117 else if constexpr (_FromBytes * 8 == _ToBytes)
4118 {
4119 if constexpr (__have_avx2)
4120 {
4121 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4122 __concat(__vector_bitcast<_LLong>(__k),
4123 __vector_bitcast<_LLong>(__k)),
4124 _mm256_setr_epi8(b31: 0, b30: 0, b29: 0, b28: 0, b27: 0, b26: 0, b25: 0, b24: 0, b23: 1, b22: 1, b21: 1, b20: 1, b19: 1,
4125 b18: 1, b17: 1, b16: 1, b15: 2, b14: 2, b13: 2, b12: 2, b11: 2, b10: 2, b09: 2, b08: 2, b07: 3, b06: 3,
4126 b05: 3, b04: 3, b03: 3, b02: 3, b01: 3, b00: 3)));
4127 }
4128 else
4129 {
4130 return __intrin_bitcast<_To>(__concat(
4131 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4132 _mm_setr_epi8(b0: 0, b1: 0, b2: 0, b3: 0, b4: 0, b5: 0, b6: 0, b7: 0,
4133 b8: 1, b9: 1, b10: 1, b11: 1, b12: 1, b13: 1, b14: 1, b15: 1)),
4134 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4135 _mm_setr_epi8(b0: 2, b1: 2, b2: 2, b3: 2, b4: 2, b5: 2, b6: 2, b7: 2,
4136 b8: 3, b9: 3, b10: 3, b11: 3, b12: 3, b13: 3, b14: 3,
4137 b15: 3))));
4138 }
4139 }
4140 else if constexpr (_FromBytes == _ToBytes * 2)
4141 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4142 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()))));
4143 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
4144 {
4145 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4146 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4147 _mm_setr_epi8(b0: 6, b1: 7, b2: 14, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1,
4148 b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1, b14: -1,
4149 b15: -1)))));
4150 }
4151 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4152 {
4153 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4154 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4155 _mm_setr_epi8(b0: 3, b1: 7, b2: 11, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1,
4156 b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1, b14: -1,
4157 b15: -1)))));
4158 }
4159 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4160 {
4161 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4162 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4163 _mm_setr_epi8(b0: 7, b1: 15, b2: -1, b3: -1, b4: -1, b5: -1, b6: -1,
4164 b7: -1, b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1,
4165 b14: -1, b15: -1)))));
4166 }
4167 else
4168 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4169 } // }}}
4170 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32)
4171 { // AVX -> SSE {{{
4172 if constexpr (_FromBytes == _ToBytes)
4173 { // keep low 1/2
4174 return __intrin_bitcast<_To>(__lo128(__k));
4175 }
4176 else if constexpr (_FromBytes == _ToBytes * 2)
4177 { // keep all
4178 auto __y = __vector_bitcast<_LLong>(__k);
4179 return __intrin_bitcast<_To>(
4180 _mm_packs_epi16(__lo128(__y), __hi128(__y)));
4181 }
4182 else if constexpr (_FromBytes == _ToBytes * 4)
4183 { // add 1/2 undef
4184 auto __y = __vector_bitcast<_LLong>(__k);
4185 return __intrin_bitcast<_To>(
4186 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4187 __m128i()));
4188 }
4189 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4190 { // add 3/4 undef
4191 auto __y = __vector_bitcast<_LLong>(__k);
4192 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
4193 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4194 _mm_setr_epi8(b0: 3, b1: 7, b2: 11, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1, b8: -1, b9: -1, b10: -1, b11: -1,
4195 b12: -1, b13: -1, b14: -1, b15: -1)));
4196 }
4197 else if constexpr (_FromBytes * 2 == _ToBytes)
4198 { // keep low 1/4
4199 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4200 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4201 }
4202 else if constexpr (_FromBytes * 4 == _ToBytes)
4203 { // keep low 1/8
4204 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4205 __y = _mm_unpacklo_epi8(__y, __y);
4206 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4207 }
4208 else if constexpr (_FromBytes * 8 == _ToBytes)
4209 { // keep low 1/16
4210 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4211 __y = _mm_unpacklo_epi8(__y, __y);
4212 __y = _mm_unpacklo_epi8(__y, __y);
4213 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4214 }
4215 else
4216 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4217 } // }}}
4218 else
4219 return _Base::template _S_to_maskvector<_Up, _ToN>(__x);
4220 /*
4221 if constexpr (_FromBytes > _ToBytes) {
4222 const _To __y = __vector_bitcast<_Up>(__k);
4223 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4224 constexpr int _Stride = _FromBytes / _ToBytes;
4225 return _To{__y[(_Is + 1) * _Stride - 1]...};
4226 }(make_index_sequence<std::min(_ToN, _FromN)>());
4227 } else {
4228 // {0, 0, 1, 1} (_Dups = 2, _Is<4>)
4229 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>)
4230 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>)
4231 // ...
4232 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4233 constexpr int __dup = _ToBytes / _FromBytes;
4234 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...});
4235 }(make_index_sequence<_FromN>());
4236 }
4237 */
4238 } // }}}
4239 }
4240
4241 // }}}
4242 // _S_to_bits {{{
4243 template <typename _Tp, size_t _Np>
4244 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
4245 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
4246 {
4247 if constexpr (is_same_v<_Tp, bool>)
4248 return _BitMask<_Np>(__x._M_data)._M_sanitized();
4249 else
4250 {
4251 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4252 if (__builtin_is_constant_evaluated()
4253 || __builtin_constant_p(__x._M_data))
4254 {
4255 const auto __bools = -__x._M_data;
4256 const _ULLong __k = __call_with_n_evaluations<_Np>(
4257 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4258 return (__bits | ...);
4259 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4260 return _ULLong(__bools[+__i]) << __i;
4261 });
4262 if (__builtin_is_constant_evaluated()
4263 || __builtin_constant_p(__k))
4264 return __k;
4265 }
4266 const auto __xi = __to_intrin(__x);
4267 if constexpr (sizeof(_Tp) == 1)
4268 if constexpr (sizeof(__xi) == 16)
4269 if constexpr (__have_avx512bw_vl)
4270 return _BitMask<_Np>(_mm_movepi8_mask(__xi));
4271 else // implies SSE2
4272 return _BitMask<_Np>(_mm_movemask_epi8(__xi));
4273 else if constexpr (sizeof(__xi) == 32)
4274 if constexpr (__have_avx512bw_vl)
4275 return _BitMask<_Np>(_mm256_movepi8_mask(__xi));
4276 else // implies AVX2
4277 return _BitMask<_Np>(_mm256_movemask_epi8(__xi));
4278 else // implies AVX512BW
4279 return _BitMask<_Np>(_mm512_movepi8_mask(__xi));
4280
4281 else if constexpr (sizeof(_Tp) == 2)
4282 if constexpr (sizeof(__xi) == 16)
4283 if constexpr (__have_avx512bw_vl)
4284 return _BitMask<_Np>(_mm_movepi16_mask(__xi));
4285 else if constexpr (__have_avx512bw)
4286 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4287 else // implies SSE2
4288 return _BitMask<_Np>(
4289 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i())));
4290 else if constexpr (sizeof(__xi) == 32)
4291 if constexpr (__have_avx512bw_vl)
4292 return _BitMask<_Np>(_mm256_movepi16_mask(__xi));
4293 else if constexpr (__have_avx512bw)
4294 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4295 else // implies SSE2
4296 return _BitMask<_Np>(_mm_movemask_epi8(
4297 _mm_packs_epi16(__lo128(__xi), __hi128(__xi))));
4298 else // implies AVX512BW
4299 return _BitMask<_Np>(_mm512_movepi16_mask(__xi));
4300
4301 else if constexpr (sizeof(_Tp) == 4)
4302 if constexpr (sizeof(__xi) == 16)
4303 if constexpr (__have_avx512dq_vl)
4304 return _BitMask<_Np>(_mm_movepi32_mask(__xi));
4305 else if constexpr (__have_avx512vl)
4306 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i()));
4307 else if constexpr (__have_avx512dq)
4308 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4309 else if constexpr (__have_avx512f)
4310 return _BitMask<_Np>(
4311 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4312 else // implies SSE
4313 return _BitMask<_Np>(
4314 _mm_movemask_ps(a: reinterpret_cast<__m128>(__xi)));
4315 else if constexpr (sizeof(__xi) == 32)
4316 if constexpr (__have_avx512dq_vl)
4317 return _BitMask<_Np>(_mm256_movepi32_mask(__xi));
4318 else if constexpr (__have_avx512dq)
4319 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4320 else if constexpr (__have_avx512vl)
4321 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i()));
4322 else if constexpr (__have_avx512f)
4323 return _BitMask<_Np>(
4324 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4325 else // implies AVX
4326 return _BitMask<_Np>(
4327 _mm256_movemask_ps(a: reinterpret_cast<__m256>(__xi)));
4328 else // implies AVX512??
4329 if constexpr (__have_avx512dq)
4330 return _BitMask<_Np>(_mm512_movepi32_mask(__xi));
4331 else // implies AVX512F
4332 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i()));
4333
4334 else if constexpr (sizeof(_Tp) == 8)
4335 if constexpr (sizeof(__xi) == 16)
4336 if constexpr (__have_avx512dq_vl)
4337 return _BitMask<_Np>(_mm_movepi64_mask(__xi));
4338 else if constexpr (__have_avx512dq)
4339 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4340 else if constexpr (__have_avx512vl)
4341 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i()));
4342 else if constexpr (__have_avx512f)
4343 return _BitMask<_Np>(
4344 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4345 else // implies SSE2
4346 return _BitMask<_Np>(
4347 _mm_movemask_pd(a: reinterpret_cast<__m128d>(__xi)));
4348 else if constexpr (sizeof(__xi) == 32)
4349 if constexpr (__have_avx512dq_vl)
4350 return _BitMask<_Np>(_mm256_movepi64_mask(__xi));
4351 else if constexpr (__have_avx512dq)
4352 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4353 else if constexpr (__have_avx512vl)
4354 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i()));
4355 else if constexpr (__have_avx512f)
4356 return _BitMask<_Np>(
4357 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4358 else // implies AVX
4359 return _BitMask<_Np>(
4360 _mm256_movemask_pd(a: reinterpret_cast<__m256d>(__xi)));
4361 else // implies AVX512??
4362 if constexpr (__have_avx512dq)
4363 return _BitMask<_Np>(_mm512_movepi64_mask(__xi));
4364 else // implies AVX512F
4365 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i()));
4366
4367 else
4368 __assert_unreachable<_Tp>();
4369 }
4370 }
4371 // }}}
4372};
4373
4374// }}}
4375// _MaskImplX86 {{{
4376template <typename _Abi>
4377 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
4378 {
4379 using _MaskImplX86Mixin::_S_to_bits;
4380 using _MaskImplX86Mixin::_S_to_maskvector;
4381 using _MaskImplBuiltin<_Abi>::_S_convert;
4382
4383 // member types {{{
4384 template <typename _Tp>
4385 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
4386
4387 template <typename _Tp>
4388 using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
4389
4390 template <typename _Tp>
4391 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
4392
4393 using _Base = _MaskImplBuiltin<_Abi>;
4394
4395 // }}}
4396 // _S_broadcast {{{
4397 template <typename _Tp>
4398 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4399 _S_broadcast(bool __x)
4400 {
4401 if constexpr (__is_avx512_abi<_Abi>())
4402 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1))
4403 : _MaskMember<_Tp>();
4404 else
4405 return _Base::template _S_broadcast<_Tp>(__x);
4406 }
4407
4408 // }}}
4409 // _S_load {{{
4410 template <typename _Tp>
4411 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4412 _S_load(const bool* __mem)
4413 {
4414 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4415 if (__builtin_is_constant_evaluated())
4416 {
4417 if constexpr (__is_avx512_abi<_Abi>())
4418 {
4419 _MaskMember<_Tp> __r{};
4420 for (size_t __i = 0; __i < _S_size<_Tp>; ++__i)
4421 __r._M_data |= _ULLong(__mem[__i]) << __i;
4422 return __r;
4423 }
4424 else
4425 return _Base::template _S_load<_Tp>(__mem);
4426 }
4427 else if constexpr (__have_avx512bw)
4428 {
4429 const auto __to_vec_or_bits
4430 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) {
4431 if constexpr (__is_avx512_abi<_Abi>())
4432 return __bits;
4433 else
4434 return _S_to_maskvector<_Tp>(
4435 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
4436 };
4437
4438 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
4439 {
4440 __m128i __a = {};
4441 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4442 return __to_vec_or_bits(_mm_test_epi8_mask(A: __a, B: __a));
4443 }
4444 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl)
4445 {
4446 __m256i __a = {};
4447 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4448 return __to_vec_or_bits(_mm256_test_epi8_mask(A: __a, B: __a));
4449 }
4450 else if constexpr (_S_size<_Tp> <= 64)
4451 {
4452 __m512i __a = {};
4453 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4454 return __to_vec_or_bits(_mm512_test_epi8_mask(A: __a, B: __a));
4455 }
4456 }
4457 else if constexpr (__is_avx512_abi<_Abi>())
4458 {
4459 if constexpr (_S_size<_Tp> <= 8)
4460 {
4461 __m128i __a = {};
4462 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4463 const auto __b = _mm512_cvtepi8_epi64(A: __a);
4464 return _mm512_test_epi64_mask(A: __b, B: __b);
4465 }
4466 else if constexpr (_S_size<_Tp> <= 16)
4467 {
4468 __m128i __a = {};
4469 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4470 const auto __b = _mm512_cvtepi8_epi32(A: __a);
4471 return _mm512_test_epi32_mask(A: __b, B: __b);
4472 }
4473 else if constexpr (_S_size<_Tp> <= 32)
4474 {
4475 __m128i __a = {};
4476 __builtin_memcpy(&__a, __mem, 16);
4477 const auto __b = _mm512_cvtepi8_epi32(A: __a);
4478 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16);
4479 const auto __c = _mm512_cvtepi8_epi32(A: __a);
4480 return _mm512_test_epi32_mask(A: __b, B: __b)
4481 | (_mm512_test_epi32_mask(A: __c, B: __c) << 16);
4482 }
4483 else if constexpr (_S_size<_Tp> <= 64)
4484 {
4485 __m128i __a = {};
4486 __builtin_memcpy(&__a, __mem, 16);
4487 const auto __b = _mm512_cvtepi8_epi32(A: __a);
4488 __builtin_memcpy(&__a, __mem + 16, 16);
4489 const auto __c = _mm512_cvtepi8_epi32(A: __a);
4490 if constexpr (_S_size<_Tp> <= 48)
4491 {
4492 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32);
4493 const auto __d = _mm512_cvtepi8_epi32(A: __a);
4494 return _mm512_test_epi32_mask(A: __b, B: __b)
4495 | (_mm512_test_epi32_mask(A: __c, B: __c) << 16)
4496 | (_ULLong(_mm512_test_epi32_mask(A: __d, B: __d)) << 32);
4497 }
4498 else
4499 {
4500 __builtin_memcpy(&__a, __mem + 16, 16);
4501 const auto __d = _mm512_cvtepi8_epi32(A: __a);
4502 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48);
4503 const auto __e = _mm512_cvtepi8_epi32(A: __a);
4504 return _mm512_test_epi32_mask(A: __b, B: __b)
4505 | (_mm512_test_epi32_mask(A: __c, B: __c) << 16)
4506 | (_ULLong(_mm512_test_epi32_mask(A: __d, B: __d)) << 32)
4507 | (_ULLong(_mm512_test_epi32_mask(A: __e, B: __e)) << 48);
4508 }
4509 }
4510 else
4511 __assert_unreachable<_Tp>();
4512 }
4513 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2)
4514 return __vector_bitcast<_Tp>(
4515 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]),
4516 -int(__mem[1]), -int(__mem[1])});
4517 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx)
4518 {
4519 int __bool4 = 0;
4520 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>);
4521 const auto __k = __to_intrin(
4522 (__vector_broadcast<4>(x: __bool4)
4523 & __make_vector<int>(0x1, 0x100, 0x10000,
4524 _S_size<_Tp> == 4 ? 0x1000000 : 0))
4525 != 0);
4526 return __vector_bitcast<_Tp>(
4527 __concat(_mm_unpacklo_epi32(__k, __k),
4528 _mm_unpackhi_epi32(__k, __k)));
4529 }
4530 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4)
4531 {
4532 int __bools = 0;
4533 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>);
4534 if constexpr (__have_sse2)
4535 {
4536 __m128i __k = _mm_cvtsi32_si128(a: __bools);
4537 __k = _mm_cmpgt_epi16(a: _mm_unpacklo_epi8(a: __k, b: __k), b: __m128i());
4538 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4539 _mm_unpacklo_epi16(a: __k, b: __k));
4540 }
4541 else
4542 {
4543 __m128 __k = _mm_cvtpi8_ps(a: _mm_cvtsi32_si64(i: __bools));
4544 _mm_empty();
4545 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4546 _mm_cmpgt_ps(a: __k, b: __m128()));
4547 }
4548 }
4549 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8)
4550 {
4551 __m128i __k = {};
4552 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4553 __k = _mm_cmpgt_epi16(a: _mm_unpacklo_epi8(a: __k, b: __k), b: __m128i());
4554 return __vector_bitcast<_Tp>(
4555 __concat(a_: _mm_unpacklo_epi16(a: __k, b: __k),
4556 b_: _mm_unpackhi_epi16(a: __k, b: __k)));
4557 }
4558 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16)
4559 {
4560 __m128i __k = {};
4561 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4562 __k = _mm_cmpgt_epi8(a: __k, b: __m128i());
4563 if constexpr (_S_size<_Tp> <= 8)
4564 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4565 _mm_unpacklo_epi8(a: __k, b: __k));
4566 else
4567 return __concat(a_: _mm_unpacklo_epi8(a: __k, b: __k),
4568 b_: _mm_unpackhi_epi8(a: __k, b: __k));
4569 }
4570 else
4571 return _Base::template _S_load<_Tp>(__mem);
4572 }
4573
4574 // }}}
4575 // _S_from_bitmask{{{
4576 template <size_t _Np, typename _Tp>
4577 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
4578 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
4579 {
4580 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4581 if constexpr (__is_avx512_abi<_Abi>())
4582 return __bits._M_to_bits();
4583 else
4584 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits);
4585 }
4586
4587 // }}}
4588 // _S_masked_load {{{2
4589 template <typename _Tp, size_t _Np>
4590 static inline _SimdWrapper<_Tp, _Np>
4591 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
4592 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
4593 {
4594 if constexpr (__is_avx512_abi<_Abi>())
4595 {
4596 if constexpr (__have_avx512bw_vl)
4597 {
4598 if constexpr (_Np <= 16)
4599 {
4600 const auto __a
4601 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem);
4602 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a);
4603 }
4604 else if constexpr (_Np <= 32)
4605 {
4606 const auto __a
4607 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem);
4608 return (__merge & ~__mask)
4609 | _mm256_test_epi8_mask(__a, __a);
4610 }
4611 else if constexpr (_Np <= 64)
4612 {
4613 const auto __a
4614 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem);
4615 return (__merge & ~__mask)
4616 | _mm512_test_epi8_mask(__a, __a);
4617 }
4618 else
4619 __assert_unreachable<_Tp>();
4620 }
4621 else
4622 {
4623 _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4624 __merge._M_set(__i, __mem[__i]);
4625 });
4626 return __merge;
4627 }
4628 }
4629 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1)
4630 {
4631 const auto __k = _S_to_bits(__mask)._M_to_bits();
4632 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(),
4633 _mm256_mask_loadu_epi8(__m256i(),
4634 __k, __mem));
4635 }
4636 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1)
4637 {
4638 const auto __k = _S_to_bits(__mask)._M_to_bits();
4639 __merge
4640 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k,
4641 __m128i(),
4642 _mm_mask_loadu_epi8(__m128i(), __k, __mem));
4643 }
4644 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2)
4645 {
4646 const auto __k = _S_to_bits(__mask)._M_to_bits();
4647 __merge = _mm256_mask_sub_epi16(
4648 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4649 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4650 }
4651 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2)
4652 {
4653 const auto __k = _S_to_bits(__mask)._M_to_bits();
4654 __merge = _mm_mask_sub_epi16(
4655 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4656 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4657 }
4658 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4)
4659 {
4660 const auto __k = _S_to_bits(__mask)._M_to_bits();
4661 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32(
4662 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4663 _mm256_cvtepi8_epi32(
4664 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4665 }
4666 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4)
4667 {
4668 const auto __k = _S_to_bits(__mask)._M_to_bits();
4669 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32(
4670 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4671 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4672 }
4673 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8)
4674 {
4675 const auto __k = _S_to_bits(__mask)._M_to_bits();
4676 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64(
4677 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4678 _mm256_cvtepi8_epi64(
4679 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4680 }
4681 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8)
4682 {
4683 const auto __k = _S_to_bits(__mask)._M_to_bits();
4684 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64(
4685 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4686 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4687 }
4688 else
4689 return _Base::_S_masked_load(__merge, __mask, __mem);
4690 return __merge;
4691 }
4692
4693 // _S_store {{{2
4694 template <typename _Tp, size_t _Np>
4695 _GLIBCXX_SIMD_INTRINSIC static constexpr void
4696 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
4697 {
4698 if (__builtin_is_constant_evaluated())
4699 _Base::_S_store(__v, __mem);
4700 else if constexpr (__is_avx512_abi<_Abi>())
4701 {
4702 if constexpr (__have_avx512bw_vl)
4703 _CommonImplX86::_S_store<_Np>(
4704 __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4705 if constexpr (_Np <= 16)
4706 return _mm_maskz_set1_epi8(__data, 1);
4707 else if constexpr (_Np <= 32)
4708 return _mm256_maskz_set1_epi8(__data, 1);
4709 else
4710 return _mm512_maskz_set1_epi8(__data, 1);
4711 }(__v._M_data)),
4712 __mem);
4713 else if constexpr (_Np <= 8)
4714 _CommonImplX86::_S_store<_Np>(
4715 __vector_bitcast<char>(
4716#if defined __x86_64__
4717 __make_wrapper<_ULLong>(
4718 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull)
4719#else
4720 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U),
4721 _pdep_u32(__v._M_data >> 4,
4722 0x01010101U))
4723#endif
4724 ),
4725 __mem);
4726 else if constexpr (_Np <= 16)
4727 _mm512_mask_cvtepi32_storeu_epi8(
4728 __mem, 0xffffu >> (16 - _Np),
4729 _mm512_maskz_set1_epi32(__v._M_data, 1));
4730 else
4731 __assert_unreachable<_Tp>();
4732 }
4733 else if constexpr (__is_sse_abi<_Abi>()) //{{{
4734 {
4735 if constexpr (_Np == 2 && sizeof(_Tp) == 8)
4736 {
4737 const auto __k = __vector_bitcast<int>(__v);
4738 __mem[0] = -__k[1];
4739 __mem[1] = -__k[3];
4740 }
4741 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
4742 {
4743 if constexpr (__have_sse2)
4744 {
4745 const unsigned __bool4
4746 = __vector_bitcast<_UInt>(_mm_packs_epi16(
4747 _mm_packs_epi32(__intrin_bitcast<__m128i>(
4748 __to_intrin(__v)),
4749 __m128i()),
4750 __m128i()))[0]
4751 & 0x01010101u;
4752 __builtin_memcpy(__mem, &__bool4, _Np);
4753 }
4754 else if constexpr (__have_mmx)
4755 {
4756 const __m64 __k = _mm_cvtps_pi8(
4757 __and(__to_intrin(__v), _mm_set1_ps(w: 1.f)));
4758 __builtin_memcpy(__mem, &__k, _Np);
4759 _mm_empty();
4760 }
4761 else
4762 return _Base::_S_store(__v, __mem);
4763 }
4764 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
4765 {
4766 _CommonImplX86::_S_store<_Np>(
4767 __vector_bitcast<char>(_mm_packs_epi16(
4768 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15),
4769 __m128i())),
4770 __mem);
4771 }
4772 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
4773 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem);
4774 else
4775 __assert_unreachable<_Tp>();
4776 } // }}}
4777 else if constexpr (__is_avx_abi<_Abi>()) // {{{
4778 {
4779 if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
4780 {
4781 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4782 int __bool4{};
4783 if constexpr (__have_avx2)
4784 __bool4 = _mm256_movemask_epi8(__k);
4785 else
4786 __bool4 = (_mm_movemask_epi8(__lo128(__k))
4787 | (_mm_movemask_epi8(__hi128(__k)) << 16));
4788 __bool4 &= 0x01010101;
4789 __builtin_memcpy(__mem, &__bool4, _Np);
4790 }
4791 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4)
4792 {
4793 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4794 const auto __k2
4795 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)),
4796 15);
4797 const auto __k3
4798 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i()));
4799 _CommonImplX86::_S_store<_Np>(__k3, __mem);
4800 }
4801 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2)
4802 {
4803 if constexpr (__have_avx2)
4804 {
4805 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15);
4806 const auto __bools = __vector_bitcast<char>(
4807 _mm_packs_epi16(__lo128(__x), __hi128(__x)));
4808 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4809 }
4810 else
4811 {
4812 const auto __bools
4813 = 1
4814 & __vector_bitcast<_UChar>(
4815 _mm_packs_epi16(__lo128(__to_intrin(__v)),
4816 __hi128(__to_intrin(__v))));
4817 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4818 }
4819 }
4820 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1)
4821 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem);
4822 else
4823 __assert_unreachable<_Tp>();
4824 } // }}}
4825 else
4826 __assert_unreachable<_Tp>();
4827 }
4828
4829 // _S_masked_store {{{2
4830 template <typename _Tp, size_t _Np>
4831 static inline void
4832 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
4833 const _SimdWrapper<_Tp, _Np> __k) noexcept
4834 {
4835 if constexpr (__is_avx512_abi<_Abi>())
4836 {
4837 static_assert(is_same_v<_Tp, bool>);
4838 if constexpr (_Np <= 16 && __have_avx512bw_vl)
4839 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1));
4840 else if constexpr (_Np <= 16)
4841 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k,
4842 _mm512_maskz_set1_epi32(__v, 1));
4843 else if constexpr (_Np <= 32 && __have_avx512bw_vl)
4844 _mm256_mask_storeu_epi8(__mem, __k,
4845 _mm256_maskz_set1_epi8(__v, 1));
4846 else if constexpr (_Np <= 32 && __have_avx512bw)
4847 _mm256_mask_storeu_epi8(__mem, __k,
4848 __lo256(_mm512_maskz_set1_epi8(__v, 1)));
4849 else if constexpr (_Np <= 64 && __have_avx512bw)
4850 _mm512_mask_storeu_epi8(__mem, __k,
4851 _mm512_maskz_set1_epi8(__v, 1));
4852 else
4853 __assert_unreachable<_Tp>();
4854 }
4855 else
4856 _Base::_S_masked_store(__v, __mem, __k);
4857 }
4858
4859 // logical and bitwise operators {{{2
4860 template <typename _Tp, size_t _Np>
4861 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4862 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4863 {
4864 if constexpr (is_same_v<_Tp, bool>)
4865 {
4866 if (__builtin_is_constant_evaluated())
4867 return __x._M_data & __y._M_data;
4868 else if constexpr (__have_avx512dq && _Np <= 8)
4869 return _kand_mask8(__x._M_data, __y._M_data);
4870 else if constexpr (_Np <= 16)
4871 return _kand_mask16(__x._M_data, __y._M_data);
4872 else if constexpr (__have_avx512bw && _Np <= 32)
4873 return _kand_mask32(__x._M_data, __y._M_data);
4874 else if constexpr (__have_avx512bw && _Np <= 64)
4875 return _kand_mask64(__x._M_data, __y._M_data);
4876 else
4877 __assert_unreachable<_Tp>();
4878 }
4879 else
4880 return _Base::_S_logical_and(__x, __y);
4881 }
4882
4883 template <typename _Tp, size_t _Np>
4884 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4885 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4886 {
4887 if constexpr (is_same_v<_Tp, bool>)
4888 {
4889 if (__builtin_is_constant_evaluated())
4890 return __x._M_data | __y._M_data;
4891 else if constexpr (__have_avx512dq && _Np <= 8)
4892 return _kor_mask8(__x._M_data, __y._M_data);
4893 else if constexpr (_Np <= 16)
4894 return _kor_mask16(__x._M_data, __y._M_data);
4895 else if constexpr (__have_avx512bw && _Np <= 32)
4896 return _kor_mask32(__x._M_data, __y._M_data);
4897 else if constexpr (__have_avx512bw && _Np <= 64)
4898 return _kor_mask64(__x._M_data, __y._M_data);
4899 else
4900 __assert_unreachable<_Tp>();
4901 }
4902 else
4903 return _Base::_S_logical_or(__x, __y);
4904 }
4905
4906 template <typename _Tp, size_t _Np>
4907 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4908 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
4909 {
4910 if constexpr (is_same_v<_Tp, bool>)
4911 {
4912 if (__builtin_is_constant_evaluated())
4913 return __x._M_data ^ _Abi::template __implicit_mask_n<_Np>();
4914 else if constexpr (__have_avx512dq && _Np <= 8)
4915 return _kandn_mask8(__x._M_data,
4916 _Abi::template __implicit_mask_n<_Np>());
4917 else if constexpr (_Np <= 16)
4918 return _kandn_mask16(__x._M_data,
4919 _Abi::template __implicit_mask_n<_Np>());
4920 else if constexpr (__have_avx512bw && _Np <= 32)
4921 return _kandn_mask32(__x._M_data,
4922 _Abi::template __implicit_mask_n<_Np>());
4923 else if constexpr (__have_avx512bw && _Np <= 64)
4924 return _kandn_mask64(__x._M_data,
4925 _Abi::template __implicit_mask_n<_Np>());
4926 else
4927 __assert_unreachable<_Tp>();
4928 }
4929 else
4930 return _Base::_S_bit_not(__x);
4931 }
4932
4933 template <typename _Tp, size_t _Np>
4934 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4935 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4936 {
4937 if constexpr (is_same_v<_Tp, bool>)
4938 {
4939 if (__builtin_is_constant_evaluated())
4940 return __x._M_data & __y._M_data;
4941 else if constexpr (__have_avx512dq && _Np <= 8)
4942 return _kand_mask8(__x._M_data, __y._M_data);
4943 else if constexpr (_Np <= 16)
4944 return _kand_mask16(__x._M_data, __y._M_data);
4945 else if constexpr (__have_avx512bw && _Np <= 32)
4946 return _kand_mask32(__x._M_data, __y._M_data);
4947 else if constexpr (__have_avx512bw && _Np <= 64)
4948 return _kand_mask64(__x._M_data, __y._M_data);
4949 else
4950 __assert_unreachable<_Tp>();
4951 }
4952 else
4953 return _Base::_S_bit_and(__x, __y);
4954 }
4955
4956 template <typename _Tp, size_t _Np>
4957 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4958 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4959 {
4960 if constexpr (is_same_v<_Tp, bool>)
4961 {
4962 if (__builtin_is_constant_evaluated())
4963 return __x._M_data | __y._M_data;
4964 else if constexpr (__have_avx512dq && _Np <= 8)
4965 return _kor_mask8(__x._M_data, __y._M_data);
4966 else if constexpr (_Np <= 16)
4967 return _kor_mask16(__x._M_data, __y._M_data);
4968 else if constexpr (__have_avx512bw && _Np <= 32)
4969 return _kor_mask32(__x._M_data, __y._M_data);
4970 else if constexpr (__have_avx512bw && _Np <= 64)
4971 return _kor_mask64(__x._M_data, __y._M_data);
4972 else
4973 __assert_unreachable<_Tp>();
4974 }
4975 else
4976 return _Base::_S_bit_or(__x, __y);
4977 }
4978
4979 template <typename _Tp, size_t _Np>
4980 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4981 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4982 {
4983 if constexpr (is_same_v<_Tp, bool>)
4984 {
4985 if (__builtin_is_constant_evaluated())
4986 return __x._M_data ^ __y._M_data;
4987 else if constexpr (__have_avx512dq && _Np <= 8)
4988 return _kxor_mask8(__x._M_data, __y._M_data);
4989 else if constexpr (_Np <= 16)
4990 return _kxor_mask16(__x._M_data, __y._M_data);
4991 else if constexpr (__have_avx512bw && _Np <= 32)
4992 return _kxor_mask32(__x._M_data, __y._M_data);
4993 else if constexpr (__have_avx512bw && _Np <= 64)
4994 return _kxor_mask64(__x._M_data, __y._M_data);
4995 else
4996 __assert_unreachable<_Tp>();
4997 }
4998 else
4999 return _Base::_S_bit_xor(__x, __y);
5000 }
5001
5002 //}}}2
5003 // _S_masked_assign{{{
5004 template <size_t _Np>
5005 _GLIBCXX_SIMD_INTRINSIC static void
5006 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
5007 _SimdWrapper<bool, _Np>& __lhs, _SimdWrapper<bool, _Np> __rhs)
5008 {
5009 __lhs._M_data
5010 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data);
5011 }
5012
5013 template <size_t _Np>
5014 _GLIBCXX_SIMD_INTRINSIC static void
5015 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
5016 _SimdWrapper<bool, _Np>& __lhs, bool __rhs)
5017 {
5018 if (__rhs)
5019 __lhs._M_data = __k._M_data | __lhs._M_data;
5020 else
5021 __lhs._M_data = ~__k._M_data & __lhs._M_data;
5022 }
5023
5024 using _MaskImplBuiltin<_Abi>::_S_masked_assign;
5025
5026 //}}}
5027 // _S_all_of {{{
5028 template <typename _Tp>
5029 _GLIBCXX_SIMD_INTRINSIC static bool
5030 _S_all_of(simd_mask<_Tp, _Abi> __k)
5031 {
5032 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5033 {
5034 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5035 using _TI = __intrinsic_type_t<_Tp, _Np>;
5036 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5037 if constexpr (__have_sse4_1)
5038 {
5039 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5040 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5041 return 0 != __testc(__a, __b);
5042 }
5043 else if constexpr (is_same_v<_Tp, float>)
5044 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1))
5045 == (1 << _Np) - 1;
5046 else if constexpr (is_same_v<_Tp, double>)
5047 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1))
5048 == (1 << _Np) - 1;
5049 else
5050 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
5051 == (1 << (_Np * sizeof(_Tp))) - 1;
5052 }
5053 else if constexpr (__is_avx512_abi<_Abi>())
5054 {
5055 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>();
5056 const auto __kk = __k._M_data._M_data;
5057 if constexpr (sizeof(__kk) == 1)
5058 {
5059 if constexpr (__have_avx512dq)
5060 return _kortestc_mask8_u8(__kk, _Mask == 0xff
5061 ? __kk
5062 : __mmask8(~_Mask));
5063 else
5064 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask));
5065 }
5066 else if constexpr (sizeof(__kk) == 2)
5067 return _kortestc_mask16_u8(__kk, _Mask == 0xffff
5068 ? __kk
5069 : __mmask16(~_Mask));
5070 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw)
5071 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU
5072 ? __kk
5073 : __mmask32(~_Mask));
5074 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw)
5075 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL
5076 ? __kk
5077 : __mmask64(~_Mask));
5078 else
5079 __assert_unreachable<_Tp>();
5080 }
5081 }
5082
5083 // }}}
5084 // _S_any_of {{{
5085 template <typename _Tp>
5086 _GLIBCXX_SIMD_INTRINSIC static bool
5087 _S_any_of(simd_mask<_Tp, _Abi> __k)
5088 {
5089 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5090 {
5091 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5092 using _TI = __intrinsic_type_t<_Tp, _Np>;
5093 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5094 if constexpr (__have_sse4_1)
5095 {
5096 if constexpr (_Abi::template _S_is_partial<
5097 _Tp> || sizeof(__k) < 16)
5098 {
5099 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5100 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5101 return 0 == __testz(__a, __b);
5102 }
5103 else
5104 return 0 == __testz(__a, __a);
5105 }
5106 else if constexpr (is_same_v<_Tp, float>)
5107 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0;
5108 else if constexpr (is_same_v<_Tp, double>)
5109 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0;
5110 else
5111 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
5112 != 0;
5113 }
5114 else if constexpr (__is_avx512_abi<_Abi>())
5115 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5116 != 0;
5117 }
5118
5119 // }}}
5120 // _S_none_of {{{
5121 template <typename _Tp>
5122 _GLIBCXX_SIMD_INTRINSIC static bool
5123 _S_none_of(simd_mask<_Tp, _Abi> __k)
5124 {
5125 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5126 {
5127 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5128 using _TI = __intrinsic_type_t<_Tp, _Np>;
5129 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5130 if constexpr (__have_sse4_1)
5131 {
5132 if constexpr (_Abi::template _S_is_partial<
5133 _Tp> || sizeof(__k) < 16)
5134 {
5135 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5136 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5137 return 0 != __testz(__a, __b);
5138 }
5139 else
5140 return 0 != __testz(__a, __a);
5141 }
5142 else if constexpr (is_same_v<_Tp, float>)
5143 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5144 else if constexpr (is_same_v<_Tp, double>)
5145 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5146 else
5147 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1))
5148 == 0;
5149 }
5150 else if constexpr (__is_avx512_abi<_Abi>())
5151 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5152 == 0;
5153 }
5154
5155 // }}}
5156 // _S_some_of {{{
5157 template <typename _Tp>
5158 _GLIBCXX_SIMD_INTRINSIC static bool
5159 _S_some_of(simd_mask<_Tp, _Abi> __k)
5160 {
5161 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5162 {
5163 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5164 using _TI = __intrinsic_type_t<_Tp, _Np>;
5165 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5166 if constexpr (__have_sse4_1)
5167 {
5168 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5169 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5170 return 0 != __testnzc(__a, __b);
5171 }
5172 else if constexpr (is_same_v<_Tp, float>)
5173 {
5174 constexpr int __allbits = (1 << _Np) - 1;
5175 const auto __tmp = _mm_movemask_ps(__a) & __allbits;
5176 return __tmp > 0 && __tmp < __allbits;
5177 }
5178 else if constexpr (is_same_v<_Tp, double>)
5179 {
5180 constexpr int __allbits = (1 << _Np) - 1;
5181 const auto __tmp = _mm_movemask_pd(__a) & __allbits;
5182 return __tmp > 0 && __tmp < __allbits;
5183 }
5184 else
5185 {
5186 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1;
5187 const auto __tmp = _mm_movemask_epi8(__a) & __allbits;
5188 return __tmp > 0 && __tmp < __allbits;
5189 }
5190 }
5191 else if constexpr (__is_avx512_abi<_Abi>())
5192 return _S_any_of(__k) && !_S_all_of(__k);
5193 else
5194 __assert_unreachable<_Tp>();
5195 }
5196
5197 // }}}
5198 // _S_popcount {{{
5199 template <typename _Tp>
5200 _GLIBCXX_SIMD_INTRINSIC static int
5201 _S_popcount(simd_mask<_Tp, _Abi> __k)
5202 {
5203 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5204 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data;
5205 if constexpr (__is_avx512_abi<_Abi>())
5206 {
5207 if constexpr (_Np > 32)
5208 return __builtin_popcountll(__kk);
5209 else
5210 return __builtin_popcount(__kk);
5211 }
5212 else
5213 {
5214 if constexpr (__have_popcnt)
5215 {
5216 int __bits
5217 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk)));
5218 const int __count = __builtin_popcount(__bits);
5219 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count;
5220 }
5221 else if constexpr (_Np == 2 && sizeof(_Tp) == 8)
5222 {
5223 const int mask = _mm_movemask_pd(__auto_bitcast(__kk));
5224 return mask - (mask >> 1);
5225 }
5226 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
5227 {
5228 auto __x = -(__lo128(__kk) + __hi128(__kk));
5229 return __x[0] + __x[1];
5230 }
5231 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
5232 {
5233 if constexpr (__have_sse2)
5234 {
5235 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk));
5236 __x = _mm_add_epi32(
5237 a: __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5238 __x = _mm_add_epi32(
5239 a: __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2)));
5240 return -_mm_cvtsi128_si32(a: __x);
5241 }
5242 else
5243 return __builtin_popcount(
5244 _mm_movemask_ps(__auto_bitcast(__kk)));
5245 }
5246 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
5247 {
5248 auto __x = __to_intrin(__kk);
5249 __x = _mm_add_epi16(__x,
5250 _mm_shuffle_epi32(__x,
5251 _MM_SHUFFLE(0, 1, 2, 3)));
5252 __x = _mm_add_epi16(
5253 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5254 __x = _mm_add_epi16(
5255 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1)));
5256 return -short(_mm_extract_epi16(__x, 0));
5257 }
5258 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
5259 {
5260 auto __x = __to_intrin(__kk);
5261 __x = _mm_add_epi8(__x,
5262 _mm_shuffle_epi32(__x,
5263 _MM_SHUFFLE(0, 1, 2, 3)));
5264 __x = _mm_add_epi8(__x,
5265 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2,
5266 3)));
5267 __x = _mm_add_epi8(__x,
5268 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0,
5269 1)));
5270 auto __y = -__vector_bitcast<_UChar>(__x);
5271 if constexpr (__have_sse4_1)
5272 return __y[0] + __y[1];
5273 else
5274 {
5275 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0);
5276 return (__z & 0xff) + (__z >> 8);
5277 }
5278 }
5279 else if constexpr (sizeof(__kk) == 32)
5280 {
5281 // The following works only as long as the implementations above
5282 // use a summation
5283 using _I = __int_for_sizeof_t<_Tp>;
5284 const auto __as_int = __vector_bitcast<_I>(__kk);
5285 _MaskImplX86<simd_abi::__sse>::_S_popcount(
5286 simd_mask<_I, simd_abi::__sse>(__private_init,
5287 __lo128(__as_int)
5288 + __hi128(__as_int)));
5289 }
5290 else
5291 __assert_unreachable<_Tp>();
5292 }
5293 }
5294
5295 // }}}
5296 // _S_find_first_set {{{
5297 template <typename _Tp>
5298 _GLIBCXX_SIMD_INTRINSIC static int
5299 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
5300 {
5301 if constexpr (__is_avx512_abi<_Abi>())
5302 return std::__countr_zero(__k._M_data._M_data);
5303 else
5304 return _Base::_S_find_first_set(__k);
5305 }
5306
5307 // }}}
5308 // _S_find_last_set {{{
5309 template <typename _Tp>
5310 _GLIBCXX_SIMD_INTRINSIC static int
5311 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
5312 {
5313 if constexpr (__is_avx512_abi<_Abi>())
5314 return std::__bit_width(__k._M_data._M_data) - 1;
5315 else
5316 return _Base::_S_find_last_set(__k);
5317 }
5318
5319 // }}}
5320 };
5321
5322// }}}
5323
5324_GLIBCXX_SIMD_END_NAMESPACE
5325#endif // __cplusplus >= 201703L
5326#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
5327
5328// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
5329

source code of include/c++/11/experimental/bits/simd_x86.h