1 | // Simd x86 specific implementations -*- C++ -*- |
2 | |
3 | // Copyright (C) 2020-2021 Free Software Foundation, Inc. |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free |
6 | // software; you can redistribute it and/or modify it under the |
7 | // terms of the GNU General Public License as published by the |
8 | // Free Software Foundation; either version 3, or (at your option) |
9 | // any later version. |
10 | |
11 | // This library is distributed in the hope that it will be useful, |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | // GNU General Public License for more details. |
15 | |
16 | // Under Section 7 of GPL version 3, you are granted additional |
17 | // permissions described in the GCC Runtime Library Exception, version |
18 | // 3.1, as published by the Free Software Foundation. |
19 | |
20 | // You should have received a copy of the GNU General Public License and |
21 | // a copy of the GCC Runtime Library Exception along with this program; |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
23 | // <http://www.gnu.org/licenses/>. |
24 | |
25 | #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ |
26 | #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ |
27 | |
28 | #if __cplusplus >= 201703L |
29 | |
30 | #if !_GLIBCXX_SIMD_X86INTRIN |
31 | #error \ |
32 | "simd_x86.h may only be included when MMX or SSE on x86(_64) are available" |
33 | #endif |
34 | |
35 | _GLIBCXX_SIMD_BEGIN_NAMESPACE |
36 | |
37 | // __to_masktype {{{ |
38 | // Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and |
39 | // __vector_type_t. |
40 | template <typename _Tp, size_t _Np> |
41 | _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> |
42 | __to_masktype(_SimdWrapper<_Tp, _Np> __x) |
43 | { return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(__x._M_data); } |
44 | |
45 | template <typename _TV, |
46 | typename _TVT |
47 | = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>, |
48 | typename _Up = __int_for_sizeof_t<typename _TVT::value_type>> |
49 | _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size> |
50 | __to_masktype(_TV __x) |
51 | { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); } |
52 | |
53 | // }}} |
54 | // __interleave128_lo {{{ |
55 | template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>, |
56 | typename _Trait = _VectorTraits<_Tp>> |
57 | _GLIBCXX_SIMD_INTRINSIC constexpr _Tp |
58 | __interleave128_lo(const _Ap& __av, const _Bp& __bv) |
59 | { |
60 | const _Tp __a(__av); |
61 | const _Tp __b(__bv); |
62 | if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2) |
63 | return _Tp{__a[0], __b[0]}; |
64 | else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4) |
65 | return _Tp{__a[0], __b[0], __a[1], __b[1]}; |
66 | else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8) |
67 | return _Tp{__a[0], __b[0], __a[1], __b[1], |
68 | __a[2], __b[2], __a[3], __b[3]}; |
69 | else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16) |
70 | return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], |
71 | __a[3], __b[3], __a[4], __b[4], __a[5], __b[5], |
72 | __a[6], __b[6], __a[7], __b[7]}; |
73 | else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4) |
74 | return _Tp{__a[0], __b[0], __a[2], __b[2]}; |
75 | else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8) |
76 | return _Tp{__a[0], __b[0], __a[1], __b[1], |
77 | __a[4], __b[4], __a[5], __b[5]}; |
78 | else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16) |
79 | return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], |
80 | __a[3], __b[3], __a[8], __b[8], __a[9], __b[9], |
81 | __a[10], __b[10], __a[11], __b[11]}; |
82 | else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32) |
83 | return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], |
84 | __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], |
85 | __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], |
86 | __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], |
87 | __a[22], __b[22], __a[23], __b[23]}; |
88 | else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8) |
89 | return _Tp{__a[0], __b[0], __a[2], __b[2], |
90 | __a[4], __b[4], __a[6], __b[6]}; |
91 | else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16) |
92 | return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4], |
93 | __a[5], __b[5], __a[8], __b[8], __a[9], __b[9], |
94 | __a[12], __b[12], __a[13], __b[13]}; |
95 | else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32) |
96 | return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], |
97 | __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], |
98 | __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18], |
99 | __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25], |
100 | __a[26], __b[26], __a[27], __b[27]}; |
101 | else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64) |
102 | return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], |
103 | __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], |
104 | __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], |
105 | __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], |
106 | __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33], |
107 | __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36], |
108 | __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48], |
109 | __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51], |
110 | __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55], |
111 | __b[55]}; |
112 | else |
113 | __assert_unreachable<_Tp>(); |
114 | } |
115 | |
116 | // }}} |
117 | // __is_zero{{{ |
118 | template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> |
119 | _GLIBCXX_SIMD_INTRINSIC constexpr bool |
120 | __is_zero(_Tp __a) |
121 | { |
122 | if (!__builtin_is_constant_evaluated()) |
123 | { |
124 | if constexpr (__have_avx) |
125 | { |
126 | if constexpr (_TVT::template _S_is<float, 8>) |
127 | return _mm256_testz_ps(__a, __a); |
128 | else if constexpr (_TVT::template _S_is<double, 4>) |
129 | return _mm256_testz_pd(__a, __a); |
130 | else if constexpr (sizeof(_Tp) == 32) |
131 | return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a)); |
132 | else if constexpr (_TVT::template _S_is<float>) |
133 | return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a)); |
134 | else if constexpr (_TVT::template _S_is<double, 2>) |
135 | return _mm_testz_pd(__a, __a); |
136 | else |
137 | return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a)); |
138 | } |
139 | else if constexpr (__have_sse4_1) |
140 | return _mm_testz_si128(__intrin_bitcast<__m128i>(__a), |
141 | __intrin_bitcast<__m128i>(__a)); |
142 | } |
143 | else if constexpr (sizeof(_Tp) <= 8) |
144 | return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0; |
145 | else |
146 | { |
147 | const auto __b = __vector_bitcast<_LLong>(__a); |
148 | if constexpr (sizeof(__b) == 16) |
149 | return (__b[0] | __b[1]) == 0; |
150 | else if constexpr (sizeof(__b) == 32) |
151 | return __is_zero(__lo128(__b) | __hi128(__b)); |
152 | else if constexpr (sizeof(__b) == 64) |
153 | return __is_zero(__lo256(__b) | __hi256(__b)); |
154 | else |
155 | __assert_unreachable<_Tp>(); |
156 | } |
157 | } |
158 | |
159 | // }}} |
160 | // __movemask{{{ |
161 | template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> |
162 | _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int |
163 | __movemask(_Tp __a) |
164 | { |
165 | if constexpr (sizeof(_Tp) == 32) |
166 | { |
167 | if constexpr (_TVT::template _S_is<float>) |
168 | return _mm256_movemask_ps(__to_intrin(__a)); |
169 | else if constexpr (_TVT::template _S_is<double>) |
170 | return _mm256_movemask_pd(__to_intrin(__a)); |
171 | else |
172 | return _mm256_movemask_epi8(__to_intrin(__a)); |
173 | } |
174 | else if constexpr (_TVT::template _S_is<float>) |
175 | return _mm_movemask_ps(__to_intrin(__a)); |
176 | else if constexpr (_TVT::template _S_is<double>) |
177 | return _mm_movemask_pd(__to_intrin(__a)); |
178 | else |
179 | return _mm_movemask_epi8(__to_intrin(__a)); |
180 | } |
181 | |
182 | // }}} |
183 | // __testz{{{ |
184 | template <typename _TI, typename _TVT = _VectorTraits<_TI>> |
185 | _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int |
186 | __testz(_TI __a, _TI __b) |
187 | { |
188 | static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type, |
189 | _TVT::_S_full_size>>); |
190 | if (!__builtin_is_constant_evaluated()) |
191 | { |
192 | if constexpr (sizeof(_TI) == 32) |
193 | { |
194 | if constexpr (_TVT::template _S_is<float>) |
195 | return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b)); |
196 | else if constexpr (_TVT::template _S_is<double>) |
197 | return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b)); |
198 | else |
199 | return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b)); |
200 | } |
201 | else if constexpr (_TVT::template _S_is<float> && __have_avx) |
202 | return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b)); |
203 | else if constexpr (_TVT::template _S_is<double> && __have_avx) |
204 | return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b)); |
205 | else if constexpr (__have_sse4_1) |
206 | return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), |
207 | __intrin_bitcast<__m128i>(__to_intrin(__b))); |
208 | else |
209 | return __movemask(0 == __and(__a, __b)) != 0; |
210 | } |
211 | else |
212 | return __is_zero(__and(__a, __b)); |
213 | } |
214 | |
215 | // }}} |
216 | // __testc{{{ |
217 | // requires SSE4.1 or above |
218 | template <typename _TI, typename _TVT = _VectorTraits<_TI>> |
219 | _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int |
220 | __testc(_TI __a, _TI __b) |
221 | { |
222 | static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type, |
223 | _TVT::_S_full_size>>); |
224 | if (__builtin_is_constant_evaluated()) |
225 | return __is_zero(__andnot(__a, __b)); |
226 | |
227 | if constexpr (sizeof(_TI) == 32) |
228 | { |
229 | if constexpr (_TVT::template _S_is<float>) |
230 | return _mm256_testc_ps(__a, __b); |
231 | else if constexpr (_TVT::template _S_is<double>) |
232 | return _mm256_testc_pd(__a, __b); |
233 | else |
234 | return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b)); |
235 | } |
236 | else if constexpr (_TVT::template _S_is<float> && __have_avx) |
237 | return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b)); |
238 | else if constexpr (_TVT::template _S_is<double> && __have_avx) |
239 | return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b)); |
240 | else |
241 | { |
242 | static_assert(is_same_v<_TI, _TI> && __have_sse4_1); |
243 | return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), |
244 | __intrin_bitcast<__m128i>(__to_intrin(__b))); |
245 | } |
246 | } |
247 | |
248 | // }}} |
249 | // __testnzc{{{ |
250 | template <typename _TI, typename _TVT = _VectorTraits<_TI>> |
251 | _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int |
252 | __testnzc(_TI __a, _TI __b) |
253 | { |
254 | static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type, |
255 | _TVT::_S_full_size>>); |
256 | if (!__builtin_is_constant_evaluated()) |
257 | { |
258 | if constexpr (sizeof(_TI) == 32) |
259 | { |
260 | if constexpr (_TVT::template _S_is<float>) |
261 | return _mm256_testnzc_ps(__a, __b); |
262 | else if constexpr (_TVT::template _S_is<double>) |
263 | return _mm256_testnzc_pd(__a, __b); |
264 | else |
265 | return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b)); |
266 | } |
267 | else if constexpr (_TVT::template _S_is<float> && __have_avx) |
268 | return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b)); |
269 | else if constexpr (_TVT::template _S_is<double> && __have_avx) |
270 | return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b)); |
271 | else if constexpr (__have_sse4_1) |
272 | return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), |
273 | __intrin_bitcast<__m128i>(__to_intrin(__b))); |
274 | else |
275 | return __movemask(0 == __and(__a, __b)) == 0 |
276 | && __movemask(0 == __andnot(__a, __b)) == 0; |
277 | } |
278 | else |
279 | return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b))); |
280 | } |
281 | |
282 | // }}} |
283 | // __xzyw{{{ |
284 | // shuffles the complete vector, swapping the inner two quarters. Often useful |
285 | // for AVX for fixing up a shuffle result. |
286 | template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> |
287 | _GLIBCXX_SIMD_INTRINSIC _Tp |
288 | __xzyw(_Tp __a) |
289 | { |
290 | if constexpr (sizeof(_Tp) == 16) |
291 | { |
292 | const auto __x = __vector_bitcast<conditional_t< |
293 | is_floating_point_v<typename _TVT::value_type>, float, int>>(__a); |
294 | return reinterpret_cast<_Tp>( |
295 | decltype(__x){__x[0], __x[2], __x[1], __x[3]}); |
296 | } |
297 | else if constexpr (sizeof(_Tp) == 32) |
298 | { |
299 | const auto __x = __vector_bitcast<conditional_t< |
300 | is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a); |
301 | return reinterpret_cast<_Tp>( |
302 | decltype(__x){__x[0], __x[2], __x[1], __x[3]}); |
303 | } |
304 | else if constexpr (sizeof(_Tp) == 64) |
305 | { |
306 | const auto __x = __vector_bitcast<conditional_t< |
307 | is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a); |
308 | return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4], |
309 | __x[5], __x[2], __x[3], |
310 | __x[6], __x[7]}); |
311 | } |
312 | else |
313 | __assert_unreachable<_Tp>(); |
314 | } |
315 | |
316 | // }}} |
317 | // __maskload_epi32{{{ |
318 | template <typename _Tp> |
319 | _GLIBCXX_SIMD_INTRINSIC auto |
320 | __maskload_epi32(const int* __ptr, _Tp __k) |
321 | { |
322 | if constexpr (sizeof(__k) == 16) |
323 | return _mm_maskload_epi32(__ptr, __k); |
324 | else |
325 | return _mm256_maskload_epi32(__ptr, __k); |
326 | } |
327 | |
328 | // }}} |
329 | // __maskload_epi64{{{ |
330 | template <typename _Tp> |
331 | _GLIBCXX_SIMD_INTRINSIC auto |
332 | __maskload_epi64(const _LLong* __ptr, _Tp __k) |
333 | { |
334 | if constexpr (sizeof(__k) == 16) |
335 | return _mm_maskload_epi64(__ptr, __k); |
336 | else |
337 | return _mm256_maskload_epi64(__ptr, __k); |
338 | } |
339 | |
340 | // }}} |
341 | // __maskload_ps{{{ |
342 | template <typename _Tp> |
343 | _GLIBCXX_SIMD_INTRINSIC auto |
344 | __maskload_ps(const float* __ptr, _Tp __k) |
345 | { |
346 | if constexpr (sizeof(__k) == 16) |
347 | return _mm_maskload_ps(__ptr, __k); |
348 | else |
349 | return _mm256_maskload_ps(__ptr, __k); |
350 | } |
351 | |
352 | // }}} |
353 | // __maskload_pd{{{ |
354 | template <typename _Tp> |
355 | _GLIBCXX_SIMD_INTRINSIC auto |
356 | __maskload_pd(const double* __ptr, _Tp __k) |
357 | { |
358 | if constexpr (sizeof(__k) == 16) |
359 | return _mm_maskload_pd(__ptr, __k); |
360 | else |
361 | return _mm256_maskload_pd(__ptr, __k); |
362 | } |
363 | |
364 | // }}} |
365 | |
366 | #ifdef __clang__ |
367 | template <size_t _Np, typename _Tp, typename _Kp> |
368 | _GLIBCXX_SIMD_INTRINSIC constexpr auto |
369 | __movm(_Kp __k) noexcept |
370 | { |
371 | static_assert(is_unsigned_v<_Kp>); |
372 | if constexpr (sizeof(_Tp) == 1 && __have_avx512bw) |
373 | { |
374 | if constexpr (_Np <= 16 && __have_avx512vl) |
375 | return __builtin_ia32_cvtmask2b128(__k); |
376 | else if constexpr (_Np <= 32 && __have_avx512vl) |
377 | return __builtin_ia32_cvtmask2b256(__k); |
378 | else |
379 | return __builtin_ia32_cvtmask2b512(__k); |
380 | } |
381 | else if constexpr (sizeof(_Tp) == 2 && __have_avx512bw) |
382 | { |
383 | if constexpr (_Np <= 8 && __have_avx512vl) |
384 | return __builtin_ia32_cvtmask2w128(__k); |
385 | else if constexpr (_Np <= 16 && __have_avx512vl) |
386 | return __builtin_ia32_cvtmask2w256(__k); |
387 | else |
388 | return __builtin_ia32_cvtmask2w512(__k); |
389 | } |
390 | else if constexpr (sizeof(_Tp) == 4 && __have_avx512dq) |
391 | { |
392 | if constexpr (_Np <= 4 && __have_avx512vl) |
393 | return __builtin_ia32_cvtmask2d128(__k); |
394 | else if constexpr (_Np <= 8 && __have_avx512vl) |
395 | return __builtin_ia32_cvtmask2d256(__k); |
396 | else |
397 | return __builtin_ia32_cvtmask2d512(__k); |
398 | } |
399 | else if constexpr (sizeof(_Tp) == 8 && __have_avx512dq) |
400 | { |
401 | if constexpr (_Np <= 2 && __have_avx512vl) |
402 | return __builtin_ia32_cvtmask2q128(__k); |
403 | else if constexpr (_Np <= 4 && __have_avx512vl) |
404 | return __builtin_ia32_cvtmask2q256(__k); |
405 | else |
406 | return __builtin_ia32_cvtmask2q512(__k); |
407 | } |
408 | else |
409 | __assert_unreachable<_Tp>(); |
410 | } |
411 | #endif // __clang__ |
412 | |
413 | #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 |
414 | #include "simd_x86_conversions.h" |
415 | #endif |
416 | |
417 | // ISA & type detection {{{ |
418 | template <typename _Tp, size_t _Np> |
419 | constexpr bool |
420 | __is_sse_ps() |
421 | { |
422 | return __have_sse |
423 | && is_same_v<_Tp, |
424 | float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; |
425 | } |
426 | |
427 | template <typename _Tp, size_t _Np> |
428 | constexpr bool |
429 | __is_sse_pd() |
430 | { |
431 | return __have_sse2 |
432 | && is_same_v<_Tp, |
433 | double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; |
434 | } |
435 | |
436 | template <typename _Tp, size_t _Np> |
437 | constexpr bool |
438 | __is_avx_ps() |
439 | { |
440 | return __have_avx |
441 | && is_same_v<_Tp, |
442 | float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; |
443 | } |
444 | |
445 | template <typename _Tp, size_t _Np> |
446 | constexpr bool |
447 | __is_avx_pd() |
448 | { |
449 | return __have_avx |
450 | && is_same_v<_Tp, |
451 | double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; |
452 | } |
453 | |
454 | template <typename _Tp, size_t _Np> |
455 | constexpr bool |
456 | __is_avx512_ps() |
457 | { |
458 | return __have_avx512f |
459 | && is_same_v<_Tp, |
460 | float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; |
461 | } |
462 | |
463 | template <typename _Tp, size_t _Np> |
464 | constexpr bool |
465 | __is_avx512_pd() |
466 | { |
467 | return __have_avx512f |
468 | && is_same_v<_Tp, |
469 | double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; |
470 | } |
471 | |
472 | // }}} |
473 | struct _MaskImplX86Mixin; |
474 | |
475 | // _CommonImplX86 {{{ |
476 | struct _CommonImplX86 : _CommonImplBuiltin |
477 | { |
478 | #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 |
479 | // _S_converts_via_decomposition {{{ |
480 | template <typename _From, typename _To, size_t _ToSize> |
481 | static constexpr bool |
482 | _S_converts_via_decomposition() |
483 | { |
484 | if constexpr (is_integral_v< |
485 | _From> && is_integral_v<_To> && sizeof(_From) == 8 |
486 | && _ToSize == 16) |
487 | return (sizeof(_To) == 2 && !__have_ssse3) |
488 | || (sizeof(_To) == 1 && !__have_avx512f); |
489 | else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>) |
490 | return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8 |
491 | && !__have_avx512dq) |
492 | || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1 |
493 | && _ToSize == 16); |
494 | else if constexpr ( |
495 | is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8 |
496 | && !__have_avx512dq) |
497 | return (sizeof(_To) == 4 && _ToSize == 16) |
498 | || (sizeof(_To) == 8 && _ToSize < 64); |
499 | else |
500 | return false; |
501 | } |
502 | |
503 | template <typename _From, typename _To, size_t _ToSize> |
504 | static inline constexpr bool __converts_via_decomposition_v |
505 | = _S_converts_via_decomposition<_From, _To, _ToSize>(); |
506 | |
507 | // }}} |
508 | #endif |
509 | // _S_store {{{ |
510 | using _CommonImplBuiltin::_S_store; |
511 | |
512 | template <typename _Tp, size_t _Np> |
513 | _GLIBCXX_SIMD_INTRINSIC static constexpr void |
514 | _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr) |
515 | { |
516 | constexpr size_t _Bytes = _Np * sizeof(_Tp); |
517 | |
518 | if (__builtin_is_constant_evaluated()) |
519 | _CommonImplBuiltin::_S_store(__x, __addr); |
520 | else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl) |
521 | { |
522 | const auto __v = __to_intrin(__x); |
523 | |
524 | if constexpr (_Bytes & 1) |
525 | { |
526 | if constexpr (_Bytes < 16) |
527 | _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes), |
528 | __intrin_bitcast<__m128i>(__v)); |
529 | else if constexpr (_Bytes < 32) |
530 | _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes), |
531 | __intrin_bitcast<__m256i>(__v)); |
532 | else |
533 | _mm512_mask_storeu_epi8(__addr, |
534 | 0xffffffffffffffffull >> (64 - _Bytes), |
535 | __intrin_bitcast<__m512i>(__v)); |
536 | } |
537 | else if constexpr (_Bytes & 2) |
538 | { |
539 | if constexpr (_Bytes < 16) |
540 | _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2), |
541 | __intrin_bitcast<__m128i>(__v)); |
542 | else if constexpr (_Bytes < 32) |
543 | _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2), |
544 | __intrin_bitcast<__m256i>(__v)); |
545 | else |
546 | _mm512_mask_storeu_epi16(__addr, |
547 | 0xffffffffull >> (32 - _Bytes / 2), |
548 | __intrin_bitcast<__m512i>(__v)); |
549 | } |
550 | else if constexpr (_Bytes & 4) |
551 | { |
552 | if constexpr (_Bytes < 16) |
553 | _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4), |
554 | __intrin_bitcast<__m128i>(__v)); |
555 | else if constexpr (_Bytes < 32) |
556 | _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4), |
557 | __intrin_bitcast<__m256i>(__v)); |
558 | else |
559 | _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4), |
560 | __intrin_bitcast<__m512i>(__v)); |
561 | } |
562 | else |
563 | { |
564 | static_assert( |
565 | _Bytes > 16, |
566 | "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes " |
567 | "- 1)) != 0 is impossible" ); |
568 | if constexpr (_Bytes < 32) |
569 | _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8), |
570 | __intrin_bitcast<__m256i>(__v)); |
571 | else |
572 | _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8), |
573 | __intrin_bitcast<__m512i>(__v)); |
574 | } |
575 | } |
576 | else |
577 | _CommonImplBuiltin::_S_store(__x, __addr); |
578 | } |
579 | |
580 | // }}} |
581 | // _S_store_bool_array(_BitMask) {{{ |
582 | template <size_t _Np, bool _Sanitized> |
583 | _GLIBCXX_SIMD_INTRINSIC static constexpr void |
584 | _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem) |
585 | { |
586 | if (__builtin_is_constant_evaluated()) |
587 | _CommonImplBuiltin::_S_store_bool_array(__x, __mem); |
588 | else if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL |
589 | _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>( |
590 | [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
591 | if constexpr (_Np <= 16) |
592 | return _mm_movm_epi8(__x._M_to_bits()); |
593 | else if constexpr (_Np <= 32) |
594 | return _mm256_movm_epi8(__x._M_to_bits()); |
595 | else if constexpr (_Np <= 64) |
596 | return _mm512_movm_epi8(__x._M_to_bits()); |
597 | else |
598 | __assert_unreachable<_SizeConstant<_Np>>(); |
599 | }()), |
600 | __mem); |
601 | else if constexpr (__have_bmi2) |
602 | { |
603 | if constexpr (_Np <= 4) |
604 | _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem); |
605 | else |
606 | __execute_n_times<__div_roundup(a: _Np, b: sizeof(size_t))>( |
607 | [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
608 | constexpr size_t __offset = __i * sizeof(size_t); |
609 | constexpr int __todo = std::min(a: sizeof(size_t), b: _Np - __offset); |
610 | if constexpr (__todo == 1) |
611 | __mem[__offset] = __x[__offset]; |
612 | else |
613 | { |
614 | const auto __bools = |
615 | #ifdef __x86_64__ |
616 | _pdep_u64(__x.template _M_extract<__offset>().to_ullong(), |
617 | 0x0101010101010101ULL); |
618 | #else // __x86_64__ |
619 | _pdep_u32( |
620 | __x.template _M_extract<__offset>()._M_to_bits(), |
621 | 0x01010101U); |
622 | #endif // __x86_64__ |
623 | _S_store<__todo>(__bools, __mem + __offset); |
624 | } |
625 | }); |
626 | } |
627 | else if constexpr (__have_sse2 && _Np > 7) |
628 | __execute_n_times<__div_roundup(a: _Np, b: 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
629 | constexpr int __offset = __i * 16; |
630 | constexpr int __todo = std::min(a: 16, b: int(_Np) - __offset); |
631 | const int __bits = __x.template _M_extract<__offset>()._M_to_bits(); |
632 | __vector_type16_t<_UChar> __bools; |
633 | if constexpr (__have_avx512f) |
634 | { |
635 | auto __as32bits |
636 | = _mm512_maskz_mov_epi32(U: __bits, A: __to_intrin( |
637 | x: __vector_broadcast<16>(x: 1))); |
638 | auto __as16bits |
639 | = __xzyw(a: _mm256_packs_epi32(a: __lo256(x: __as32bits), |
640 | b: __todo > 8 ? __hi256(x: __as32bits) |
641 | : __m256i())); |
642 | __bools = __vector_bitcast<_UChar>( |
643 | x: _mm_packs_epi16(a: __lo128(x: __as16bits), b: __hi128(x: __as16bits))); |
644 | } |
645 | else |
646 | { |
647 | using _V = __vector_type_t<_UChar, 16>; |
648 | auto __tmp = _mm_cvtsi32_si128(a: __bits); |
649 | __tmp = _mm_unpacklo_epi8(a: __tmp, b: __tmp); |
650 | __tmp = _mm_unpacklo_epi16(a: __tmp, b: __tmp); |
651 | __tmp = _mm_unpacklo_epi32(a: __tmp, b: __tmp); |
652 | _V __tmp2 = reinterpret_cast<_V>(__tmp); |
653 | __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128, |
654 | 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index |
655 | __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01 |
656 | } |
657 | _S_store<__todo>(__bools, __mem + __offset); |
658 | }); |
659 | else |
660 | _CommonImplBuiltin::_S_store_bool_array(__x, __mem); |
661 | } |
662 | |
663 | // }}} |
664 | // _S_blend_avx512 {{{ |
665 | // Returns: __k ? __b : __a |
666 | // TODO: reverse __a and __b to match COND_EXPR |
667 | // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask |
668 | // __k |
669 | template <typename _Kp, typename _TV> |
670 | _GLIBCXX_SIMD_INTRINSIC static _TV |
671 | _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept |
672 | { |
673 | static_assert(__is_vector_type_v<_TV>); |
674 | using _Tp = typename _VectorTraits<_TV>::value_type; |
675 | static_assert(sizeof(_TV) >= 16); |
676 | static_assert(sizeof(_Tp) <= 8); |
677 | #ifdef __clang__ |
678 | return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a; |
679 | #else |
680 | using _IntT |
681 | = conditional_t<(sizeof(_Tp) > 2), |
682 | conditional_t<sizeof(_Tp) == 4, int, long long>, |
683 | conditional_t<sizeof(_Tp) == 1, char, short>>; |
684 | [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a); |
685 | [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b); |
686 | if constexpr (sizeof(_TV) == 64) |
687 | { |
688 | if constexpr (sizeof(_Tp) == 1) |
689 | return reinterpret_cast<_TV>( |
690 | __builtin_ia32_blendmb_512_mask(__aa, __bb, __k)); |
691 | else if constexpr (sizeof(_Tp) == 2) |
692 | return reinterpret_cast<_TV>( |
693 | __builtin_ia32_blendmw_512_mask(__aa, __bb, __k)); |
694 | else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) |
695 | return __builtin_ia32_blendmps_512_mask(__a, __b, __k); |
696 | else if constexpr (sizeof(_Tp) == 4) |
697 | return reinterpret_cast<_TV>( |
698 | __builtin_ia32_blendmd_512_mask(__aa, __bb, __k)); |
699 | else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) |
700 | return __builtin_ia32_blendmpd_512_mask(__a, __b, __k); |
701 | else if constexpr (sizeof(_Tp) == 8) |
702 | return reinterpret_cast<_TV>( |
703 | __builtin_ia32_blendmq_512_mask(__aa, __bb, __k)); |
704 | } |
705 | else if constexpr (sizeof(_TV) == 32) |
706 | { |
707 | if constexpr (sizeof(_Tp) == 1) |
708 | return reinterpret_cast<_TV>( |
709 | __builtin_ia32_blendmb_256_mask(__aa, __bb, __k)); |
710 | else if constexpr (sizeof(_Tp) == 2) |
711 | return reinterpret_cast<_TV>( |
712 | __builtin_ia32_blendmw_256_mask(__aa, __bb, __k)); |
713 | else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) |
714 | return __builtin_ia32_blendmps_256_mask(__a, __b, __k); |
715 | else if constexpr (sizeof(_Tp) == 4) |
716 | return reinterpret_cast<_TV>( |
717 | __builtin_ia32_blendmd_256_mask(__aa, __bb, __k)); |
718 | else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) |
719 | return __builtin_ia32_blendmpd_256_mask(__a, __b, __k); |
720 | else if constexpr (sizeof(_Tp) == 8) |
721 | return reinterpret_cast<_TV>( |
722 | __builtin_ia32_blendmq_256_mask(__aa, __bb, __k)); |
723 | } |
724 | else if constexpr (sizeof(_TV) == 16) |
725 | { |
726 | if constexpr (sizeof(_Tp) == 1) |
727 | return reinterpret_cast<_TV>( |
728 | __builtin_ia32_blendmb_128_mask(__aa, __bb, __k)); |
729 | else if constexpr (sizeof(_Tp) == 2) |
730 | return reinterpret_cast<_TV>( |
731 | __builtin_ia32_blendmw_128_mask(__aa, __bb, __k)); |
732 | else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) |
733 | return __builtin_ia32_blendmps_128_mask(__a, __b, __k); |
734 | else if constexpr (sizeof(_Tp) == 4) |
735 | return reinterpret_cast<_TV>( |
736 | __builtin_ia32_blendmd_128_mask(__aa, __bb, __k)); |
737 | else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) |
738 | return __builtin_ia32_blendmpd_128_mask(__a, __b, __k); |
739 | else if constexpr (sizeof(_Tp) == 8) |
740 | return reinterpret_cast<_TV>( |
741 | __builtin_ia32_blendmq_128_mask(__aa, __bb, __k)); |
742 | } |
743 | #endif |
744 | } |
745 | |
746 | // }}} |
747 | // _S_blend_intrin {{{ |
748 | // Returns: __k ? __b : __a |
749 | // TODO: reverse __a and __b to match COND_EXPR |
750 | // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32 |
751 | // Bytes wide |
752 | template <typename _Tp> |
753 | _GLIBCXX_SIMD_INTRINSIC static _Tp |
754 | _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept |
755 | { |
756 | static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>); |
757 | constexpr struct |
758 | { |
759 | _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b, |
760 | __m128 __k) const noexcept |
761 | { |
762 | return __builtin_ia32_blendvps(__a, __b, __k); |
763 | } |
764 | _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b, |
765 | __m128d __k) const noexcept |
766 | { |
767 | return __builtin_ia32_blendvpd(__a, __b, __k); |
768 | } |
769 | _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b, |
770 | __m128i __k) const noexcept |
771 | { |
772 | return reinterpret_cast<__m128i>( |
773 | __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a), |
774 | reinterpret_cast<__v16qi>(__b), |
775 | reinterpret_cast<__v16qi>(__k))); |
776 | } |
777 | _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b, |
778 | __m256 __k) const noexcept |
779 | { |
780 | return __builtin_ia32_blendvps256(__a, __b, __k); |
781 | } |
782 | _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b, |
783 | __m256d __k) const noexcept |
784 | { |
785 | return __builtin_ia32_blendvpd256(__a, __b, __k); |
786 | } |
787 | _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b, |
788 | __m256i __k) const noexcept |
789 | { |
790 | if constexpr (__have_avx2) |
791 | return reinterpret_cast<__m256i>( |
792 | __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a), |
793 | reinterpret_cast<__v32qi>(__b), |
794 | reinterpret_cast<__v32qi>(__k))); |
795 | else |
796 | return reinterpret_cast<__m256i>( |
797 | __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a), |
798 | reinterpret_cast<__v8sf>(__b), |
799 | reinterpret_cast<__v8sf>(__k))); |
800 | } |
801 | } __eval; |
802 | return __eval(__a, __b, __k); |
803 | } |
804 | |
805 | // }}} |
806 | // _S_blend {{{ |
807 | // Returns: __k ? __at1 : __at0 |
808 | // TODO: reverse __at0 and __at1 to match COND_EXPR |
809 | template <typename _Tp, size_t _Np> |
810 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
811 | _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0, |
812 | _SimdWrapper<_Tp, _Np> __at1) |
813 | { |
814 | static_assert(is_same_v<_Tp, _Tp> && __have_avx512f); |
815 | if (__k._M_is_constprop() && __at0._M_is_constprop() |
816 | && __at1._M_is_constprop()) |
817 | return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>( |
818 | [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
819 | return __k[__i] ? __at1[__i] : __at0[__i]; |
820 | }); |
821 | else if constexpr (sizeof(__at0) == 64 |
822 | || (__have_avx512vl && sizeof(__at0) >= 16)) |
823 | return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data); |
824 | else |
825 | { |
826 | static_assert((__have_avx512vl && sizeof(__at0) < 16) |
827 | || !__have_avx512vl); |
828 | constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp); |
829 | return __vector_bitcast<_Tp, _Np>( |
830 | _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0), |
831 | __vector_bitcast<_Tp, __size>(__at1))); |
832 | } |
833 | } |
834 | |
835 | template <typename _Tp, size_t _Np> |
836 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
837 | _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k, |
838 | _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1) |
839 | { |
840 | const auto __kk = __wrapper_bitcast<_Tp>(__k); |
841 | if (__builtin_is_constant_evaluated() |
842 | || (__kk._M_is_constprop() && __at0._M_is_constprop() |
843 | && __at1._M_is_constprop())) |
844 | { |
845 | auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1)); |
846 | if (__r._M_is_constprop()) |
847 | return __r; |
848 | } |
849 | if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl) |
850 | && (sizeof(_Tp) >= 4 || __have_avx512bw)) |
851 | // convert to bitmask and call overload above |
852 | return _S_blend( |
853 | _SimdWrapper<bool, _Np>( |
854 | __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k) |
855 | ._M_to_bits()), |
856 | __at0, __at1); |
857 | else |
858 | { |
859 | // Since GCC does not assume __k to be a mask, using the builtin |
860 | // conditional operator introduces an extra compare against 0 before |
861 | // blending. So we rather call the intrinsic here. |
862 | if constexpr (__have_sse4_1) |
863 | return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0), |
864 | __to_intrin(__at1)); |
865 | else |
866 | return __or(__andnot(__kk, __at0), __and(__kk, __at1)); |
867 | } |
868 | } |
869 | |
870 | // }}} |
871 | }; |
872 | |
873 | // }}} |
874 | // _SimdImplX86 {{{ |
875 | template <typename _Abi> |
876 | struct _SimdImplX86 : _SimdImplBuiltin<_Abi> |
877 | { |
878 | using _Base = _SimdImplBuiltin<_Abi>; |
879 | |
880 | template <typename _Tp> |
881 | using _MaskMember = typename _Base::template _MaskMember<_Tp>; |
882 | |
883 | template <typename _Tp> |
884 | static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; |
885 | |
886 | template <typename _Tp> |
887 | static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; |
888 | |
889 | template <typename _Tp> |
890 | static constexpr size_t _S_max_store_size |
891 | = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64 |
892 | : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32 |
893 | : 16; |
894 | |
895 | using _MaskImpl = typename _Abi::_MaskImpl; |
896 | |
897 | // _S_masked_load {{{ |
898 | template <typename _Tp, size_t _Np, typename _Up> |
899 | static inline _SimdWrapper<_Tp, _Np> |
900 | _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, |
901 | const _Up* __mem) noexcept |
902 | { |
903 | static_assert(_Np == _S_size<_Tp>); |
904 | if constexpr (is_same_v<_Tp, _Up> || // no conversion |
905 | (sizeof(_Tp) == sizeof(_Up) |
906 | && is_integral_v< |
907 | _Tp> == is_integral_v<_Up>) // conversion via bit |
908 | // reinterpretation |
909 | ) |
910 | { |
911 | [[maybe_unused]] const auto __intrin = __to_intrin(__merge); |
912 | if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) |
913 | && sizeof(_Tp) == 1) |
914 | { |
915 | const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); |
916 | if constexpr (sizeof(__intrin) == 16) |
917 | __merge = __vector_bitcast<_Tp, _Np>( |
918 | _mm_mask_loadu_epi8(__intrin, __kk, __mem)); |
919 | else if constexpr (sizeof(__merge) == 32) |
920 | __merge = __vector_bitcast<_Tp, _Np>( |
921 | _mm256_mask_loadu_epi8(__intrin, __kk, __mem)); |
922 | else if constexpr (sizeof(__merge) == 64) |
923 | __merge = __vector_bitcast<_Tp, _Np>( |
924 | _mm512_mask_loadu_epi8(__intrin, __kk, __mem)); |
925 | else |
926 | __assert_unreachable<_Tp>(); |
927 | } |
928 | else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) |
929 | && sizeof(_Tp) == 2) |
930 | { |
931 | const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); |
932 | if constexpr (sizeof(__intrin) == 16) |
933 | __merge = __vector_bitcast<_Tp, _Np>( |
934 | _mm_mask_loadu_epi16(__intrin, __kk, __mem)); |
935 | else if constexpr (sizeof(__intrin) == 32) |
936 | __merge = __vector_bitcast<_Tp, _Np>( |
937 | _mm256_mask_loadu_epi16(__intrin, __kk, __mem)); |
938 | else if constexpr (sizeof(__intrin) == 64) |
939 | __merge = __vector_bitcast<_Tp, _Np>( |
940 | _mm512_mask_loadu_epi16(__intrin, __kk, __mem)); |
941 | else |
942 | __assert_unreachable<_Tp>(); |
943 | } |
944 | else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) |
945 | && sizeof(_Tp) == 4 && is_integral_v<_Up>) |
946 | { |
947 | const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); |
948 | if constexpr (sizeof(__intrin) == 16) |
949 | __merge = __vector_bitcast<_Tp, _Np>( |
950 | _mm_mask_loadu_epi32(__intrin, __kk, __mem)); |
951 | else if constexpr (sizeof(__intrin) == 32) |
952 | __merge = __vector_bitcast<_Tp, _Np>( |
953 | _mm256_mask_loadu_epi32(__intrin, __kk, __mem)); |
954 | else if constexpr (sizeof(__intrin) == 64) |
955 | __merge = __vector_bitcast<_Tp, _Np>( |
956 | _mm512_mask_loadu_epi32(__intrin, __kk, __mem)); |
957 | else |
958 | __assert_unreachable<_Tp>(); |
959 | } |
960 | else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) |
961 | && sizeof(_Tp) == 4 && is_floating_point_v<_Up>) |
962 | { |
963 | const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); |
964 | if constexpr (sizeof(__intrin) == 16) |
965 | __merge = __vector_bitcast<_Tp, _Np>( |
966 | _mm_mask_loadu_ps(__intrin, __kk, __mem)); |
967 | else if constexpr (sizeof(__intrin) == 32) |
968 | __merge = __vector_bitcast<_Tp, _Np>( |
969 | _mm256_mask_loadu_ps(__intrin, __kk, __mem)); |
970 | else if constexpr (sizeof(__intrin) == 64) |
971 | __merge = __vector_bitcast<_Tp, _Np>( |
972 | _mm512_mask_loadu_ps(__intrin, __kk, __mem)); |
973 | else |
974 | __assert_unreachable<_Tp>(); |
975 | } |
976 | else if constexpr (__have_avx2 && sizeof(_Tp) == 4 |
977 | && is_integral_v<_Up>) |
978 | { |
979 | static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); |
980 | __merge |
981 | = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), |
982 | __vector_bitcast<_Tp, _Np>( |
983 | __maskload_epi32(reinterpret_cast<const int*>(__mem), |
984 | __to_intrin(__k)))); |
985 | } |
986 | else if constexpr (__have_avx && sizeof(_Tp) == 4) |
987 | { |
988 | static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); |
989 | __merge |
990 | = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), |
991 | __vector_bitcast<_Tp, _Np>( |
992 | __maskload_ps(reinterpret_cast<const float*>(__mem), |
993 | __to_intrin(__k)))); |
994 | } |
995 | else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) |
996 | && sizeof(_Tp) == 8 && is_integral_v<_Up>) |
997 | { |
998 | const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); |
999 | if constexpr (sizeof(__intrin) == 16) |
1000 | __merge = __vector_bitcast<_Tp, _Np>( |
1001 | _mm_mask_loadu_epi64(__intrin, __kk, __mem)); |
1002 | else if constexpr (sizeof(__intrin) == 32) |
1003 | __merge = __vector_bitcast<_Tp, _Np>( |
1004 | _mm256_mask_loadu_epi64(__intrin, __kk, __mem)); |
1005 | else if constexpr (sizeof(__intrin) == 64) |
1006 | __merge = __vector_bitcast<_Tp, _Np>( |
1007 | _mm512_mask_loadu_epi64(__intrin, __kk, __mem)); |
1008 | else |
1009 | __assert_unreachable<_Tp>(); |
1010 | } |
1011 | else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) |
1012 | && sizeof(_Tp) == 8 && is_floating_point_v<_Up>) |
1013 | { |
1014 | const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); |
1015 | if constexpr (sizeof(__intrin) == 16) |
1016 | __merge = __vector_bitcast<_Tp, _Np>( |
1017 | _mm_mask_loadu_pd(__intrin, __kk, __mem)); |
1018 | else if constexpr (sizeof(__intrin) == 32) |
1019 | __merge = __vector_bitcast<_Tp, _Np>( |
1020 | _mm256_mask_loadu_pd(__intrin, __kk, __mem)); |
1021 | else if constexpr (sizeof(__intrin) == 64) |
1022 | __merge = __vector_bitcast<_Tp, _Np>( |
1023 | _mm512_mask_loadu_pd(__intrin, __kk, __mem)); |
1024 | else |
1025 | __assert_unreachable<_Tp>(); |
1026 | } |
1027 | else if constexpr (__have_avx2 && sizeof(_Tp) == 8 |
1028 | && is_integral_v<_Up>) |
1029 | { |
1030 | static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); |
1031 | __merge |
1032 | = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), |
1033 | __vector_bitcast<_Tp, _Np>(__maskload_epi64( |
1034 | reinterpret_cast<const _LLong*>(__mem), |
1035 | __to_intrin(__k)))); |
1036 | } |
1037 | else if constexpr (__have_avx && sizeof(_Tp) == 8) |
1038 | { |
1039 | static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); |
1040 | __merge |
1041 | = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), |
1042 | __vector_bitcast<_Tp, _Np>( |
1043 | __maskload_pd(reinterpret_cast<const double*>(__mem), |
1044 | __to_intrin(__k)))); |
1045 | } |
1046 | else |
1047 | _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), |
1048 | [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1049 | __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); |
1050 | }); |
1051 | } |
1052 | /* Very uncertain, that the following improves anything. Needs |
1053 | benchmarking |
1054 | * before it's activated. |
1055 | else if constexpr (sizeof(_Up) <= 8 && // no long double |
1056 | !__converts_via_decomposition_v< |
1057 | _Up, _Tp, |
1058 | sizeof(__merge)> // conversion via decomposition |
1059 | // is better handled via the |
1060 | // bit_iteration fallback below |
1061 | ) |
1062 | { |
1063 | // TODO: copy pattern from _S_masked_store, which doesn't resort to |
1064 | // fixed_size |
1065 | using _Ap = simd_abi::deduce_t<_Up, _Np>; |
1066 | using _ATraits = _SimdTraits<_Up, _Ap>; |
1067 | using _AImpl = typename _ATraits::_SimdImpl; |
1068 | typename _ATraits::_SimdMember __uncvted{}; |
1069 | typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template |
1070 | _S_convert<_Up>(__k); |
1071 | __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem); |
1072 | _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter; |
1073 | _Base::_S_masked_assign(__k, __merge, __converter(__uncvted)); |
1074 | } |
1075 | */ |
1076 | else |
1077 | __merge = _Base::_S_masked_load(__merge, __k, __mem); |
1078 | return __merge; |
1079 | } |
1080 | |
1081 | // }}} |
1082 | // _S_masked_store_nocvt {{{ |
1083 | template <typename _Tp, size_t _Np> |
1084 | _GLIBCXX_SIMD_INTRINSIC static void |
1085 | _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper<bool, _Np> __k) |
1086 | { |
1087 | [[maybe_unused]] const auto __vi = __to_intrin(__v); |
1088 | if constexpr (sizeof(__vi) == 64) |
1089 | { |
1090 | static_assert(sizeof(__v) == 64 && __have_avx512f); |
1091 | if constexpr (__have_avx512bw && sizeof(_Tp) == 1) |
1092 | _mm512_mask_storeu_epi8(__mem, __k, __vi); |
1093 | else if constexpr (__have_avx512bw && sizeof(_Tp) == 2) |
1094 | _mm512_mask_storeu_epi16(__mem, __k, __vi); |
1095 | else if constexpr (__have_avx512f && sizeof(_Tp) == 4) |
1096 | { |
1097 | if constexpr (is_integral_v<_Tp>) |
1098 | _mm512_mask_storeu_epi32(__mem, __k, __vi); |
1099 | else |
1100 | _mm512_mask_storeu_ps(__mem, __k, __vi); |
1101 | } |
1102 | else if constexpr (__have_avx512f && sizeof(_Tp) == 8) |
1103 | { |
1104 | if constexpr (is_integral_v<_Tp>) |
1105 | _mm512_mask_storeu_epi64(__mem, __k, __vi); |
1106 | else |
1107 | _mm512_mask_storeu_pd(__mem, __k, __vi); |
1108 | } |
1109 | #if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32 |
1110 | // with Skylake-AVX512, __have_avx512bw is true |
1111 | else if constexpr (__have_sse2) |
1112 | { |
1113 | using _M = __vector_type_t<_Tp, _Np>; |
1114 | using _MVT = _VectorTraits<_M>; |
1115 | _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)), |
1116 | __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)), |
1117 | reinterpret_cast<char*>(__mem)); |
1118 | _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)), |
1119 | __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>( |
1120 | __k._M_data >> 1 * _MVT::_S_full_size)), |
1121 | reinterpret_cast<char*>(__mem) + 1 * 16); |
1122 | _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)), |
1123 | __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>( |
1124 | __k._M_data >> 2 * _MVT::_S_full_size)), |
1125 | reinterpret_cast<char*>(__mem) + 2 * 16); |
1126 | if constexpr (_Np > 48 / sizeof(_Tp)) |
1127 | _mm_maskmoveu_si128( |
1128 | __auto_bitcast(__extract<3, 4>(__v._M_data)), |
1129 | __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>( |
1130 | __k._M_data >> 3 * _MVT::_S_full_size)), |
1131 | reinterpret_cast<char*>(__mem) + 3 * 16); |
1132 | } |
1133 | #endif |
1134 | else |
1135 | __assert_unreachable<_Tp>(); |
1136 | } |
1137 | else if constexpr (sizeof(__vi) == 32) |
1138 | { |
1139 | if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) |
1140 | _mm256_mask_storeu_epi8(__mem, __k, __vi); |
1141 | else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) |
1142 | _mm256_mask_storeu_epi16(__mem, __k, __vi); |
1143 | else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) |
1144 | { |
1145 | if constexpr (is_integral_v<_Tp>) |
1146 | _mm256_mask_storeu_epi32(__mem, __k, __vi); |
1147 | else |
1148 | _mm256_mask_storeu_ps(__mem, __k, __vi); |
1149 | } |
1150 | else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) |
1151 | { |
1152 | if constexpr (is_integral_v<_Tp>) |
1153 | _mm256_mask_storeu_epi64(__mem, __k, __vi); |
1154 | else |
1155 | _mm256_mask_storeu_pd(__mem, __k, __vi); |
1156 | } |
1157 | else if constexpr (__have_avx512f |
1158 | && (sizeof(_Tp) >= 4 || __have_avx512bw)) |
1159 | { |
1160 | // use a 512-bit maskstore, using zero-extension of the bitmask |
1161 | _S_masked_store_nocvt( |
1162 | _SimdWrapper64<_Tp>( |
1163 | __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)), |
1164 | __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data)); |
1165 | } |
1166 | else |
1167 | _S_masked_store_nocvt(__v, __mem, |
1168 | _MaskImpl::template _S_to_maskvector< |
1169 | __int_for_sizeof_t<_Tp>, _Np>(__k)); |
1170 | } |
1171 | else if constexpr (sizeof(__vi) == 16) |
1172 | { |
1173 | if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) |
1174 | _mm_mask_storeu_epi8(__mem, __k, __vi); |
1175 | else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) |
1176 | _mm_mask_storeu_epi16(__mem, __k, __vi); |
1177 | else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) |
1178 | { |
1179 | if constexpr (is_integral_v<_Tp>) |
1180 | _mm_mask_storeu_epi32(__mem, __k, __vi); |
1181 | else |
1182 | _mm_mask_storeu_ps(__mem, __k, __vi); |
1183 | } |
1184 | else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) |
1185 | { |
1186 | if constexpr (is_integral_v<_Tp>) |
1187 | _mm_mask_storeu_epi64(__mem, __k, __vi); |
1188 | else |
1189 | _mm_mask_storeu_pd(__mem, __k, __vi); |
1190 | } |
1191 | else if constexpr (__have_avx512f |
1192 | && (sizeof(_Tp) >= 4 || __have_avx512bw)) |
1193 | { |
1194 | // use a 512-bit maskstore, using zero-extension of the bitmask |
1195 | _S_masked_store_nocvt( |
1196 | _SimdWrapper64<_Tp>( |
1197 | __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)), |
1198 | __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data)); |
1199 | } |
1200 | else |
1201 | _S_masked_store_nocvt(__v, __mem, |
1202 | _MaskImpl::template _S_to_maskvector< |
1203 | __int_for_sizeof_t<_Tp>, _Np>(__k)); |
1204 | } |
1205 | else |
1206 | __assert_unreachable<_Tp>(); |
1207 | } |
1208 | |
1209 | template <typename _Tp, size_t _Np> |
1210 | _GLIBCXX_SIMD_INTRINSIC static void |
1211 | _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, |
1212 | _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k) |
1213 | { |
1214 | if constexpr (sizeof(__v) <= 16) |
1215 | { |
1216 | [[maybe_unused]] const auto __vi |
1217 | = __intrin_bitcast<__m128i>(__as_vector(__v)); |
1218 | [[maybe_unused]] const auto __ki |
1219 | = __intrin_bitcast<__m128i>(__as_vector(__k)); |
1220 | if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) |
1221 | _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi); |
1222 | else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) |
1223 | _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi); |
1224 | else if constexpr (__have_avx2 && sizeof(_Tp) == 4 |
1225 | && is_integral_v<_Tp>) |
1226 | _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi); |
1227 | else if constexpr (__have_avx && sizeof(_Tp) == 4) |
1228 | _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki, |
1229 | __vector_bitcast<float>(__vi)); |
1230 | else if constexpr (__have_avx2 && sizeof(_Tp) == 8 |
1231 | && is_integral_v<_Tp>) |
1232 | _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi); |
1233 | else if constexpr (__have_avx && sizeof(_Tp) == 8) |
1234 | _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki, |
1235 | __vector_bitcast<double>(__vi)); |
1236 | else if constexpr (__have_sse2) |
1237 | _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<char*>(__mem)); |
1238 | } |
1239 | else if constexpr (sizeof(__v) == 32) |
1240 | { |
1241 | [[maybe_unused]] const auto __vi |
1242 | = __intrin_bitcast<__m256i>(__as_vector(__v)); |
1243 | [[maybe_unused]] const auto __ki |
1244 | = __intrin_bitcast<__m256i>(__as_vector(__k)); |
1245 | if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) |
1246 | _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi); |
1247 | else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) |
1248 | _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi); |
1249 | else if constexpr (__have_avx2 && sizeof(_Tp) == 4 |
1250 | && is_integral_v<_Tp>) |
1251 | _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi); |
1252 | else if constexpr (sizeof(_Tp) == 4) |
1253 | _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki, |
1254 | __vector_bitcast<float>(__v)); |
1255 | else if constexpr (__have_avx2 && sizeof(_Tp) == 8 |
1256 | && is_integral_v<_Tp>) |
1257 | _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, |
1258 | __vi); |
1259 | else if constexpr (__have_avx && sizeof(_Tp) == 8) |
1260 | _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki, |
1261 | __vector_bitcast<double>(__v)); |
1262 | else if constexpr (__have_sse2) |
1263 | { |
1264 | _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki), |
1265 | reinterpret_cast<char*>(__mem)); |
1266 | _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki), |
1267 | reinterpret_cast<char*>(__mem) + 16); |
1268 | } |
1269 | } |
1270 | else |
1271 | __assert_unreachable<_Tp>(); |
1272 | } |
1273 | |
1274 | // }}} |
1275 | // _S_masked_store {{{ |
1276 | template <typename _Tp, size_t _Np, typename _Up> |
1277 | _GLIBCXX_SIMD_INTRINSIC static void |
1278 | _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem, |
1279 | const _MaskMember<_Tp> __k) noexcept |
1280 | { |
1281 | if constexpr (is_integral_v< |
1282 | _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up) |
1283 | && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw) |
1284 | && (sizeof(__v) == 64 || __have_avx512vl)) |
1285 | { // truncating store |
1286 | const auto __vi = __to_intrin(__v); |
1287 | const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); |
1288 | if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 |
1289 | && sizeof(__vi) == 64) |
1290 | _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); |
1291 | else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 |
1292 | && sizeof(__vi) == 32) |
1293 | _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); |
1294 | else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 |
1295 | && sizeof(__vi) == 16) |
1296 | _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); |
1297 | else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 |
1298 | && sizeof(__vi) == 64) |
1299 | _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); |
1300 | else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 |
1301 | && sizeof(__vi) == 32) |
1302 | _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); |
1303 | else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 |
1304 | && sizeof(__vi) == 16) |
1305 | _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); |
1306 | else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 |
1307 | && sizeof(__vi) == 64) |
1308 | _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); |
1309 | else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 |
1310 | && sizeof(__vi) == 32) |
1311 | _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); |
1312 | else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 |
1313 | && sizeof(__vi) == 16) |
1314 | _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); |
1315 | else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 |
1316 | && sizeof(__vi) == 64) |
1317 | _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); |
1318 | else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 |
1319 | && sizeof(__vi) == 32) |
1320 | _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); |
1321 | else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 |
1322 | && sizeof(__vi) == 16) |
1323 | _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); |
1324 | else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 |
1325 | && sizeof(__vi) == 64) |
1326 | _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); |
1327 | else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 |
1328 | && sizeof(__vi) == 32) |
1329 | _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); |
1330 | else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 |
1331 | && sizeof(__vi) == 16) |
1332 | _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); |
1333 | else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 |
1334 | && sizeof(__vi) == 64) |
1335 | _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); |
1336 | else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 |
1337 | && sizeof(__vi) == 32) |
1338 | _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); |
1339 | else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 |
1340 | && sizeof(__vi) == 16) |
1341 | _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); |
1342 | else |
1343 | __assert_unreachable<_Tp>(); |
1344 | } |
1345 | else |
1346 | _Base::_S_masked_store(__v, __mem, __k); |
1347 | } |
1348 | |
1349 | // }}} |
1350 | // _S_multiplies {{{ |
1351 | template <typename _V, typename _VVT = _VectorTraits<_V>> |
1352 | _GLIBCXX_SIMD_INTRINSIC static constexpr _V |
1353 | _S_multiplies(_V __x, _V __y) |
1354 | { |
1355 | using _Tp = typename _VVT::value_type; |
1356 | if (__builtin_is_constant_evaluated() || __x._M_is_constprop() |
1357 | || __y._M_is_constprop()) |
1358 | return __as_vector(__x) * __as_vector(__y); |
1359 | else if constexpr (sizeof(_Tp) == 1) |
1360 | { |
1361 | if constexpr (sizeof(_V) == 2) |
1362 | { |
1363 | const auto __xs = reinterpret_cast<short>(__x._M_data); |
1364 | const auto __ys = reinterpret_cast<short>(__y._M_data); |
1365 | return reinterpret_cast<__vector_type_t<_Tp, 2>>(short( |
1366 | ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00)))); |
1367 | } |
1368 | else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3) |
1369 | { |
1370 | const auto __xi = reinterpret_cast<int>(__x._M_data); |
1371 | const auto __yi = reinterpret_cast<int>(__y._M_data); |
1372 | return reinterpret_cast<__vector_type_t<_Tp, 3>>( |
1373 | ((__xi * __yi) & 0xff) |
1374 | | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) |
1375 | | ((__xi >> 16) * (__yi & 0xff0000))); |
1376 | } |
1377 | else if constexpr (sizeof(_V) == 4) |
1378 | { |
1379 | const auto __xi = reinterpret_cast<int>(__x._M_data); |
1380 | const auto __yi = reinterpret_cast<int>(__y._M_data); |
1381 | return reinterpret_cast<__vector_type_t<_Tp, 4>>( |
1382 | ((__xi * __yi) & 0xff) |
1383 | | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) |
1384 | | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000) |
1385 | | ((__xi >> 24) * (__yi & 0xff000000u))); |
1386 | } |
1387 | else if constexpr (sizeof(_V) == 8 && __have_avx2 |
1388 | && is_signed_v<_Tp>) |
1389 | return __convert<typename _VVT::type>( |
1390 | __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x))) |
1391 | * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y)))); |
1392 | else if constexpr (sizeof(_V) == 8 && __have_avx2 |
1393 | && is_unsigned_v<_Tp>) |
1394 | return __convert<typename _VVT::type>( |
1395 | __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x))) |
1396 | * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y)))); |
1397 | else |
1398 | { |
1399 | // codegen of `x*y` is suboptimal (as of GCC 9.0.1) |
1400 | constexpr size_t __full_size = _VVT::_S_full_size; |
1401 | constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8; |
1402 | using _ShortW = _SimdWrapper<short, _Np>; |
1403 | const _ShortW __even = __vector_bitcast<short, _Np>(__x) |
1404 | * __vector_bitcast<short, _Np>(__y); |
1405 | _ShortW __high_byte = _ShortW()._M_data - 256; |
1406 | //[&]() { asm("" : "+x"(__high_byte._M_data)); }(); |
1407 | const _ShortW __odd |
1408 | = (__vector_bitcast<short, _Np>(__x) >> 8) |
1409 | * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data); |
1410 | if constexpr (__have_avx512bw && sizeof(_V) > 2) |
1411 | return _CommonImplX86::_S_blend_avx512( |
1412 | 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even), |
1413 | __vector_bitcast<_Tp>(__odd)); |
1414 | else if constexpr (__have_sse4_1 && sizeof(_V) > 2) |
1415 | return _CommonImplX86::_S_blend_intrin(__to_intrin( |
1416 | __high_byte), |
1417 | __to_intrin(__even), |
1418 | __to_intrin(__odd)); |
1419 | else |
1420 | return __to_intrin( |
1421 | __or(__andnot(__high_byte, __even), __odd)); |
1422 | } |
1423 | } |
1424 | else |
1425 | return _Base::_S_multiplies(__x, __y); |
1426 | } |
1427 | |
1428 | // }}} |
1429 | // _S_divides {{{ |
1430 | #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993 |
1431 | template <typename _Tp, size_t _Np> |
1432 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
1433 | _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
1434 | { |
1435 | if (!__builtin_is_constant_evaluated() |
1436 | && !__builtin_constant_p(__y._M_data)) |
1437 | if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4) |
1438 | { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1) |
1439 | // Note that using floating-point division is likely to raise the |
1440 | // *Inexact* exception flag and thus appears like an invalid |
1441 | // "as-if" transformation. However, C++ doesn't specify how the |
1442 | // fpenv can be observed and points to C. C says that function |
1443 | // calls are assumed to potentially raise fp exceptions, unless |
1444 | // documented otherwise. Consequently, operator/, which is a |
1445 | // function call, may raise fp exceptions. |
1446 | /*const struct _CsrGuard |
1447 | { |
1448 | const unsigned _M_data = _mm_getcsr(); |
1449 | _CsrGuard() |
1450 | { |
1451 | _mm_setcsr(0x9f80); // turn off FP exceptions and |
1452 | flush-to-zero |
1453 | } |
1454 | ~_CsrGuard() { _mm_setcsr(_M_data); } |
1455 | } __csr;*/ |
1456 | using _Float = conditional_t<sizeof(_Tp) == 4, double, float>; |
1457 | constexpr size_t __n_intermediate |
1458 | = std::min(_Np, (__have_avx512f ? 64 |
1459 | : __have_avx ? 32 |
1460 | : 16) |
1461 | / sizeof(_Float)); |
1462 | using _FloatV = __vector_type_t<_Float, __n_intermediate>; |
1463 | constexpr size_t __n_floatv |
1464 | = __div_roundup(_Np, __n_intermediate); |
1465 | using _R = __vector_type_t<_Tp, _Np>; |
1466 | const auto __xf = __convert_all<_FloatV, __n_floatv>(__x); |
1467 | const auto __yf = __convert_all<_FloatV, __n_floatv>( |
1468 | _Abi::__make_padding_nonzero(__as_vector(__y))); |
1469 | return __call_with_n_evaluations<__n_floatv>( |
1470 | [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1471 | return __vector_convert<_R>(__quotients...); |
1472 | }, |
1473 | [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA |
1474 | -> _SimdWrapper<_Float, __n_intermediate> |
1475 | { |
1476 | #if __RECIPROCAL_MATH__ |
1477 | // If -freciprocal-math is active, using the `/` operator is |
1478 | // incorrect because it may be translated to an imprecise |
1479 | // multiplication with reciprocal. We need to use inline |
1480 | // assembly to force a real division. |
1481 | _FloatV __r; |
1482 | if constexpr (__have_avx) // -mno-sse2avx is irrelevant |
1483 | // because once -mavx is given, GCC |
1484 | // emits VEX encoded vdivp[sd] |
1485 | { |
1486 | if constexpr (sizeof(_Tp) == 4) |
1487 | asm("vdivpd\t{%2, %1, %0|%0, %1, %2}" |
1488 | : "=x" (__r) |
1489 | : "x" (__xf[__i]), "x" (__yf[__i])); |
1490 | else |
1491 | asm("vdivps\t{%2, %1, %0|%0, %1, %2}" |
1492 | : "=x" (__r) |
1493 | : "x" (__xf[__i]), "x" (__yf[__i])); |
1494 | } |
1495 | else |
1496 | { |
1497 | __r = __xf[__i]; |
1498 | if constexpr (sizeof(_Tp) == 4) |
1499 | asm("divpd\t{%1, %0|%0, %1}" |
1500 | : "=x" (__r) |
1501 | : "x" (__yf[__i])); |
1502 | else |
1503 | asm("divps\t{%1, %0|%0, %1}" |
1504 | : "=x" (__r) |
1505 | : "x" (__yf[__i])); |
1506 | } |
1507 | return __r; |
1508 | #else |
1509 | return __xf[__i] / __yf[__i]; |
1510 | #endif |
1511 | }); |
1512 | } |
1513 | /* 64-bit int division is potentially optimizable via double division if |
1514 | * the value in __x is small enough and the conversion between |
1515 | * int<->double is efficient enough: |
1516 | else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> && |
1517 | sizeof(_Tp) == 8) |
1518 | { |
1519 | if constexpr (__have_sse4_1 && sizeof(__x) == 16) |
1520 | { |
1521 | if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull, |
1522 | 0xffe0'0000'0000'0000ull})) |
1523 | { |
1524 | __x._M_data | 0x __vector_convert<__m128d>(__x._M_data) |
1525 | } |
1526 | } |
1527 | } |
1528 | */ |
1529 | return _Base::_S_divides(__x, __y); |
1530 | } |
1531 | #else |
1532 | using _Base::_S_divides; |
1533 | #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993 |
1534 | |
1535 | // }}} |
1536 | // _S_modulus {{{ |
1537 | template <typename _Tp, size_t _Np> |
1538 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
1539 | _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
1540 | { |
1541 | if (__builtin_is_constant_evaluated() |
1542 | || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8) |
1543 | return _Base::_S_modulus(__x, __y); |
1544 | else |
1545 | return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y))); |
1546 | } |
1547 | |
1548 | // }}} |
1549 | // _S_bit_shift_left {{{ |
1550 | // Notes on UB. C++2a [expr.shift] says: |
1551 | // -1- [...] The operands shall be of integral or unscoped enumeration type |
1552 | // and integral promotions are performed. The type of the result is that |
1553 | // of the promoted left operand. The behavior is undefined if the right |
1554 | // operand is negative, or greater than or equal to the width of the |
1555 | // promoted left operand. |
1556 | // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo |
1557 | // 2^N, where N is the width of the type of the result. |
1558 | // |
1559 | // C++17 [expr.shift] says: |
1560 | // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated |
1561 | // bits are zero-filled. If E1 has an unsigned type, the value of the |
1562 | // result is E1 × 2^E2 , reduced modulo one more than the maximum value |
1563 | // representable in the result type. Otherwise, if E1 has a signed type |
1564 | // and non-negative value, and E1 × 2^E2 is representable in the |
1565 | // corresponding unsigned type of the result type, then that value, |
1566 | // converted to the result type, is the resulting value; otherwise, the |
1567 | // behavior is undefined. |
1568 | // |
1569 | // Consequences: |
1570 | // With C++2a signed and unsigned types have the same UB |
1571 | // characteristics: |
1572 | // - left shift is not UB for 0 <= RHS < max(32, #bits(T)) |
1573 | // |
1574 | // With C++17 there's little room for optimizations because the standard |
1575 | // requires all shifts to happen on promoted integrals (i.e. int). Thus, |
1576 | // short and char shifts must assume shifts affect bits of neighboring |
1577 | // values. |
1578 | #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT |
1579 | template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> |
1580 | constexpr inline _GLIBCXX_CONST static typename _TVT::type |
1581 | _S_bit_shift_left(_Tp __xx, int __y) |
1582 | { |
1583 | using _V = typename _TVT::type; |
1584 | using _Up = typename _TVT::value_type; |
1585 | _V __x = __xx; |
1586 | [[maybe_unused]] const auto __ix = __to_intrin(__x); |
1587 | if (__builtin_is_constant_evaluated()) |
1588 | return __x << __y; |
1589 | #if __cplusplus > 201703 |
1590 | // after C++17, signed shifts have no UB, and behave just like unsigned |
1591 | // shifts |
1592 | else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) |
1593 | return __vector_bitcast<_Up>( |
1594 | _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), |
1595 | __y)); |
1596 | #endif |
1597 | else if constexpr (sizeof(_Up) == 1) |
1598 | { |
1599 | // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894) |
1600 | if (__builtin_constant_p(__y)) |
1601 | { |
1602 | if (__y == 0) |
1603 | return __x; |
1604 | else if (__y == 1) |
1605 | return __x + __x; |
1606 | else if (__y == 2) |
1607 | { |
1608 | __x = __x + __x; |
1609 | return __x + __x; |
1610 | } |
1611 | else if (__y > 2 && __y < 8) |
1612 | { |
1613 | if constexpr (sizeof(__x) > sizeof(unsigned)) |
1614 | { |
1615 | const _UChar __mask = 0xff << __y; // precomputed vector |
1616 | return __vector_bitcast<_Up>( |
1617 | __vector_bitcast<_UChar>( |
1618 | __vector_bitcast<unsigned>(__x) << __y) |
1619 | & __mask); |
1620 | } |
1621 | else |
1622 | { |
1623 | const unsigned __mask |
1624 | = (0xff & (0xff << __y)) * 0x01010101u; |
1625 | return reinterpret_cast<_V>( |
1626 | static_cast<__int_for_sizeof_t<_V>>( |
1627 | unsigned( |
1628 | reinterpret_cast<__int_for_sizeof_t<_V>>(__x) |
1629 | << __y) |
1630 | & __mask)); |
1631 | } |
1632 | } |
1633 | else if (__y >= 8 && __y < 32) |
1634 | return _V(); |
1635 | else |
1636 | __builtin_unreachable(); |
1637 | } |
1638 | // general strategy in the following: use an sllv instead of sll |
1639 | // instruction, because it's 2 to 4 times faster: |
1640 | else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16) |
1641 | return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8( |
1642 | _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix), |
1643 | _mm256_set1_epi16(w: __y)))); |
1644 | else if constexpr (__have_avx512bw && sizeof(__x) == 32) |
1645 | return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( |
1646 | _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix), |
1647 | _mm512_set1_epi16(w: __y)))); |
1648 | else if constexpr (__have_avx512bw && sizeof(__x) == 64) |
1649 | { |
1650 | const auto __shift = _mm512_set1_epi16(w: __y); |
1651 | return __vector_bitcast<_Up>( |
1652 | __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( |
1653 | _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)), |
1654 | _mm512_cvtepi16_epi8(_mm512_sllv_epi16( |
1655 | _mm512_cvtepi8_epi16(__hi256(__ix)), __shift)))); |
1656 | } |
1657 | else if constexpr (__have_avx2 && sizeof(__x) == 32) |
1658 | { |
1659 | #if 1 |
1660 | const auto __shift = _mm_cvtsi32_si128(a: __y); |
1661 | auto __k |
1662 | = _mm256_sll_epi16(a: _mm256_slli_epi16(a: ~__m256i(), count: 8), count: __shift); |
1663 | __k |= _mm256_srli_epi16(a: __k, count: 8); |
1664 | return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift) |
1665 | & __k); |
1666 | #else |
1667 | const _Up __k = 0xff << __y; |
1668 | return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y) |
1669 | & __k; |
1670 | #endif |
1671 | } |
1672 | else |
1673 | { |
1674 | const auto __shift = _mm_cvtsi32_si128(a: __y); |
1675 | auto __k |
1676 | = _mm_sll_epi16(a: _mm_slli_epi16(a: ~__m128i(), count: 8), count: __shift); |
1677 | __k |= _mm_srli_epi16(a: __k, count: 8); |
1678 | return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k); |
1679 | } |
1680 | } |
1681 | return __x << __y; |
1682 | } |
1683 | |
1684 | template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> |
1685 | constexpr inline _GLIBCXX_CONST static typename _TVT::type |
1686 | _S_bit_shift_left(_Tp __xx, typename _TVT::type __y) |
1687 | { |
1688 | using _V = typename _TVT::type; |
1689 | using _Up = typename _TVT::value_type; |
1690 | _V __x = __xx; |
1691 | [[maybe_unused]] const auto __ix = __to_intrin(__x); |
1692 | [[maybe_unused]] const auto __iy = __to_intrin(__y); |
1693 | if (__builtin_is_constant_evaluated()) |
1694 | return __x << __y; |
1695 | #if __cplusplus > 201703 |
1696 | // after C++17, signed shifts have no UB, and behave just like unsigned |
1697 | // shifts |
1698 | else if constexpr (is_signed_v<_Up>) |
1699 | return __vector_bitcast<_Up>( |
1700 | _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), |
1701 | __vector_bitcast<make_unsigned_t<_Up>>(__y))); |
1702 | #endif |
1703 | else if constexpr (sizeof(_Up) == 1) |
1704 | { |
1705 | if constexpr (sizeof __ix == 64 && __have_avx512bw) |
1706 | return __vector_bitcast<_Up>(__concat( |
1707 | _mm512_cvtepi16_epi8( |
1708 | _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)), |
1709 | _mm512_cvtepu8_epi16(__lo256(__iy)))), |
1710 | _mm512_cvtepi16_epi8( |
1711 | _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)), |
1712 | _mm512_cvtepu8_epi16(__hi256(__iy)))))); |
1713 | else if constexpr (sizeof __ix == 32 && __have_avx512bw) |
1714 | return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( |
1715 | _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix), |
1716 | _mm512_cvtepu8_epi16(__iy)))); |
1717 | else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl) |
1718 | return __intrin_bitcast<_V>( |
1719 | _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix), |
1720 | _mm_cvtepu8_epi16(__iy)))); |
1721 | else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) |
1722 | return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( |
1723 | _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix), |
1724 | _mm256_cvtepu8_epi16(__iy)))); |
1725 | else if constexpr (sizeof __ix == 16 && __have_avx512bw) |
1726 | return __intrin_bitcast<_V>( |
1727 | __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( |
1728 | _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)), |
1729 | _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy)))))); |
1730 | else if constexpr (__have_sse4_1 && sizeof(__x) == 16) |
1731 | { |
1732 | auto __mask |
1733 | = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5); |
1734 | auto __x4 |
1735 | = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); |
1736 | __x4 &= char(0xf0); |
1737 | __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( |
1738 | __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4))); |
1739 | __mask += __mask; |
1740 | auto __x2 |
1741 | = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); |
1742 | __x2 &= char(0xfc); |
1743 | __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( |
1744 | __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2))); |
1745 | __mask += __mask; |
1746 | auto __x1 = __x + __x; |
1747 | __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( |
1748 | __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1))); |
1749 | return __x |
1750 | & ((__y & char(0xf8)) == 0); // y > 7 nulls the result |
1751 | } |
1752 | else if constexpr (sizeof(__x) == 16) |
1753 | { |
1754 | auto __mask |
1755 | = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5); |
1756 | auto __x4 |
1757 | = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); |
1758 | __x4 &= char(0xf0); |
1759 | __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x; |
1760 | __mask += __mask; |
1761 | auto __x2 |
1762 | = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); |
1763 | __x2 &= char(0xfc); |
1764 | __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x; |
1765 | __mask += __mask; |
1766 | auto __x1 = __x + __x; |
1767 | __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x; |
1768 | return __x |
1769 | & ((__y & char(0xf8)) == 0); // y > 7 nulls the result |
1770 | } |
1771 | else |
1772 | return __x << __y; |
1773 | } |
1774 | else if constexpr (sizeof(_Up) == 2) |
1775 | { |
1776 | if constexpr (sizeof __ix == 64 && __have_avx512bw) |
1777 | return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy)); |
1778 | else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl) |
1779 | return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy)); |
1780 | else if constexpr (sizeof __ix == 32 && __have_avx512bw) |
1781 | return __vector_bitcast<_Up>( |
1782 | __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix), |
1783 | _mm512_castsi256_si512(__iy)))); |
1784 | else if constexpr (sizeof __ix == 32 && __have_avx2) |
1785 | { |
1786 | const auto __ux = __vector_bitcast<unsigned>(__x); |
1787 | const auto __uy = __vector_bitcast<unsigned>(__y); |
1788 | return __vector_bitcast<_Up>(_mm256_blend_epi16( |
1789 | __auto_bitcast(__ux << (__uy & 0x0000ffffu)), |
1790 | __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); |
1791 | } |
1792 | else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) |
1793 | return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy)); |
1794 | else if constexpr (sizeof __ix == 16 && __have_avx512bw) |
1795 | return __intrin_bitcast<_V>( |
1796 | __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix), |
1797 | _mm512_castsi128_si512(__iy)))); |
1798 | else if constexpr (sizeof __ix == 16 && __have_avx2) |
1799 | { |
1800 | const auto __ux = __vector_bitcast<unsigned>(__ix); |
1801 | const auto __uy = __vector_bitcast<unsigned>(__iy); |
1802 | return __intrin_bitcast<_V>(_mm_blend_epi16( |
1803 | __auto_bitcast(__ux << (__uy & 0x0000ffffu)), |
1804 | __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); |
1805 | } |
1806 | else if constexpr (sizeof __ix == 16) |
1807 | { |
1808 | using _Float4 = __vector_type_t<float, 4>; |
1809 | using _Int4 = __vector_type_t<int, 4>; |
1810 | using _UInt4 = __vector_type_t<unsigned, 4>; |
1811 | const _UInt4 __yu |
1812 | = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3))); |
1813 | return __x |
1814 | * __intrin_bitcast<_V>( |
1815 | __vector_convert<_Int4>(xs: _SimdWrapper<float, 4>( |
1816 | reinterpret_cast<_Float4>(__yu << 23))) |
1817 | | (__vector_convert<_Int4>(xs: _SimdWrapper<float, 4>( |
1818 | reinterpret_cast<_Float4>((__yu >> 16) << 23))) |
1819 | << 16)); |
1820 | } |
1821 | else |
1822 | __assert_unreachable<_Tp>(); |
1823 | } |
1824 | else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16 |
1825 | && !__have_avx2) |
1826 | // latency is suboptimal, but throughput is at full speedup |
1827 | return __intrin_bitcast<_V>( |
1828 | __vector_bitcast<unsigned>(__ix) |
1829 | * __vector_convert<__vector_type16_t<int>>( |
1830 | xs: _SimdWrapper<float, 4>(__vector_bitcast<float>( |
1831 | (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000)))); |
1832 | else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16 |
1833 | && !__have_avx2) |
1834 | { |
1835 | const auto __lo = _mm_sll_epi64(__ix, __iy); |
1836 | const auto __hi |
1837 | = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy)); |
1838 | if constexpr (__have_sse4_1) |
1839 | return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0)); |
1840 | else |
1841 | return __vector_bitcast<_Up>( |
1842 | _mm_move_sd(__vector_bitcast<double>(__hi), |
1843 | __vector_bitcast<double>(__lo))); |
1844 | } |
1845 | else |
1846 | return __x << __y; |
1847 | } |
1848 | #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT |
1849 | |
1850 | // }}} |
1851 | // _S_bit_shift_right {{{ |
1852 | #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT |
1853 | template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> |
1854 | constexpr inline _GLIBCXX_CONST static typename _TVT::type |
1855 | _S_bit_shift_right(_Tp __xx, int __y) |
1856 | { |
1857 | using _V = typename _TVT::type; |
1858 | using _Up = typename _TVT::value_type; |
1859 | _V __x = __xx; |
1860 | [[maybe_unused]] const auto __ix = __to_intrin(__x); |
1861 | if (__builtin_is_constant_evaluated()) |
1862 | return __x >> __y; |
1863 | else if (__builtin_constant_p(__y) |
1864 | && is_unsigned_v< |
1865 | _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__)) |
1866 | return _V(); |
1867 | else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{ |
1868 | return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y) |
1869 | & _Up(0xff >> __y); |
1870 | //}}} |
1871 | else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{ |
1872 | return __intrin_bitcast<_V>( |
1873 | (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix) |
1874 | >> (__y + 8)) |
1875 | << 8) |
1876 | | (__vector_bitcast<_UShort>( |
1877 | __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8) |
1878 | >> __y) |
1879 | >> 8)); |
1880 | //}}} |
1881 | // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected |
1882 | else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{ |
1883 | { |
1884 | if (__y > 32) |
1885 | return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32) |
1886 | & _Up(0xffff'ffff'0000'0000ull)) |
1887 | | __vector_bitcast<_Up>( |
1888 | __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix) |
1889 | >> 32) |
1890 | >> (__y - 32)); |
1891 | else |
1892 | return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix) |
1893 | >> __y) |
1894 | | __vector_bitcast<_Up>( |
1895 | __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll) |
1896 | >> __y); |
1897 | } |
1898 | //}}} |
1899 | else |
1900 | return __x >> __y; |
1901 | } |
1902 | |
1903 | template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> |
1904 | constexpr inline _GLIBCXX_CONST static typename _TVT::type |
1905 | _S_bit_shift_right(_Tp __xx, typename _TVT::type __y) |
1906 | { |
1907 | using _V = typename _TVT::type; |
1908 | using _Up = typename _TVT::value_type; |
1909 | _V __x = __xx; |
1910 | [[maybe_unused]] const auto __ix = __to_intrin(__x); |
1911 | [[maybe_unused]] const auto __iy = __to_intrin(__y); |
1912 | if (__builtin_is_constant_evaluated() |
1913 | || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) |
1914 | return __x >> __y; |
1915 | else if constexpr (sizeof(_Up) == 1) //{{{ |
1916 | { |
1917 | if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl) |
1918 | return __intrin_bitcast<_V>(_mm_cvtepi16_epi8( |
1919 | is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix), |
1920 | _mm_cvtepi8_epi16(__iy)) |
1921 | : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix), |
1922 | _mm_cvtepu8_epi16(__iy)))); |
1923 | if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl) |
1924 | return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( |
1925 | is_signed_v<_Up> |
1926 | ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix), |
1927 | _mm256_cvtepi8_epi16(__iy)) |
1928 | : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix), |
1929 | _mm256_cvtepu8_epi16(__iy)))); |
1930 | else if constexpr (sizeof(__x) == 32 && __have_avx512bw) |
1931 | return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( |
1932 | is_signed_v<_Up> |
1933 | ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix), |
1934 | _mm512_cvtepi8_epi16(__iy)) |
1935 | : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix), |
1936 | _mm512_cvtepu8_epi16(__iy)))); |
1937 | else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>) |
1938 | return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( |
1939 | _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)), |
1940 | 0x5555'5555'5555'5555ull, |
1941 | _mm512_srav_epi16( |
1942 | _mm512_slli_epi16(__ix, 8), |
1943 | _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy, |
1944 | _mm512_set1_epi16(w: 8))))); |
1945 | else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>) |
1946 | return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( |
1947 | _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)), |
1948 | 0x5555'5555'5555'5555ull, |
1949 | _mm512_srlv_epi16( |
1950 | _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix), |
1951 | _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy)))); |
1952 | /* This has better throughput but higher latency than the impl below |
1953 | else if constexpr (__have_avx2 && sizeof(__x) == 16 && |
1954 | is_unsigned_v<_Up>) |
1955 | { |
1956 | const auto __shorts = __to_intrin(_S_bit_shift_right( |
1957 | __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)), |
1958 | __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy)))); |
1959 | return __vector_bitcast<_Up>( |
1960 | _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts))); |
1961 | } |
1962 | */ |
1963 | else if constexpr (__have_avx2 && sizeof(__x) > 8) |
1964 | // the following uses vpsr[al]vd, which requires AVX2 |
1965 | if constexpr (is_signed_v<_Up>) |
1966 | { |
1967 | const auto r3 = __vector_bitcast<_UInt>( |
1968 | (__vector_bitcast<int>(__x) |
1969 | >> (__vector_bitcast<_UInt>(__y) >> 24))) |
1970 | & 0xff000000u; |
1971 | const auto r2 |
1972 | = __vector_bitcast<_UInt>( |
1973 | ((__vector_bitcast<int>(__x) << 8) |
1974 | >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))) |
1975 | & 0xff000000u; |
1976 | const auto r1 |
1977 | = __vector_bitcast<_UInt>( |
1978 | ((__vector_bitcast<int>(__x) << 16) |
1979 | >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))) |
1980 | & 0xff000000u; |
1981 | const auto r0 = __vector_bitcast<_UInt>( |
1982 | (__vector_bitcast<int>(__x) << 24) |
1983 | >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24)); |
1984 | return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) |
1985 | | (r0 >> 24)); |
1986 | } |
1987 | else |
1988 | { |
1989 | const auto r3 = (__vector_bitcast<_UInt>(__x) |
1990 | >> (__vector_bitcast<_UInt>(__y) >> 24)) |
1991 | & 0xff000000u; |
1992 | const auto r2 |
1993 | = ((__vector_bitcast<_UInt>(__x) << 8) |
1994 | >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)) |
1995 | & 0xff000000u; |
1996 | const auto r1 |
1997 | = ((__vector_bitcast<_UInt>(__x) << 16) |
1998 | >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)) |
1999 | & 0xff000000u; |
2000 | const auto r0 |
2001 | = (__vector_bitcast<_UInt>(__x) << 24) |
2002 | >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24); |
2003 | return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) |
2004 | | (r0 >> 24)); |
2005 | } |
2006 | else if constexpr (__have_sse4_1 |
2007 | && is_unsigned_v<_Up> && sizeof(__x) > 2) |
2008 | { |
2009 | auto __x128 = __vector_bitcast<_Up>(__ix); |
2010 | auto __mask |
2011 | = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5); |
2012 | auto __x4 = __vector_bitcast<_Up>( |
2013 | (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f)); |
2014 | __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( |
2015 | __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4))); |
2016 | __mask += __mask; |
2017 | auto __x2 = __vector_bitcast<_Up>( |
2018 | (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f)); |
2019 | __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( |
2020 | __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2))); |
2021 | __mask += __mask; |
2022 | auto __x1 = __vector_bitcast<_Up>( |
2023 | (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f)); |
2024 | __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( |
2025 | __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1))); |
2026 | return __intrin_bitcast<_V>( |
2027 | __x128 |
2028 | & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) |
2029 | == 0)); // y > 7 nulls the result |
2030 | } |
2031 | else if constexpr (__have_sse4_1 |
2032 | && is_signed_v<_Up> && sizeof(__x) > 2) |
2033 | { |
2034 | auto __mask = __vector_bitcast<_UChar>( |
2035 | __vector_bitcast<_UShort>(__iy) << 5); |
2036 | auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
2037 | return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); |
2038 | }; |
2039 | auto __xh = __vector_bitcast<short>(__ix); |
2040 | auto __xl = __vector_bitcast<short>(__ix) << 8; |
2041 | auto __xh4 = __xh >> 4; |
2042 | auto __xl4 = __xl >> 4; |
2043 | __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( |
2044 | __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4))); |
2045 | __xl = __vector_bitcast<short>( |
2046 | _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), |
2047 | __to_intrin(__xl4))); |
2048 | __mask += __mask; |
2049 | auto __xh2 = __xh >> 2; |
2050 | auto __xl2 = __xl >> 2; |
2051 | __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( |
2052 | __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2))); |
2053 | __xl = __vector_bitcast<short>( |
2054 | _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), |
2055 | __to_intrin(__xl2))); |
2056 | __mask += __mask; |
2057 | auto __xh1 = __xh >> 1; |
2058 | auto __xl1 = __xl >> 1; |
2059 | __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( |
2060 | __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1))); |
2061 | __xl = __vector_bitcast<short>( |
2062 | _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), |
2063 | __to_intrin(__xl1))); |
2064 | return __intrin_bitcast<_V>( |
2065 | (__vector_bitcast<_Up>((__xh & short(0xff00))) |
2066 | | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) |
2067 | >> 8)) |
2068 | & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) |
2069 | == 0)); // y > 7 nulls the result |
2070 | } |
2071 | else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2 |
2072 | { |
2073 | auto __mask |
2074 | = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5); |
2075 | auto __x4 = __vector_bitcast<_Up>( |
2076 | (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f)); |
2077 | __x = __mask > 0x7f ? __x4 : __x; |
2078 | __mask += __mask; |
2079 | auto __x2 = __vector_bitcast<_Up>( |
2080 | (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f)); |
2081 | __x = __mask > 0x7f ? __x2 : __x; |
2082 | __mask += __mask; |
2083 | auto __x1 = __vector_bitcast<_Up>( |
2084 | (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f)); |
2085 | __x = __mask > 0x7f ? __x1 : __x; |
2086 | return __x |
2087 | & ((__y & char(0xf8)) == 0); // y > 7 nulls the result |
2088 | } |
2089 | else if constexpr (sizeof(__x) > 2) // signed SSE2 |
2090 | { |
2091 | static_assert(is_signed_v<_Up>); |
2092 | auto __maskh = __vector_bitcast<_UShort>(__y) << 5; |
2093 | auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8); |
2094 | auto __xh = __vector_bitcast<short>(__x); |
2095 | auto __xl = __vector_bitcast<short>(__x) << 8; |
2096 | auto __xh4 = __xh >> 4; |
2097 | auto __xl4 = __xl >> 4; |
2098 | __xh = __maskh > 0x7fff ? __xh4 : __xh; |
2099 | __xl = __maskl > 0x7fff ? __xl4 : __xl; |
2100 | __maskh += __maskh; |
2101 | __maskl += __maskl; |
2102 | auto __xh2 = __xh >> 2; |
2103 | auto __xl2 = __xl >> 2; |
2104 | __xh = __maskh > 0x7fff ? __xh2 : __xh; |
2105 | __xl = __maskl > 0x7fff ? __xl2 : __xl; |
2106 | __maskh += __maskh; |
2107 | __maskl += __maskl; |
2108 | auto __xh1 = __xh >> 1; |
2109 | auto __xl1 = __xl >> 1; |
2110 | __xh = __maskh > 0x7fff ? __xh1 : __xh; |
2111 | __xl = __maskl > 0x7fff ? __xl1 : __xl; |
2112 | __x = __vector_bitcast<_Up>((__xh & short(0xff00))) |
2113 | | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) |
2114 | >> 8); |
2115 | return __x |
2116 | & ((__y & char(0xf8)) == 0); // y > 7 nulls the result |
2117 | } |
2118 | else |
2119 | return __x >> __y; |
2120 | } //}}} |
2121 | else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ |
2122 | { |
2123 | [[maybe_unused]] auto __blend_0xaa |
2124 | = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
2125 | if constexpr (sizeof(__a) == 16) |
2126 | return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), |
2127 | 0xaa); |
2128 | else if constexpr (sizeof(__a) == 32) |
2129 | return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), |
2130 | 0xaa); |
2131 | else if constexpr (sizeof(__a) == 64) |
2132 | return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), |
2133 | __to_intrin(__b)); |
2134 | else |
2135 | __assert_unreachable<decltype(__a)>(); |
2136 | }; |
2137 | if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) |
2138 | return __intrin_bitcast<_V>(is_signed_v<_Up> |
2139 | ? _mm_srav_epi16(__ix, __iy) |
2140 | : _mm_srlv_epi16(__ix, __iy)); |
2141 | else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32) |
2142 | return __vector_bitcast<_Up>(is_signed_v<_Up> |
2143 | ? _mm256_srav_epi16(__ix, __iy) |
2144 | : _mm256_srlv_epi16(__ix, __iy)); |
2145 | else if constexpr (__have_avx512bw && sizeof(_Tp) == 64) |
2146 | return __vector_bitcast<_Up>(is_signed_v<_Up> |
2147 | ? _mm512_srav_epi16(__ix, __iy) |
2148 | : _mm512_srlv_epi16(__ix, __iy)); |
2149 | else if constexpr (__have_avx2 && is_signed_v<_Up>) |
2150 | return __intrin_bitcast<_V>( |
2151 | __blend_0xaa(((__vector_bitcast<int>(__ix) << 16) |
2152 | >> (__vector_bitcast<int>(__iy) & 0xffffu)) |
2153 | >> 16, |
2154 | __vector_bitcast<int>(__ix) |
2155 | >> (__vector_bitcast<int>(__iy) >> 16))); |
2156 | else if constexpr (__have_avx2 && is_unsigned_v<_Up>) |
2157 | return __intrin_bitcast<_V>( |
2158 | __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu) |
2159 | >> (__vector_bitcast<_UInt>(__iy) & 0xffffu), |
2160 | __vector_bitcast<_UInt>(__ix) |
2161 | >> (__vector_bitcast<_UInt>(__iy) >> 16))); |
2162 | else if constexpr (__have_sse4_1) |
2163 | { |
2164 | auto __mask = __vector_bitcast<_UShort>(__iy); |
2165 | auto __x128 = __vector_bitcast<_Up>(__ix); |
2166 | //__mask *= 0x0808; |
2167 | __mask = (__mask << 3) | (__mask << 11); |
2168 | // do __x128 = 0 where __y[4] is set |
2169 | __x128 = __vector_bitcast<_Up>( |
2170 | _mm_blendv_epi8(__to_intrin(__x128), __m128i(), |
2171 | __to_intrin(__mask))); |
2172 | // do __x128 =>> 8 where __y[3] is set |
2173 | __x128 = __vector_bitcast<_Up>( |
2174 | _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8), |
2175 | __to_intrin(__mask += __mask))); |
2176 | // do __x128 =>> 4 where __y[2] is set |
2177 | __x128 = __vector_bitcast<_Up>( |
2178 | _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4), |
2179 | __to_intrin(__mask += __mask))); |
2180 | // do __x128 =>> 2 where __y[1] is set |
2181 | __x128 = __vector_bitcast<_Up>( |
2182 | _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2), |
2183 | __to_intrin(__mask += __mask))); |
2184 | // do __x128 =>> 1 where __y[0] is set |
2185 | return __intrin_bitcast<_V>( |
2186 | _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1), |
2187 | __to_intrin(__mask + __mask))); |
2188 | } |
2189 | else |
2190 | { |
2191 | auto __k = __vector_bitcast<_UShort>(__iy) << 11; |
2192 | auto __x128 = __vector_bitcast<_Up>(__ix); |
2193 | auto __mask |
2194 | = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
2195 | return __vector_bitcast<short>(x: __kk) < 0; |
2196 | }; |
2197 | // do __x128 = 0 where __y[4] is set |
2198 | __x128 = __mask(__k) ? decltype(__x128)() : __x128; |
2199 | // do __x128 =>> 8 where __y[3] is set |
2200 | __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128; |
2201 | // do __x128 =>> 4 where __y[2] is set |
2202 | __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128; |
2203 | // do __x128 =>> 2 where __y[1] is set |
2204 | __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128; |
2205 | // do __x128 =>> 1 where __y[0] is set |
2206 | return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1 |
2207 | : __x128); |
2208 | } |
2209 | } //}}} |
2210 | else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{ |
2211 | { |
2212 | if constexpr (is_unsigned_v<_Up>) |
2213 | { |
2214 | // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31 |
2215 | const __m128 __factor_f = reinterpret_cast<__m128>( |
2216 | 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23)); |
2217 | const __m128i __factor |
2218 | = __builtin_constant_p(__factor_f) |
2219 | ? __to_intrin( |
2220 | x: __make_vector<unsigned>(args: __factor_f[0], args: __factor_f[1], |
2221 | args: __factor_f[2], args: __factor_f[3])) |
2222 | : _mm_cvttps_epi32(a: __factor_f); |
2223 | const auto __r02 |
2224 | = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31); |
2225 | const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4), |
2226 | _mm_srli_si128(__factor, 4)); |
2227 | if constexpr (__have_sse4_1) |
2228 | return __intrin_bitcast<_V>( |
2229 | _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33)); |
2230 | else |
2231 | return __intrin_bitcast<_V>( |
2232 | __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4)); |
2233 | } |
2234 | else |
2235 | { |
2236 | auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
2237 | if constexpr (is_signed_v<_Up>) |
2238 | return _mm_sra_epi32(__a, __b); |
2239 | else |
2240 | return _mm_srl_epi32(__a, __b); |
2241 | }; |
2242 | const auto __r0 |
2243 | = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i())); |
2244 | const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32)); |
2245 | const auto __r2 |
2246 | = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i())); |
2247 | const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12)); |
2248 | if constexpr (__have_sse4_1) |
2249 | return __intrin_bitcast<_V>( |
2250 | _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3), |
2251 | _mm_blend_epi16(__r3, __r2, 0x30), 0xf0)); |
2252 | else |
2253 | return __intrin_bitcast<_V>(_mm_unpacklo_epi64( |
2254 | _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)), |
2255 | _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4)))); |
2256 | } |
2257 | } //}}} |
2258 | else |
2259 | return __x >> __y; |
2260 | } |
2261 | #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT |
2262 | |
2263 | // }}} |
2264 | // compares {{{ |
2265 | // _S_equal_to {{{ |
2266 | template <typename _Tp, size_t _Np> |
2267 | _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> |
2268 | _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
2269 | { |
2270 | if constexpr (__is_avx512_abi<_Abi>()) // {{{ |
2271 | { |
2272 | if (__builtin_is_constant_evaluated() |
2273 | || (__x._M_is_constprop() && __y._M_is_constprop())) |
2274 | return _MaskImpl::_S_to_bits( |
2275 | __as_wrapper<_Np>(__x._M_data == __y._M_data)); |
2276 | |
2277 | constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
2278 | [[maybe_unused]] const auto __xi = __to_intrin(__x); |
2279 | [[maybe_unused]] const auto __yi = __to_intrin(__y); |
2280 | if constexpr (is_floating_point_v<_Tp>) |
2281 | { |
2282 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
2283 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); |
2284 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
2285 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); |
2286 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
2287 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); |
2288 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
2289 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); |
2290 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
2291 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); |
2292 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
2293 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); |
2294 | else |
2295 | __assert_unreachable<_Tp>(); |
2296 | } |
2297 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
2298 | return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); |
2299 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
2300 | return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); |
2301 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) |
2302 | return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); |
2303 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) |
2304 | return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); |
2305 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
2306 | return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); |
2307 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
2308 | return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); |
2309 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) |
2310 | return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); |
2311 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) |
2312 | return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); |
2313 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
2314 | return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); |
2315 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
2316 | return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); |
2317 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) |
2318 | return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); |
2319 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) |
2320 | return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); |
2321 | else |
2322 | __assert_unreachable<_Tp>(); |
2323 | } // }}} |
2324 | else if (__builtin_is_constant_evaluated()) |
2325 | return _Base::_S_equal_to(__x, __y); |
2326 | else if constexpr (sizeof(__x) == 8) |
2327 | { |
2328 | const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) |
2329 | == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); |
2330 | _MaskMember<_Tp> __r64{}; |
2331 | __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); |
2332 | return __r64; |
2333 | } |
2334 | else |
2335 | return _Base::_S_equal_to(__x, __y); |
2336 | } |
2337 | |
2338 | // }}} |
2339 | // _S_not_equal_to {{{ |
2340 | template <typename _Tp, size_t _Np> |
2341 | _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> |
2342 | _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
2343 | { |
2344 | if constexpr (__is_avx512_abi<_Abi>()) // {{{ |
2345 | { |
2346 | if (__builtin_is_constant_evaluated() |
2347 | || (__x._M_is_constprop() && __y._M_is_constprop())) |
2348 | return _MaskImpl::_S_to_bits( |
2349 | __as_wrapper<_Np>(__x._M_data != __y._M_data)); |
2350 | |
2351 | constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
2352 | [[maybe_unused]] const auto __xi = __to_intrin(__x); |
2353 | [[maybe_unused]] const auto __yi = __to_intrin(__y); |
2354 | if constexpr (is_floating_point_v<_Tp>) |
2355 | { |
2356 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
2357 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); |
2358 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
2359 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); |
2360 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
2361 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); |
2362 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
2363 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); |
2364 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
2365 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); |
2366 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
2367 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); |
2368 | else |
2369 | __assert_unreachable<_Tp>(); |
2370 | } |
2371 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
2372 | return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); |
2373 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
2374 | return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); |
2375 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) |
2376 | return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); |
2377 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) |
2378 | return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); |
2379 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
2380 | return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); |
2381 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
2382 | return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); |
2383 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) |
2384 | return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); |
2385 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) |
2386 | return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); |
2387 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
2388 | return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); |
2389 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
2390 | return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); |
2391 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) |
2392 | return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); |
2393 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) |
2394 | return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); |
2395 | else |
2396 | __assert_unreachable<_Tp>(); |
2397 | } // }}} |
2398 | else if (__builtin_is_constant_evaluated()) |
2399 | return _Base::_S_not_equal_to(__x, __y); |
2400 | else if constexpr (sizeof(__x) == 8) |
2401 | { |
2402 | const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) |
2403 | != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); |
2404 | _MaskMember<_Tp> __r64{}; |
2405 | __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); |
2406 | return __r64; |
2407 | } |
2408 | else |
2409 | return _Base::_S_not_equal_to(__x, __y); |
2410 | } |
2411 | |
2412 | // }}} |
2413 | // _S_less {{{ |
2414 | template <typename _Tp, size_t _Np> |
2415 | _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> |
2416 | _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
2417 | { |
2418 | if constexpr (__is_avx512_abi<_Abi>()) // {{{ |
2419 | { |
2420 | if (__builtin_is_constant_evaluated() |
2421 | || (__x._M_is_constprop() && __y._M_is_constprop())) |
2422 | return _MaskImpl::_S_to_bits( |
2423 | __as_wrapper<_Np>(__x._M_data < __y._M_data)); |
2424 | |
2425 | constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
2426 | [[maybe_unused]] const auto __xi = __to_intrin(__x); |
2427 | [[maybe_unused]] const auto __yi = __to_intrin(__y); |
2428 | if constexpr (sizeof(__xi) == 64) |
2429 | { |
2430 | if constexpr (is_same_v<_Tp, float>) |
2431 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); |
2432 | else if constexpr (is_same_v<_Tp, double>) |
2433 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); |
2434 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) |
2435 | return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi); |
2436 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) |
2437 | return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi); |
2438 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) |
2439 | return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi); |
2440 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) |
2441 | return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi); |
2442 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) |
2443 | return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi); |
2444 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) |
2445 | return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi); |
2446 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) |
2447 | return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi); |
2448 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) |
2449 | return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi); |
2450 | else |
2451 | __assert_unreachable<_Tp>(); |
2452 | } |
2453 | else if constexpr (sizeof(__xi) == 32) |
2454 | { |
2455 | if constexpr (is_same_v<_Tp, float>) |
2456 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); |
2457 | else if constexpr (is_same_v<_Tp, double>) |
2458 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); |
2459 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) |
2460 | return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi); |
2461 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) |
2462 | return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi); |
2463 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) |
2464 | return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi); |
2465 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) |
2466 | return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi); |
2467 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) |
2468 | return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi); |
2469 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) |
2470 | return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi); |
2471 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) |
2472 | return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi); |
2473 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) |
2474 | return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi); |
2475 | else |
2476 | __assert_unreachable<_Tp>(); |
2477 | } |
2478 | else if constexpr (sizeof(__xi) == 16) |
2479 | { |
2480 | if constexpr (is_same_v<_Tp, float>) |
2481 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); |
2482 | else if constexpr (is_same_v<_Tp, double>) |
2483 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); |
2484 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) |
2485 | return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi); |
2486 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) |
2487 | return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi); |
2488 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) |
2489 | return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi); |
2490 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) |
2491 | return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi); |
2492 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) |
2493 | return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi); |
2494 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) |
2495 | return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi); |
2496 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) |
2497 | return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi); |
2498 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) |
2499 | return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi); |
2500 | else |
2501 | __assert_unreachable<_Tp>(); |
2502 | } |
2503 | else |
2504 | __assert_unreachable<_Tp>(); |
2505 | } // }}} |
2506 | else if (__builtin_is_constant_evaluated()) |
2507 | return _Base::_S_less(__x, __y); |
2508 | else if constexpr (sizeof(__x) == 8) |
2509 | { |
2510 | const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) |
2511 | < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); |
2512 | _MaskMember<_Tp> __r64{}; |
2513 | __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); |
2514 | return __r64; |
2515 | } |
2516 | else |
2517 | return _Base::_S_less(__x, __y); |
2518 | } |
2519 | |
2520 | // }}} |
2521 | // _S_less_equal {{{ |
2522 | template <typename _Tp, size_t _Np> |
2523 | _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> |
2524 | _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
2525 | { |
2526 | if constexpr (__is_avx512_abi<_Abi>()) // {{{ |
2527 | { |
2528 | if (__builtin_is_constant_evaluated() |
2529 | || (__x._M_is_constprop() && __y._M_is_constprop())) |
2530 | return _MaskImpl::_S_to_bits( |
2531 | __as_wrapper<_Np>(__x._M_data <= __y._M_data)); |
2532 | |
2533 | constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
2534 | [[maybe_unused]] const auto __xi = __to_intrin(__x); |
2535 | [[maybe_unused]] const auto __yi = __to_intrin(__y); |
2536 | if constexpr (sizeof(__xi) == 64) |
2537 | { |
2538 | if constexpr (is_same_v<_Tp, float>) |
2539 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); |
2540 | else if constexpr (is_same_v<_Tp, double>) |
2541 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); |
2542 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) |
2543 | return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi); |
2544 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) |
2545 | return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi); |
2546 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) |
2547 | return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi); |
2548 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) |
2549 | return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi); |
2550 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) |
2551 | return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi); |
2552 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) |
2553 | return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi); |
2554 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) |
2555 | return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi); |
2556 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) |
2557 | return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi); |
2558 | else |
2559 | __assert_unreachable<_Tp>(); |
2560 | } |
2561 | else if constexpr (sizeof(__xi) == 32) |
2562 | { |
2563 | if constexpr (is_same_v<_Tp, float>) |
2564 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); |
2565 | else if constexpr (is_same_v<_Tp, double>) |
2566 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); |
2567 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) |
2568 | return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi); |
2569 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) |
2570 | return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi); |
2571 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) |
2572 | return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi); |
2573 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) |
2574 | return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi); |
2575 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) |
2576 | return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi); |
2577 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) |
2578 | return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi); |
2579 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) |
2580 | return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi); |
2581 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) |
2582 | return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi); |
2583 | else |
2584 | __assert_unreachable<_Tp>(); |
2585 | } |
2586 | else if constexpr (sizeof(__xi) == 16) |
2587 | { |
2588 | if constexpr (is_same_v<_Tp, float>) |
2589 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); |
2590 | else if constexpr (is_same_v<_Tp, double>) |
2591 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); |
2592 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) |
2593 | return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi); |
2594 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) |
2595 | return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi); |
2596 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) |
2597 | return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi); |
2598 | else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) |
2599 | return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi); |
2600 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) |
2601 | return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi); |
2602 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) |
2603 | return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi); |
2604 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) |
2605 | return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi); |
2606 | else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) |
2607 | return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi); |
2608 | else |
2609 | __assert_unreachable<_Tp>(); |
2610 | } |
2611 | else |
2612 | __assert_unreachable<_Tp>(); |
2613 | } // }}} |
2614 | else if (__builtin_is_constant_evaluated()) |
2615 | return _Base::_S_less_equal(__x, __y); |
2616 | else if constexpr (sizeof(__x) == 8) |
2617 | { |
2618 | const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) |
2619 | <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); |
2620 | _MaskMember<_Tp> __r64{}; |
2621 | __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); |
2622 | return __r64; |
2623 | } |
2624 | else |
2625 | return _Base::_S_less_equal(__x, __y); |
2626 | } |
2627 | |
2628 | // }}} }}} |
2629 | // negation {{{ |
2630 | template <typename _Tp, size_t _Np> |
2631 | _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> |
2632 | _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept |
2633 | { |
2634 | if constexpr (__is_avx512_abi<_Abi>()) |
2635 | return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>()); |
2636 | else |
2637 | return _Base::_S_negate(__x); |
2638 | } |
2639 | |
2640 | // }}} |
2641 | // math {{{ |
2642 | using _Base::_S_abs; |
2643 | |
2644 | // _S_sqrt {{{ |
2645 | template <typename _Tp, size_t _Np> |
2646 | _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> |
2647 | _S_sqrt(_SimdWrapper<_Tp, _Np> __x) |
2648 | { |
2649 | if constexpr (__is_sse_ps<_Tp, _Np>()) |
2650 | return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x))); |
2651 | else if constexpr (__is_sse_pd<_Tp, _Np>()) |
2652 | return _mm_sqrt_pd(__x); |
2653 | else if constexpr (__is_avx_ps<_Tp, _Np>()) |
2654 | return _mm256_sqrt_ps(__x); |
2655 | else if constexpr (__is_avx_pd<_Tp, _Np>()) |
2656 | return _mm256_sqrt_pd(__x); |
2657 | else if constexpr (__is_avx512_ps<_Tp, _Np>()) |
2658 | return _mm512_sqrt_ps(__x); |
2659 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
2660 | return _mm512_sqrt_pd(__x); |
2661 | else |
2662 | __assert_unreachable<_Tp>(); |
2663 | } |
2664 | |
2665 | // }}} |
2666 | // _S_ldexp {{{ |
2667 | template <typename _Tp, size_t _Np> |
2668 | _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> |
2669 | _S_ldexp(_SimdWrapper<_Tp, _Np> __x, |
2670 | __fixed_size_storage_t<int, _Np> __exp) |
2671 | { |
2672 | if constexpr (__is_avx512_abi<_Abi>()) |
2673 | { |
2674 | const auto __xi = __to_intrin(__x); |
2675 | constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi> |
2676 | __cvt; |
2677 | const auto __expi = __to_intrin(__cvt(__exp)); |
2678 | constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
2679 | if constexpr (sizeof(__xi) == 16) |
2680 | { |
2681 | if constexpr (sizeof(_Tp) == 8) |
2682 | return _mm_maskz_scalef_pd(__k1, __xi, __expi); |
2683 | else |
2684 | return _mm_maskz_scalef_ps(__k1, __xi, __expi); |
2685 | } |
2686 | else if constexpr (sizeof(__xi) == 32) |
2687 | { |
2688 | if constexpr (sizeof(_Tp) == 8) |
2689 | return _mm256_maskz_scalef_pd(__k1, __xi, __expi); |
2690 | else |
2691 | return _mm256_maskz_scalef_ps(__k1, __xi, __expi); |
2692 | } |
2693 | else |
2694 | { |
2695 | static_assert(sizeof(__xi) == 64); |
2696 | if constexpr (sizeof(_Tp) == 8) |
2697 | return _mm512_maskz_scalef_pd(__k1, __xi, __expi); |
2698 | else |
2699 | return _mm512_maskz_scalef_ps(__k1, __xi, __expi); |
2700 | } |
2701 | } |
2702 | else |
2703 | return _Base::_S_ldexp(__x, __exp); |
2704 | } |
2705 | |
2706 | // }}} |
2707 | // _S_trunc {{{ |
2708 | template <typename _Tp, size_t _Np> |
2709 | _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> |
2710 | _S_trunc(_SimdWrapper<_Tp, _Np> __x) |
2711 | { |
2712 | if constexpr (__is_avx512_ps<_Tp, _Np>()) |
2713 | return _mm512_roundscale_ps(__x, 0x0b); |
2714 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
2715 | return _mm512_roundscale_pd(__x, 0x0b); |
2716 | else if constexpr (__is_avx_ps<_Tp, _Np>()) |
2717 | return _mm256_round_ps(__x, 0x3); |
2718 | else if constexpr (__is_avx_pd<_Tp, _Np>()) |
2719 | return _mm256_round_pd(__x, 0x3); |
2720 | else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) |
2721 | return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x3)); |
2722 | else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) |
2723 | return _mm_round_pd(__x, 0x3); |
2724 | else if constexpr (__is_sse_ps<_Tp, _Np>()) |
2725 | { |
2726 | auto __truncated |
2727 | = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))); |
2728 | const auto __no_fractional_values |
2729 | = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x)) |
2730 | & 0x7f800000u) |
2731 | < 0x4b000000; // the exponent is so large that no mantissa bits |
2732 | // signify fractional values (0x3f8 + 23*8 = |
2733 | // 0x4b0) |
2734 | return __no_fractional_values ? __truncated : __to_intrin(__x); |
2735 | } |
2736 | else |
2737 | return _Base::_S_trunc(__x); |
2738 | } |
2739 | |
2740 | // }}} |
2741 | // _S_round {{{ |
2742 | template <typename _Tp, size_t _Np> |
2743 | _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> |
2744 | _S_round(_SimdWrapper<_Tp, _Np> __x) |
2745 | { |
2746 | // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away |
2747 | // from zero as required by std::round. Therefore this function is more |
2748 | // complicated. |
2749 | using _V = __vector_type_t<_Tp, _Np>; |
2750 | _V __truncated; |
2751 | if constexpr (__is_avx512_ps<_Tp, _Np>()) |
2752 | __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b); |
2753 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
2754 | __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b); |
2755 | else if constexpr (__is_avx_ps<_Tp, _Np>()) |
2756 | __truncated = _mm256_round_ps(__x._M_data, |
2757 | _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); |
2758 | else if constexpr (__is_avx_pd<_Tp, _Np>()) |
2759 | __truncated = _mm256_round_pd(__x._M_data, |
2760 | _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); |
2761 | else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) |
2762 | __truncated = __auto_bitcast( |
2763 | _mm_round_ps(__to_intrin(__x), |
2764 | _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); |
2765 | else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) |
2766 | __truncated |
2767 | = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); |
2768 | else if constexpr (__is_sse_ps<_Tp, _Np>()) |
2769 | __truncated = __auto_bitcast( |
2770 | _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)))); |
2771 | else |
2772 | return _Base::_S_round(__x); |
2773 | |
2774 | // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0 |
2775 | // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0 |
2776 | |
2777 | const _V __rounded |
2778 | = __truncated |
2779 | + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5) |
2780 | ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1) |
2781 | : _V()); |
2782 | if constexpr (__have_sse4_1) |
2783 | return __rounded; |
2784 | else // adjust for missing range in cvttps_epi32 |
2785 | return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded |
2786 | : __x._M_data; |
2787 | } |
2788 | |
2789 | // }}} |
2790 | // _S_nearbyint {{{ |
2791 | template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> |
2792 | _GLIBCXX_SIMD_INTRINSIC static _Tp |
2793 | _S_nearbyint(_Tp __x) noexcept |
2794 | { |
2795 | if constexpr (_TVT::template _S_is<float, 16>) |
2796 | return _mm512_roundscale_ps(__x, 0x0c); |
2797 | else if constexpr (_TVT::template _S_is<double, 8>) |
2798 | return _mm512_roundscale_pd(__x, 0x0c); |
2799 | else if constexpr (_TVT::template _S_is<float, 8>) |
2800 | return _mm256_round_ps(__x, |
2801 | _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); |
2802 | else if constexpr (_TVT::template _S_is<double, 4>) |
2803 | return _mm256_round_pd(__x, |
2804 | _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); |
2805 | else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) |
2806 | return _mm_round_ps(__x, |
2807 | _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); |
2808 | else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) |
2809 | return _mm_round_pd(__x, |
2810 | _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); |
2811 | else |
2812 | return _Base::_S_nearbyint(__x); |
2813 | } |
2814 | |
2815 | // }}} |
2816 | // _S_rint {{{ |
2817 | template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> |
2818 | _GLIBCXX_SIMD_INTRINSIC static _Tp |
2819 | _S_rint(_Tp __x) noexcept |
2820 | { |
2821 | if constexpr (_TVT::template _S_is<float, 16>) |
2822 | return _mm512_roundscale_ps(__x, 0x04); |
2823 | else if constexpr (_TVT::template _S_is<double, 8>) |
2824 | return _mm512_roundscale_pd(__x, 0x04); |
2825 | else if constexpr (_TVT::template _S_is<float, 8>) |
2826 | return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION); |
2827 | else if constexpr (_TVT::template _S_is<double, 4>) |
2828 | return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION); |
2829 | else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) |
2830 | return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION); |
2831 | else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) |
2832 | return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION); |
2833 | else |
2834 | return _Base::_S_rint(__x); |
2835 | } |
2836 | |
2837 | // }}} |
2838 | // _S_floor {{{ |
2839 | template <typename _Tp, size_t _Np> |
2840 | _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> |
2841 | _S_floor(_SimdWrapper<_Tp, _Np> __x) |
2842 | { |
2843 | if constexpr (__is_avx512_ps<_Tp, _Np>()) |
2844 | return _mm512_roundscale_ps(__x, 0x09); |
2845 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
2846 | return _mm512_roundscale_pd(__x, 0x09); |
2847 | else if constexpr (__is_avx_ps<_Tp, _Np>()) |
2848 | return _mm256_round_ps(__x, 0x1); |
2849 | else if constexpr (__is_avx_pd<_Tp, _Np>()) |
2850 | return _mm256_round_pd(__x, 0x1); |
2851 | else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) |
2852 | return __auto_bitcast(_mm_floor_ps(__to_intrin(__x))); |
2853 | else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) |
2854 | return _mm_floor_pd(__x); |
2855 | else |
2856 | return _Base::_S_floor(__x); |
2857 | } |
2858 | |
2859 | // }}} |
2860 | // _S_ceil {{{ |
2861 | template <typename _Tp, size_t _Np> |
2862 | _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> |
2863 | _S_ceil(_SimdWrapper<_Tp, _Np> __x) |
2864 | { |
2865 | if constexpr (__is_avx512_ps<_Tp, _Np>()) |
2866 | return _mm512_roundscale_ps(__x, 0x0a); |
2867 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
2868 | return _mm512_roundscale_pd(__x, 0x0a); |
2869 | else if constexpr (__is_avx_ps<_Tp, _Np>()) |
2870 | return _mm256_round_ps(__x, 0x2); |
2871 | else if constexpr (__is_avx_pd<_Tp, _Np>()) |
2872 | return _mm256_round_pd(__x, 0x2); |
2873 | else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) |
2874 | return __auto_bitcast(_mm_ceil_ps(__to_intrin(__x))); |
2875 | else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) |
2876 | return _mm_ceil_pd(__x); |
2877 | else |
2878 | return _Base::_S_ceil(__x); |
2879 | } |
2880 | |
2881 | // }}} |
2882 | // _S_signbit {{{ |
2883 | template <typename _Tp, size_t _Np> |
2884 | _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> |
2885 | _S_signbit(_SimdWrapper<_Tp, _Np> __x) |
2886 | { |
2887 | if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) |
2888 | { |
2889 | if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4) |
2890 | return _mm512_movepi32_mask( |
2891 | __intrin_bitcast<__m512i>(__x._M_data)); |
2892 | else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8) |
2893 | return _mm512_movepi64_mask( |
2894 | __intrin_bitcast<__m512i>(__x._M_data)); |
2895 | else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4) |
2896 | return _mm256_movepi32_mask( |
2897 | __intrin_bitcast<__m256i>(__x._M_data)); |
2898 | else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8) |
2899 | return _mm256_movepi64_mask( |
2900 | __intrin_bitcast<__m256i>(__x._M_data)); |
2901 | else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4) |
2902 | return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data)); |
2903 | else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8) |
2904 | return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data)); |
2905 | } |
2906 | else if constexpr (__is_avx512_abi<_Abi>()) |
2907 | { |
2908 | const auto __xi = __to_intrin(__x); |
2909 | [[maybe_unused]] constexpr auto __k1 |
2910 | = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
2911 | if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
2912 | return _mm_movemask_ps(__xi); |
2913 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
2914 | return _mm_movemask_pd(__xi); |
2915 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
2916 | return _mm256_movemask_ps(__xi); |
2917 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
2918 | return _mm256_movemask_pd(__xi); |
2919 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
2920 | return _mm512_mask_cmplt_epi32_mask( |
2921 | __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); |
2922 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
2923 | return _mm512_mask_cmplt_epi64_mask( |
2924 | __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); |
2925 | else |
2926 | __assert_unreachable<_Tp>(); |
2927 | } |
2928 | else |
2929 | return _Base::_S_signbit(__x); |
2930 | /*{ |
2931 | using _I = __int_for_sizeof_t<_Tp>; |
2932 | if constexpr (sizeof(__x) == 64) |
2933 | return _S_less(__vector_bitcast<_I>(__x), _I()); |
2934 | else |
2935 | { |
2936 | const auto __xx = __vector_bitcast<_I>(__x._M_data); |
2937 | [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>; |
2938 | if constexpr ((sizeof(_Tp) == 4 && |
2939 | (__have_avx2 || sizeof(__x) == 16)) || |
2940 | __have_avx512vl) |
2941 | { |
2942 | return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>); |
2943 | } |
2944 | else if constexpr ((__have_avx2 || |
2945 | (__have_ssse3 && sizeof(__x) == 16))) |
2946 | { |
2947 | return __vector_bitcast<_Tp>((__xx & __signmask) == |
2948 | __signmask); |
2949 | } |
2950 | else |
2951 | { // SSE2/3 or AVX (w/o AVX2) |
2952 | constexpr auto __one = __vector_broadcast<_Np, _Tp>(1); |
2953 | return __vector_bitcast<_Tp>( |
2954 | __vector_bitcast<_Tp>( |
2955 | (__xx & __signmask) | |
2956 | __vector_bitcast<_I>(__one)) // -1 or 1 |
2957 | != __one); |
2958 | } |
2959 | } |
2960 | }*/ |
2961 | } |
2962 | |
2963 | // }}} |
2964 | // _S_isnonzerovalue_mask {{{ |
2965 | // (isnormal | is subnormal == !isinf & !isnan & !is zero) |
2966 | template <typename _Tp> |
2967 | _GLIBCXX_SIMD_INTRINSIC static auto |
2968 | _S_isnonzerovalue_mask(_Tp __x) |
2969 | { |
2970 | using _Traits = _VectorTraits<_Tp>; |
2971 | if constexpr (__have_avx512dq_vl) |
2972 | { |
2973 | if constexpr (_Traits::template _S_is< |
2974 | float, 2> || _Traits::template _S_is<float, 4>) |
2975 | return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f)); |
2976 | else if constexpr (_Traits::template _S_is<float, 8>) |
2977 | return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f)); |
2978 | else if constexpr (_Traits::template _S_is<float, 16>) |
2979 | return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f)); |
2980 | else if constexpr (_Traits::template _S_is<double, 2>) |
2981 | return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f)); |
2982 | else if constexpr (_Traits::template _S_is<double, 4>) |
2983 | return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f)); |
2984 | else if constexpr (_Traits::template _S_is<double, 8>) |
2985 | return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f)); |
2986 | else |
2987 | __assert_unreachable<_Tp>(); |
2988 | } |
2989 | else |
2990 | { |
2991 | using _Up = typename _Traits::value_type; |
2992 | constexpr size_t _Np = _Traits::_S_full_size; |
2993 | const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0 |
2994 | const auto __b = __x * _Up(); // NaN if __x == inf |
2995 | if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>()) |
2996 | return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b), |
2997 | _CMP_ORD_Q); |
2998 | else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>()) |
2999 | return __mmask8(0xf |
3000 | & _mm512_cmp_ps_mask(__auto_bitcast(__a), |
3001 | __auto_bitcast(__b), |
3002 | _CMP_ORD_Q)); |
3003 | else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>()) |
3004 | return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q); |
3005 | else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>()) |
3006 | return __mmask8(0x3 |
3007 | & _mm512_cmp_pd_mask(__auto_bitcast(__a), |
3008 | __auto_bitcast(__b), |
3009 | _CMP_ORD_Q)); |
3010 | else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>()) |
3011 | return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q); |
3012 | else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>()) |
3013 | return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a), |
3014 | __auto_bitcast(__b), |
3015 | _CMP_ORD_Q)); |
3016 | else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>()) |
3017 | return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q); |
3018 | else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>()) |
3019 | return __mmask8(0xf |
3020 | & _mm512_cmp_pd_mask(__auto_bitcast(__a), |
3021 | __auto_bitcast(__b), |
3022 | _CMP_ORD_Q)); |
3023 | else if constexpr (__is_avx512_ps<_Up, _Np>()) |
3024 | return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q); |
3025 | else if constexpr (__is_avx512_pd<_Up, _Np>()) |
3026 | return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q); |
3027 | else |
3028 | __assert_unreachable<_Tp>(); |
3029 | } |
3030 | } |
3031 | |
3032 | // }}} |
3033 | // _S_isfinite {{{ |
3034 | template <typename _Tp, size_t _Np> |
3035 | _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> |
3036 | _S_isfinite(_SimdWrapper<_Tp, _Np> __x) |
3037 | { |
3038 | static_assert(is_floating_point_v<_Tp>); |
3039 | #if !__FINITE_MATH_ONLY__ |
3040 | if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) |
3041 | { |
3042 | const auto __xi = __to_intrin(__x); |
3043 | constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
3044 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
3045 | return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99); |
3046 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
3047 | return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99); |
3048 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3049 | return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99); |
3050 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3051 | return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99); |
3052 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3053 | return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99); |
3054 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3055 | return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99); |
3056 | } |
3057 | else if constexpr (__is_avx512_abi<_Abi>()) |
3058 | { |
3059 | // if all exponent bits are set, __x is either inf or NaN |
3060 | using _I = __int_for_sizeof_t<_Tp>; |
3061 | const auto __inf = __vector_bitcast<_I>( |
3062 | __vector_broadcast<_Np>(__infinity_v<_Tp>)); |
3063 | return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf); |
3064 | } |
3065 | else |
3066 | #endif |
3067 | return _Base::_S_isfinite(__x); |
3068 | } |
3069 | |
3070 | // }}} |
3071 | // _S_isinf {{{ |
3072 | template <typename _Tp, size_t _Np> |
3073 | _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> |
3074 | _S_isinf(_SimdWrapper<_Tp, _Np> __x) |
3075 | { |
3076 | #if !__FINITE_MATH_ONLY__ |
3077 | if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) |
3078 | { |
3079 | const auto __xi = __to_intrin(__x); |
3080 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
3081 | return _mm512_fpclass_ps_mask(__xi, 0x18); |
3082 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
3083 | return _mm512_fpclass_pd_mask(__xi, 0x18); |
3084 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3085 | return _mm256_fpclass_ps_mask(__xi, 0x18); |
3086 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3087 | return _mm256_fpclass_pd_mask(__xi, 0x18); |
3088 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3089 | return _mm_fpclass_ps_mask(__xi, 0x18); |
3090 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3091 | return _mm_fpclass_pd_mask(__xi, 0x18); |
3092 | else |
3093 | __assert_unreachable<_Tp>(); |
3094 | } |
3095 | else if constexpr (__have_avx512dq_vl) |
3096 | { |
3097 | if constexpr (__is_sse_pd<_Tp, _Np>()) |
3098 | return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18)); |
3099 | else if constexpr (__is_avx_pd<_Tp, _Np>()) |
3100 | return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18)); |
3101 | else if constexpr (__is_sse_ps<_Tp, _Np>()) |
3102 | return _mm_movm_epi32( |
3103 | _mm_fpclass_ps_mask(__to_intrin(__x), 0x18)); |
3104 | else if constexpr (__is_avx_ps<_Tp, _Np>()) |
3105 | return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18)); |
3106 | else |
3107 | __assert_unreachable<_Tp>(); |
3108 | } |
3109 | else |
3110 | #endif |
3111 | return _Base::_S_isinf(__x); |
3112 | } |
3113 | |
3114 | // }}} |
3115 | // _S_isnormal {{{ |
3116 | template <typename _Tp, size_t _Np> |
3117 | _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> |
3118 | _S_isnormal(_SimdWrapper<_Tp, _Np> __x) |
3119 | { |
3120 | #if __FINITE_MATH_ONLY__ |
3121 | [[maybe_unused]] constexpr int __mode = 0x26; |
3122 | #else |
3123 | [[maybe_unused]] constexpr int __mode = 0xbf; |
3124 | #endif |
3125 | if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) |
3126 | { |
3127 | const auto __xi = __to_intrin(__x); |
3128 | const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
3129 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
3130 | return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode); |
3131 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
3132 | return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode); |
3133 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3134 | return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode); |
3135 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3136 | return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode); |
3137 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3138 | return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode); |
3139 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3140 | return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode); |
3141 | else |
3142 | __assert_unreachable<_Tp>(); |
3143 | } |
3144 | else if constexpr (__have_avx512dq) |
3145 | { |
3146 | if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) |
3147 | return _mm_movm_epi32( |
3148 | A: _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode))); |
3149 | else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) |
3150 | return _mm256_movm_epi32( |
3151 | A: _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode))); |
3152 | else if constexpr (__is_avx512_ps<_Tp, _Np>()) |
3153 | return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode)); |
3154 | else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) |
3155 | return _mm_movm_epi64( |
3156 | A: _knot_mask8(_mm_fpclass_pd_mask(__x, __mode))); |
3157 | else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) |
3158 | return _mm256_movm_epi64( |
3159 | A: _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode))); |
3160 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
3161 | return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode)); |
3162 | else |
3163 | __assert_unreachable<_Tp>(); |
3164 | } |
3165 | else if constexpr (__is_avx512_abi<_Abi>()) |
3166 | { |
3167 | using _I = __int_for_sizeof_t<_Tp>; |
3168 | const auto absn = __vector_bitcast<_I>(_S_abs(__x)); |
3169 | const auto minn = __vector_bitcast<_I>( |
3170 | __vector_broadcast<_Np>(__norm_min_v<_Tp>)); |
3171 | #if __FINITE_MATH_ONLY__ |
3172 | return _S_less_equal<_I, _Np>(minn, absn); |
3173 | #else |
3174 | const auto infn |
3175 | = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); |
3176 | return __and(_S_less_equal<_I, _Np>(minn, absn), |
3177 | _S_less<_I, _Np>(absn, infn)); |
3178 | #endif |
3179 | } |
3180 | else |
3181 | return _Base::_S_isnormal(__x); |
3182 | } |
3183 | |
3184 | // }}} |
3185 | // _S_isnan {{{ |
3186 | template <typename _Tp, size_t _Np> |
3187 | _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> |
3188 | _S_isnan(_SimdWrapper<_Tp, _Np> __x) |
3189 | { return _S_isunordered(__x, __x); } |
3190 | |
3191 | // }}} |
3192 | // _S_isunordered {{{ |
3193 | template <typename _Tp, size_t _Np> |
3194 | _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> |
3195 | _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x, |
3196 | [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y) |
3197 | { |
3198 | #if __FINITE_MATH_ONLY__ |
3199 | return {}; // false |
3200 | #else |
3201 | const auto __xi = __to_intrin(__x); |
3202 | const auto __yi = __to_intrin(__y); |
3203 | if constexpr (__is_avx512_abi<_Abi>()) |
3204 | { |
3205 | constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
3206 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
3207 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); |
3208 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
3209 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); |
3210 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3211 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); |
3212 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3213 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); |
3214 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3215 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); |
3216 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3217 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); |
3218 | } |
3219 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3220 | return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q)); |
3221 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3222 | return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q)); |
3223 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3224 | return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi)); |
3225 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3226 | return __to_masktype(_mm_cmpunord_pd(__xi, __yi)); |
3227 | else |
3228 | __assert_unreachable<_Tp>(); |
3229 | #endif |
3230 | } |
3231 | |
3232 | // }}} |
3233 | // _S_isgreater {{{ |
3234 | template <typename _Tp, size_t _Np> |
3235 | static constexpr _MaskMember<_Tp> |
3236 | _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
3237 | { |
3238 | const auto __xi = __to_intrin(__x); |
3239 | const auto __yi = __to_intrin(__y); |
3240 | if constexpr (__is_avx512_abi<_Abi>()) |
3241 | { |
3242 | const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
3243 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
3244 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); |
3245 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
3246 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); |
3247 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3248 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); |
3249 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3250 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); |
3251 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3252 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); |
3253 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3254 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); |
3255 | else |
3256 | __assert_unreachable<_Tp>(); |
3257 | } |
3258 | else if constexpr (__have_avx) |
3259 | { |
3260 | if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3261 | return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ)); |
3262 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3263 | return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ)); |
3264 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3265 | return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ)); |
3266 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3267 | return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ)); |
3268 | else |
3269 | __assert_unreachable<_Tp>(); |
3270 | } |
3271 | else if constexpr (__have_sse2 && sizeof(__xi) == 16 |
3272 | && sizeof(_Tp) == 4) |
3273 | { |
3274 | const auto __xn = __vector_bitcast<int>(__xi); |
3275 | const auto __yn = __vector_bitcast<int>(__yi); |
3276 | const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; |
3277 | const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; |
3278 | return __auto_bitcast( |
3279 | __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp)); |
3280 | } |
3281 | else if constexpr (__have_sse2 && sizeof(__xi) == 16 |
3282 | && sizeof(_Tp) == 8) |
3283 | return __vector_type_t<__int_with_sizeof_t<8>, 2>{ |
3284 | -_mm_ucomigt_sd(__xi, __yi), |
3285 | -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi), |
3286 | _mm_unpackhi_pd(__yi, __yi))}; |
3287 | else |
3288 | return _Base::_S_isgreater(__x, __y); |
3289 | } |
3290 | |
3291 | // }}} |
3292 | // _S_isgreaterequal {{{ |
3293 | template <typename _Tp, size_t _Np> |
3294 | static constexpr _MaskMember<_Tp> |
3295 | _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
3296 | { |
3297 | const auto __xi = __to_intrin(__x); |
3298 | const auto __yi = __to_intrin(__y); |
3299 | if constexpr (__is_avx512_abi<_Abi>()) |
3300 | { |
3301 | const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
3302 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
3303 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); |
3304 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
3305 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); |
3306 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3307 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); |
3308 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3309 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); |
3310 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3311 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); |
3312 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3313 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); |
3314 | else |
3315 | __assert_unreachable<_Tp>(); |
3316 | } |
3317 | else if constexpr (__have_avx) |
3318 | { |
3319 | if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3320 | return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ)); |
3321 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3322 | return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ)); |
3323 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3324 | return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ)); |
3325 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3326 | return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ)); |
3327 | else |
3328 | __assert_unreachable<_Tp>(); |
3329 | } |
3330 | else if constexpr (__have_sse2 && sizeof(__xi) == 16 |
3331 | && sizeof(_Tp) == 4) |
3332 | { |
3333 | const auto __xn = __vector_bitcast<int>(__xi); |
3334 | const auto __yn = __vector_bitcast<int>(__yi); |
3335 | const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; |
3336 | const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; |
3337 | return __auto_bitcast( |
3338 | __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp)); |
3339 | } |
3340 | else if constexpr (__have_sse2 && sizeof(__xi) == 16 |
3341 | && sizeof(_Tp) == 8) |
3342 | return __vector_type_t<__int_with_sizeof_t<8>, 2>{ |
3343 | -_mm_ucomige_sd(__xi, __yi), |
3344 | -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi), |
3345 | _mm_unpackhi_pd(__yi, __yi))}; |
3346 | else |
3347 | return _Base::_S_isgreaterequal(__x, __y); |
3348 | } |
3349 | |
3350 | // }}} |
3351 | // _S_isless {{{ |
3352 | template <typename _Tp, size_t _Np> |
3353 | static constexpr _MaskMember<_Tp> |
3354 | _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
3355 | { |
3356 | const auto __xi = __to_intrin(__x); |
3357 | const auto __yi = __to_intrin(__y); |
3358 | if constexpr (__is_avx512_abi<_Abi>()) |
3359 | { |
3360 | const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
3361 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
3362 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); |
3363 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
3364 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); |
3365 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3366 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); |
3367 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3368 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); |
3369 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3370 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); |
3371 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3372 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); |
3373 | else |
3374 | __assert_unreachable<_Tp>(); |
3375 | } |
3376 | else if constexpr (__have_avx) |
3377 | { |
3378 | if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3379 | return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ)); |
3380 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3381 | return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ)); |
3382 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3383 | return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ)); |
3384 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3385 | return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ)); |
3386 | else |
3387 | __assert_unreachable<_Tp>(); |
3388 | } |
3389 | else if constexpr (__have_sse2 && sizeof(__xi) == 16 |
3390 | && sizeof(_Tp) == 4) |
3391 | { |
3392 | const auto __xn = __vector_bitcast<int>(__xi); |
3393 | const auto __yn = __vector_bitcast<int>(__yi); |
3394 | const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; |
3395 | const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; |
3396 | return __auto_bitcast( |
3397 | __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp)); |
3398 | } |
3399 | else if constexpr (__have_sse2 && sizeof(__xi) == 16 |
3400 | && sizeof(_Tp) == 8) |
3401 | return __vector_type_t<__int_with_sizeof_t<8>, 2>{ |
3402 | -_mm_ucomigt_sd(__yi, __xi), |
3403 | -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi), |
3404 | _mm_unpackhi_pd(__xi, __xi))}; |
3405 | else |
3406 | return _Base::_S_isless(__x, __y); |
3407 | } |
3408 | |
3409 | // }}} |
3410 | // _S_islessequal {{{ |
3411 | template <typename _Tp, size_t _Np> |
3412 | static constexpr _MaskMember<_Tp> |
3413 | _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
3414 | { |
3415 | const auto __xi = __to_intrin(__x); |
3416 | const auto __yi = __to_intrin(__y); |
3417 | if constexpr (__is_avx512_abi<_Abi>()) |
3418 | { |
3419 | const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
3420 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
3421 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); |
3422 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
3423 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); |
3424 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3425 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); |
3426 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3427 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); |
3428 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3429 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); |
3430 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3431 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); |
3432 | else |
3433 | __assert_unreachable<_Tp>(); |
3434 | } |
3435 | else if constexpr (__have_avx) |
3436 | { |
3437 | if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3438 | return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ)); |
3439 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3440 | return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ)); |
3441 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3442 | return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ)); |
3443 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3444 | return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ)); |
3445 | else |
3446 | __assert_unreachable<_Tp>(); |
3447 | } |
3448 | else if constexpr (__have_sse2 && sizeof(__xi) == 16 |
3449 | && sizeof(_Tp) == 4) |
3450 | { |
3451 | const auto __xn = __vector_bitcast<int>(__xi); |
3452 | const auto __yn = __vector_bitcast<int>(__yi); |
3453 | const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; |
3454 | const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; |
3455 | return __auto_bitcast( |
3456 | __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp)); |
3457 | } |
3458 | else if constexpr (__have_sse2 && sizeof(__xi) == 16 |
3459 | && sizeof(_Tp) == 8) |
3460 | return __vector_type_t<__int_with_sizeof_t<8>, 2>{ |
3461 | -_mm_ucomige_sd(__yi, __xi), |
3462 | -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi), |
3463 | _mm_unpackhi_pd(__xi, __xi))}; |
3464 | else |
3465 | return _Base::_S_islessequal(__x, __y); |
3466 | } |
3467 | |
3468 | // }}} |
3469 | // _S_islessgreater {{{ |
3470 | template <typename _Tp, size_t _Np> |
3471 | static constexpr _MaskMember<_Tp> |
3472 | _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) |
3473 | { |
3474 | const auto __xi = __to_intrin(__x); |
3475 | const auto __yi = __to_intrin(__y); |
3476 | if constexpr (__is_avx512_abi<_Abi>()) |
3477 | { |
3478 | const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
3479 | if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) |
3480 | return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); |
3481 | else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) |
3482 | return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); |
3483 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3484 | return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); |
3485 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3486 | return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); |
3487 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3488 | return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); |
3489 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3490 | return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); |
3491 | else |
3492 | __assert_unreachable<_Tp>(); |
3493 | } |
3494 | else if constexpr (__have_avx) |
3495 | { |
3496 | if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) |
3497 | return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); |
3498 | else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) |
3499 | return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); |
3500 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3501 | return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); |
3502 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3503 | return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); |
3504 | else |
3505 | __assert_unreachable<_Tp>(); |
3506 | } |
3507 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) |
3508 | return __auto_bitcast( |
3509 | __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi))); |
3510 | else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) |
3511 | return __to_masktype( |
3512 | __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi))); |
3513 | else |
3514 | __assert_unreachable<_Tp>(); |
3515 | } |
3516 | |
3517 | //}}} }}} |
3518 | template <template <typename> class _Op, typename _Tp, typename _K, size_t _Np> |
3519 | _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> |
3520 | _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v) |
3521 | { |
3522 | if (__k._M_is_constprop_none_of()) |
3523 | return __v; |
3524 | else if (__k._M_is_constprop_all_of()) |
3525 | { |
3526 | auto __vv = _Base::_M_make_simd(__v); |
3527 | _Op<decltype(__vv)> __op; |
3528 | return __data(__op(__vv)); |
3529 | } |
3530 | else if constexpr (__is_bitmask_v<decltype(__k)> |
3531 | && (is_same_v<_Op<void>, __increment<void>> |
3532 | || is_same_v<_Op<void>, __decrement<void>>)) |
3533 | { |
3534 | // optimize masked unary increment and decrement as masked sub +/-1 |
3535 | constexpr int __pm_one |
3536 | = is_same_v<_Op<void>, __increment<void>> ? -1 : 1; |
3537 | #ifdef __clang__ |
3538 | return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data; |
3539 | #else // __clang__ |
3540 | if constexpr (is_integral_v<_Tp>) |
3541 | { |
3542 | constexpr bool __lp64 = sizeof(long) == sizeof(long long); |
3543 | using _Ip = std::make_signed_t<_Tp>; |
3544 | using _Up = std::conditional_t< |
3545 | std::is_same_v<_Ip, long>, |
3546 | std::conditional_t<__lp64, long long, int>, |
3547 | std::conditional_t< |
3548 | std::is_same_v<_Ip, signed char>, char, _Ip>>; |
3549 | const auto __value = __vector_bitcast<_Up>(__v._M_data); |
3550 | #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ |
3551 | if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \ |
3552 | return __vector_bitcast<_Tp>(__builtin_ia32_##_Instr##_mask(__value, \ |
3553 | __vector_broadcast<_Np>(_Up(__pm_one)), __value, __k._M_data)) |
3554 | _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512); |
3555 | _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256); |
3556 | _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128); |
3557 | _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512); |
3558 | _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256); |
3559 | _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128); |
3560 | _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512); |
3561 | _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256); |
3562 | _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128); |
3563 | _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512); |
3564 | _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256); |
3565 | _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128); |
3566 | #undef _GLIBCXX_SIMD_MASK_SUB |
3567 | } |
3568 | else |
3569 | { |
3570 | #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ |
3571 | if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \ |
3572 | return __builtin_ia32_##_Instr##_mask( \ |
3573 | __v._M_data, __vector_broadcast<_Np>(_Tp(__pm_one)), __v._M_data, \ |
3574 | __k._M_data, _MM_FROUND_CUR_DIRECTION) |
3575 | _GLIBCXX_SIMD_MASK_SUB(4, 64, subps512); |
3576 | _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256); |
3577 | _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128); |
3578 | _GLIBCXX_SIMD_MASK_SUB(8, 64, subpd512); |
3579 | _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256); |
3580 | _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128); |
3581 | #undef _GLIBCXX_SIMD_MASK_SUB |
3582 | } |
3583 | #endif // __clang__ |
3584 | } |
3585 | else |
3586 | return _Base::template _S_masked_unary<_Op>(__k, __v); |
3587 | } |
3588 | }; |
3589 | |
3590 | // }}} |
3591 | // _MaskImplX86Mixin {{{ |
3592 | struct _MaskImplX86Mixin |
3593 | { |
3594 | template <typename _Tp> |
3595 | using _TypeTag = _Tp*; |
3596 | |
3597 | using _Base = _MaskImplBuiltinMixin; |
3598 | |
3599 | // _S_to_maskvector(bool) {{{ |
3600 | template <typename _Up, size_t _ToN = 1, typename _Tp> |
3601 | _GLIBCXX_SIMD_INTRINSIC static constexpr |
3602 | enable_if_t<is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>> |
3603 | _S_to_maskvector(_Tp __x) |
3604 | { |
3605 | static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); |
3606 | return __x ? __vector_type_t<_Up, _ToN>{~_Up()} |
3607 | : __vector_type_t<_Up, _ToN>(); |
3608 | } |
3609 | |
3610 | // }}} |
3611 | // _S_to_maskvector(_SanitizedBitMask) {{{ |
3612 | template <typename _Up, size_t _UpN = 0, size_t _Np, size_t _ToN = _UpN == 0 ? _Np : _UpN> |
3613 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> |
3614 | _S_to_maskvector(_SanitizedBitMask<_Np> __x) |
3615 | { |
3616 | static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); |
3617 | using _UV = __vector_type_t<_Up, _ToN>; |
3618 | using _UI = __intrinsic_type_t<_Up, _ToN>; |
3619 | [[maybe_unused]] const auto __k = __x._M_to_bits(); |
3620 | if constexpr (_Np == 1) |
3621 | return _S_to_maskvector<_Up, _ToN>(__k); |
3622 | else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) |
3623 | return __generate_from_n_evaluations<std::min(a: _ToN, b: _Np), _UV>( |
3624 | [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; }); |
3625 | else if constexpr (sizeof(_Up) == 1) |
3626 | { |
3627 | if constexpr (sizeof(_UI) == 16) |
3628 | { |
3629 | if constexpr (__have_avx512bw_vl) |
3630 | return __intrin_bitcast<_UV>(_mm_movm_epi8(__k)); |
3631 | else if constexpr (__have_avx512bw) |
3632 | return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k))); |
3633 | else if constexpr (__have_avx512f) |
3634 | { |
3635 | auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); |
3636 | auto __as16bits |
3637 | = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), |
3638 | __hi256(__as32bits))); |
3639 | return __intrin_bitcast<_UV>( |
3640 | _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); |
3641 | } |
3642 | else if constexpr (__have_ssse3) |
3643 | { |
3644 | const auto __bitmask = __to_intrin( |
3645 | x: __make_vector<_UChar>(args: 1, args: 2, args: 4, args: 8, args: 16, args: 32, args: 64, args: 128, args: 1, args: 2, args: 4, |
3646 | args: 8, args: 16, args: 32, args: 64, args: 128)); |
3647 | return __intrin_bitcast<_UV>( |
3648 | __vector_bitcast<_Up>( |
3649 | _mm_shuffle_epi8(a: __to_intrin( |
3650 | x: __vector_type_t<_ULLong, 2>{__k}), |
3651 | b: _mm_setr_epi8(b0: 0, b1: 0, b2: 0, b3: 0, b4: 0, b5: 0, b6: 0, b7: 0, b8: 1, |
3652 | b9: 1, b10: 1, b11: 1, b12: 1, b13: 1, b14: 1, b15: 1)) |
3653 | & __bitmask) |
3654 | != 0); |
3655 | } |
3656 | // else fall through |
3657 | } |
3658 | else if constexpr (sizeof(_UI) == 32) |
3659 | { |
3660 | if constexpr (__have_avx512bw_vl) |
3661 | return __vector_bitcast<_Up>(_mm256_movm_epi8(__k)); |
3662 | else if constexpr (__have_avx512bw) |
3663 | return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k))); |
3664 | else if constexpr (__have_avx512f) |
3665 | { |
3666 | auto __as16bits = // 0 16 1 17 ... 15 31 |
3667 | _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()), |
3668 | 16) |
3669 | | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16, |
3670 | ~__m512i()), |
3671 | 16); |
3672 | auto __0_16_1_17 = __xzyw(_mm256_packs_epi16( |
3673 | __lo256(__as16bits), |
3674 | __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ... |
3675 | ); |
3676 | // deinterleave: |
3677 | return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8( |
3678 | __0_16_1_17, // 0 16 1 17 2 ... |
3679 | _mm256_setr_epi8(b31: 0, b30: 2, b29: 4, b28: 6, b27: 8, b26: 10, b25: 12, b24: 14, b23: 1, b22: 3, b21: 5, b20: 7, b19: 9, |
3680 | b18: 11, b17: 13, b16: 15, b15: 0, b14: 2, b13: 4, b12: 6, b11: 8, b10: 10, b09: 12, b08: 14, b07: 1, |
3681 | b06: 3, b05: 5, b04: 7, b03: 9, b02: 11, b01: 13, |
3682 | b00: 15)))); // 0-7 16-23 8-15 24-31 -> xzyw |
3683 | // 0-3 8-11 16-19 24-27 |
3684 | // 4-7 12-15 20-23 28-31 |
3685 | } |
3686 | else if constexpr (__have_avx2) |
3687 | { |
3688 | const auto __bitmask |
3689 | = _mm256_broadcastsi128_si256(X: __to_intrin( |
3690 | x: __make_vector<_UChar>(args: 1, args: 2, args: 4, args: 8, args: 16, args: 32, args: 64, args: 128, args: 1, args: 2, |
3691 | args: 4, args: 8, args: 16, args: 32, args: 64, args: 128))); |
3692 | return __vector_bitcast<_Up>( |
3693 | __vector_bitcast<_Up>( |
3694 | _mm256_shuffle_epi8( |
3695 | a: _mm256_broadcastsi128_si256( |
3696 | X: __to_intrin(x: __vector_type_t<_ULLong, 2>{__k})), |
3697 | b: _mm256_setr_epi8(b31: 0, b30: 0, b29: 0, b28: 0, b27: 0, b26: 0, b25: 0, b24: 0, b23: 1, b22: 1, b21: 1, b20: 1, b19: 1, |
3698 | b18: 1, b17: 1, b16: 1, b15: 2, b14: 2, b13: 2, b12: 2, b11: 2, b10: 2, b09: 2, b08: 2, b07: 3, b06: 3, |
3699 | b05: 3, b04: 3, b03: 3, b02: 3, b01: 3, b00: 3)) |
3700 | & __bitmask) |
3701 | != 0); |
3702 | } |
3703 | // else fall through |
3704 | } |
3705 | else if constexpr (sizeof(_UI) == 64) |
3706 | return reinterpret_cast<_UV>(_mm512_movm_epi8(__k)); |
3707 | if constexpr (std::min(a: _ToN, b: _Np) <= 4) |
3708 | { |
3709 | if constexpr (_Np > 7) // avoid overflow |
3710 | __x &= _SanitizedBitMask<_Np>(0x0f); |
3711 | const _UInt __char_mask |
3712 | = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL) |
3713 | * 0xff; |
3714 | _UV __r = {}; |
3715 | __builtin_memcpy(&__r, &__char_mask, |
3716 | std::min(a: sizeof(__r), b: sizeof(__char_mask))); |
3717 | return __r; |
3718 | } |
3719 | else if constexpr (std::min(a: _ToN, b: _Np) <= 7) |
3720 | { |
3721 | if constexpr (_Np > 7) // avoid overflow |
3722 | __x &= _SanitizedBitMask<_Np>(0x7f); |
3723 | const _ULLong __char_mask |
3724 | = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL) |
3725 | * 0xff; |
3726 | _UV __r = {}; |
3727 | __builtin_memcpy(&__r, &__char_mask, |
3728 | std::min(a: sizeof(__r), b: sizeof(__char_mask))); |
3729 | return __r; |
3730 | } |
3731 | } |
3732 | else if constexpr (sizeof(_Up) == 2) |
3733 | { |
3734 | if constexpr (sizeof(_UI) == 16) |
3735 | { |
3736 | if constexpr (__have_avx512bw_vl) |
3737 | return __intrin_bitcast<_UV>(_mm_movm_epi16(__k)); |
3738 | else if constexpr (__have_avx512bw) |
3739 | return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k))); |
3740 | else if constexpr (__have_avx512f) |
3741 | { |
3742 | __m256i __as32bits = {}; |
3743 | if constexpr (__have_avx512vl) |
3744 | __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i()); |
3745 | else |
3746 | __as32bits |
3747 | = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())); |
3748 | return __intrin_bitcast<_UV>( |
3749 | _mm_packs_epi32(a: __lo128(x: __as32bits), b: __hi128(x: __as32bits))); |
3750 | } |
3751 | // else fall through |
3752 | } |
3753 | else if constexpr (sizeof(_UI) == 32) |
3754 | { |
3755 | if constexpr (__have_avx512bw_vl) |
3756 | return __vector_bitcast<_Up>(_mm256_movm_epi16(__k)); |
3757 | else if constexpr (__have_avx512bw) |
3758 | return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k))); |
3759 | else if constexpr (__have_avx512f) |
3760 | { |
3761 | auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); |
3762 | return __vector_bitcast<_Up>( |
3763 | __xzyw(_mm256_packs_epi32(__lo256(__as32bits), |
3764 | __hi256(__as32bits)))); |
3765 | } |
3766 | // else fall through |
3767 | } |
3768 | else if constexpr (sizeof(_UI) == 64) |
3769 | return __vector_bitcast<_Up>(_mm512_movm_epi16(__k)); |
3770 | } |
3771 | else if constexpr (sizeof(_Up) == 4) |
3772 | { |
3773 | if constexpr (sizeof(_UI) == 16) |
3774 | { |
3775 | if constexpr (__have_avx512dq_vl) |
3776 | return __intrin_bitcast<_UV>(_mm_movm_epi32(__k)); |
3777 | else if constexpr (__have_avx512dq) |
3778 | return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k))); |
3779 | else if constexpr (__have_avx512vl) |
3780 | return __intrin_bitcast<_UV>( |
3781 | _mm_maskz_mov_epi32(__k, ~__m128i())); |
3782 | else if constexpr (__have_avx512f) |
3783 | return __intrin_bitcast<_UV>( |
3784 | __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i()))); |
3785 | // else fall through |
3786 | } |
3787 | else if constexpr (sizeof(_UI) == 32) |
3788 | { |
3789 | if constexpr (__have_avx512dq_vl) |
3790 | return __vector_bitcast<_Up>(_mm256_movm_epi32(__k)); |
3791 | else if constexpr (__have_avx512dq) |
3792 | return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k))); |
3793 | else if constexpr (__have_avx512vl) |
3794 | return __vector_bitcast<_Up>( |
3795 | _mm256_maskz_mov_epi32(__k, ~__m256i())); |
3796 | else if constexpr (__have_avx512f) |
3797 | return __vector_bitcast<_Up>( |
3798 | __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()))); |
3799 | // else fall through |
3800 | } |
3801 | else if constexpr (sizeof(_UI) == 64) |
3802 | return __vector_bitcast<_Up>( |
3803 | __have_avx512dq ? _mm512_movm_epi32(__k) |
3804 | : _mm512_maskz_mov_epi32(__k, ~__m512i())); |
3805 | } |
3806 | else if constexpr (sizeof(_Up) == 8) |
3807 | { |
3808 | if constexpr (sizeof(_UI) == 16) |
3809 | { |
3810 | if constexpr (__have_avx512dq_vl) |
3811 | return __vector_bitcast<_Up>(_mm_movm_epi64(__k)); |
3812 | else if constexpr (__have_avx512dq) |
3813 | return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k))); |
3814 | else if constexpr (__have_avx512vl) |
3815 | return __vector_bitcast<_Up>( |
3816 | _mm_maskz_mov_epi64(__k, ~__m128i())); |
3817 | else if constexpr (__have_avx512f) |
3818 | return __vector_bitcast<_Up>( |
3819 | __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i()))); |
3820 | // else fall through |
3821 | } |
3822 | else if constexpr (sizeof(_UI) == 32) |
3823 | { |
3824 | if constexpr (__have_avx512dq_vl) |
3825 | return __vector_bitcast<_Up>(_mm256_movm_epi64(__k)); |
3826 | else if constexpr (__have_avx512dq) |
3827 | return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k))); |
3828 | else if constexpr (__have_avx512vl) |
3829 | return __vector_bitcast<_Up>( |
3830 | _mm256_maskz_mov_epi64(__k, ~__m256i())); |
3831 | else if constexpr (__have_avx512f) |
3832 | return __vector_bitcast<_Up>( |
3833 | __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i()))); |
3834 | // else fall through |
3835 | } |
3836 | else if constexpr (sizeof(_UI) == 64) |
3837 | return __vector_bitcast<_Up>( |
3838 | __have_avx512dq ? _mm512_movm_epi64(__k) |
3839 | : _mm512_maskz_mov_epi64(__k, ~__m512i())); |
3840 | } |
3841 | |
3842 | using _UpUInt = make_unsigned_t<_Up>; |
3843 | using _V = __vector_type_t<_UpUInt, _ToN>; |
3844 | constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__; |
3845 | if constexpr (_ToN == 2) |
3846 | { |
3847 | return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])}); |
3848 | } |
3849 | else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32) |
3850 | { |
3851 | if constexpr (sizeof(_Up) == 4) |
3852 | return __vector_bitcast<_Up>(_mm256_cmp_ps( |
3853 | _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)), |
3854 | _mm256_castsi256_ps(_mm256_setr_epi32( |
3855 | 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))), |
3856 | _mm256_setzero_ps(), _CMP_NEQ_UQ)); |
3857 | else if constexpr (sizeof(_Up) == 8) |
3858 | return __vector_bitcast<_Up>(_mm256_cmp_pd( |
3859 | _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)), |
3860 | _mm256_castsi256_pd( |
3861 | _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))), |
3862 | _mm256_setzero_pd(), _CMP_NEQ_UQ)); |
3863 | else |
3864 | __assert_unreachable<_Up>(); |
3865 | } |
3866 | else if constexpr (__bits_per_element >= _ToN) |
3867 | { |
3868 | constexpr auto __bitmask |
3869 | = __generate_vector<_V>([](auto __i) |
3870 | constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt |
3871 | { return __i < _ToN ? 1ull << __i : 0; }); |
3872 | const auto __bits |
3873 | = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; |
3874 | if constexpr (__bits_per_element > _ToN) |
3875 | return __vector_bitcast<_Up>(__bits) > 0; |
3876 | else |
3877 | return __vector_bitcast<_Up>(__bits != 0); |
3878 | } |
3879 | else |
3880 | { |
3881 | const _V __tmp |
3882 | = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
3883 | return static_cast<_UpUInt>( |
3884 | __k >> (__bits_per_element * (__i / __bits_per_element))); |
3885 | }) |
3886 | & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
3887 | return static_cast<_UpUInt>(1ull |
3888 | << (__i % __bits_per_element)); |
3889 | }); // mask bit index |
3890 | return __intrin_bitcast<_UV>(__tmp != _V()); |
3891 | } |
3892 | } |
3893 | |
3894 | // }}} |
3895 | // _S_to_maskvector(_SimdWrapper) {{{ |
3896 | template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np, |
3897 | size_t _ToN = _UpN == 0 ? _Np : _UpN> |
3898 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> |
3899 | _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) |
3900 | { |
3901 | static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); |
3902 | using _TW = _SimdWrapper<_Tp, _Np>; |
3903 | using _UW = _SimdWrapper<_Up, _ToN>; |
3904 | using _UI = __intrinsic_type_t<_Up, _ToN>; |
3905 | if constexpr (is_same_v<_Tp, bool>) // bits -> vector |
3906 | return _S_to_maskvector<_Up, _ToN>( |
3907 | _BitMask<_Np>(__x._M_data)._M_sanitized()); |
3908 | // vector -> vector bitcast |
3909 | else if constexpr (sizeof(_Up) == sizeof(_Tp) |
3910 | && sizeof(_TW) == sizeof(_UW)) |
3911 | return __wrapper_bitcast<_Up, _ToN>( |
3912 | _ToN <= _Np |
3913 | ? __x |
3914 | : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x)); |
3915 | else // vector -> vector {{{ |
3916 | { |
3917 | if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) |
3918 | { |
3919 | const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); |
3920 | return __generate_from_n_evaluations<std::min(a: _ToN, b: _Np), |
3921 | __vector_type_t<_Up, _ToN>>( |
3922 | [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; }); |
3923 | } |
3924 | using _To = __vector_type_t<_Up, _ToN>; |
3925 | [[maybe_unused]] constexpr size_t _FromN = _Np; |
3926 | constexpr int _FromBytes = sizeof(_Tp); |
3927 | constexpr int _ToBytes = sizeof(_Up); |
3928 | const auto __k = __x._M_data; |
3929 | |
3930 | if constexpr (_FromBytes == _ToBytes) |
3931 | return __intrin_bitcast<_To>(__k); |
3932 | else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16) |
3933 | { // SSE -> SSE {{{ |
3934 | if constexpr (_FromBytes == 4 && _ToBytes == 8) |
3935 | return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); |
3936 | else if constexpr (_FromBytes == 2 && _ToBytes == 8) |
3937 | { |
3938 | const auto __y |
3939 | = __vector_bitcast<int>(__interleave128_lo(__k, __k)); |
3940 | return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); |
3941 | } |
3942 | else if constexpr (_FromBytes == 1 && _ToBytes == 8) |
3943 | { |
3944 | auto __y |
3945 | = __vector_bitcast<short>(__interleave128_lo(__k, __k)); |
3946 | auto __z |
3947 | = __vector_bitcast<int>(__interleave128_lo(__y, __y)); |
3948 | return __intrin_bitcast<_To>(__interleave128_lo(__z, __z)); |
3949 | } |
3950 | else if constexpr (_FromBytes == 8 && _ToBytes == 4 |
3951 | && __have_sse2) |
3952 | return __intrin_bitcast<_To>( |
3953 | _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); |
3954 | else if constexpr (_FromBytes == 8 && _ToBytes == 4) |
3955 | return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k), |
3956 | _UI()); |
3957 | else if constexpr (_FromBytes == 2 && _ToBytes == 4) |
3958 | return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); |
3959 | else if constexpr (_FromBytes == 1 && _ToBytes == 4) |
3960 | { |
3961 | const auto __y |
3962 | = __vector_bitcast<short>(__interleave128_lo(__k, __k)); |
3963 | return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); |
3964 | } |
3965 | else if constexpr (_FromBytes == 8 && _ToBytes == 2) |
3966 | { |
3967 | if constexpr (__have_sse2 && !__have_ssse3) |
3968 | return __intrin_bitcast<_To>(_mm_packs_epi32( |
3969 | _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()), |
3970 | __m128i())); |
3971 | else |
3972 | return __intrin_bitcast<_To>( |
3973 | __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>( |
3974 | __vector_bitcast<_Up>(__k))); |
3975 | } |
3976 | else if constexpr (_FromBytes == 4 && _ToBytes == 2) |
3977 | return __intrin_bitcast<_To>( |
3978 | _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); |
3979 | else if constexpr (_FromBytes == 1 && _ToBytes == 2) |
3980 | return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); |
3981 | else if constexpr (_FromBytes == 8 && _ToBytes == 1 |
3982 | && __have_ssse3) |
3983 | return __intrin_bitcast<_To>( |
3984 | _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), |
3985 | _mm_setr_epi8(b0: 7, b1: 15, b2: -1, b3: -1, b4: -1, b5: -1, b6: -1, b7: -1, |
3986 | b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1, b14: -1, |
3987 | b15: -1))); |
3988 | else if constexpr (_FromBytes == 8 && _ToBytes == 1) |
3989 | { |
3990 | auto __y |
3991 | = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); |
3992 | __y = _mm_packs_epi32(__y, __m128i()); |
3993 | return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); |
3994 | } |
3995 | else if constexpr (_FromBytes == 4 && _ToBytes == 1 |
3996 | && __have_ssse3) |
3997 | return __intrin_bitcast<_To>( |
3998 | _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), |
3999 | _mm_setr_epi8(b0: 3, b1: 7, b2: 11, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1, |
4000 | b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1, b14: -1, |
4001 | b15: -1))); |
4002 | else if constexpr (_FromBytes == 4 && _ToBytes == 1) |
4003 | { |
4004 | const auto __y |
4005 | = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); |
4006 | return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); |
4007 | } |
4008 | else if constexpr (_FromBytes == 2 && _ToBytes == 1) |
4009 | return __intrin_bitcast<_To>( |
4010 | _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())); |
4011 | else |
4012 | __assert_unreachable<_Tp>(); |
4013 | } // }}} |
4014 | else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32) |
4015 | { // AVX -> AVX {{{ |
4016 | if constexpr (_FromBytes == _ToBytes) |
4017 | __assert_unreachable<_Tp>(); |
4018 | else if constexpr (_FromBytes == _ToBytes * 2) |
4019 | { |
4020 | const auto __y = __vector_bitcast<_LLong>(__k); |
4021 | return __intrin_bitcast<_To>(_mm256_castsi128_si256( |
4022 | _mm_packs_epi16(__lo128(__y), __hi128(__y)))); |
4023 | } |
4024 | else if constexpr (_FromBytes == _ToBytes * 4) |
4025 | { |
4026 | const auto __y = __vector_bitcast<_LLong>(__k); |
4027 | return __intrin_bitcast<_To>(_mm256_castsi128_si256( |
4028 | _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), |
4029 | __m128i()))); |
4030 | } |
4031 | else if constexpr (_FromBytes == _ToBytes * 8) |
4032 | { |
4033 | const auto __y = __vector_bitcast<_LLong>(__k); |
4034 | return __intrin_bitcast<_To>( |
4035 | _mm256_castsi128_si256(_mm_shuffle_epi8( |
4036 | _mm_packs_epi16(__lo128(__y), __hi128(__y)), |
4037 | _mm_setr_epi8(b0: 3, b1: 7, b2: 11, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1, b8: -1, b9: -1, b10: -1, |
4038 | b11: -1, b12: -1, b13: -1, b14: -1, b15: -1)))); |
4039 | } |
4040 | else if constexpr (_FromBytes * 2 == _ToBytes) |
4041 | { |
4042 | auto __y = __xzyw(__to_intrin(__k)); |
4043 | if constexpr (is_floating_point_v< |
4044 | _Tp> || (!__have_avx2 && _FromBytes == 4)) |
4045 | { |
4046 | const auto __yy = __vector_bitcast<float>(__y); |
4047 | return __intrin_bitcast<_To>( |
4048 | _mm256_unpacklo_ps(__yy, __yy)); |
4049 | } |
4050 | else |
4051 | return __intrin_bitcast<_To>( |
4052 | _mm256_unpacklo_epi8(__y, __y)); |
4053 | } |
4054 | else if constexpr (_FromBytes * 4 == _ToBytes) |
4055 | { |
4056 | auto __y |
4057 | = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), |
4058 | __lo128(__vector_bitcast<_LLong>( |
4059 | __k))); // drops 3/4 of input |
4060 | return __intrin_bitcast<_To>( |
4061 | __concat(_mm_unpacklo_epi16(__y, __y), |
4062 | _mm_unpackhi_epi16(__y, __y))); |
4063 | } |
4064 | else if constexpr (_FromBytes == 1 && _ToBytes == 8) |
4065 | { |
4066 | auto __y |
4067 | = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), |
4068 | __lo128(__vector_bitcast<_LLong>( |
4069 | __k))); // drops 3/4 of input |
4070 | __y |
4071 | = _mm_unpacklo_epi16(__y, |
4072 | __y); // drops another 1/2 => 7/8 total |
4073 | return __intrin_bitcast<_To>( |
4074 | __concat(_mm_unpacklo_epi32(__y, __y), |
4075 | _mm_unpackhi_epi32(__y, __y))); |
4076 | } |
4077 | else |
4078 | __assert_unreachable<_Tp>(); |
4079 | } // }}} |
4080 | else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16) |
4081 | { // SSE -> AVX {{{ |
4082 | if constexpr (_FromBytes == _ToBytes) |
4083 | return __intrin_bitcast<_To>( |
4084 | __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>( |
4085 | __zero_extend(__to_intrin(__k)))); |
4086 | else if constexpr (_FromBytes * 2 == _ToBytes) |
4087 | { // keep all |
4088 | return __intrin_bitcast<_To>( |
4089 | __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k), |
4090 | __vector_bitcast<_LLong>(__k)), |
4091 | _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k), |
4092 | __vector_bitcast<_LLong>(__k)))); |
4093 | } |
4094 | else if constexpr (_FromBytes * 4 == _ToBytes) |
4095 | { |
4096 | if constexpr (__have_avx2) |
4097 | { |
4098 | return __intrin_bitcast<_To>(_mm256_shuffle_epi8( |
4099 | __concat(__vector_bitcast<_LLong>(__k), |
4100 | __vector_bitcast<_LLong>(__k)), |
4101 | _mm256_setr_epi8(b31: 0, b30: 0, b29: 0, b28: 0, b27: 1, b26: 1, b25: 1, b24: 1, b23: 2, b22: 2, b21: 2, b20: 2, b19: 3, |
4102 | b18: 3, b17: 3, b16: 3, b15: 4, b14: 4, b13: 4, b12: 4, b11: 5, b10: 5, b09: 5, b08: 5, b07: 6, b06: 6, |
4103 | b05: 6, b04: 6, b03: 7, b02: 7, b01: 7, b00: 7))); |
4104 | } |
4105 | else |
4106 | { |
4107 | return __intrin_bitcast<_To>(__concat( |
4108 | _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), |
4109 | _mm_setr_epi8(b0: 0, b1: 0, b2: 0, b3: 0, b4: 1, b5: 1, b6: 1, b7: 1, |
4110 | b8: 2, b9: 2, b10: 2, b11: 2, b12: 3, b13: 3, b14: 3, b15: 3)), |
4111 | _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), |
4112 | _mm_setr_epi8(b0: 4, b1: 4, b2: 4, b3: 4, b4: 5, b5: 5, b6: 5, b7: 5, |
4113 | b8: 6, b9: 6, b10: 6, b11: 6, b12: 7, b13: 7, b14: 7, |
4114 | b15: 7)))); |
4115 | } |
4116 | } |
4117 | else if constexpr (_FromBytes * 8 == _ToBytes) |
4118 | { |
4119 | if constexpr (__have_avx2) |
4120 | { |
4121 | return __intrin_bitcast<_To>(_mm256_shuffle_epi8( |
4122 | __concat(__vector_bitcast<_LLong>(__k), |
4123 | __vector_bitcast<_LLong>(__k)), |
4124 | _mm256_setr_epi8(b31: 0, b30: 0, b29: 0, b28: 0, b27: 0, b26: 0, b25: 0, b24: 0, b23: 1, b22: 1, b21: 1, b20: 1, b19: 1, |
4125 | b18: 1, b17: 1, b16: 1, b15: 2, b14: 2, b13: 2, b12: 2, b11: 2, b10: 2, b09: 2, b08: 2, b07: 3, b06: 3, |
4126 | b05: 3, b04: 3, b03: 3, b02: 3, b01: 3, b00: 3))); |
4127 | } |
4128 | else |
4129 | { |
4130 | return __intrin_bitcast<_To>(__concat( |
4131 | _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), |
4132 | _mm_setr_epi8(b0: 0, b1: 0, b2: 0, b3: 0, b4: 0, b5: 0, b6: 0, b7: 0, |
4133 | b8: 1, b9: 1, b10: 1, b11: 1, b12: 1, b13: 1, b14: 1, b15: 1)), |
4134 | _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), |
4135 | _mm_setr_epi8(b0: 2, b1: 2, b2: 2, b3: 2, b4: 2, b5: 2, b6: 2, b7: 2, |
4136 | b8: 3, b9: 3, b10: 3, b11: 3, b12: 3, b13: 3, b14: 3, |
4137 | b15: 3)))); |
4138 | } |
4139 | } |
4140 | else if constexpr (_FromBytes == _ToBytes * 2) |
4141 | return __intrin_bitcast<_To>(__m256i(__zero_extend( |
4142 | _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())))); |
4143 | else if constexpr (_FromBytes == 8 && _ToBytes == 2) |
4144 | { |
4145 | return __intrin_bitcast<_To>(__m256i(__zero_extend( |
4146 | _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), |
4147 | _mm_setr_epi8(b0: 6, b1: 7, b2: 14, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1, |
4148 | b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1, b14: -1, |
4149 | b15: -1))))); |
4150 | } |
4151 | else if constexpr (_FromBytes == 4 && _ToBytes == 1) |
4152 | { |
4153 | return __intrin_bitcast<_To>(__m256i(__zero_extend( |
4154 | _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), |
4155 | _mm_setr_epi8(b0: 3, b1: 7, b2: 11, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1, |
4156 | b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1, b14: -1, |
4157 | b15: -1))))); |
4158 | } |
4159 | else if constexpr (_FromBytes == 8 && _ToBytes == 1) |
4160 | { |
4161 | return __intrin_bitcast<_To>(__m256i(__zero_extend( |
4162 | _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), |
4163 | _mm_setr_epi8(b0: 7, b1: 15, b2: -1, b3: -1, b4: -1, b5: -1, b6: -1, |
4164 | b7: -1, b8: -1, b9: -1, b10: -1, b11: -1, b12: -1, b13: -1, |
4165 | b14: -1, b15: -1))))); |
4166 | } |
4167 | else |
4168 | static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable" ); |
4169 | } // }}} |
4170 | else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32) |
4171 | { // AVX -> SSE {{{ |
4172 | if constexpr (_FromBytes == _ToBytes) |
4173 | { // keep low 1/2 |
4174 | return __intrin_bitcast<_To>(__lo128(__k)); |
4175 | } |
4176 | else if constexpr (_FromBytes == _ToBytes * 2) |
4177 | { // keep all |
4178 | auto __y = __vector_bitcast<_LLong>(__k); |
4179 | return __intrin_bitcast<_To>( |
4180 | _mm_packs_epi16(__lo128(__y), __hi128(__y))); |
4181 | } |
4182 | else if constexpr (_FromBytes == _ToBytes * 4) |
4183 | { // add 1/2 undef |
4184 | auto __y = __vector_bitcast<_LLong>(__k); |
4185 | return __intrin_bitcast<_To>( |
4186 | _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), |
4187 | __m128i())); |
4188 | } |
4189 | else if constexpr (_FromBytes == 8 && _ToBytes == 1) |
4190 | { // add 3/4 undef |
4191 | auto __y = __vector_bitcast<_LLong>(__k); |
4192 | return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
4193 | _mm_packs_epi16(__lo128(__y), __hi128(__y)), |
4194 | _mm_setr_epi8(b0: 3, b1: 7, b2: 11, b3: 15, b4: -1, b5: -1, b6: -1, b7: -1, b8: -1, b9: -1, b10: -1, b11: -1, |
4195 | b12: -1, b13: -1, b14: -1, b15: -1))); |
4196 | } |
4197 | else if constexpr (_FromBytes * 2 == _ToBytes) |
4198 | { // keep low 1/4 |
4199 | auto __y = __lo128(__vector_bitcast<_LLong>(__k)); |
4200 | return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); |
4201 | } |
4202 | else if constexpr (_FromBytes * 4 == _ToBytes) |
4203 | { // keep low 1/8 |
4204 | auto __y = __lo128(__vector_bitcast<_LLong>(__k)); |
4205 | __y = _mm_unpacklo_epi8(__y, __y); |
4206 | return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); |
4207 | } |
4208 | else if constexpr (_FromBytes * 8 == _ToBytes) |
4209 | { // keep low 1/16 |
4210 | auto __y = __lo128(__vector_bitcast<_LLong>(__k)); |
4211 | __y = _mm_unpacklo_epi8(__y, __y); |
4212 | __y = _mm_unpacklo_epi8(__y, __y); |
4213 | return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); |
4214 | } |
4215 | else |
4216 | static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable" ); |
4217 | } // }}} |
4218 | else |
4219 | return _Base::template _S_to_maskvector<_Up, _ToN>(__x); |
4220 | /* |
4221 | if constexpr (_FromBytes > _ToBytes) { |
4222 | const _To __y = __vector_bitcast<_Up>(__k); |
4223 | return [&] <size_t... _Is> (index_sequence<_Is...>) { |
4224 | constexpr int _Stride = _FromBytes / _ToBytes; |
4225 | return _To{__y[(_Is + 1) * _Stride - 1]...}; |
4226 | }(make_index_sequence<std::min(_ToN, _FromN)>()); |
4227 | } else { |
4228 | // {0, 0, 1, 1} (_Dups = 2, _Is<4>) |
4229 | // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>) |
4230 | // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>) |
4231 | // ... |
4232 | return [&] <size_t... _Is> (index_sequence<_Is...>) { |
4233 | constexpr int __dup = _ToBytes / _FromBytes; |
4234 | return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...}); |
4235 | }(make_index_sequence<_FromN>()); |
4236 | } |
4237 | */ |
4238 | } // }}} |
4239 | } |
4240 | |
4241 | // }}} |
4242 | // _S_to_bits {{{ |
4243 | template <typename _Tp, size_t _Np> |
4244 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> |
4245 | _S_to_bits(_SimdWrapper<_Tp, _Np> __x) |
4246 | { |
4247 | if constexpr (is_same_v<_Tp, bool>) |
4248 | return _BitMask<_Np>(__x._M_data)._M_sanitized(); |
4249 | else |
4250 | { |
4251 | static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); |
4252 | if (__builtin_is_constant_evaluated() |
4253 | || __builtin_constant_p(__x._M_data)) |
4254 | { |
4255 | const auto __bools = -__x._M_data; |
4256 | const _ULLong __k = __call_with_n_evaluations<_Np>( |
4257 | [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
4258 | return (__bits | ...); |
4259 | }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
4260 | return _ULLong(__bools[+__i]) << __i; |
4261 | }); |
4262 | if (__builtin_is_constant_evaluated() |
4263 | || __builtin_constant_p(__k)) |
4264 | return __k; |
4265 | } |
4266 | const auto __xi = __to_intrin(__x); |
4267 | if constexpr (sizeof(_Tp) == 1) |
4268 | if constexpr (sizeof(__xi) == 16) |
4269 | if constexpr (__have_avx512bw_vl) |
4270 | return _BitMask<_Np>(_mm_movepi8_mask(__xi)); |
4271 | else // implies SSE2 |
4272 | return _BitMask<_Np>(_mm_movemask_epi8(__xi)); |
4273 | else if constexpr (sizeof(__xi) == 32) |
4274 | if constexpr (__have_avx512bw_vl) |
4275 | return _BitMask<_Np>(_mm256_movepi8_mask(__xi)); |
4276 | else // implies AVX2 |
4277 | return _BitMask<_Np>(_mm256_movemask_epi8(__xi)); |
4278 | else // implies AVX512BW |
4279 | return _BitMask<_Np>(_mm512_movepi8_mask(__xi)); |
4280 | |
4281 | else if constexpr (sizeof(_Tp) == 2) |
4282 | if constexpr (sizeof(__xi) == 16) |
4283 | if constexpr (__have_avx512bw_vl) |
4284 | return _BitMask<_Np>(_mm_movepi16_mask(__xi)); |
4285 | else if constexpr (__have_avx512bw) |
4286 | return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); |
4287 | else // implies SSE2 |
4288 | return _BitMask<_Np>( |
4289 | _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i()))); |
4290 | else if constexpr (sizeof(__xi) == 32) |
4291 | if constexpr (__have_avx512bw_vl) |
4292 | return _BitMask<_Np>(_mm256_movepi16_mask(__xi)); |
4293 | else if constexpr (__have_avx512bw) |
4294 | return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); |
4295 | else // implies SSE2 |
4296 | return _BitMask<_Np>(_mm_movemask_epi8( |
4297 | _mm_packs_epi16(__lo128(__xi), __hi128(__xi)))); |
4298 | else // implies AVX512BW |
4299 | return _BitMask<_Np>(_mm512_movepi16_mask(__xi)); |
4300 | |
4301 | else if constexpr (sizeof(_Tp) == 4) |
4302 | if constexpr (sizeof(__xi) == 16) |
4303 | if constexpr (__have_avx512dq_vl) |
4304 | return _BitMask<_Np>(_mm_movepi32_mask(__xi)); |
4305 | else if constexpr (__have_avx512vl) |
4306 | return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i())); |
4307 | else if constexpr (__have_avx512dq) |
4308 | return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); |
4309 | else if constexpr (__have_avx512f) |
4310 | return _BitMask<_Np>( |
4311 | _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); |
4312 | else // implies SSE |
4313 | return _BitMask<_Np>( |
4314 | _mm_movemask_ps(a: reinterpret_cast<__m128>(__xi))); |
4315 | else if constexpr (sizeof(__xi) == 32) |
4316 | if constexpr (__have_avx512dq_vl) |
4317 | return _BitMask<_Np>(_mm256_movepi32_mask(__xi)); |
4318 | else if constexpr (__have_avx512dq) |
4319 | return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); |
4320 | else if constexpr (__have_avx512vl) |
4321 | return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i())); |
4322 | else if constexpr (__have_avx512f) |
4323 | return _BitMask<_Np>( |
4324 | _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); |
4325 | else // implies AVX |
4326 | return _BitMask<_Np>( |
4327 | _mm256_movemask_ps(a: reinterpret_cast<__m256>(__xi))); |
4328 | else // implies AVX512?? |
4329 | if constexpr (__have_avx512dq) |
4330 | return _BitMask<_Np>(_mm512_movepi32_mask(__xi)); |
4331 | else // implies AVX512F |
4332 | return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i())); |
4333 | |
4334 | else if constexpr (sizeof(_Tp) == 8) |
4335 | if constexpr (sizeof(__xi) == 16) |
4336 | if constexpr (__have_avx512dq_vl) |
4337 | return _BitMask<_Np>(_mm_movepi64_mask(__xi)); |
4338 | else if constexpr (__have_avx512dq) |
4339 | return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); |
4340 | else if constexpr (__have_avx512vl) |
4341 | return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i())); |
4342 | else if constexpr (__have_avx512f) |
4343 | return _BitMask<_Np>( |
4344 | _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); |
4345 | else // implies SSE2 |
4346 | return _BitMask<_Np>( |
4347 | _mm_movemask_pd(a: reinterpret_cast<__m128d>(__xi))); |
4348 | else if constexpr (sizeof(__xi) == 32) |
4349 | if constexpr (__have_avx512dq_vl) |
4350 | return _BitMask<_Np>(_mm256_movepi64_mask(__xi)); |
4351 | else if constexpr (__have_avx512dq) |
4352 | return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); |
4353 | else if constexpr (__have_avx512vl) |
4354 | return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i())); |
4355 | else if constexpr (__have_avx512f) |
4356 | return _BitMask<_Np>( |
4357 | _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); |
4358 | else // implies AVX |
4359 | return _BitMask<_Np>( |
4360 | _mm256_movemask_pd(a: reinterpret_cast<__m256d>(__xi))); |
4361 | else // implies AVX512?? |
4362 | if constexpr (__have_avx512dq) |
4363 | return _BitMask<_Np>(_mm512_movepi64_mask(__xi)); |
4364 | else // implies AVX512F |
4365 | return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i())); |
4366 | |
4367 | else |
4368 | __assert_unreachable<_Tp>(); |
4369 | } |
4370 | } |
4371 | // }}} |
4372 | }; |
4373 | |
4374 | // }}} |
4375 | // _MaskImplX86 {{{ |
4376 | template <typename _Abi> |
4377 | struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi> |
4378 | { |
4379 | using _MaskImplX86Mixin::_S_to_bits; |
4380 | using _MaskImplX86Mixin::_S_to_maskvector; |
4381 | using _MaskImplBuiltin<_Abi>::_S_convert; |
4382 | |
4383 | // member types {{{ |
4384 | template <typename _Tp> |
4385 | using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; |
4386 | |
4387 | template <typename _Tp> |
4388 | using _MaskMember = typename _Abi::template _MaskMember<_Tp>; |
4389 | |
4390 | template <typename _Tp> |
4391 | static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; |
4392 | |
4393 | using _Base = _MaskImplBuiltin<_Abi>; |
4394 | |
4395 | // }}} |
4396 | // _S_broadcast {{{ |
4397 | template <typename _Tp> |
4398 | _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> |
4399 | _S_broadcast(bool __x) |
4400 | { |
4401 | if constexpr (__is_avx512_abi<_Abi>()) |
4402 | return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1)) |
4403 | : _MaskMember<_Tp>(); |
4404 | else |
4405 | return _Base::template _S_broadcast<_Tp>(__x); |
4406 | } |
4407 | |
4408 | // }}} |
4409 | // _S_load {{{ |
4410 | template <typename _Tp> |
4411 | _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> |
4412 | _S_load(const bool* __mem) |
4413 | { |
4414 | static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); |
4415 | if (__builtin_is_constant_evaluated()) |
4416 | { |
4417 | if constexpr (__is_avx512_abi<_Abi>()) |
4418 | { |
4419 | _MaskMember<_Tp> __r{}; |
4420 | for (size_t __i = 0; __i < _S_size<_Tp>; ++__i) |
4421 | __r._M_data |= _ULLong(__mem[__i]) << __i; |
4422 | return __r; |
4423 | } |
4424 | else |
4425 | return _Base::template _S_load<_Tp>(__mem); |
4426 | } |
4427 | else if constexpr (__have_avx512bw) |
4428 | { |
4429 | const auto __to_vec_or_bits |
4430 | = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) { |
4431 | if constexpr (__is_avx512_abi<_Abi>()) |
4432 | return __bits; |
4433 | else |
4434 | return _S_to_maskvector<_Tp>( |
4435 | _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); |
4436 | }; |
4437 | |
4438 | if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) |
4439 | { |
4440 | __m128i __a = {}; |
4441 | __builtin_memcpy(&__a, __mem, _S_size<_Tp>); |
4442 | return __to_vec_or_bits(_mm_test_epi8_mask(A: __a, B: __a)); |
4443 | } |
4444 | else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl) |
4445 | { |
4446 | __m256i __a = {}; |
4447 | __builtin_memcpy(&__a, __mem, _S_size<_Tp>); |
4448 | return __to_vec_or_bits(_mm256_test_epi8_mask(A: __a, B: __a)); |
4449 | } |
4450 | else if constexpr (_S_size<_Tp> <= 64) |
4451 | { |
4452 | __m512i __a = {}; |
4453 | __builtin_memcpy(&__a, __mem, _S_size<_Tp>); |
4454 | return __to_vec_or_bits(_mm512_test_epi8_mask(A: __a, B: __a)); |
4455 | } |
4456 | } |
4457 | else if constexpr (__is_avx512_abi<_Abi>()) |
4458 | { |
4459 | if constexpr (_S_size<_Tp> <= 8) |
4460 | { |
4461 | __m128i __a = {}; |
4462 | __builtin_memcpy(&__a, __mem, _S_size<_Tp>); |
4463 | const auto __b = _mm512_cvtepi8_epi64(A: __a); |
4464 | return _mm512_test_epi64_mask(A: __b, B: __b); |
4465 | } |
4466 | else if constexpr (_S_size<_Tp> <= 16) |
4467 | { |
4468 | __m128i __a = {}; |
4469 | __builtin_memcpy(&__a, __mem, _S_size<_Tp>); |
4470 | const auto __b = _mm512_cvtepi8_epi32(A: __a); |
4471 | return _mm512_test_epi32_mask(A: __b, B: __b); |
4472 | } |
4473 | else if constexpr (_S_size<_Tp> <= 32) |
4474 | { |
4475 | __m128i __a = {}; |
4476 | __builtin_memcpy(&__a, __mem, 16); |
4477 | const auto __b = _mm512_cvtepi8_epi32(A: __a); |
4478 | __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16); |
4479 | const auto __c = _mm512_cvtepi8_epi32(A: __a); |
4480 | return _mm512_test_epi32_mask(A: __b, B: __b) |
4481 | | (_mm512_test_epi32_mask(A: __c, B: __c) << 16); |
4482 | } |
4483 | else if constexpr (_S_size<_Tp> <= 64) |
4484 | { |
4485 | __m128i __a = {}; |
4486 | __builtin_memcpy(&__a, __mem, 16); |
4487 | const auto __b = _mm512_cvtepi8_epi32(A: __a); |
4488 | __builtin_memcpy(&__a, __mem + 16, 16); |
4489 | const auto __c = _mm512_cvtepi8_epi32(A: __a); |
4490 | if constexpr (_S_size<_Tp> <= 48) |
4491 | { |
4492 | __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32); |
4493 | const auto __d = _mm512_cvtepi8_epi32(A: __a); |
4494 | return _mm512_test_epi32_mask(A: __b, B: __b) |
4495 | | (_mm512_test_epi32_mask(A: __c, B: __c) << 16) |
4496 | | (_ULLong(_mm512_test_epi32_mask(A: __d, B: __d)) << 32); |
4497 | } |
4498 | else |
4499 | { |
4500 | __builtin_memcpy(&__a, __mem + 16, 16); |
4501 | const auto __d = _mm512_cvtepi8_epi32(A: __a); |
4502 | __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48); |
4503 | const auto __e = _mm512_cvtepi8_epi32(A: __a); |
4504 | return _mm512_test_epi32_mask(A: __b, B: __b) |
4505 | | (_mm512_test_epi32_mask(A: __c, B: __c) << 16) |
4506 | | (_ULLong(_mm512_test_epi32_mask(A: __d, B: __d)) << 32) |
4507 | | (_ULLong(_mm512_test_epi32_mask(A: __e, B: __e)) << 48); |
4508 | } |
4509 | } |
4510 | else |
4511 | __assert_unreachable<_Tp>(); |
4512 | } |
4513 | else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2) |
4514 | return __vector_bitcast<_Tp>( |
4515 | __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]), |
4516 | -int(__mem[1]), -int(__mem[1])}); |
4517 | else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx) |
4518 | { |
4519 | int __bool4 = 0; |
4520 | __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>); |
4521 | const auto __k = __to_intrin( |
4522 | (__vector_broadcast<4>(x: __bool4) |
4523 | & __make_vector<int>(0x1, 0x100, 0x10000, |
4524 | _S_size<_Tp> == 4 ? 0x1000000 : 0)) |
4525 | != 0); |
4526 | return __vector_bitcast<_Tp>( |
4527 | __concat(_mm_unpacklo_epi32(__k, __k), |
4528 | _mm_unpackhi_epi32(__k, __k))); |
4529 | } |
4530 | else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4) |
4531 | { |
4532 | int __bools = 0; |
4533 | __builtin_memcpy(&__bools, __mem, _S_size<_Tp>); |
4534 | if constexpr (__have_sse2) |
4535 | { |
4536 | __m128i __k = _mm_cvtsi32_si128(a: __bools); |
4537 | __k = _mm_cmpgt_epi16(a: _mm_unpacklo_epi8(a: __k, b: __k), b: __m128i()); |
4538 | return __vector_bitcast<_Tp, _S_size<_Tp>>( |
4539 | _mm_unpacklo_epi16(a: __k, b: __k)); |
4540 | } |
4541 | else |
4542 | { |
4543 | __m128 __k = _mm_cvtpi8_ps(a: _mm_cvtsi32_si64(i: __bools)); |
4544 | _mm_empty(); |
4545 | return __vector_bitcast<_Tp, _S_size<_Tp>>( |
4546 | _mm_cmpgt_ps(a: __k, b: __m128())); |
4547 | } |
4548 | } |
4549 | else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8) |
4550 | { |
4551 | __m128i __k = {}; |
4552 | __builtin_memcpy(&__k, __mem, _S_size<_Tp>); |
4553 | __k = _mm_cmpgt_epi16(a: _mm_unpacklo_epi8(a: __k, b: __k), b: __m128i()); |
4554 | return __vector_bitcast<_Tp>( |
4555 | __concat(a_: _mm_unpacklo_epi16(a: __k, b: __k), |
4556 | b_: _mm_unpackhi_epi16(a: __k, b: __k))); |
4557 | } |
4558 | else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16) |
4559 | { |
4560 | __m128i __k = {}; |
4561 | __builtin_memcpy(&__k, __mem, _S_size<_Tp>); |
4562 | __k = _mm_cmpgt_epi8(a: __k, b: __m128i()); |
4563 | if constexpr (_S_size<_Tp> <= 8) |
4564 | return __vector_bitcast<_Tp, _S_size<_Tp>>( |
4565 | _mm_unpacklo_epi8(a: __k, b: __k)); |
4566 | else |
4567 | return __concat(a_: _mm_unpacklo_epi8(a: __k, b: __k), |
4568 | b_: _mm_unpackhi_epi8(a: __k, b: __k)); |
4569 | } |
4570 | else |
4571 | return _Base::template _S_load<_Tp>(__mem); |
4572 | } |
4573 | |
4574 | // }}} |
4575 | // _S_from_bitmask{{{ |
4576 | template <size_t _Np, typename _Tp> |
4577 | _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> |
4578 | _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) |
4579 | { |
4580 | static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); |
4581 | if constexpr (__is_avx512_abi<_Abi>()) |
4582 | return __bits._M_to_bits(); |
4583 | else |
4584 | return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); |
4585 | } |
4586 | |
4587 | // }}} |
4588 | // _S_masked_load {{{2 |
4589 | template <typename _Tp, size_t _Np> |
4590 | static inline _SimdWrapper<_Tp, _Np> |
4591 | _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, |
4592 | _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept |
4593 | { |
4594 | if constexpr (__is_avx512_abi<_Abi>()) |
4595 | { |
4596 | if constexpr (__have_avx512bw_vl) |
4597 | { |
4598 | if constexpr (_Np <= 16) |
4599 | { |
4600 | const auto __a |
4601 | = _mm_mask_loadu_epi8(__m128i(), __mask, __mem); |
4602 | return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a); |
4603 | } |
4604 | else if constexpr (_Np <= 32) |
4605 | { |
4606 | const auto __a |
4607 | = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem); |
4608 | return (__merge & ~__mask) |
4609 | | _mm256_test_epi8_mask(__a, __a); |
4610 | } |
4611 | else if constexpr (_Np <= 64) |
4612 | { |
4613 | const auto __a |
4614 | = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem); |
4615 | return (__merge & ~__mask) |
4616 | | _mm512_test_epi8_mask(__a, __a); |
4617 | } |
4618 | else |
4619 | __assert_unreachable<_Tp>(); |
4620 | } |
4621 | else |
4622 | { |
4623 | _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
4624 | __merge._M_set(__i, __mem[__i]); |
4625 | }); |
4626 | return __merge; |
4627 | } |
4628 | } |
4629 | else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1) |
4630 | { |
4631 | const auto __k = _S_to_bits(__mask)._M_to_bits(); |
4632 | __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(), |
4633 | _mm256_mask_loadu_epi8(__m256i(), |
4634 | __k, __mem)); |
4635 | } |
4636 | else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1) |
4637 | { |
4638 | const auto __k = _S_to_bits(__mask)._M_to_bits(); |
4639 | __merge |
4640 | = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k, |
4641 | __m128i(), |
4642 | _mm_mask_loadu_epi8(__m128i(), __k, __mem)); |
4643 | } |
4644 | else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2) |
4645 | { |
4646 | const auto __k = _S_to_bits(__mask)._M_to_bits(); |
4647 | __merge = _mm256_mask_sub_epi16( |
4648 | __vector_bitcast<_LLong>(__merge), __k, __m256i(), |
4649 | _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); |
4650 | } |
4651 | else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2) |
4652 | { |
4653 | const auto __k = _S_to_bits(__mask)._M_to_bits(); |
4654 | __merge = _mm_mask_sub_epi16( |
4655 | __vector_bitcast<_LLong>(__merge), __k, __m128i(), |
4656 | _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); |
4657 | } |
4658 | else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4) |
4659 | { |
4660 | const auto __k = _S_to_bits(__mask)._M_to_bits(); |
4661 | __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32( |
4662 | __vector_bitcast<_LLong>(__merge), __k, __m256i(), |
4663 | _mm256_cvtepi8_epi32( |
4664 | _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); |
4665 | } |
4666 | else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4) |
4667 | { |
4668 | const auto __k = _S_to_bits(__mask)._M_to_bits(); |
4669 | __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32( |
4670 | __vector_bitcast<_LLong>(__merge), __k, __m128i(), |
4671 | _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); |
4672 | } |
4673 | else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8) |
4674 | { |
4675 | const auto __k = _S_to_bits(__mask)._M_to_bits(); |
4676 | __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64( |
4677 | __vector_bitcast<_LLong>(__merge), __k, __m256i(), |
4678 | _mm256_cvtepi8_epi64( |
4679 | _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); |
4680 | } |
4681 | else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8) |
4682 | { |
4683 | const auto __k = _S_to_bits(__mask)._M_to_bits(); |
4684 | __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64( |
4685 | __vector_bitcast<_LLong>(__merge), __k, __m128i(), |
4686 | _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); |
4687 | } |
4688 | else |
4689 | return _Base::_S_masked_load(__merge, __mask, __mem); |
4690 | return __merge; |
4691 | } |
4692 | |
4693 | // _S_store {{{2 |
4694 | template <typename _Tp, size_t _Np> |
4695 | _GLIBCXX_SIMD_INTRINSIC static constexpr void |
4696 | _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept |
4697 | { |
4698 | if (__builtin_is_constant_evaluated()) |
4699 | _Base::_S_store(__v, __mem); |
4700 | else if constexpr (__is_avx512_abi<_Abi>()) |
4701 | { |
4702 | if constexpr (__have_avx512bw_vl) |
4703 | _CommonImplX86::_S_store<_Np>( |
4704 | __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
4705 | if constexpr (_Np <= 16) |
4706 | return _mm_maskz_set1_epi8(__data, 1); |
4707 | else if constexpr (_Np <= 32) |
4708 | return _mm256_maskz_set1_epi8(__data, 1); |
4709 | else |
4710 | return _mm512_maskz_set1_epi8(__data, 1); |
4711 | }(__v._M_data)), |
4712 | __mem); |
4713 | else if constexpr (_Np <= 8) |
4714 | _CommonImplX86::_S_store<_Np>( |
4715 | __vector_bitcast<char>( |
4716 | #if defined __x86_64__ |
4717 | __make_wrapper<_ULLong>( |
4718 | _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull) |
4719 | #else |
4720 | __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U), |
4721 | _pdep_u32(__v._M_data >> 4, |
4722 | 0x01010101U)) |
4723 | #endif |
4724 | ), |
4725 | __mem); |
4726 | else if constexpr (_Np <= 16) |
4727 | _mm512_mask_cvtepi32_storeu_epi8( |
4728 | __mem, 0xffffu >> (16 - _Np), |
4729 | _mm512_maskz_set1_epi32(__v._M_data, 1)); |
4730 | else |
4731 | __assert_unreachable<_Tp>(); |
4732 | } |
4733 | else if constexpr (__is_sse_abi<_Abi>()) //{{{ |
4734 | { |
4735 | if constexpr (_Np == 2 && sizeof(_Tp) == 8) |
4736 | { |
4737 | const auto __k = __vector_bitcast<int>(__v); |
4738 | __mem[0] = -__k[1]; |
4739 | __mem[1] = -__k[3]; |
4740 | } |
4741 | else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) |
4742 | { |
4743 | if constexpr (__have_sse2) |
4744 | { |
4745 | const unsigned __bool4 |
4746 | = __vector_bitcast<_UInt>(_mm_packs_epi16( |
4747 | _mm_packs_epi32(__intrin_bitcast<__m128i>( |
4748 | __to_intrin(__v)), |
4749 | __m128i()), |
4750 | __m128i()))[0] |
4751 | & 0x01010101u; |
4752 | __builtin_memcpy(__mem, &__bool4, _Np); |
4753 | } |
4754 | else if constexpr (__have_mmx) |
4755 | { |
4756 | const __m64 __k = _mm_cvtps_pi8( |
4757 | __and(__to_intrin(__v), _mm_set1_ps(w: 1.f))); |
4758 | __builtin_memcpy(__mem, &__k, _Np); |
4759 | _mm_empty(); |
4760 | } |
4761 | else |
4762 | return _Base::_S_store(__v, __mem); |
4763 | } |
4764 | else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) |
4765 | { |
4766 | _CommonImplX86::_S_store<_Np>( |
4767 | __vector_bitcast<char>(_mm_packs_epi16( |
4768 | __to_intrin(__vector_bitcast<_UShort>(__v) >> 15), |
4769 | __m128i())), |
4770 | __mem); |
4771 | } |
4772 | else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) |
4773 | _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem); |
4774 | else |
4775 | __assert_unreachable<_Tp>(); |
4776 | } // }}} |
4777 | else if constexpr (__is_avx_abi<_Abi>()) // {{{ |
4778 | { |
4779 | if constexpr (_Np <= 4 && sizeof(_Tp) == 8) |
4780 | { |
4781 | auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); |
4782 | int __bool4{}; |
4783 | if constexpr (__have_avx2) |
4784 | __bool4 = _mm256_movemask_epi8(__k); |
4785 | else |
4786 | __bool4 = (_mm_movemask_epi8(__lo128(__k)) |
4787 | | (_mm_movemask_epi8(__hi128(__k)) << 16)); |
4788 | __bool4 &= 0x01010101; |
4789 | __builtin_memcpy(__mem, &__bool4, _Np); |
4790 | } |
4791 | else if constexpr (_Np <= 8 && sizeof(_Tp) == 4) |
4792 | { |
4793 | const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); |
4794 | const auto __k2 |
4795 | = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)), |
4796 | 15); |
4797 | const auto __k3 |
4798 | = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i())); |
4799 | _CommonImplX86::_S_store<_Np>(__k3, __mem); |
4800 | } |
4801 | else if constexpr (_Np <= 16 && sizeof(_Tp) == 2) |
4802 | { |
4803 | if constexpr (__have_avx2) |
4804 | { |
4805 | const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15); |
4806 | const auto __bools = __vector_bitcast<char>( |
4807 | _mm_packs_epi16(__lo128(__x), __hi128(__x))); |
4808 | _CommonImplX86::_S_store<_Np>(__bools, __mem); |
4809 | } |
4810 | else |
4811 | { |
4812 | const auto __bools |
4813 | = 1 |
4814 | & __vector_bitcast<_UChar>( |
4815 | _mm_packs_epi16(__lo128(__to_intrin(__v)), |
4816 | __hi128(__to_intrin(__v)))); |
4817 | _CommonImplX86::_S_store<_Np>(__bools, __mem); |
4818 | } |
4819 | } |
4820 | else if constexpr (_Np <= 32 && sizeof(_Tp) == 1) |
4821 | _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem); |
4822 | else |
4823 | __assert_unreachable<_Tp>(); |
4824 | } // }}} |
4825 | else |
4826 | __assert_unreachable<_Tp>(); |
4827 | } |
4828 | |
4829 | // _S_masked_store {{{2 |
4830 | template <typename _Tp, size_t _Np> |
4831 | static inline void |
4832 | _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, |
4833 | const _SimdWrapper<_Tp, _Np> __k) noexcept |
4834 | { |
4835 | if constexpr (__is_avx512_abi<_Abi>()) |
4836 | { |
4837 | static_assert(is_same_v<_Tp, bool>); |
4838 | if constexpr (_Np <= 16 && __have_avx512bw_vl) |
4839 | _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1)); |
4840 | else if constexpr (_Np <= 16) |
4841 | _mm512_mask_cvtepi32_storeu_epi8(__mem, __k, |
4842 | _mm512_maskz_set1_epi32(__v, 1)); |
4843 | else if constexpr (_Np <= 32 && __have_avx512bw_vl) |
4844 | _mm256_mask_storeu_epi8(__mem, __k, |
4845 | _mm256_maskz_set1_epi8(__v, 1)); |
4846 | else if constexpr (_Np <= 32 && __have_avx512bw) |
4847 | _mm256_mask_storeu_epi8(__mem, __k, |
4848 | __lo256(_mm512_maskz_set1_epi8(__v, 1))); |
4849 | else if constexpr (_Np <= 64 && __have_avx512bw) |
4850 | _mm512_mask_storeu_epi8(__mem, __k, |
4851 | _mm512_maskz_set1_epi8(__v, 1)); |
4852 | else |
4853 | __assert_unreachable<_Tp>(); |
4854 | } |
4855 | else |
4856 | _Base::_S_masked_store(__v, __mem, __k); |
4857 | } |
4858 | |
4859 | // logical and bitwise operators {{{2 |
4860 | template <typename _Tp, size_t _Np> |
4861 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
4862 | _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) |
4863 | { |
4864 | if constexpr (is_same_v<_Tp, bool>) |
4865 | { |
4866 | if (__builtin_is_constant_evaluated()) |
4867 | return __x._M_data & __y._M_data; |
4868 | else if constexpr (__have_avx512dq && _Np <= 8) |
4869 | return _kand_mask8(__x._M_data, __y._M_data); |
4870 | else if constexpr (_Np <= 16) |
4871 | return _kand_mask16(__x._M_data, __y._M_data); |
4872 | else if constexpr (__have_avx512bw && _Np <= 32) |
4873 | return _kand_mask32(__x._M_data, __y._M_data); |
4874 | else if constexpr (__have_avx512bw && _Np <= 64) |
4875 | return _kand_mask64(__x._M_data, __y._M_data); |
4876 | else |
4877 | __assert_unreachable<_Tp>(); |
4878 | } |
4879 | else |
4880 | return _Base::_S_logical_and(__x, __y); |
4881 | } |
4882 | |
4883 | template <typename _Tp, size_t _Np> |
4884 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
4885 | _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) |
4886 | { |
4887 | if constexpr (is_same_v<_Tp, bool>) |
4888 | { |
4889 | if (__builtin_is_constant_evaluated()) |
4890 | return __x._M_data | __y._M_data; |
4891 | else if constexpr (__have_avx512dq && _Np <= 8) |
4892 | return _kor_mask8(__x._M_data, __y._M_data); |
4893 | else if constexpr (_Np <= 16) |
4894 | return _kor_mask16(__x._M_data, __y._M_data); |
4895 | else if constexpr (__have_avx512bw && _Np <= 32) |
4896 | return _kor_mask32(__x._M_data, __y._M_data); |
4897 | else if constexpr (__have_avx512bw && _Np <= 64) |
4898 | return _kor_mask64(__x._M_data, __y._M_data); |
4899 | else |
4900 | __assert_unreachable<_Tp>(); |
4901 | } |
4902 | else |
4903 | return _Base::_S_logical_or(__x, __y); |
4904 | } |
4905 | |
4906 | template <typename _Tp, size_t _Np> |
4907 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
4908 | _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x) |
4909 | { |
4910 | if constexpr (is_same_v<_Tp, bool>) |
4911 | { |
4912 | if (__builtin_is_constant_evaluated()) |
4913 | return __x._M_data ^ _Abi::template __implicit_mask_n<_Np>(); |
4914 | else if constexpr (__have_avx512dq && _Np <= 8) |
4915 | return _kandn_mask8(__x._M_data, |
4916 | _Abi::template __implicit_mask_n<_Np>()); |
4917 | else if constexpr (_Np <= 16) |
4918 | return _kandn_mask16(__x._M_data, |
4919 | _Abi::template __implicit_mask_n<_Np>()); |
4920 | else if constexpr (__have_avx512bw && _Np <= 32) |
4921 | return _kandn_mask32(__x._M_data, |
4922 | _Abi::template __implicit_mask_n<_Np>()); |
4923 | else if constexpr (__have_avx512bw && _Np <= 64) |
4924 | return _kandn_mask64(__x._M_data, |
4925 | _Abi::template __implicit_mask_n<_Np>()); |
4926 | else |
4927 | __assert_unreachable<_Tp>(); |
4928 | } |
4929 | else |
4930 | return _Base::_S_bit_not(__x); |
4931 | } |
4932 | |
4933 | template <typename _Tp, size_t _Np> |
4934 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
4935 | _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) |
4936 | { |
4937 | if constexpr (is_same_v<_Tp, bool>) |
4938 | { |
4939 | if (__builtin_is_constant_evaluated()) |
4940 | return __x._M_data & __y._M_data; |
4941 | else if constexpr (__have_avx512dq && _Np <= 8) |
4942 | return _kand_mask8(__x._M_data, __y._M_data); |
4943 | else if constexpr (_Np <= 16) |
4944 | return _kand_mask16(__x._M_data, __y._M_data); |
4945 | else if constexpr (__have_avx512bw && _Np <= 32) |
4946 | return _kand_mask32(__x._M_data, __y._M_data); |
4947 | else if constexpr (__have_avx512bw && _Np <= 64) |
4948 | return _kand_mask64(__x._M_data, __y._M_data); |
4949 | else |
4950 | __assert_unreachable<_Tp>(); |
4951 | } |
4952 | else |
4953 | return _Base::_S_bit_and(__x, __y); |
4954 | } |
4955 | |
4956 | template <typename _Tp, size_t _Np> |
4957 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
4958 | _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) |
4959 | { |
4960 | if constexpr (is_same_v<_Tp, bool>) |
4961 | { |
4962 | if (__builtin_is_constant_evaluated()) |
4963 | return __x._M_data | __y._M_data; |
4964 | else if constexpr (__have_avx512dq && _Np <= 8) |
4965 | return _kor_mask8(__x._M_data, __y._M_data); |
4966 | else if constexpr (_Np <= 16) |
4967 | return _kor_mask16(__x._M_data, __y._M_data); |
4968 | else if constexpr (__have_avx512bw && _Np <= 32) |
4969 | return _kor_mask32(__x._M_data, __y._M_data); |
4970 | else if constexpr (__have_avx512bw && _Np <= 64) |
4971 | return _kor_mask64(__x._M_data, __y._M_data); |
4972 | else |
4973 | __assert_unreachable<_Tp>(); |
4974 | } |
4975 | else |
4976 | return _Base::_S_bit_or(__x, __y); |
4977 | } |
4978 | |
4979 | template <typename _Tp, size_t _Np> |
4980 | _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> |
4981 | _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) |
4982 | { |
4983 | if constexpr (is_same_v<_Tp, bool>) |
4984 | { |
4985 | if (__builtin_is_constant_evaluated()) |
4986 | return __x._M_data ^ __y._M_data; |
4987 | else if constexpr (__have_avx512dq && _Np <= 8) |
4988 | return _kxor_mask8(__x._M_data, __y._M_data); |
4989 | else if constexpr (_Np <= 16) |
4990 | return _kxor_mask16(__x._M_data, __y._M_data); |
4991 | else if constexpr (__have_avx512bw && _Np <= 32) |
4992 | return _kxor_mask32(__x._M_data, __y._M_data); |
4993 | else if constexpr (__have_avx512bw && _Np <= 64) |
4994 | return _kxor_mask64(__x._M_data, __y._M_data); |
4995 | else |
4996 | __assert_unreachable<_Tp>(); |
4997 | } |
4998 | else |
4999 | return _Base::_S_bit_xor(__x, __y); |
5000 | } |
5001 | |
5002 | //}}}2 |
5003 | // _S_masked_assign{{{ |
5004 | template <size_t _Np> |
5005 | _GLIBCXX_SIMD_INTRINSIC static void |
5006 | _S_masked_assign(_SimdWrapper<bool, _Np> __k, |
5007 | _SimdWrapper<bool, _Np>& __lhs, _SimdWrapper<bool, _Np> __rhs) |
5008 | { |
5009 | __lhs._M_data |
5010 | = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data); |
5011 | } |
5012 | |
5013 | template <size_t _Np> |
5014 | _GLIBCXX_SIMD_INTRINSIC static void |
5015 | _S_masked_assign(_SimdWrapper<bool, _Np> __k, |
5016 | _SimdWrapper<bool, _Np>& __lhs, bool __rhs) |
5017 | { |
5018 | if (__rhs) |
5019 | __lhs._M_data = __k._M_data | __lhs._M_data; |
5020 | else |
5021 | __lhs._M_data = ~__k._M_data & __lhs._M_data; |
5022 | } |
5023 | |
5024 | using _MaskImplBuiltin<_Abi>::_S_masked_assign; |
5025 | |
5026 | //}}} |
5027 | // _S_all_of {{{ |
5028 | template <typename _Tp> |
5029 | _GLIBCXX_SIMD_INTRINSIC static bool |
5030 | _S_all_of(simd_mask<_Tp, _Abi> __k) |
5031 | { |
5032 | if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) |
5033 | { |
5034 | constexpr size_t _Np = simd_size_v<_Tp, _Abi>; |
5035 | using _TI = __intrinsic_type_t<_Tp, _Np>; |
5036 | const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); |
5037 | if constexpr (__have_sse4_1) |
5038 | { |
5039 | _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b |
5040 | = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
5041 | return 0 != __testc(__a, __b); |
5042 | } |
5043 | else if constexpr (is_same_v<_Tp, float>) |
5044 | return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) |
5045 | == (1 << _Np) - 1; |
5046 | else if constexpr (is_same_v<_Tp, double>) |
5047 | return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) |
5048 | == (1 << _Np) - 1; |
5049 | else |
5050 | return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) |
5051 | == (1 << (_Np * sizeof(_Tp))) - 1; |
5052 | } |
5053 | else if constexpr (__is_avx512_abi<_Abi>()) |
5054 | { |
5055 | constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>(); |
5056 | const auto __kk = __k._M_data._M_data; |
5057 | if constexpr (sizeof(__kk) == 1) |
5058 | { |
5059 | if constexpr (__have_avx512dq) |
5060 | return _kortestc_mask8_u8(__kk, _Mask == 0xff |
5061 | ? __kk |
5062 | : __mmask8(~_Mask)); |
5063 | else |
5064 | return _kortestc_mask16_u8(__kk, __mmask16(~_Mask)); |
5065 | } |
5066 | else if constexpr (sizeof(__kk) == 2) |
5067 | return _kortestc_mask16_u8(__kk, _Mask == 0xffff |
5068 | ? __kk |
5069 | : __mmask16(~_Mask)); |
5070 | else if constexpr (sizeof(__kk) == 4 && __have_avx512bw) |
5071 | return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU |
5072 | ? __kk |
5073 | : __mmask32(~_Mask)); |
5074 | else if constexpr (sizeof(__kk) == 8 && __have_avx512bw) |
5075 | return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL |
5076 | ? __kk |
5077 | : __mmask64(~_Mask)); |
5078 | else |
5079 | __assert_unreachable<_Tp>(); |
5080 | } |
5081 | } |
5082 | |
5083 | // }}} |
5084 | // _S_any_of {{{ |
5085 | template <typename _Tp> |
5086 | _GLIBCXX_SIMD_INTRINSIC static bool |
5087 | _S_any_of(simd_mask<_Tp, _Abi> __k) |
5088 | { |
5089 | if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) |
5090 | { |
5091 | constexpr size_t _Np = simd_size_v<_Tp, _Abi>; |
5092 | using _TI = __intrinsic_type_t<_Tp, _Np>; |
5093 | const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); |
5094 | if constexpr (__have_sse4_1) |
5095 | { |
5096 | if constexpr (_Abi::template _S_is_partial< |
5097 | _Tp> || sizeof(__k) < 16) |
5098 | { |
5099 | _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b |
5100 | = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
5101 | return 0 == __testz(__a, __b); |
5102 | } |
5103 | else |
5104 | return 0 == __testz(__a, __a); |
5105 | } |
5106 | else if constexpr (is_same_v<_Tp, float>) |
5107 | return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0; |
5108 | else if constexpr (is_same_v<_Tp, double>) |
5109 | return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0; |
5110 | else |
5111 | return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) |
5112 | != 0; |
5113 | } |
5114 | else if constexpr (__is_avx512_abi<_Abi>()) |
5115 | return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) |
5116 | != 0; |
5117 | } |
5118 | |
5119 | // }}} |
5120 | // _S_none_of {{{ |
5121 | template <typename _Tp> |
5122 | _GLIBCXX_SIMD_INTRINSIC static bool |
5123 | _S_none_of(simd_mask<_Tp, _Abi> __k) |
5124 | { |
5125 | if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) |
5126 | { |
5127 | constexpr size_t _Np = simd_size_v<_Tp, _Abi>; |
5128 | using _TI = __intrinsic_type_t<_Tp, _Np>; |
5129 | const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); |
5130 | if constexpr (__have_sse4_1) |
5131 | { |
5132 | if constexpr (_Abi::template _S_is_partial< |
5133 | _Tp> || sizeof(__k) < 16) |
5134 | { |
5135 | _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b |
5136 | = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
5137 | return 0 != __testz(__a, __b); |
5138 | } |
5139 | else |
5140 | return 0 != __testz(__a, __a); |
5141 | } |
5142 | else if constexpr (is_same_v<_Tp, float>) |
5143 | return (__movemask(__a) & ((1 << _Np) - 1)) == 0; |
5144 | else if constexpr (is_same_v<_Tp, double>) |
5145 | return (__movemask(__a) & ((1 << _Np) - 1)) == 0; |
5146 | else |
5147 | return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1)) |
5148 | == 0; |
5149 | } |
5150 | else if constexpr (__is_avx512_abi<_Abi>()) |
5151 | return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) |
5152 | == 0; |
5153 | } |
5154 | |
5155 | // }}} |
5156 | // _S_some_of {{{ |
5157 | template <typename _Tp> |
5158 | _GLIBCXX_SIMD_INTRINSIC static bool |
5159 | _S_some_of(simd_mask<_Tp, _Abi> __k) |
5160 | { |
5161 | if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) |
5162 | { |
5163 | constexpr size_t _Np = simd_size_v<_Tp, _Abi>; |
5164 | using _TI = __intrinsic_type_t<_Tp, _Np>; |
5165 | const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); |
5166 | if constexpr (__have_sse4_1) |
5167 | { |
5168 | _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b |
5169 | = _Abi::template _S_implicit_mask_intrin<_Tp>(); |
5170 | return 0 != __testnzc(__a, __b); |
5171 | } |
5172 | else if constexpr (is_same_v<_Tp, float>) |
5173 | { |
5174 | constexpr int __allbits = (1 << _Np) - 1; |
5175 | const auto __tmp = _mm_movemask_ps(__a) & __allbits; |
5176 | return __tmp > 0 && __tmp < __allbits; |
5177 | } |
5178 | else if constexpr (is_same_v<_Tp, double>) |
5179 | { |
5180 | constexpr int __allbits = (1 << _Np) - 1; |
5181 | const auto __tmp = _mm_movemask_pd(__a) & __allbits; |
5182 | return __tmp > 0 && __tmp < __allbits; |
5183 | } |
5184 | else |
5185 | { |
5186 | constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1; |
5187 | const auto __tmp = _mm_movemask_epi8(__a) & __allbits; |
5188 | return __tmp > 0 && __tmp < __allbits; |
5189 | } |
5190 | } |
5191 | else if constexpr (__is_avx512_abi<_Abi>()) |
5192 | return _S_any_of(__k) && !_S_all_of(__k); |
5193 | else |
5194 | __assert_unreachable<_Tp>(); |
5195 | } |
5196 | |
5197 | // }}} |
5198 | // _S_popcount {{{ |
5199 | template <typename _Tp> |
5200 | _GLIBCXX_SIMD_INTRINSIC static int |
5201 | _S_popcount(simd_mask<_Tp, _Abi> __k) |
5202 | { |
5203 | constexpr size_t _Np = simd_size_v<_Tp, _Abi>; |
5204 | const auto __kk = _Abi::_S_masked(__k._M_data)._M_data; |
5205 | if constexpr (__is_avx512_abi<_Abi>()) |
5206 | { |
5207 | if constexpr (_Np > 32) |
5208 | return __builtin_popcountll(__kk); |
5209 | else |
5210 | return __builtin_popcount(__kk); |
5211 | } |
5212 | else |
5213 | { |
5214 | if constexpr (__have_popcnt) |
5215 | { |
5216 | int __bits |
5217 | = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk))); |
5218 | const int __count = __builtin_popcount(__bits); |
5219 | return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count; |
5220 | } |
5221 | else if constexpr (_Np == 2 && sizeof(_Tp) == 8) |
5222 | { |
5223 | const int mask = _mm_movemask_pd(__auto_bitcast(__kk)); |
5224 | return mask - (mask >> 1); |
5225 | } |
5226 | else if constexpr (_Np <= 4 && sizeof(_Tp) == 8) |
5227 | { |
5228 | auto __x = -(__lo128(__kk) + __hi128(__kk)); |
5229 | return __x[0] + __x[1]; |
5230 | } |
5231 | else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) |
5232 | { |
5233 | if constexpr (__have_sse2) |
5234 | { |
5235 | __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk)); |
5236 | __x = _mm_add_epi32( |
5237 | a: __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3))); |
5238 | __x = _mm_add_epi32( |
5239 | a: __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2))); |
5240 | return -_mm_cvtsi128_si32(a: __x); |
5241 | } |
5242 | else |
5243 | return __builtin_popcount( |
5244 | _mm_movemask_ps(__auto_bitcast(__kk))); |
5245 | } |
5246 | else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) |
5247 | { |
5248 | auto __x = __to_intrin(__kk); |
5249 | __x = _mm_add_epi16(__x, |
5250 | _mm_shuffle_epi32(__x, |
5251 | _MM_SHUFFLE(0, 1, 2, 3))); |
5252 | __x = _mm_add_epi16( |
5253 | __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3))); |
5254 | __x = _mm_add_epi16( |
5255 | __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1))); |
5256 | return -short(_mm_extract_epi16(__x, 0)); |
5257 | } |
5258 | else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) |
5259 | { |
5260 | auto __x = __to_intrin(__kk); |
5261 | __x = _mm_add_epi8(__x, |
5262 | _mm_shuffle_epi32(__x, |
5263 | _MM_SHUFFLE(0, 1, 2, 3))); |
5264 | __x = _mm_add_epi8(__x, |
5265 | _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, |
5266 | 3))); |
5267 | __x = _mm_add_epi8(__x, |
5268 | _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, |
5269 | 1))); |
5270 | auto __y = -__vector_bitcast<_UChar>(__x); |
5271 | if constexpr (__have_sse4_1) |
5272 | return __y[0] + __y[1]; |
5273 | else |
5274 | { |
5275 | unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0); |
5276 | return (__z & 0xff) + (__z >> 8); |
5277 | } |
5278 | } |
5279 | else if constexpr (sizeof(__kk) == 32) |
5280 | { |
5281 | // The following works only as long as the implementations above |
5282 | // use a summation |
5283 | using _I = __int_for_sizeof_t<_Tp>; |
5284 | const auto __as_int = __vector_bitcast<_I>(__kk); |
5285 | _MaskImplX86<simd_abi::__sse>::_S_popcount( |
5286 | simd_mask<_I, simd_abi::__sse>(__private_init, |
5287 | __lo128(__as_int) |
5288 | + __hi128(__as_int))); |
5289 | } |
5290 | else |
5291 | __assert_unreachable<_Tp>(); |
5292 | } |
5293 | } |
5294 | |
5295 | // }}} |
5296 | // _S_find_first_set {{{ |
5297 | template <typename _Tp> |
5298 | _GLIBCXX_SIMD_INTRINSIC static int |
5299 | _S_find_first_set(simd_mask<_Tp, _Abi> __k) |
5300 | { |
5301 | if constexpr (__is_avx512_abi<_Abi>()) |
5302 | return std::__countr_zero(__k._M_data._M_data); |
5303 | else |
5304 | return _Base::_S_find_first_set(__k); |
5305 | } |
5306 | |
5307 | // }}} |
5308 | // _S_find_last_set {{{ |
5309 | template <typename _Tp> |
5310 | _GLIBCXX_SIMD_INTRINSIC static int |
5311 | _S_find_last_set(simd_mask<_Tp, _Abi> __k) |
5312 | { |
5313 | if constexpr (__is_avx512_abi<_Abi>()) |
5314 | return std::__bit_width(__k._M_data._M_data) - 1; |
5315 | else |
5316 | return _Base::_S_find_last_set(__k); |
5317 | } |
5318 | |
5319 | // }}} |
5320 | }; |
5321 | |
5322 | // }}} |
5323 | |
5324 | _GLIBCXX_SIMD_END_NAMESPACE |
5325 | #endif // __cplusplus >= 201703L |
5326 | #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ |
5327 | |
5328 | // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80 |
5329 | |