1 | // x86 specific conversion optimizations -*- C++ -*- |
2 | |
3 | // Copyright (C) 2020-2021 Free Software Foundation, Inc. |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free |
6 | // software; you can redistribute it and/or modify it under the |
7 | // terms of the GNU General Public License as published by the |
8 | // Free Software Foundation; either version 3, or (at your option) |
9 | // any later version. |
10 | |
11 | // This library is distributed in the hope that it will be useful, |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | // GNU General Public License for more details. |
15 | |
16 | // Under Section 7 of GPL version 3, you are granted additional |
17 | // permissions described in the GCC Runtime Library Exception, version |
18 | // 3.1, as published by the Free Software Foundation. |
19 | |
20 | // You should have received a copy of the GNU General Public License and |
21 | // a copy of the GCC Runtime Library Exception along with this program; |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
23 | // <http://www.gnu.org/licenses/>. |
24 | |
25 | #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H |
26 | #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H |
27 | |
28 | #if __cplusplus >= 201703L |
29 | |
30 | // work around PR85827 |
31 | // 1-arg __convert_x86 {{{1 |
32 | template <typename _To, typename _V, typename _Traits> |
33 | _GLIBCXX_SIMD_INTRINSIC _To |
34 | __convert_x86(_V __v) |
35 | { |
36 | static_assert(__is_vector_type_v<_V>); |
37 | using _Tp = typename _Traits::value_type; |
38 | constexpr size_t _Np = _Traits::_S_full_size; |
39 | [[maybe_unused]] const auto __intrin = __to_intrin(__v); |
40 | using _Up = typename _VectorTraits<_To>::value_type; |
41 | constexpr size_t _M = _VectorTraits<_To>::_S_full_size; |
42 | |
43 | // [xyz]_to_[xyz] {{{2 |
44 | [[maybe_unused]] constexpr bool __x_to_x |
45 | = sizeof(__v) <= 16 && sizeof(_To) <= 16; |
46 | [[maybe_unused]] constexpr bool __x_to_y |
47 | = sizeof(__v) <= 16 && sizeof(_To) == 32; |
48 | [[maybe_unused]] constexpr bool __x_to_z |
49 | = sizeof(__v) <= 16 && sizeof(_To) == 64; |
50 | [[maybe_unused]] constexpr bool __y_to_x |
51 | = sizeof(__v) == 32 && sizeof(_To) <= 16; |
52 | [[maybe_unused]] constexpr bool __y_to_y |
53 | = sizeof(__v) == 32 && sizeof(_To) == 32; |
54 | [[maybe_unused]] constexpr bool __y_to_z |
55 | = sizeof(__v) == 32 && sizeof(_To) == 64; |
56 | [[maybe_unused]] constexpr bool __z_to_x |
57 | = sizeof(__v) == 64 && sizeof(_To) <= 16; |
58 | [[maybe_unused]] constexpr bool __z_to_y |
59 | = sizeof(__v) == 64 && sizeof(_To) == 32; |
60 | [[maybe_unused]] constexpr bool __z_to_z |
61 | = sizeof(__v) == 64 && sizeof(_To) == 64; |
62 | |
63 | // iX_to_iX {{{2 |
64 | [[maybe_unused]] constexpr bool __i_to_i |
65 | = is_integral_v<_Up> && is_integral_v<_Tp>; |
66 | [[maybe_unused]] constexpr bool __i8_to_i16 |
67 | = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; |
68 | [[maybe_unused]] constexpr bool __i8_to_i32 |
69 | = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; |
70 | [[maybe_unused]] constexpr bool __i8_to_i64 |
71 | = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; |
72 | [[maybe_unused]] constexpr bool __i16_to_i8 |
73 | = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; |
74 | [[maybe_unused]] constexpr bool __i16_to_i32 |
75 | = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; |
76 | [[maybe_unused]] constexpr bool __i16_to_i64 |
77 | = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; |
78 | [[maybe_unused]] constexpr bool __i32_to_i8 |
79 | = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; |
80 | [[maybe_unused]] constexpr bool __i32_to_i16 |
81 | = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; |
82 | [[maybe_unused]] constexpr bool __i32_to_i64 |
83 | = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; |
84 | [[maybe_unused]] constexpr bool __i64_to_i8 |
85 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; |
86 | [[maybe_unused]] constexpr bool __i64_to_i16 |
87 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; |
88 | [[maybe_unused]] constexpr bool __i64_to_i32 |
89 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; |
90 | |
91 | // [fsu]X_to_[fsu]X {{{2 |
92 | // ibw = integral && byte or word, i.e. char and short with any signedness |
93 | [[maybe_unused]] constexpr bool __s64_to_f32 |
94 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 |
95 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
96 | [[maybe_unused]] constexpr bool __s32_to_f32 |
97 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
98 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
99 | [[maybe_unused]] constexpr bool __s16_to_f32 |
100 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
101 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
102 | [[maybe_unused]] constexpr bool __s8_to_f32 |
103 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
104 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
105 | [[maybe_unused]] constexpr bool __u64_to_f32 |
106 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 |
107 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
108 | [[maybe_unused]] constexpr bool __u32_to_f32 |
109 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
110 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
111 | [[maybe_unused]] constexpr bool __u16_to_f32 |
112 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
113 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
114 | [[maybe_unused]] constexpr bool __u8_to_f32 |
115 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
116 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
117 | [[maybe_unused]] constexpr bool __s64_to_f64 |
118 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 |
119 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
120 | [[maybe_unused]] constexpr bool __s32_to_f64 |
121 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
122 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
123 | [[maybe_unused]] constexpr bool __u64_to_f64 |
124 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 |
125 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
126 | [[maybe_unused]] constexpr bool __u32_to_f64 |
127 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
128 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
129 | [[maybe_unused]] constexpr bool __f32_to_s64 |
130 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
131 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
132 | [[maybe_unused]] constexpr bool __f32_to_s32 |
133 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
134 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
135 | [[maybe_unused]] constexpr bool __f32_to_u64 |
136 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
137 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
138 | [[maybe_unused]] constexpr bool __f32_to_u32 |
139 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
140 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
141 | [[maybe_unused]] constexpr bool __f64_to_s64 |
142 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
143 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
144 | [[maybe_unused]] constexpr bool __f64_to_s32 |
145 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
146 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
147 | [[maybe_unused]] constexpr bool __f64_to_u64 |
148 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
149 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
150 | [[maybe_unused]] constexpr bool __f64_to_u32 |
151 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
152 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
153 | [[maybe_unused]] constexpr bool __ibw_to_f32 |
154 | = is_integral_v<_Tp> && sizeof(_Tp) <= 2 |
155 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
156 | [[maybe_unused]] constexpr bool __ibw_to_f64 |
157 | = is_integral_v<_Tp> && sizeof(_Tp) <= 2 |
158 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
159 | [[maybe_unused]] constexpr bool __f32_to_ibw |
160 | = is_integral_v<_Up> && sizeof(_Up) <= 2 |
161 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
162 | [[maybe_unused]] constexpr bool __f64_to_ibw |
163 | = is_integral_v<_Up> && sizeof(_Up) <= 2 |
164 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
165 | [[maybe_unused]] constexpr bool __f32_to_f64 |
166 | = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 |
167 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
168 | [[maybe_unused]] constexpr bool __f64_to_f32 |
169 | = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 |
170 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
171 | |
172 | if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 |
173 | return __convert_x86<_To>(__lo128(__v), __hi128(__v)); |
174 | else if constexpr (__i_to_i && __x_to_y && !__have_avx2) //{{{2 |
175 | return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v), |
176 | __convert_x86<__vector_type_t<_Up, _M / 2>>( |
177 | __extract_part<1, _Np / _M * 2>(__v))); |
178 | else if constexpr (__i_to_i) //{{{2 |
179 | { |
180 | static_assert(__x_to_x || __have_avx2, |
181 | "integral conversions with ymm registers require AVX2" ); |
182 | static_assert(__have_avx512bw |
183 | || ((sizeof(_Tp) >= 4 || sizeof(__v) < 64) |
184 | && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), |
185 | "8/16-bit integers in zmm registers require AVX512BW" ); |
186 | static_assert((sizeof(__v) < 64 && sizeof(_To) < 64) || __have_avx512f, |
187 | "integral conversions with ymm registers require AVX2" ); |
188 | } |
189 | if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> && //{{{2 |
190 | sizeof(_Tp) == sizeof(_Up)) |
191 | { |
192 | // conversion uses simple bit reinterpretation (or no conversion at all) |
193 | if constexpr (_Np >= _M) |
194 | return __intrin_bitcast<_To>(__v); |
195 | else |
196 | return __zero_extend(__vector_bitcast<_Up>(__v)); |
197 | } |
198 | else if constexpr (_Np < _M && sizeof(_To) > 16) //{{{2 |
199 | // zero extend (eg. xmm -> ymm) |
200 | return __zero_extend( |
201 | __convert_x86<__vector_type_t< |
202 | _Up, (16 / sizeof(_Up) > _Np) ? 16 / sizeof(_Up) : _Np>>(__v)); |
203 | else if constexpr (_Np > _M && sizeof(__v) > 16) //{{{2 |
204 | // partial input (eg. ymm -> xmm) |
205 | return __convert_x86<_To>(__extract_part<0, _Np / _M>(__v)); |
206 | else if constexpr (__i64_to_i32) //{{{2 |
207 | { |
208 | if constexpr (__x_to_x && __have_avx512vl) |
209 | return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin)); |
210 | else if constexpr (__x_to_x) |
211 | return __auto_bitcast( |
212 | _mm_shuffle_ps(__vector_bitcast<float>(__v), __m128(), 8)); |
213 | else if constexpr (__y_to_x && __have_avx512vl) |
214 | return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin)); |
215 | else if constexpr (__y_to_x && __have_avx512f) |
216 | return __intrin_bitcast<_To>( |
217 | __lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v)))); |
218 | else if constexpr (__y_to_x) |
219 | return __intrin_bitcast<_To>( |
220 | __lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, 8), |
221 | 0 + 4 * 2))); |
222 | else if constexpr (__z_to_y) |
223 | return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin)); |
224 | } |
225 | else if constexpr (__i64_to_i16) //{{{2 |
226 | { |
227 | if constexpr (__x_to_x && __have_avx512vl) |
228 | return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin)); |
229 | else if constexpr (__x_to_x && __have_avx512f) |
230 | return __intrin_bitcast<_To>( |
231 | __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v)))); |
232 | else if constexpr (__x_to_x && __have_ssse3) |
233 | { |
234 | return __intrin_bitcast<_To>( |
235 | _mm_shuffle_epi8(__intrin, |
236 | _mm_setr_epi8(b0: 0, b1: 1, b2: 8, b3: 9, b4: -0x80, b5: -0x80, b6: -0x80, |
237 | b7: -0x80, b8: -0x80, b9: -0x80, b10: -0x80, b11: -0x80, |
238 | b12: -0x80, b13: -0x80, b14: -0x80, b15: -0x80))); |
239 | // fallback without SSSE3 |
240 | } |
241 | else if constexpr (__y_to_x && __have_avx512vl) |
242 | return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin)); |
243 | else if constexpr (__y_to_x && __have_avx512f) |
244 | return __intrin_bitcast<_To>( |
245 | __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v)))); |
246 | else if constexpr (__y_to_x) |
247 | { |
248 | const auto __a = _mm256_shuffle_epi8( |
249 | __intrin, |
250 | _mm256_setr_epi8(b31: 0, b30: 1, b29: 8, b28: 9, b27: -0x80, b26: -0x80, b25: -0x80, b24: -0x80, b23: -0x80, |
251 | b22: -0x80, b21: -0x80, b20: -0x80, b19: -0x80, b18: -0x80, b17: -0x80, b16: -0x80, |
252 | b15: -0x80, b14: -0x80, b13: -0x80, b12: -0x80, b11: 0, b10: 1, b09: 8, b08: 9, b07: -0x80, |
253 | b06: -0x80, b05: -0x80, b04: -0x80, b03: -0x80, b02: -0x80, b01: -0x80, |
254 | b00: -0x80)); |
255 | return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); |
256 | } |
257 | else if constexpr (__z_to_x) |
258 | return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin)); |
259 | } |
260 | else if constexpr (__i64_to_i8) //{{{2 |
261 | { |
262 | if constexpr (__x_to_x && __have_avx512vl) |
263 | return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin)); |
264 | else if constexpr (__x_to_x && __have_avx512f) |
265 | return __intrin_bitcast<_To>( |
266 | __lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin)))); |
267 | else if constexpr (__y_to_x && __have_avx512vl) |
268 | return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin)); |
269 | else if constexpr (__y_to_x && __have_avx512f) |
270 | return __intrin_bitcast<_To>( |
271 | _mm512_cvtepi64_epi8(__zero_extend(__intrin))); |
272 | else if constexpr (__z_to_x) |
273 | return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin)); |
274 | } |
275 | else if constexpr (__i32_to_i64) //{{{2 |
276 | { |
277 | if constexpr (__have_sse4_1 && __x_to_x) |
278 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
279 | ? _mm_cvtepi32_epi64(__intrin) |
280 | : _mm_cvtepu32_epi64(__intrin)); |
281 | else if constexpr (__x_to_x) |
282 | { |
283 | return __intrin_bitcast<_To>( |
284 | _mm_unpacklo_epi32(__intrin, is_signed_v<_Tp> |
285 | ? _mm_srai_epi32(__intrin, 31) |
286 | : __m128i())); |
287 | } |
288 | else if constexpr (__x_to_y) |
289 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
290 | ? _mm256_cvtepi32_epi64(__intrin) |
291 | : _mm256_cvtepu32_epi64(__intrin)); |
292 | else if constexpr (__y_to_z) |
293 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
294 | ? _mm512_cvtepi32_epi64(__intrin) |
295 | : _mm512_cvtepu32_epi64(__intrin)); |
296 | } |
297 | else if constexpr (__i32_to_i16) //{{{2 |
298 | { |
299 | if constexpr (__x_to_x && __have_avx512vl) |
300 | return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin)); |
301 | else if constexpr (__x_to_x && __have_avx512f) |
302 | return __intrin_bitcast<_To>( |
303 | __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v)))); |
304 | else if constexpr (__x_to_x && __have_ssse3) |
305 | return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
306 | __intrin, _mm_setr_epi8(b0: 0, b1: 1, b2: 4, b3: 5, b4: 8, b5: 9, b6: 12, b7: 13, b8: -0x80, b9: -0x80, |
307 | b10: -0x80, b11: -0x80, b12: -0x80, b13: -0x80, b14: -0x80, b15: -0x80))); |
308 | else if constexpr (__x_to_x) |
309 | { |
310 | auto __a = _mm_unpacklo_epi16(__intrin, __m128i()); // 0o.o 1o.o |
311 | auto __b = _mm_unpackhi_epi16(__intrin, __m128i()); // 2o.o 3o.o |
312 | auto __c = _mm_unpacklo_epi16(__a, __b); // 02oo ..oo |
313 | auto __d = _mm_unpackhi_epi16(__a, __b); // 13oo ..oo |
314 | return __intrin_bitcast<_To>( |
315 | _mm_unpacklo_epi16(__c, __d)); // 0123 oooo |
316 | } |
317 | else if constexpr (__y_to_x && __have_avx512vl) |
318 | return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin)); |
319 | else if constexpr (__y_to_x && __have_avx512f) |
320 | return __intrin_bitcast<_To>( |
321 | __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v)))); |
322 | else if constexpr (__y_to_x) |
323 | { |
324 | auto __a = _mm256_shuffle_epi8( |
325 | __intrin, |
326 | _mm256_setr_epi8(b31: 0, b30: 1, b29: 4, b28: 5, b27: 8, b26: 9, b25: 12, b24: 13, b23: -0x80, b22: -0x80, b21: -0x80, |
327 | b20: -0x80, b19: -0x80, b18: -0x80, b17: -0x80, b16: -0x80, b15: 0, b14: 1, b13: 4, b12: 5, b11: 8, |
328 | b10: 9, b09: 12, b08: 13, b07: -0x80, b06: -0x80, b05: -0x80, b04: -0x80, b03: -0x80, |
329 | b02: -0x80, b01: -0x80, b00: -0x80)); |
330 | return __intrin_bitcast<_To>(__lo128( |
331 | _mm256_permute4x64_epi64(__a, |
332 | 0xf8))); // __a[0] __a[2] | __a[3] __a[3] |
333 | } |
334 | else if constexpr (__z_to_y) |
335 | return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin)); |
336 | } |
337 | else if constexpr (__i32_to_i8) //{{{2 |
338 | { |
339 | if constexpr (__x_to_x && __have_avx512vl) |
340 | return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin)); |
341 | else if constexpr (__x_to_x && __have_avx512f) |
342 | return __intrin_bitcast<_To>( |
343 | __lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin)))); |
344 | else if constexpr (__x_to_x && __have_ssse3) |
345 | { |
346 | return __intrin_bitcast<_To>( |
347 | _mm_shuffle_epi8(__intrin, |
348 | _mm_setr_epi8(b0: 0, b1: 4, b2: 8, b3: 12, b4: -0x80, b5: -0x80, b6: -0x80, |
349 | b7: -0x80, b8: -0x80, b9: -0x80, b10: -0x80, b11: -0x80, |
350 | b12: -0x80, b13: -0x80, b14: -0x80, b15: -0x80))); |
351 | } |
352 | else if constexpr (__x_to_x) |
353 | { |
354 | const auto __a |
355 | = _mm_unpacklo_epi8(__intrin, __intrin); // 0... .... 1... .... |
356 | const auto __b |
357 | = _mm_unpackhi_epi8(__intrin, __intrin); // 2... .... 3... .... |
358 | const auto __c = _mm_unpacklo_epi8(__a, __b); // 02.. .... .... .... |
359 | const auto __d = _mm_unpackhi_epi8(__a, __b); // 13.. .... .... .... |
360 | const auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 .... .... .... |
361 | return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(a: -1)); |
362 | } |
363 | else if constexpr (__y_to_x && __have_avx512vl) |
364 | return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin)); |
365 | else if constexpr (__y_to_x && __have_avx512f) |
366 | return __intrin_bitcast<_To>( |
367 | _mm512_cvtepi32_epi8(__zero_extend(__intrin))); |
368 | else if constexpr (__z_to_x) |
369 | return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin)); |
370 | } |
371 | else if constexpr (__i16_to_i64) //{{{2 |
372 | { |
373 | if constexpr (__x_to_x && __have_sse4_1) |
374 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
375 | ? _mm_cvtepi16_epi64(__intrin) |
376 | : _mm_cvtepu16_epi64(__intrin)); |
377 | else if constexpr (__x_to_x && is_signed_v<_Tp>) |
378 | { |
379 | auto __x = _mm_srai_epi16(__intrin, 15); |
380 | auto __y = _mm_unpacklo_epi16(__intrin, __x); |
381 | __x = _mm_unpacklo_epi16(__x, __x); |
382 | return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x)); |
383 | } |
384 | else if constexpr (__x_to_x) |
385 | return __intrin_bitcast<_To>( |
386 | _mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()), |
387 | __m128i())); |
388 | else if constexpr (__x_to_y) |
389 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
390 | ? _mm256_cvtepi16_epi64(__intrin) |
391 | : _mm256_cvtepu16_epi64(__intrin)); |
392 | else if constexpr (__x_to_z) |
393 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
394 | ? _mm512_cvtepi16_epi64(__intrin) |
395 | : _mm512_cvtepu16_epi64(__intrin)); |
396 | } |
397 | else if constexpr (__i16_to_i32) //{{{2 |
398 | { |
399 | if constexpr (__x_to_x && __have_sse4_1) |
400 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
401 | ? _mm_cvtepi16_epi32(__intrin) |
402 | : _mm_cvtepu16_epi32(__intrin)); |
403 | else if constexpr (__x_to_x && is_signed_v<_Tp>) |
404 | return __intrin_bitcast<_To>( |
405 | _mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), 16)); |
406 | else if constexpr (__x_to_x && is_unsigned_v<_Tp>) |
407 | return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i())); |
408 | else if constexpr (__x_to_y) |
409 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
410 | ? _mm256_cvtepi16_epi32(__intrin) |
411 | : _mm256_cvtepu16_epi32(__intrin)); |
412 | else if constexpr (__y_to_z) |
413 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
414 | ? _mm512_cvtepi16_epi32(__intrin) |
415 | : _mm512_cvtepu16_epi32(__intrin)); |
416 | } |
417 | else if constexpr (__i16_to_i8) //{{{2 |
418 | { |
419 | if constexpr (__x_to_x && __have_avx512bw_vl) |
420 | return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin)); |
421 | else if constexpr (__x_to_x && __have_avx512bw) |
422 | return __intrin_bitcast<_To>( |
423 | __lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin)))); |
424 | else if constexpr (__x_to_x && __have_ssse3) |
425 | return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
426 | __intrin, _mm_setr_epi8(b0: 0, b1: 2, b2: 4, b3: 6, b4: 8, b5: 10, b6: 12, b7: 14, b8: -0x80, b9: -0x80, |
427 | b10: -0x80, b11: -0x80, b12: -0x80, b13: -0x80, b14: -0x80, b15: -0x80))); |
428 | else if constexpr (__x_to_x) |
429 | { |
430 | auto __a |
431 | = _mm_unpacklo_epi8(__intrin, __intrin); // 00.. 11.. 22.. 33.. |
432 | auto __b |
433 | = _mm_unpackhi_epi8(__intrin, __intrin); // 44.. 55.. 66.. 77.. |
434 | auto __c = _mm_unpacklo_epi8(__a, __b); // 0404 .... 1515 .... |
435 | auto __d = _mm_unpackhi_epi8(__a, __b); // 2626 .... 3737 .... |
436 | auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 0246 .... .... |
437 | auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 1357 .... .... |
438 | return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f)); |
439 | } |
440 | else if constexpr (__y_to_x && __have_avx512bw_vl) |
441 | return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin)); |
442 | else if constexpr (__y_to_x && __have_avx512bw) |
443 | return __intrin_bitcast<_To>( |
444 | __lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin)))); |
445 | else if constexpr (__y_to_x) |
446 | { |
447 | auto __a = _mm256_shuffle_epi8( |
448 | __intrin, |
449 | _mm256_setr_epi8(b31: 0, b30: 2, b29: 4, b28: 6, b27: 8, b26: 10, b25: 12, b24: 14, b23: -0x80, b22: -0x80, b21: -0x80, |
450 | b20: -0x80, b19: -0x80, b18: -0x80, b17: -0x80, b16: -0x80, b15: -0x80, b14: -0x80, |
451 | b13: -0x80, b12: -0x80, b11: -0x80, b10: -0x80, b09: -0x80, b08: -0x80, b07: 0, b06: 2, |
452 | b05: 4, b04: 6, b03: 8, b02: 10, b01: 12, b00: 14)); |
453 | return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); |
454 | } |
455 | else if constexpr (__z_to_y && __have_avx512bw) |
456 | return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin)); |
457 | else if constexpr (__z_to_y) |
458 | __assert_unreachable<_Tp>(); |
459 | } |
460 | else if constexpr (__i8_to_i64) //{{{2 |
461 | { |
462 | if constexpr (__x_to_x && __have_sse4_1) |
463 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
464 | ? _mm_cvtepi8_epi64(__intrin) |
465 | : _mm_cvtepu8_epi64(__intrin)); |
466 | else if constexpr (__x_to_x && is_signed_v<_Tp>) |
467 | { |
468 | if constexpr (__have_ssse3) |
469 | { |
470 | auto __dup = _mm_unpacklo_epi8(__intrin, __intrin); |
471 | auto __epi16 = _mm_srai_epi16(__dup, 8); |
472 | _mm_shuffle_epi8(__epi16, |
473 | _mm_setr_epi8(b0: 0, b1: 1, b2: 1, b3: 1, b4: 1, b5: 1, b6: 1, b7: 1, b8: 2, b9: 3, b10: 3, |
474 | b11: 3, b12: 3, b13: 3, b14: 3, b15: 3)); |
475 | } |
476 | else |
477 | { |
478 | auto __x = _mm_unpacklo_epi8(__intrin, __intrin); |
479 | __x = _mm_unpacklo_epi16(__x, __x); |
480 | return __intrin_bitcast<_To>( |
481 | _mm_unpacklo_epi32(_mm_srai_epi32(__x, 24), |
482 | _mm_srai_epi32(__x, 31))); |
483 | } |
484 | } |
485 | else if constexpr (__x_to_x) |
486 | { |
487 | return __intrin_bitcast<_To>(_mm_unpacklo_epi32( |
488 | _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()), |
489 | __m128i()), |
490 | __m128i())); |
491 | } |
492 | else if constexpr (__x_to_y) |
493 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
494 | ? _mm256_cvtepi8_epi64(__intrin) |
495 | : _mm256_cvtepu8_epi64(__intrin)); |
496 | else if constexpr (__x_to_z) |
497 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
498 | ? _mm512_cvtepi8_epi64(__intrin) |
499 | : _mm512_cvtepu8_epi64(__intrin)); |
500 | } |
501 | else if constexpr (__i8_to_i32) //{{{2 |
502 | { |
503 | if constexpr (__x_to_x && __have_sse4_1) |
504 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
505 | ? _mm_cvtepi8_epi32(__intrin) |
506 | : _mm_cvtepu8_epi32(__intrin)); |
507 | else if constexpr (__x_to_x && is_signed_v<_Tp>) |
508 | { |
509 | const auto __x = _mm_unpacklo_epi8(__intrin, __intrin); |
510 | return __intrin_bitcast<_To>( |
511 | _mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), 24)); |
512 | } |
513 | else if constexpr (__x_to_x && is_unsigned_v<_Tp>) |
514 | return __intrin_bitcast<_To>( |
515 | _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()), |
516 | __m128i())); |
517 | else if constexpr (__x_to_y) |
518 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
519 | ? _mm256_cvtepi8_epi32(__intrin) |
520 | : _mm256_cvtepu8_epi32(__intrin)); |
521 | else if constexpr (__x_to_z) |
522 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
523 | ? _mm512_cvtepi8_epi32(__intrin) |
524 | : _mm512_cvtepu8_epi32(__intrin)); |
525 | } |
526 | else if constexpr (__i8_to_i16) //{{{2 |
527 | { |
528 | if constexpr (__x_to_x && __have_sse4_1) |
529 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
530 | ? _mm_cvtepi8_epi16(__intrin) |
531 | : _mm_cvtepu8_epi16(__intrin)); |
532 | else if constexpr (__x_to_x && is_signed_v<_Tp>) |
533 | return __intrin_bitcast<_To>( |
534 | _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), 8)); |
535 | else if constexpr (__x_to_x && is_unsigned_v<_Tp>) |
536 | return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i())); |
537 | else if constexpr (__x_to_y) |
538 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
539 | ? _mm256_cvtepi8_epi16(__intrin) |
540 | : _mm256_cvtepu8_epi16(__intrin)); |
541 | else if constexpr (__y_to_z && __have_avx512bw) |
542 | return __intrin_bitcast<_To>(is_signed_v<_Tp> |
543 | ? _mm512_cvtepi8_epi16(__intrin) |
544 | : _mm512_cvtepu8_epi16(__intrin)); |
545 | else if constexpr (__y_to_z) |
546 | __assert_unreachable<_Tp>(); |
547 | } |
548 | else if constexpr (__f32_to_s64) //{{{2 |
549 | { |
550 | if constexpr (__have_avx512dq_vl && __x_to_x) |
551 | return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin)); |
552 | else if constexpr (__have_avx512dq_vl && __x_to_y) |
553 | return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin)); |
554 | else if constexpr (__have_avx512dq && __y_to_z) |
555 | return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin)); |
556 | // else use scalar fallback |
557 | } |
558 | else if constexpr (__f32_to_u64) //{{{2 |
559 | { |
560 | if constexpr (__have_avx512dq_vl && __x_to_x) |
561 | return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin)); |
562 | else if constexpr (__have_avx512dq_vl && __x_to_y) |
563 | return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin)); |
564 | else if constexpr (__have_avx512dq && __y_to_z) |
565 | return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin)); |
566 | // else use scalar fallback |
567 | } |
568 | else if constexpr (__f32_to_s32) //{{{2 |
569 | { |
570 | if constexpr (__x_to_x || __y_to_y || __z_to_z) |
571 | { |
572 | // go to fallback, it does the right thing |
573 | } |
574 | else |
575 | __assert_unreachable<_Tp>(); |
576 | } |
577 | else if constexpr (__f32_to_u32) //{{{2 |
578 | { |
579 | if constexpr (__have_avx512vl && __x_to_x) |
580 | return __auto_bitcast(_mm_cvttps_epu32(__intrin)); |
581 | else if constexpr (__have_avx512f && __x_to_x) |
582 | return __auto_bitcast( |
583 | __lo128(_mm512_cvttps_epu32(__auto_bitcast(__v)))); |
584 | else if constexpr (__have_avx512vl && __y_to_y) |
585 | return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin)); |
586 | else if constexpr (__have_avx512f && __y_to_y) |
587 | return __vector_bitcast<_Up>( |
588 | __lo256(_mm512_cvttps_epu32(__auto_bitcast(__v)))); |
589 | else if constexpr (__x_to_x || __y_to_y || __z_to_z) |
590 | { |
591 | // go to fallback, it does the right thing. We can't use the |
592 | // _mm_floor_ps - 0x8000'0000 trick for f32->u32 because it would |
593 | // discard small input values (only 24 mantissa bits) |
594 | } |
595 | else |
596 | __assert_unreachable<_Tp>(); |
597 | } |
598 | else if constexpr (__f32_to_ibw) //{{{2 |
599 | return __convert_x86<_To>(__convert_x86<__vector_type_t<int, _Np>>(__v)); |
600 | else if constexpr (__f64_to_s64) //{{{2 |
601 | { |
602 | if constexpr (__have_avx512dq_vl && __x_to_x) |
603 | return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin)); |
604 | else if constexpr (__have_avx512dq_vl && __y_to_y) |
605 | return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin)); |
606 | else if constexpr (__have_avx512dq && __z_to_z) |
607 | return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin)); |
608 | // else use scalar fallback |
609 | } |
610 | else if constexpr (__f64_to_u64) //{{{2 |
611 | { |
612 | if constexpr (__have_avx512dq_vl && __x_to_x) |
613 | return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin)); |
614 | else if constexpr (__have_avx512dq_vl && __y_to_y) |
615 | return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin)); |
616 | else if constexpr (__have_avx512dq && __z_to_z) |
617 | return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin)); |
618 | // else use scalar fallback |
619 | } |
620 | else if constexpr (__f64_to_s32) //{{{2 |
621 | { |
622 | if constexpr (__x_to_x) |
623 | return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin)); |
624 | else if constexpr (__y_to_x) |
625 | return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin)); |
626 | else if constexpr (__z_to_y) |
627 | return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin)); |
628 | } |
629 | else if constexpr (__f64_to_u32) //{{{2 |
630 | { |
631 | if constexpr (__have_avx512vl && __x_to_x) |
632 | return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin)); |
633 | else if constexpr (__have_sse4_1 && __x_to_x) |
634 | return __vector_bitcast<_Up, _M>( |
635 | _mm_cvttpd_epi32(_mm_floor_pd(__intrin) - 0x8000'0000u)) |
636 | ^ 0x8000'0000u; |
637 | else if constexpr (__x_to_x) |
638 | { |
639 | // use scalar fallback: it's only 2 values to convert, can't get |
640 | // much better than scalar decomposition |
641 | } |
642 | else if constexpr (__have_avx512vl && __y_to_x) |
643 | return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin)); |
644 | else if constexpr (__y_to_x) |
645 | { |
646 | return __intrin_bitcast<_To>( |
647 | __vector_bitcast<_Up>( |
648 | _mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - 0x8000'0000u)) |
649 | ^ 0x8000'0000u); |
650 | } |
651 | else if constexpr (__z_to_y) |
652 | return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin)); |
653 | } |
654 | else if constexpr (__f64_to_ibw) //{{{2 |
655 | { |
656 | return __convert_x86<_To>( |
657 | __convert_x86<__vector_type_t<int, (_Np < 4 ? 4 : _Np)>>(__v)); |
658 | } |
659 | else if constexpr (__s64_to_f32) //{{{2 |
660 | { |
661 | if constexpr (__x_to_x && __have_avx512dq_vl) |
662 | return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin)); |
663 | else if constexpr (__y_to_x && __have_avx512dq_vl) |
664 | return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin)); |
665 | else if constexpr (__z_to_y && __have_avx512dq) |
666 | return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin)); |
667 | else if constexpr (__z_to_y) |
668 | return __intrin_bitcast<_To>( |
669 | _mm512_cvtpd_ps(__convert_x86<__vector_type_t<double, 8>>(__v))); |
670 | } |
671 | else if constexpr (__u64_to_f32) //{{{2 |
672 | { |
673 | if constexpr (__x_to_x && __have_avx512dq_vl) |
674 | return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin)); |
675 | else if constexpr (__y_to_x && __have_avx512dq_vl) |
676 | return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin)); |
677 | else if constexpr (__z_to_y && __have_avx512dq) |
678 | return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin)); |
679 | else if constexpr (__z_to_y) |
680 | { |
681 | return __intrin_bitcast<_To>( |
682 | __lo256(_mm512_cvtepu32_ps(__auto_bitcast( |
683 | _mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, 32))))) |
684 | * 0x100000000LL |
685 | + __lo256(_mm512_cvtepu32_ps( |
686 | __auto_bitcast(_mm512_cvtepi64_epi32(__intrin))))); |
687 | } |
688 | } |
689 | else if constexpr (__s32_to_f32) //{{{2 |
690 | { |
691 | // use fallback (builtin conversion) |
692 | } |
693 | else if constexpr (__u32_to_f32) //{{{2 |
694 | { |
695 | if constexpr (__x_to_x && __have_avx512vl) |
696 | { |
697 | // use fallback |
698 | } |
699 | else if constexpr (__x_to_x && __have_avx512f) |
700 | return __intrin_bitcast<_To>( |
701 | __lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v)))); |
702 | else if constexpr (__x_to_x && (__have_fma || __have_fma4)) |
703 | // work around PR85819 |
704 | return __auto_bitcast(0x10000 |
705 | * _mm_cvtepi32_ps(__to_intrin(__v >> 16)) |
706 | + _mm_cvtepi32_ps(__to_intrin(__v & 0xffff))); |
707 | else if constexpr (__y_to_y && __have_avx512vl) |
708 | { |
709 | // use fallback |
710 | } |
711 | else if constexpr (__y_to_y && __have_avx512f) |
712 | return __intrin_bitcast<_To>( |
713 | __lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v)))); |
714 | else if constexpr (__y_to_y) |
715 | // work around PR85819 |
716 | return 0x10000 * _mm256_cvtepi32_ps(__to_intrin(__v >> 16)) |
717 | + _mm256_cvtepi32_ps(__to_intrin(__v & 0xffff)); |
718 | // else use fallback (builtin conversion) |
719 | } |
720 | else if constexpr (__ibw_to_f32) //{{{2 |
721 | { |
722 | if constexpr (_M <= 4 || __have_avx2) |
723 | return __convert_x86<_To>( |
724 | __convert_x86<__vector_type_t<int, _M>>(__v)); |
725 | else |
726 | { |
727 | static_assert(__x_to_y); |
728 | __m128i __a, __b; |
729 | if constexpr (__have_sse4_1) |
730 | { |
731 | __a = sizeof(_Tp) == 2 |
732 | ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin) |
733 | : _mm_cvtepu16_epi32(__intrin)) |
734 | : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin) |
735 | : _mm_cvtepu8_epi32(__intrin)); |
736 | const auto __w |
737 | = _mm_shuffle_epi32(__intrin, sizeof(_Tp) == 2 ? 0xee : 0xe9); |
738 | __b = sizeof(_Tp) == 2 |
739 | ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(V: __w) |
740 | : _mm_cvtepu16_epi32(V: __w)) |
741 | : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(V: __w) |
742 | : _mm_cvtepu8_epi32(V: __w)); |
743 | } |
744 | else |
745 | { |
746 | __m128i __tmp; |
747 | if constexpr (sizeof(_Tp) == 1) |
748 | { |
749 | __tmp = is_signed_v<_Tp> |
750 | ? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, |
751 | __intrin), |
752 | 8) |
753 | : _mm_unpacklo_epi8(__intrin, __m128i()); |
754 | } |
755 | else |
756 | { |
757 | static_assert(sizeof(_Tp) == 2); |
758 | __tmp = __intrin; |
759 | } |
760 | __a = is_signed_v<_Tp> |
761 | ? _mm_srai_epi32(a: _mm_unpacklo_epi16(a: __tmp, b: __tmp), count: 16) |
762 | : _mm_unpacklo_epi16(a: __tmp, b: __m128i()); |
763 | __b = is_signed_v<_Tp> |
764 | ? _mm_srai_epi32(a: _mm_unpackhi_epi16(a: __tmp, b: __tmp), count: 16) |
765 | : _mm_unpackhi_epi16(a: __tmp, b: __m128i()); |
766 | } |
767 | return __convert_x86<_To>(__vector_bitcast<int>(x: __a), |
768 | __vector_bitcast<int>(x: __b)); |
769 | } |
770 | } |
771 | else if constexpr (__s64_to_f64) //{{{2 |
772 | { |
773 | if constexpr (__x_to_x && __have_avx512dq_vl) |
774 | return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin)); |
775 | else if constexpr (__y_to_y && __have_avx512dq_vl) |
776 | return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin)); |
777 | else if constexpr (__z_to_z && __have_avx512dq) |
778 | return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin)); |
779 | else if constexpr (__z_to_z) |
780 | { |
781 | return __intrin_bitcast<_To>( |
782 | _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32))) |
783 | * 0x100000000LL |
784 | + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin))); |
785 | } |
786 | } |
787 | else if constexpr (__u64_to_f64) //{{{2 |
788 | { |
789 | if constexpr (__x_to_x && __have_avx512dq_vl) |
790 | return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin)); |
791 | else if constexpr (__y_to_y && __have_avx512dq_vl) |
792 | return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin)); |
793 | else if constexpr (__z_to_z && __have_avx512dq) |
794 | return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin)); |
795 | else if constexpr (__z_to_z) |
796 | { |
797 | return __intrin_bitcast<_To>( |
798 | _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32))) |
799 | * 0x100000000LL |
800 | + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin))); |
801 | } |
802 | } |
803 | else if constexpr (__s32_to_f64) //{{{2 |
804 | { |
805 | if constexpr (__x_to_x) |
806 | return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin)); |
807 | else if constexpr (__x_to_y) |
808 | return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin)); |
809 | else if constexpr (__y_to_z) |
810 | return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin)); |
811 | } |
812 | else if constexpr (__u32_to_f64) //{{{2 |
813 | { |
814 | if constexpr (__x_to_x && __have_avx512vl) |
815 | return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin)); |
816 | else if constexpr (__x_to_x && __have_avx512f) |
817 | return __intrin_bitcast<_To>( |
818 | __lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v)))); |
819 | else if constexpr (__x_to_x) |
820 | return __intrin_bitcast<_To>( |
821 | _mm_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u); |
822 | else if constexpr (__x_to_y && __have_avx512vl) |
823 | return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin)); |
824 | else if constexpr (__x_to_y && __have_avx512f) |
825 | return __intrin_bitcast<_To>( |
826 | __lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v)))); |
827 | else if constexpr (__x_to_y) |
828 | return __intrin_bitcast<_To>( |
829 | _mm256_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u); |
830 | else if constexpr (__y_to_z) |
831 | return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin)); |
832 | } |
833 | else if constexpr (__ibw_to_f64) //{{{2 |
834 | { |
835 | return __convert_x86<_To>( |
836 | __convert_x86<__vector_type_t<int, std::max(a: size_t(4), b: _M)>>(__v)); |
837 | } |
838 | else if constexpr (__f32_to_f64) //{{{2 |
839 | { |
840 | if constexpr (__x_to_x) |
841 | return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin)); |
842 | else if constexpr (__x_to_y) |
843 | return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin)); |
844 | else if constexpr (__y_to_z) |
845 | return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin)); |
846 | } |
847 | else if constexpr (__f64_to_f32) //{{{2 |
848 | { |
849 | if constexpr (__x_to_x) |
850 | return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin)); |
851 | else if constexpr (__y_to_x) |
852 | return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin)); |
853 | else if constexpr (__z_to_y) |
854 | return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin)); |
855 | } |
856 | else //{{{2 |
857 | __assert_unreachable<_Tp>(); |
858 | |
859 | // fallback:{{{2 |
860 | return __vector_convert<_To>(__v, make_index_sequence<std::min(a: _M, b: _Np)>()); |
861 | //}}} |
862 | } |
863 | |
864 | // }}} |
865 | // 2-arg __convert_x86 {{{1 |
866 | template <typename _To, typename _V, typename _Traits> |
867 | _GLIBCXX_SIMD_INTRINSIC _To |
868 | __convert_x86(_V __v0, _V __v1) |
869 | { |
870 | static_assert(__is_vector_type_v<_V>); |
871 | using _Tp = typename _Traits::value_type; |
872 | constexpr size_t _Np = _Traits::_S_full_size; |
873 | [[maybe_unused]] const auto __i0 = __to_intrin(__v0); |
874 | [[maybe_unused]] const auto __i1 = __to_intrin(__v1); |
875 | using _Up = typename _VectorTraits<_To>::value_type; |
876 | constexpr size_t _M = _VectorTraits<_To>::_S_full_size; |
877 | |
878 | static_assert(2 * _Np <= _M, |
879 | "__v1 would be discarded; use the one-argument " |
880 | "__convert_x86 overload instead" ); |
881 | |
882 | // [xyz]_to_[xyz] {{{2 |
883 | [[maybe_unused]] constexpr bool __x_to_x |
884 | = sizeof(__v0) <= 16 && sizeof(_To) <= 16; |
885 | [[maybe_unused]] constexpr bool __x_to_y |
886 | = sizeof(__v0) <= 16 && sizeof(_To) == 32; |
887 | [[maybe_unused]] constexpr bool __x_to_z |
888 | = sizeof(__v0) <= 16 && sizeof(_To) == 64; |
889 | [[maybe_unused]] constexpr bool __y_to_x |
890 | = sizeof(__v0) == 32 && sizeof(_To) <= 16; |
891 | [[maybe_unused]] constexpr bool __y_to_y |
892 | = sizeof(__v0) == 32 && sizeof(_To) == 32; |
893 | [[maybe_unused]] constexpr bool __y_to_z |
894 | = sizeof(__v0) == 32 && sizeof(_To) == 64; |
895 | [[maybe_unused]] constexpr bool __z_to_x |
896 | = sizeof(__v0) == 64 && sizeof(_To) <= 16; |
897 | [[maybe_unused]] constexpr bool __z_to_y |
898 | = sizeof(__v0) == 64 && sizeof(_To) == 32; |
899 | [[maybe_unused]] constexpr bool __z_to_z |
900 | = sizeof(__v0) == 64 && sizeof(_To) == 64; |
901 | |
902 | // iX_to_iX {{{2 |
903 | [[maybe_unused]] constexpr bool __i_to_i |
904 | = is_integral_v<_Up> && is_integral_v<_Tp>; |
905 | [[maybe_unused]] constexpr bool __i8_to_i16 |
906 | = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; |
907 | [[maybe_unused]] constexpr bool __i8_to_i32 |
908 | = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; |
909 | [[maybe_unused]] constexpr bool __i8_to_i64 |
910 | = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; |
911 | [[maybe_unused]] constexpr bool __i16_to_i8 |
912 | = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; |
913 | [[maybe_unused]] constexpr bool __i16_to_i32 |
914 | = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; |
915 | [[maybe_unused]] constexpr bool __i16_to_i64 |
916 | = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; |
917 | [[maybe_unused]] constexpr bool __i32_to_i8 |
918 | = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; |
919 | [[maybe_unused]] constexpr bool __i32_to_i16 |
920 | = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; |
921 | [[maybe_unused]] constexpr bool __i32_to_i64 |
922 | = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; |
923 | [[maybe_unused]] constexpr bool __i64_to_i8 |
924 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; |
925 | [[maybe_unused]] constexpr bool __i64_to_i16 |
926 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; |
927 | [[maybe_unused]] constexpr bool __i64_to_i32 |
928 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; |
929 | |
930 | // [fsu]X_to_[fsu]X {{{2 |
931 | // ibw = integral && byte or word, i.e. char and short with any signedness |
932 | [[maybe_unused]] constexpr bool __i64_to_f32 |
933 | = is_integral_v<_Tp> && sizeof(_Tp) == 8 |
934 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
935 | [[maybe_unused]] constexpr bool __s32_to_f32 |
936 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
937 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
938 | [[maybe_unused]] constexpr bool __s16_to_f32 |
939 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
940 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
941 | [[maybe_unused]] constexpr bool __s8_to_f32 |
942 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
943 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
944 | [[maybe_unused]] constexpr bool __u32_to_f32 |
945 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
946 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
947 | [[maybe_unused]] constexpr bool __u16_to_f32 |
948 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
949 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
950 | [[maybe_unused]] constexpr bool __u8_to_f32 |
951 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
952 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
953 | [[maybe_unused]] constexpr bool __s64_to_f64 |
954 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 |
955 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
956 | [[maybe_unused]] constexpr bool __s32_to_f64 |
957 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
958 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
959 | [[maybe_unused]] constexpr bool __s16_to_f64 |
960 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
961 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
962 | [[maybe_unused]] constexpr bool __s8_to_f64 |
963 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
964 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
965 | [[maybe_unused]] constexpr bool __u64_to_f64 |
966 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 |
967 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
968 | [[maybe_unused]] constexpr bool __u32_to_f64 |
969 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
970 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
971 | [[maybe_unused]] constexpr bool __u16_to_f64 |
972 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
973 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
974 | [[maybe_unused]] constexpr bool __u8_to_f64 |
975 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
976 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
977 | [[maybe_unused]] constexpr bool __f32_to_s64 |
978 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
979 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
980 | [[maybe_unused]] constexpr bool __f32_to_s32 |
981 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
982 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
983 | [[maybe_unused]] constexpr bool __f32_to_u64 |
984 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
985 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
986 | [[maybe_unused]] constexpr bool __f32_to_u32 |
987 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
988 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
989 | [[maybe_unused]] constexpr bool __f64_to_s64 |
990 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
991 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
992 | [[maybe_unused]] constexpr bool __f64_to_s32 |
993 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
994 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
995 | [[maybe_unused]] constexpr bool __f64_to_u64 |
996 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
997 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
998 | [[maybe_unused]] constexpr bool __f64_to_u32 |
999 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
1000 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
1001 | [[maybe_unused]] constexpr bool __f32_to_ibw |
1002 | = is_integral_v<_Up> && sizeof(_Up) <= 2 |
1003 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
1004 | [[maybe_unused]] constexpr bool __f64_to_ibw |
1005 | = is_integral_v<_Up> && sizeof(_Up) <= 2 |
1006 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
1007 | [[maybe_unused]] constexpr bool __f32_to_f64 |
1008 | = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 |
1009 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1010 | [[maybe_unused]] constexpr bool __f64_to_f32 |
1011 | = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 |
1012 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
1013 | |
1014 | if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 |
1015 | // <double, 4>, <double, 4> => <short, 8> |
1016 | return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1), |
1017 | __hi128(__v1)); |
1018 | else if constexpr (__i_to_i) // assert ISA {{{2 |
1019 | { |
1020 | static_assert(__x_to_x || __have_avx2, |
1021 | "integral conversions with ymm registers require AVX2" ); |
1022 | static_assert(__have_avx512bw |
1023 | || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) |
1024 | && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), |
1025 | "8/16-bit integers in zmm registers require AVX512BW" ); |
1026 | static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, |
1027 | "integral conversions with ymm registers require AVX2" ); |
1028 | } |
1029 | // concat => use 1-arg __convert_x86 {{{2 |
1030 | if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) |
1031 | || (sizeof(__v0) == 16 && __have_avx |
1032 | && is_floating_point_v<_Tp>) |
1033 | || (sizeof(__v0) == 32 && __have_avx512f |
1034 | && (sizeof(_Tp) >= 4 || __have_avx512bw))) |
1035 | { |
1036 | // The ISA can handle wider input registers, so concat and use one-arg |
1037 | // implementation. This reduces code duplication considerably. |
1038 | return __convert_x86<_To>(__concat(__v0, __v1)); |
1039 | } |
1040 | else //{{{2 |
1041 | { |
1042 | // conversion using bit reinterpretation (or no conversion at all) |
1043 | // should all go through the concat branch above: |
1044 | static_assert( |
1045 | !(is_floating_point_v< |
1046 | _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); |
1047 | // handle all zero extension{{{2 |
1048 | if constexpr (2 * _Np < _M && sizeof(_To) > 16) |
1049 | { |
1050 | constexpr size_t Min = 16 / sizeof(_Up); |
1051 | return __zero_extend( |
1052 | __convert_x86< |
1053 | __vector_type_t<_Up, (Min > 2 * _Np) ? Min : 2 * _Np>>(__v0, |
1054 | __v1)); |
1055 | } |
1056 | else if constexpr (__i64_to_i32) //{{{2 |
1057 | { |
1058 | if constexpr (__x_to_x) |
1059 | return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0), |
1060 | __auto_bitcast(__v1), 0x88)); |
1061 | else if constexpr (__y_to_y) |
1062 | { |
1063 | // AVX512F is not available (would concat otherwise) |
1064 | return __auto_bitcast( |
1065 | x: __xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0), |
1066 | __auto_bitcast(__v1), 0x88))); |
1067 | // alternative: |
1068 | // const auto v0_abxxcdxx = _mm256_shuffle_epi32(__v0, 8); |
1069 | // const auto v1_efxxghxx = _mm256_shuffle_epi32(__v1, 8); |
1070 | // const auto v_abefcdgh = _mm256_unpacklo_epi64(v0_abxxcdxx, |
1071 | // v1_efxxghxx); return _mm256_permute4x64_epi64(v_abefcdgh, |
1072 | // 0x01 * 0 + 0x04 * 2 + 0x10 * 1 + 0x40 * 3); // abcdefgh |
1073 | } |
1074 | else if constexpr (__z_to_z) |
1075 | return __intrin_bitcast<_To>( |
1076 | __concat(_mm512_cvtepi64_epi32(__i0), |
1077 | _mm512_cvtepi64_epi32(__i1))); |
1078 | } |
1079 | else if constexpr (__i64_to_i16) //{{{2 |
1080 | { |
1081 | if constexpr (__x_to_x) |
1082 | { |
1083 | // AVX2 is not available (would concat otherwise) |
1084 | if constexpr (__have_sse4_1) |
1085 | { |
1086 | return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
1087 | _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44), |
1088 | b: _mm_setr_epi8(b0: 0, b1: 1, b2: 8, b3: 9, b4: 4, b5: 5, b6: 12, b7: 13, b8: -0x80, b9: -0x80, |
1089 | b10: -0x80, b11: -0x80, b12: -0x80, b13: -0x80, b14: -0x80, b15: -0x80))); |
1090 | } |
1091 | else |
1092 | { |
1093 | return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]), |
1094 | _Up(__v1[0]), _Up(__v1[1])}; |
1095 | } |
1096 | } |
1097 | else if constexpr (__y_to_x) |
1098 | { |
1099 | auto __a |
1100 | = _mm256_unpacklo_epi16(__i0, __i1); // 04.. .... 26.. .... |
1101 | auto __b |
1102 | = _mm256_unpackhi_epi16(__i0, __i1); // 15.. .... 37.. .... |
1103 | auto __c |
1104 | = _mm256_unpacklo_epi16(__a, __b); // 0145 .... 2367 .... |
1105 | return __intrin_bitcast<_To>( |
1106 | _mm_unpacklo_epi32(__lo128(__c), __hi128(__c))); // 0123 4567 |
1107 | } |
1108 | else if constexpr (__z_to_y) |
1109 | return __intrin_bitcast<_To>( |
1110 | __concat(_mm512_cvtepi64_epi16(__i0), |
1111 | _mm512_cvtepi64_epi16(__i1))); |
1112 | } |
1113 | else if constexpr (__i64_to_i8) //{{{2 |
1114 | { |
1115 | if constexpr (__x_to_x && __have_sse4_1) |
1116 | { |
1117 | return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
1118 | _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44), |
1119 | b: _mm_setr_epi8(b0: 0, b1: 8, b2: 4, b3: 12, b4: -0x80, b5: -0x80, b6: -0x80, b7: -0x80, b8: -0x80, |
1120 | b9: -0x80, b10: -0x80, b11: -0x80, b12: -0x80, b13: -0x80, b14: -0x80, |
1121 | b15: -0x80))); |
1122 | } |
1123 | else if constexpr (__x_to_x && __have_ssse3) |
1124 | { |
1125 | return __intrin_bitcast<_To>(_mm_unpacklo_epi16( |
1126 | _mm_shuffle_epi8( |
1127 | __i0, _mm_setr_epi8(b0: 0, b1: 8, b2: -0x80, b3: -0x80, b4: -0x80, b5: -0x80, b6: -0x80, |
1128 | b7: -0x80, b8: -0x80, b9: -0x80, b10: -0x80, b11: -0x80, |
1129 | b12: -0x80, b13: -0x80, b14: -0x80, b15: -0x80)), |
1130 | _mm_shuffle_epi8( |
1131 | __i1, _mm_setr_epi8(b0: 0, b1: 8, b2: -0x80, b3: -0x80, b4: -0x80, b5: -0x80, b6: -0x80, |
1132 | b7: -0x80, b8: -0x80, b9: -0x80, b10: -0x80, b11: -0x80, |
1133 | b12: -0x80, b13: -0x80, b14: -0x80, b15: -0x80)))); |
1134 | } |
1135 | else if constexpr (__x_to_x) |
1136 | { |
1137 | return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]), |
1138 | _Up(__v1[0]), _Up(__v1[1])}; |
1139 | } |
1140 | else if constexpr (__y_to_x) |
1141 | { |
1142 | const auto __a = _mm256_shuffle_epi8( |
1143 | _mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, 32), 0xAA), |
1144 | b: _mm256_setr_epi8(b31: 0, b30: 8, b29: -0x80, b28: -0x80, b27: 4, b26: 12, b25: -0x80, b24: -0x80, |
1145 | b23: -0x80, b22: -0x80, b21: -0x80, b20: -0x80, b19: -0x80, b18: -0x80, |
1146 | b17: -0x80, b16: -0x80, b15: -0x80, b14: -0x80, b13: 0, b12: 8, b11: -0x80, |
1147 | b10: -0x80, b09: 4, b08: 12, b07: -0x80, b06: -0x80, b05: -0x80, b04: -0x80, |
1148 | b03: -0x80, b02: -0x80, b01: -0x80, b00: -0x80)); |
1149 | return __intrin_bitcast<_To>(__lo128(x: __a) | __hi128(x: __a)); |
1150 | } // __z_to_x uses concat fallback |
1151 | } |
1152 | else if constexpr (__i32_to_i16) //{{{2 |
1153 | { |
1154 | if constexpr (__x_to_x) |
1155 | { |
1156 | // AVX2 is not available (would concat otherwise) |
1157 | if constexpr (__have_sse4_1) |
1158 | { |
1159 | return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
1160 | _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0xaa), |
1161 | b: _mm_setr_epi8(b0: 0, b1: 1, b2: 4, b3: 5, b4: 8, b5: 9, b6: 12, b7: 13, b8: 2, b9: 3, b10: 6, b11: 7, b12: 10, |
1162 | b13: 11, b14: 14, b15: 15))); |
1163 | } |
1164 | else if constexpr (__have_ssse3) |
1165 | { |
1166 | return __intrin_bitcast<_To>( |
1167 | _mm_hadd_epi16(__to_intrin(__v0 << 16), |
1168 | __to_intrin(__v1 << 16))); |
1169 | /* |
1170 | return _mm_unpacklo_epi64( |
1171 | _mm_shuffle_epi8(__i0, _mm_setr_epi8(0, 1, 4, 5, 8, 9, |
1172 | 12, 13, 8, 9, 12, 13, 12, 13, 14, 15)), |
1173 | _mm_shuffle_epi8(__i1, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, |
1174 | 13, 8, 9, 12, 13, 12, 13, 14, 15))); |
1175 | */ |
1176 | } |
1177 | else |
1178 | { |
1179 | auto __a = _mm_unpacklo_epi16(__i0, __i1); // 04.. 15.. |
1180 | auto __b = _mm_unpackhi_epi16(__i0, __i1); // 26.. 37.. |
1181 | auto __c = _mm_unpacklo_epi16(__a, __b); // 0246 .... |
1182 | auto __d = _mm_unpackhi_epi16(__a, __b); // 1357 .... |
1183 | return __intrin_bitcast<_To>( |
1184 | _mm_unpacklo_epi16(__c, __d)); // 0123 4567 |
1185 | } |
1186 | } |
1187 | else if constexpr (__y_to_y) |
1188 | { |
1189 | const auto __shuf |
1190 | = _mm256_setr_epi8(b31: 0, b30: 1, b29: 4, b28: 5, b27: 8, b26: 9, b25: 12, b24: 13, b23: -0x80, b22: -0x80, |
1191 | b21: -0x80, b20: -0x80, b19: -0x80, b18: -0x80, b17: -0x80, b16: -0x80, |
1192 | b15: 0, b14: 1, b13: 4, b12: 5, b11: 8, b10: 9, b09: 12, b08: 13, b07: -0x80, b06: -0x80, |
1193 | b05: -0x80, b04: -0x80, b03: -0x80, b02: -0x80, b01: -0x80, b00: -0x80); |
1194 | auto __a = _mm256_shuffle_epi8(__i0, __shuf); |
1195 | auto __b = _mm256_shuffle_epi8(__i1, __shuf); |
1196 | return __intrin_bitcast<_To>( |
1197 | __xzyw(_mm256_unpacklo_epi64(__a, __b))); |
1198 | } // __z_to_z uses concat fallback |
1199 | } |
1200 | else if constexpr (__i32_to_i8) //{{{2 |
1201 | { |
1202 | if constexpr (__x_to_x && __have_ssse3) |
1203 | { |
1204 | const auto shufmask |
1205 | = _mm_setr_epi8(b0: 0, b1: 4, b2: 8, b3: 12, b4: -0x80, b5: -0x80, b6: -0x80, b7: -0x80, |
1206 | b8: -0x80, b9: -0x80, b10: -0x80, b11: -0x80, b12: -0x80, b13: -0x80, |
1207 | b14: -0x80, b15: -0x80); |
1208 | return __intrin_bitcast<_To>( |
1209 | _mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask), |
1210 | _mm_shuffle_epi8(__i1, shufmask))); |
1211 | } |
1212 | else if constexpr (__x_to_x) |
1213 | { |
1214 | auto __a = _mm_unpacklo_epi8(__i0, __i1); // 04.. .... 15.. .... |
1215 | auto __b = _mm_unpackhi_epi8(__i0, __i1); // 26.. .... 37.. .... |
1216 | auto __c = _mm_unpacklo_epi8(__a, __b); // 0246 .... .... .... |
1217 | auto __d = _mm_unpackhi_epi8(__a, __b); // 1357 .... .... .... |
1218 | auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 4567 .... .... |
1219 | return __intrin_bitcast<_To>(__e & __m128i{-1, 0}); |
1220 | } |
1221 | else if constexpr (__y_to_x) |
1222 | { |
1223 | const auto __a = _mm256_shuffle_epi8( |
1224 | _mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, 16), 0xAA), |
1225 | b: _mm256_setr_epi8(b31: 0, b30: 4, b29: 8, b28: 12, b27: -0x80, b26: -0x80, b25: -0x80, b24: -0x80, b23: 2, |
1226 | b22: 6, b21: 10, b20: 14, b19: -0x80, b18: -0x80, b17: -0x80, b16: -0x80, b15: -0x80, |
1227 | b14: -0x80, b13: -0x80, b12: -0x80, b11: 0, b10: 4, b09: 8, b08: 12, b07: -0x80, |
1228 | b06: -0x80, b05: -0x80, b04: -0x80, b03: 2, b02: 6, b01: 10, b00: 14)); |
1229 | return __intrin_bitcast<_To>(__lo128(x: __a) | __hi128(x: __a)); |
1230 | } // __z_to_y uses concat fallback |
1231 | } |
1232 | else if constexpr (__i16_to_i8) //{{{2 |
1233 | { |
1234 | if constexpr (__x_to_x && __have_ssse3) |
1235 | { |
1236 | const auto __shuf = reinterpret_cast<__m128i>( |
1237 | __vector_type_t<_UChar, 16>{0, 2, 4, 6, 8, 10, 12, 14, 0x80, |
1238 | 0x80, 0x80, 0x80, 0x80, 0x80, |
1239 | 0x80, 0x80}); |
1240 | return __intrin_bitcast<_To>( |
1241 | _mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf), |
1242 | _mm_shuffle_epi8(__i1, __shuf))); |
1243 | } |
1244 | else if constexpr (__x_to_x) |
1245 | { |
1246 | auto __a = _mm_unpacklo_epi8(__i0, __i1); // 08.. 19.. 2A.. 3B.. |
1247 | auto __b = _mm_unpackhi_epi8(__i0, __i1); // 4C.. 5D.. 6E.. 7F.. |
1248 | auto __c = _mm_unpacklo_epi8(__a, __b); // 048C .... 159D .... |
1249 | auto __d = _mm_unpackhi_epi8(__a, __b); // 26AE .... 37BF .... |
1250 | auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 8ACE .... .... |
1251 | auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 9BDF .... .... |
1252 | return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f)); |
1253 | } |
1254 | else if constexpr (__y_to_y) |
1255 | { |
1256 | return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8( |
1257 | (__to_intrin(__v0) & _mm256_set1_epi32(i: 0x00ff00ff)) |
1258 | | _mm256_slli_epi16(__i1, 8), |
1259 | _mm256_setr_epi8(b31: 0, b30: 2, b29: 4, b28: 6, b27: 8, b26: 10, b25: 12, b24: 14, b23: 1, b22: 3, b21: 5, b20: 7, b19: 9, b18: 11, |
1260 | b17: 13, b16: 15, b15: 0, b14: 2, b13: 4, b12: 6, b11: 8, b10: 10, b09: 12, b08: 14, b07: 1, b06: 3, b05: 5, |
1261 | b04: 7, b03: 9, b02: 11, b01: 13, b00: 15)))); |
1262 | } // __z_to_z uses concat fallback |
1263 | } |
1264 | else if constexpr (__i64_to_f32) //{{{2 |
1265 | { |
1266 | if constexpr (__x_to_x) |
1267 | return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1]); |
1268 | else if constexpr (__y_to_y) |
1269 | { |
1270 | static_assert(__y_to_y && __have_avx2); |
1271 | const auto __a = _mm256_unpacklo_epi32(__i0, __i1); // aeAE cgCG |
1272 | const auto __b = _mm256_unpackhi_epi32(__i0, __i1); // bfBF dhDH |
1273 | const auto __lo32 |
1274 | = _mm256_unpacklo_epi32(__a, __b); // abef cdgh |
1275 | const auto __hi32 = __vector_bitcast< |
1276 | conditional_t<is_signed_v<_Tp>, int, _UInt>>( |
1277 | _mm256_unpackhi_epi32(__a, __b)); // ABEF CDGH |
1278 | const auto __hi |
1279 | = 0x100000000LL |
1280 | * __convert_x86<__vector_type_t<float, 8>>(__hi32); |
1281 | const auto __mid |
1282 | = 0x10000 * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, 16)); |
1283 | const auto __lo |
1284 | = _mm256_cvtepi32_ps(_mm256_set1_epi32(i: 0x0000ffffu) & __lo32); |
1285 | return __xzyw((__hi + __mid) + __lo); |
1286 | } |
1287 | else if constexpr (__z_to_z && __have_avx512dq) |
1288 | { |
1289 | return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0), |
1290 | _mm512_cvtepi64_ps(__i1)) |
1291 | : __concat(_mm512_cvtepu64_ps(__i0), |
1292 | _mm512_cvtepu64_ps(__i1)); |
1293 | } |
1294 | else if constexpr (__z_to_z && is_signed_v<_Tp>) |
1295 | { |
1296 | const __m512 __hi32 = _mm512_cvtepi32_ps( |
1297 | __concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> 32)), |
1298 | _mm512_cvtepi64_epi32(__to_intrin(__v1 >> 32)))); |
1299 | const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0), |
1300 | _mm512_cvtepi64_epi32(__i1)); |
1301 | // split low 32-bits, because if __hi32 is a small negative |
1302 | // number, the 24-bit mantissa may lose important information if |
1303 | // any of the high 8 bits of __lo32 is set, leading to |
1304 | // catastrophic cancelation in the FMA |
1305 | const __m512 __hi16 |
1306 | = _mm512_cvtepu32_ps(A: _mm512_set1_epi32(s: 0xffff0000u) & __lo32); |
1307 | const __m512 __lo16 |
1308 | = _mm512_cvtepi32_ps(A: _mm512_set1_epi32(s: 0x0000ffffu) & __lo32); |
1309 | return (__hi32 * 0x100000000LL + __hi16) + __lo16; |
1310 | } |
1311 | else if constexpr (__z_to_z && is_unsigned_v<_Tp>) |
1312 | { |
1313 | return __intrin_bitcast<_To>( |
1314 | _mm512_cvtepu32_ps(__concat( |
1315 | _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, 32)), |
1316 | _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, 32)))) |
1317 | * 0x100000000LL |
1318 | + _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0), |
1319 | _mm512_cvtepi64_epi32(__i1)))); |
1320 | } |
1321 | } |
1322 | else if constexpr (__f64_to_s32) //{{{2 |
1323 | { |
1324 | // use concat fallback |
1325 | } |
1326 | else if constexpr (__f64_to_u32) //{{{2 |
1327 | { |
1328 | if constexpr (__x_to_x && __have_sse4_1) |
1329 | { |
1330 | return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64( |
1331 | a: _mm_cvttpd_epi32(_mm_floor_pd(__i0) - 0x8000'0000u), |
1332 | b: _mm_cvttpd_epi32(_mm_floor_pd(__i1) - 0x8000'0000u))) |
1333 | ^ 0x8000'0000u; |
1334 | // without SSE4.1 just use the scalar fallback, it's only four |
1335 | // values |
1336 | } |
1337 | else if constexpr (__y_to_y) |
1338 | { |
1339 | return __vector_bitcast<_Up>( |
1340 | __concat(a_: _mm256_cvttpd_epi32(_mm256_floor_pd(__i0) |
1341 | - 0x8000'0000u), |
1342 | b_: _mm256_cvttpd_epi32(_mm256_floor_pd(__i1) |
1343 | - 0x8000'0000u))) |
1344 | ^ 0x8000'0000u; |
1345 | } // __z_to_z uses fallback |
1346 | } |
1347 | else if constexpr (__f64_to_ibw) //{{{2 |
1348 | { |
1349 | // one-arg __f64_to_ibw goes via _SimdWrapper<int, ?>. The fallback |
1350 | // would go via two independet conversions to _SimdWrapper<_To> and |
1351 | // subsequent interleaving. This is better, because f64->__i32 |
1352 | // allows to combine __v0 and __v1 into one register: if constexpr |
1353 | // (__z_to_x || __y_to_x) { |
1354 | return __convert_x86<_To>( |
1355 | __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1)); |
1356 | //} |
1357 | } |
1358 | else if constexpr (__f32_to_ibw) //{{{2 |
1359 | { |
1360 | return __convert_x86<_To>( |
1361 | __convert_x86<__vector_type_t<int, _Np>>(__v0), |
1362 | __convert_x86<__vector_type_t<int, _Np>>(__v1)); |
1363 | } //}}} |
1364 | |
1365 | // fallback: {{{2 |
1366 | if constexpr (sizeof(_To) >= 32) |
1367 | // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm |
1368 | return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0), |
1369 | __convert_x86<__vector_type_t<_Up, _M / 2>>(__v1)); |
1370 | else if constexpr (sizeof(_To) == 16) |
1371 | { |
1372 | const auto __lo = __to_intrin(__convert_x86<_To>(__v0)); |
1373 | const auto __hi = __to_intrin(__convert_x86<_To>(__v1)); |
1374 | if constexpr (sizeof(_Up) * _Np == 8) |
1375 | { |
1376 | if constexpr (is_floating_point_v<_Up>) |
1377 | return __auto_bitcast( |
1378 | _mm_unpacklo_pd(__vector_bitcast<double>(__lo), |
1379 | __vector_bitcast<double>(__hi))); |
1380 | else |
1381 | return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); |
1382 | } |
1383 | else if constexpr (sizeof(_Up) * _Np == 4) |
1384 | { |
1385 | if constexpr (is_floating_point_v<_Up>) |
1386 | return __auto_bitcast( |
1387 | _mm_unpacklo_ps(__vector_bitcast<float>(__lo), |
1388 | __vector_bitcast<float>(__hi))); |
1389 | else |
1390 | return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi)); |
1391 | } |
1392 | else if constexpr (sizeof(_Up) * _Np == 2) |
1393 | return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi)); |
1394 | else |
1395 | __assert_unreachable<_Tp>(); |
1396 | } |
1397 | else |
1398 | return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>()); |
1399 | //}}} |
1400 | } |
1401 | } |
1402 | |
1403 | //}}}1 |
1404 | // 4-arg __convert_x86 {{{1 |
1405 | template <typename _To, typename _V, typename _Traits> |
1406 | _GLIBCXX_SIMD_INTRINSIC _To |
1407 | __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3) |
1408 | { |
1409 | static_assert(__is_vector_type_v<_V>); |
1410 | using _Tp = typename _Traits::value_type; |
1411 | constexpr size_t _Np = _Traits::_S_full_size; |
1412 | [[maybe_unused]] const auto __i0 = __to_intrin(__v0); |
1413 | [[maybe_unused]] const auto __i1 = __to_intrin(__v1); |
1414 | [[maybe_unused]] const auto __i2 = __to_intrin(__v2); |
1415 | [[maybe_unused]] const auto __i3 = __to_intrin(__v3); |
1416 | using _Up = typename _VectorTraits<_To>::value_type; |
1417 | constexpr size_t _M = _VectorTraits<_To>::_S_full_size; |
1418 | |
1419 | static_assert(4 * _Np <= _M, |
1420 | "__v2/__v3 would be discarded; use the two/one-argument " |
1421 | "__convert_x86 overload instead" ); |
1422 | |
1423 | // [xyz]_to_[xyz] {{{2 |
1424 | [[maybe_unused]] constexpr bool __x_to_x |
1425 | = sizeof(__v0) <= 16 && sizeof(_To) <= 16; |
1426 | [[maybe_unused]] constexpr bool __x_to_y |
1427 | = sizeof(__v0) <= 16 && sizeof(_To) == 32; |
1428 | [[maybe_unused]] constexpr bool __x_to_z |
1429 | = sizeof(__v0) <= 16 && sizeof(_To) == 64; |
1430 | [[maybe_unused]] constexpr bool __y_to_x |
1431 | = sizeof(__v0) == 32 && sizeof(_To) <= 16; |
1432 | [[maybe_unused]] constexpr bool __y_to_y |
1433 | = sizeof(__v0) == 32 && sizeof(_To) == 32; |
1434 | [[maybe_unused]] constexpr bool __y_to_z |
1435 | = sizeof(__v0) == 32 && sizeof(_To) == 64; |
1436 | [[maybe_unused]] constexpr bool __z_to_x |
1437 | = sizeof(__v0) == 64 && sizeof(_To) <= 16; |
1438 | [[maybe_unused]] constexpr bool __z_to_y |
1439 | = sizeof(__v0) == 64 && sizeof(_To) == 32; |
1440 | [[maybe_unused]] constexpr bool __z_to_z |
1441 | = sizeof(__v0) == 64 && sizeof(_To) == 64; |
1442 | |
1443 | // iX_to_iX {{{2 |
1444 | [[maybe_unused]] constexpr bool __i_to_i |
1445 | = is_integral_v<_Up> && is_integral_v<_Tp>; |
1446 | [[maybe_unused]] constexpr bool __i8_to_i16 |
1447 | = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; |
1448 | [[maybe_unused]] constexpr bool __i8_to_i32 |
1449 | = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; |
1450 | [[maybe_unused]] constexpr bool __i8_to_i64 |
1451 | = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; |
1452 | [[maybe_unused]] constexpr bool __i16_to_i8 |
1453 | = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; |
1454 | [[maybe_unused]] constexpr bool __i16_to_i32 |
1455 | = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; |
1456 | [[maybe_unused]] constexpr bool __i16_to_i64 |
1457 | = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; |
1458 | [[maybe_unused]] constexpr bool __i32_to_i8 |
1459 | = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; |
1460 | [[maybe_unused]] constexpr bool __i32_to_i16 |
1461 | = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; |
1462 | [[maybe_unused]] constexpr bool __i32_to_i64 |
1463 | = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; |
1464 | [[maybe_unused]] constexpr bool __i64_to_i8 |
1465 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; |
1466 | [[maybe_unused]] constexpr bool __i64_to_i16 |
1467 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; |
1468 | [[maybe_unused]] constexpr bool __i64_to_i32 |
1469 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; |
1470 | |
1471 | // [fsu]X_to_[fsu]X {{{2 |
1472 | // ibw = integral && byte or word, i.e. char and short with any signedness |
1473 | [[maybe_unused]] constexpr bool __i64_to_f32 |
1474 | = is_integral_v<_Tp> && sizeof(_Tp) == 8 |
1475 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
1476 | [[maybe_unused]] constexpr bool __s32_to_f32 |
1477 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
1478 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
1479 | [[maybe_unused]] constexpr bool __s16_to_f32 |
1480 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
1481 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
1482 | [[maybe_unused]] constexpr bool __s8_to_f32 |
1483 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
1484 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
1485 | [[maybe_unused]] constexpr bool __u32_to_f32 |
1486 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
1487 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
1488 | [[maybe_unused]] constexpr bool __u16_to_f32 |
1489 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
1490 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
1491 | [[maybe_unused]] constexpr bool __u8_to_f32 |
1492 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
1493 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
1494 | [[maybe_unused]] constexpr bool __s64_to_f64 |
1495 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 |
1496 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1497 | [[maybe_unused]] constexpr bool __s32_to_f64 |
1498 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
1499 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1500 | [[maybe_unused]] constexpr bool __s16_to_f64 |
1501 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
1502 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1503 | [[maybe_unused]] constexpr bool __s8_to_f64 |
1504 | = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
1505 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1506 | [[maybe_unused]] constexpr bool __u64_to_f64 |
1507 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 |
1508 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1509 | [[maybe_unused]] constexpr bool __u32_to_f64 |
1510 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
1511 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1512 | [[maybe_unused]] constexpr bool __u16_to_f64 |
1513 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
1514 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1515 | [[maybe_unused]] constexpr bool __u8_to_f64 |
1516 | = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
1517 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1518 | [[maybe_unused]] constexpr bool __f32_to_s64 |
1519 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
1520 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
1521 | [[maybe_unused]] constexpr bool __f32_to_s32 |
1522 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
1523 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
1524 | [[maybe_unused]] constexpr bool __f32_to_u64 |
1525 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
1526 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
1527 | [[maybe_unused]] constexpr bool __f32_to_u32 |
1528 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
1529 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
1530 | [[maybe_unused]] constexpr bool __f64_to_s64 |
1531 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
1532 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
1533 | [[maybe_unused]] constexpr bool __f64_to_s32 |
1534 | = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
1535 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
1536 | [[maybe_unused]] constexpr bool __f64_to_u64 |
1537 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
1538 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
1539 | [[maybe_unused]] constexpr bool __f64_to_u32 |
1540 | = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
1541 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
1542 | [[maybe_unused]] constexpr bool __f32_to_ibw |
1543 | = is_integral_v<_Up> && sizeof(_Up) <= 2 |
1544 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
1545 | [[maybe_unused]] constexpr bool __f64_to_ibw |
1546 | = is_integral_v<_Up> && sizeof(_Up) <= 2 |
1547 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
1548 | [[maybe_unused]] constexpr bool __f32_to_f64 |
1549 | = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 |
1550 | && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
1551 | [[maybe_unused]] constexpr bool __f64_to_f32 |
1552 | = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 |
1553 | && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
1554 | |
1555 | if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 |
1556 | { |
1557 | // <double, 4>, <double, 4>, <double, 4>, <double, 4> => <char, 16> |
1558 | return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1), |
1559 | __hi128(__v1), __lo128(__v2), __hi128(__v2), |
1560 | __lo128(__v3), __hi128(__v3)); |
1561 | } |
1562 | else if constexpr (__i_to_i) // assert ISA {{{2 |
1563 | { |
1564 | static_assert(__x_to_x || __have_avx2, |
1565 | "integral conversions with ymm registers require AVX2" ); |
1566 | static_assert(__have_avx512bw |
1567 | || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) |
1568 | && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), |
1569 | "8/16-bit integers in zmm registers require AVX512BW" ); |
1570 | static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, |
1571 | "integral conversions with ymm registers require AVX2" ); |
1572 | } |
1573 | // concat => use 2-arg __convert_x86 {{{2 |
1574 | if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) |
1575 | || (sizeof(__v0) == 16 && __have_avx |
1576 | && is_floating_point_v<_Tp>) |
1577 | || (sizeof(__v0) == 32 && __have_avx512f)) |
1578 | { |
1579 | // The ISA can handle wider input registers, so concat and use two-arg |
1580 | // implementation. This reduces code duplication considerably. |
1581 | return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3)); |
1582 | } |
1583 | else //{{{2 |
1584 | { |
1585 | // conversion using bit reinterpretation (or no conversion at all) |
1586 | // should all go through the concat branch above: |
1587 | static_assert( |
1588 | !(is_floating_point_v< |
1589 | _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); |
1590 | // handle all zero extension{{{2 |
1591 | if constexpr (4 * _Np < _M && sizeof(_To) > 16) |
1592 | { |
1593 | constexpr size_t Min = 16 / sizeof(_Up); |
1594 | return __zero_extend( |
1595 | __convert_x86< |
1596 | __vector_type_t<_Up, (Min > 4 * _Np) ? Min : 4 * _Np>>( |
1597 | __v0, __v1, __v2, __v3)); |
1598 | } |
1599 | else if constexpr (__i64_to_i16) //{{{2 |
1600 | { |
1601 | if constexpr (__x_to_x && __have_sse4_1) |
1602 | { |
1603 | return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
1604 | _mm_blend_epi16( |
1605 | _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0x22), |
1606 | _mm_blend_epi16(_mm_slli_si128(__i2, 4), |
1607 | _mm_slli_si128(__i3, 6), 0x88), |
1608 | 0xcc), |
1609 | b: _mm_setr_epi8(b0: 0, b1: 1, b2: 8, b3: 9, b4: 2, b5: 3, b6: 10, b7: 11, b8: 4, b9: 5, b10: 12, b11: 13, b12: 6, b13: 7, |
1610 | b14: 14, b15: 15))); |
1611 | } |
1612 | else if constexpr (__y_to_y && __have_avx2) |
1613 | { |
1614 | return __intrin_bitcast<_To>(_mm256_shuffle_epi8( |
1615 | a: __xzyw(_mm256_blend_epi16( |
1616 | __auto_bitcast( |
1617 | _mm256_shuffle_ps(__vector_bitcast<float>(__v0), |
1618 | __vector_bitcast<float>(__v2), |
1619 | 0x88)), // 0.1. 8.9. 2.3. A.B. |
1620 | __to_intrin(__vector_bitcast<int>(_mm256_shuffle_ps( |
1621 | __vector_bitcast<float>(__v1), |
1622 | __vector_bitcast<float>(__v3), 0x88)) |
1623 | << 16), // .4.5 .C.D .6.7 .E.F |
1624 | 0xaa) // 0415 8C9D 2637 AEBF |
1625 | ), // 0415 2637 8C9D AEBF |
1626 | b: _mm256_setr_epi8(b31: 0, b30: 1, b29: 4, b28: 5, b27: 8, b26: 9, b25: 12, b24: 13, b23: 2, b22: 3, b21: 6, b20: 7, b19: 10, b18: 11, |
1627 | b17: 14, b16: 15, b15: 0, b14: 1, b13: 4, b12: 5, b11: 8, b10: 9, b09: 12, b08: 13, b07: 2, b06: 3, b05: 6, b04: 7, |
1628 | b03: 10, b02: 11, b01: 14, b00: 15))); |
1629 | /* |
1630 | auto __a = _mm256_unpacklo_epi16(__v0, __v1); // 04.. .... 26.. |
1631 | .... auto __b = _mm256_unpackhi_epi16(__v0, __v1); // 15.. |
1632 | .... 37.. .... auto __c = _mm256_unpacklo_epi16(__v2, __v3); // |
1633 | 8C.. .... AE.. .... auto __d = _mm256_unpackhi_epi16(__v2, |
1634 | __v3); |
1635 | // 9D.. .... BF.. .... auto __e = _mm256_unpacklo_epi16(__a, |
1636 | __b); |
1637 | // 0145 .... 2367 .... auto __f = _mm256_unpacklo_epi16(__c, |
1638 | __d); |
1639 | // 89CD .... ABEF .... auto __g = _mm256_unpacklo_epi64(__e, |
1640 | __f); |
1641 | // 0145 89CD 2367 ABEF return __concat( |
1642 | _mm_unpacklo_epi32(__lo128(__g), __hi128(__g)), |
1643 | _mm_unpackhi_epi32(__lo128(__g), __hi128(__g))); // 0123 |
1644 | 4567 89AB CDEF |
1645 | */ |
1646 | } // else use fallback |
1647 | } |
1648 | else if constexpr (__i64_to_i8) //{{{2 |
1649 | { |
1650 | if constexpr (__x_to_x) |
1651 | { |
1652 | // TODO: use fallback for now |
1653 | } |
1654 | else if constexpr (__y_to_x) |
1655 | { |
1656 | auto __a |
1657 | = _mm256_srli_epi32(_mm256_slli_epi32(__i0, 24), 24) |
1658 | | _mm256_srli_epi32(_mm256_slli_epi32(__i1, 24), 16) |
1659 | | _mm256_srli_epi32(_mm256_slli_epi32(__i2, 24), 8) |
1660 | | _mm256_slli_epi32( |
1661 | __i3, 24); // 048C .... 159D .... 26AE .... 37BF .... |
1662 | /*return _mm_shuffle_epi8( |
1663 | _mm_blend_epi32(__lo128(__a) << 32, __hi128(__a), 0x5), |
1664 | _mm_setr_epi8(4, 12, 0, 8, 5, 13, 1, 9, 6, 14, 2, 10, 7, 15, |
1665 | 3, 11));*/ |
1666 | auto __b = _mm256_unpackhi_epi64( |
1667 | __a, __a); // 159D .... 159D .... 37BF .... 37BF .... |
1668 | auto __c = _mm256_unpacklo_epi8( |
1669 | __a, __b); // 0145 89CD .... .... 2367 ABEF .... .... |
1670 | return __intrin_bitcast<_To>( |
1671 | _mm_unpacklo_epi16(__lo128(__c), |
1672 | __hi128(__c))); // 0123 4567 89AB CDEF |
1673 | } |
1674 | } |
1675 | else if constexpr (__i32_to_i8) //{{{2 |
1676 | { |
1677 | if constexpr (__x_to_x) |
1678 | { |
1679 | if constexpr (__have_ssse3) |
1680 | { |
1681 | const auto __x0 = __vector_bitcast<_UInt>(__v0) & 0xff; |
1682 | const auto __x1 = (__vector_bitcast<_UInt>(__v1) & 0xff) |
1683 | << 8; |
1684 | const auto __x2 = (__vector_bitcast<_UInt>(__v2) & 0xff) |
1685 | << 16; |
1686 | const auto __x3 = __vector_bitcast<_UInt>(__v3) << 24; |
1687 | return __intrin_bitcast<_To>( |
1688 | _mm_shuffle_epi8(__to_intrin(__x0 | __x1 | __x2 | __x3), |
1689 | _mm_setr_epi8(b0: 0, b1: 4, b2: 8, b3: 12, b4: 1, b5: 5, b6: 9, b7: 13, |
1690 | b8: 2, b9: 6, b10: 10, b11: 14, b12: 3, b13: 7, b14: 11, |
1691 | b15: 15))); |
1692 | } |
1693 | else |
1694 | { |
1695 | auto __a |
1696 | = _mm_unpacklo_epi8(__i0, __i2); // 08.. .... 19.. .... |
1697 | auto __b |
1698 | = _mm_unpackhi_epi8(__i0, __i2); // 2A.. .... 3B.. .... |
1699 | auto __c |
1700 | = _mm_unpacklo_epi8(__i1, __i3); // 4C.. .... 5D.. .... |
1701 | auto __d |
1702 | = _mm_unpackhi_epi8(__i1, __i3); // 6E.. .... 7F.. .... |
1703 | auto __e |
1704 | = _mm_unpacklo_epi8(__a, __c); // 048C .... .... .... |
1705 | auto __f |
1706 | = _mm_unpackhi_epi8(__a, __c); // 159D .... .... .... |
1707 | auto __g |
1708 | = _mm_unpacklo_epi8(__b, __d); // 26AE .... .... .... |
1709 | auto __h |
1710 | = _mm_unpackhi_epi8(__b, __d); // 37BF .... .... .... |
1711 | return __intrin_bitcast<_To>(_mm_unpacklo_epi8( |
1712 | _mm_unpacklo_epi8(__e, __g), // 0246 8ACE .... .... |
1713 | _mm_unpacklo_epi8(__f, __h) // 1357 9BDF .... .... |
1714 | )); // 0123 4567 89AB CDEF |
1715 | } |
1716 | } |
1717 | else if constexpr (__y_to_y) |
1718 | { |
1719 | const auto __a = _mm256_shuffle_epi8( |
1720 | a: __to_intrin(x: (__vector_bitcast<_UShort>(_mm256_blend_epi16( |
1721 | __i0, _mm256_slli_epi32(__i1, 16), 0xAA)) |
1722 | & 0xff) |
1723 | | (__vector_bitcast<_UShort>(_mm256_blend_epi16( |
1724 | __i2, _mm256_slli_epi32(__i3, 16), 0xAA)) |
1725 | << 8)), |
1726 | b: _mm256_setr_epi8(b31: 0, b30: 4, b29: 8, b28: 12, b27: 2, b26: 6, b25: 10, b24: 14, b23: 1, b22: 5, b21: 9, b20: 13, b19: 3, b18: 7, |
1727 | b17: 11, b16: 15, b15: 0, b14: 4, b13: 8, b12: 12, b11: 2, b10: 6, b09: 10, b08: 14, b07: 1, b06: 5, b05: 9, |
1728 | b04: 13, b03: 3, b02: 7, b01: 11, b00: 15)); |
1729 | return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32( |
1730 | __a, b: _mm256_setr_epi32(i0: 0, i1: 4, i2: 1, i3: 5, i4: 2, i5: 6, i6: 3, i7: 7))); |
1731 | } |
1732 | } |
1733 | else if constexpr (__i64_to_f32) //{{{2 |
1734 | { |
1735 | // this branch is only relevant with AVX and w/o AVX2 (i.e. no ymm |
1736 | // integers) |
1737 | if constexpr (__x_to_y) |
1738 | { |
1739 | return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1], |
1740 | __v2[0], __v2[1], __v3[0], |
1741 | __v3[1]); |
1742 | |
1743 | const auto __a = _mm_unpacklo_epi32(__i0, __i1); // acAC |
1744 | const auto __b = _mm_unpackhi_epi32(__i0, __i1); // bdBD |
1745 | const auto __c = _mm_unpacklo_epi32(__i2, __i3); // egEG |
1746 | const auto __d = _mm_unpackhi_epi32(__i2, __i3); // fhFH |
1747 | const auto __lo32a = _mm_unpacklo_epi32(__a, __b); // abcd |
1748 | const auto __lo32b = _mm_unpacklo_epi32(__c, __d); // efgh |
1749 | const auto __hi32 = __vector_bitcast< |
1750 | conditional_t<is_signed_v<_Tp>, int, _UInt>>( |
1751 | __concat(_mm_unpackhi_epi32(__a, __b), |
1752 | _mm_unpackhi_epi32(__c, __d))); // ABCD EFGH |
1753 | const auto __hi |
1754 | = 0x100000000LL |
1755 | * __convert_x86<__vector_type_t<float, 8>>(__hi32); |
1756 | const auto __mid |
1757 | = 0x10000 |
1758 | * _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, 16), |
1759 | _mm_srli_epi32(__lo32b, 16))); |
1760 | const auto __lo = _mm256_cvtepi32_ps( |
1761 | __concat(_mm_set1_epi32(i: 0x0000ffffu) & __lo32a, |
1762 | _mm_set1_epi32(i: 0x0000ffffu) & __lo32b)); |
1763 | return (__hi + __mid) + __lo; |
1764 | } |
1765 | } |
1766 | else if constexpr (__f64_to_ibw) //{{{2 |
1767 | { |
1768 | return __convert_x86<_To>( |
1769 | __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1), |
1770 | __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3)); |
1771 | } |
1772 | else if constexpr (__f32_to_ibw) //{{{2 |
1773 | { |
1774 | return __convert_x86<_To>( |
1775 | __convert_x86<__vector_type_t<int, _Np>>(__v0), |
1776 | __convert_x86<__vector_type_t<int, _Np>>(__v1), |
1777 | __convert_x86<__vector_type_t<int, _Np>>(__v2), |
1778 | __convert_x86<__vector_type_t<int, _Np>>(__v3)); |
1779 | } //}}} |
1780 | |
1781 | // fallback: {{{2 |
1782 | if constexpr (sizeof(_To) >= 32) |
1783 | // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm |
1784 | return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, |
1785 | __v1), |
1786 | __convert_x86<__vector_type_t<_Up, _M / 2>>(__v2, |
1787 | __v3)); |
1788 | else if constexpr (sizeof(_To) == 16) |
1789 | { |
1790 | const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1)); |
1791 | const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3)); |
1792 | if constexpr (sizeof(_Up) * _Np * 2 == 8) |
1793 | { |
1794 | if constexpr (is_floating_point_v<_Up>) |
1795 | return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi)); |
1796 | else |
1797 | return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); |
1798 | } |
1799 | else if constexpr (sizeof(_Up) * _Np * 2 == 4) |
1800 | { |
1801 | if constexpr (is_floating_point_v<_Up>) |
1802 | return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi)); |
1803 | else |
1804 | return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi)); |
1805 | } |
1806 | else |
1807 | __assert_unreachable<_Tp>(); |
1808 | } |
1809 | else |
1810 | return __vector_convert<_To>(__v0, __v1, __v2, __v3, |
1811 | make_index_sequence<_Np>()); |
1812 | //}}}2 |
1813 | } |
1814 | } |
1815 | |
1816 | //}}} |
1817 | // 8-arg __convert_x86 {{{1 |
1818 | template <typename _To, typename _V, typename _Traits> |
1819 | _GLIBCXX_SIMD_INTRINSIC _To |
1820 | __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6, |
1821 | _V __v7) |
1822 | { |
1823 | static_assert(__is_vector_type_v<_V>); |
1824 | using _Tp = typename _Traits::value_type; |
1825 | constexpr size_t _Np = _Traits::_S_full_size; |
1826 | [[maybe_unused]] const auto __i0 = __to_intrin(__v0); |
1827 | [[maybe_unused]] const auto __i1 = __to_intrin(__v1); |
1828 | [[maybe_unused]] const auto __i2 = __to_intrin(__v2); |
1829 | [[maybe_unused]] const auto __i3 = __to_intrin(__v3); |
1830 | [[maybe_unused]] const auto __i4 = __to_intrin(__v4); |
1831 | [[maybe_unused]] const auto __i5 = __to_intrin(__v5); |
1832 | [[maybe_unused]] const auto __i6 = __to_intrin(__v6); |
1833 | [[maybe_unused]] const auto __i7 = __to_intrin(__v7); |
1834 | using _Up = typename _VectorTraits<_To>::value_type; |
1835 | constexpr size_t _M = _VectorTraits<_To>::_S_full_size; |
1836 | |
1837 | static_assert(8 * _Np <= _M, |
1838 | "__v4-__v7 would be discarded; use the four/two/one-argument " |
1839 | "__convert_x86 overload instead" ); |
1840 | |
1841 | // [xyz]_to_[xyz] {{{2 |
1842 | [[maybe_unused]] constexpr bool __x_to_x |
1843 | = sizeof(__v0) <= 16 && sizeof(_To) <= 16; |
1844 | [[maybe_unused]] constexpr bool __x_to_y |
1845 | = sizeof(__v0) <= 16 && sizeof(_To) == 32; |
1846 | [[maybe_unused]] constexpr bool __x_to_z |
1847 | = sizeof(__v0) <= 16 && sizeof(_To) == 64; |
1848 | [[maybe_unused]] constexpr bool __y_to_x |
1849 | = sizeof(__v0) == 32 && sizeof(_To) <= 16; |
1850 | [[maybe_unused]] constexpr bool __y_to_y |
1851 | = sizeof(__v0) == 32 && sizeof(_To) == 32; |
1852 | [[maybe_unused]] constexpr bool __y_to_z |
1853 | = sizeof(__v0) == 32 && sizeof(_To) == 64; |
1854 | [[maybe_unused]] constexpr bool __z_to_x |
1855 | = sizeof(__v0) == 64 && sizeof(_To) <= 16; |
1856 | [[maybe_unused]] constexpr bool __z_to_y |
1857 | = sizeof(__v0) == 64 && sizeof(_To) == 32; |
1858 | [[maybe_unused]] constexpr bool __z_to_z |
1859 | = sizeof(__v0) == 64 && sizeof(_To) == 64; |
1860 | |
1861 | // [if]X_to_i8 {{{2 |
1862 | [[maybe_unused]] constexpr bool __i_to_i |
1863 | = is_integral_v<_Up> && is_integral_v<_Tp>; |
1864 | [[maybe_unused]] constexpr bool __i64_to_i8 |
1865 | = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; |
1866 | [[maybe_unused]] constexpr bool __f64_to_i8 |
1867 | = is_integral_v<_Up> && sizeof(_Up) == 1 |
1868 | && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
1869 | |
1870 | if constexpr (__i_to_i) // assert ISA {{{2 |
1871 | { |
1872 | static_assert(__x_to_x || __have_avx2, |
1873 | "integral conversions with ymm registers require AVX2" ); |
1874 | static_assert(__have_avx512bw |
1875 | || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) |
1876 | && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), |
1877 | "8/16-bit integers in zmm registers require AVX512BW" ); |
1878 | static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, |
1879 | "integral conversions with ymm registers require AVX2" ); |
1880 | } |
1881 | // concat => use 4-arg __convert_x86 {{{2 |
1882 | if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) |
1883 | || (sizeof(__v0) == 16 && __have_avx |
1884 | && is_floating_point_v<_Tp>) |
1885 | || (sizeof(__v0) == 32 && __have_avx512f)) |
1886 | { |
1887 | // The ISA can handle wider input registers, so concat and use two-arg |
1888 | // implementation. This reduces code duplication considerably. |
1889 | return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3), |
1890 | __concat(__v4, __v5), __concat(__v6, __v7)); |
1891 | } |
1892 | else //{{{2 |
1893 | { |
1894 | // conversion using bit reinterpretation (or no conversion at all) |
1895 | // should all go through the concat branch above: |
1896 | static_assert( |
1897 | !(is_floating_point_v< |
1898 | _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); |
1899 | static_assert(!(8 * _Np < _M && sizeof(_To) > 16), |
1900 | "zero extension should be impossible" ); |
1901 | if constexpr (__i64_to_i8) //{{{2 |
1902 | { |
1903 | if constexpr (__x_to_x && __have_ssse3) |
1904 | { |
1905 | // unsure whether this is better than the variant below |
1906 | return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
1907 | __to_intrin( |
1908 | (((__v0 & 0xff) | ((__v1 & 0xff) << 8)) |
1909 | | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24))) |
1910 | | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40)) |
1911 | | (((__v6 & 0xff) << 48) | (__v7 << 56)))), |
1912 | _mm_setr_epi8(b0: 0, b1: 8, b2: 1, b3: 9, b4: 2, b5: 10, b6: 3, b7: 11, b8: 4, b9: 12, b10: 5, b11: 13, b12: 6, b13: 14, |
1913 | b14: 7, b15: 15))); |
1914 | } |
1915 | else if constexpr (__x_to_x) |
1916 | { |
1917 | const auto __a = _mm_unpacklo_epi8(__i0, __i1); // ac |
1918 | const auto __b = _mm_unpackhi_epi8(__i0, __i1); // bd |
1919 | const auto __c = _mm_unpacklo_epi8(__i2, __i3); // eg |
1920 | const auto __d = _mm_unpackhi_epi8(__i2, __i3); // fh |
1921 | const auto __e = _mm_unpacklo_epi8(__i4, __i5); // ik |
1922 | const auto __f = _mm_unpackhi_epi8(__i4, __i5); // jl |
1923 | const auto __g = _mm_unpacklo_epi8(__i6, __i7); // mo |
1924 | const auto __h = _mm_unpackhi_epi8(__i6, __i7); // np |
1925 | return __intrin_bitcast<_To>(_mm_unpacklo_epi64( |
1926 | _mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b), // abcd |
1927 | _mm_unpacklo_epi8(__c, __d)), // efgh |
1928 | _mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f), // ijkl |
1929 | _mm_unpacklo_epi8(__g, __h)) // mnop |
1930 | )); |
1931 | } |
1932 | else if constexpr (__y_to_y) |
1933 | { |
1934 | auto __a = // 048C GKOS 159D HLPT 26AE IMQU 37BF JNRV |
1935 | __to_intrin( |
1936 | (((__v0 & 0xff) | ((__v1 & 0xff) << 8)) |
1937 | | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24))) |
1938 | | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40)) |
1939 | | (((__v6 & 0xff) << 48) | ((__v7 << 56))))); |
1940 | /* |
1941 | auto __b = _mm256_unpackhi_epi64(__a, __a); // 159D HLPT 159D |
1942 | HLPT 37BF JNRV 37BF JNRV auto __c = _mm256_unpacklo_epi8(__a, |
1943 | __b); // 0145 89CD GHKL OPST 2367 ABEF IJMN QRUV auto __d = |
1944 | __xzyw(__c); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV return |
1945 | _mm256_shuffle_epi8( |
1946 | __d, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, |
1947 | 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, |
1948 | 14, 15)); |
1949 | */ |
1950 | auto __b = _mm256_shuffle_epi8( // 0145 89CD GHKL OPST 2367 ABEF |
1951 | // IJMN QRUV |
1952 | __a, _mm256_setr_epi8(b31: 0, b30: 8, b29: 1, b28: 9, b27: 2, b26: 10, b25: 3, b24: 11, b23: 4, b22: 12, b21: 5, b20: 13, |
1953 | b19: 6, b18: 14, b17: 7, b16: 15, b15: 0, b14: 8, b13: 1, b12: 9, b11: 2, b10: 10, b09: 3, b08: 11, |
1954 | b07: 4, b06: 12, b05: 5, b04: 13, b03: 6, b02: 14, b01: 7, b00: 15)); |
1955 | auto __c |
1956 | = __xzyw(__b); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV |
1957 | return __intrin_bitcast<_To>(_mm256_shuffle_epi8( |
1958 | __c, _mm256_setr_epi8(b31: 0, b30: 1, b29: 8, b28: 9, b27: 2, b26: 3, b25: 10, b24: 11, b23: 4, b22: 5, b21: 12, b20: 13, |
1959 | b19: 6, b18: 7, b17: 14, b16: 15, b15: 0, b14: 1, b13: 8, b12: 9, b11: 2, b10: 3, b09: 10, b08: 11, |
1960 | b07: 4, b06: 5, b05: 12, b04: 13, b03: 6, b02: 7, b01: 14, b00: 15))); |
1961 | } |
1962 | else if constexpr (__z_to_z) |
1963 | { |
1964 | return __concat( |
1965 | __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, |
1966 | __v3), |
1967 | __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6, |
1968 | __v7)); |
1969 | } |
1970 | } |
1971 | else if constexpr (__f64_to_i8) //{{{2 |
1972 | { |
1973 | return __convert_x86<_To>( |
1974 | __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1), |
1975 | __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3), |
1976 | __convert_x86<__vector_type_t<int, _Np * 2>>(__v4, __v5), |
1977 | __convert_x86<__vector_type_t<int, _Np * 2>>(__v6, __v7)); |
1978 | } |
1979 | else // unreachable {{{2 |
1980 | __assert_unreachable<_Tp>(); |
1981 | //}}} |
1982 | |
1983 | // fallback: {{{2 |
1984 | if constexpr (sizeof(_To) >= 32) |
1985 | // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm |
1986 | return __concat( |
1987 | __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, __v3), |
1988 | __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6, |
1989 | __v7)); |
1990 | else if constexpr (sizeof(_To) == 16) |
1991 | { |
1992 | const auto __lo |
1993 | = __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3)); |
1994 | const auto __hi |
1995 | = __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7)); |
1996 | static_assert(sizeof(_Up) == 1 && _Np == 2); |
1997 | return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); |
1998 | } |
1999 | else |
2000 | { |
2001 | __assert_unreachable<_Tp>(); |
2002 | // return __vector_convert<_To>(__v0, __v1, __v2, __v3, __v4, __v5, |
2003 | // __v6, __v7, |
2004 | // make_index_sequence<_Np>()); |
2005 | } //}}}2 |
2006 | } |
2007 | } |
2008 | |
2009 | //}}} |
2010 | // 16-arg __convert_x86 {{{1 |
2011 | template <typename _To, typename _V, typename _Traits> |
2012 | _GLIBCXX_SIMD_INTRINSIC _To |
2013 | __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6, |
2014 | _V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12, |
2015 | _V __v13, _V __v14, _V __v15) |
2016 | { |
2017 | // concat => use 8-arg __convert_x86 |
2018 | return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3), |
2019 | __concat(__v4, __v5), __concat(__v6, __v7), |
2020 | __concat(__v8, __v9), __concat(__v10, __v11), |
2021 | __concat(__v12, __v13), __concat(__v14, __v15)); |
2022 | } |
2023 | |
2024 | //}}} |
2025 | |
2026 | #endif // __cplusplus >= 201703L |
2027 | #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H |
2028 | |
2029 | // vim: foldmethod=marker |
2030 | |