simd_x86_conversions.h source code [include/c++/11/experimental/bits/simd_x86_conversions.h]

1	// x86 specific conversion optimizations -- C++ --
2
3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
4	//
5	// This file is part of the GNU ISO C++ Library. This library is free
6	// software; you can redistribute it and/or modify it under the
7	// terms of the GNU General Public License as published by the
8	// Free Software Foundation; either version 3, or (at your option)
9	// any later version.
10
11	// This library is distributed in the hope that it will be useful,
12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	// GNU General Public License for more details.
15
16	// Under Section 7 of GPL version 3, you are granted additional
17	// permissions described in the GCC Runtime Library Exception, version
18	// 3.1, as published by the Free Software Foundation.
19
20	// You should have received a copy of the GNU General Public License and
21	// a copy of the GCC Runtime Library Exception along with this program;
22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	// <http://www.gnu.org/licenses/>.
24
25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
26	#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
27
28	#if __cplusplus >= 201703L
29
30	// work around PR85827
31	// 1-arg __convert_x86 {{{1
32	template <typename _To, typename _V, typename _Traits>
33	_GLIBCXX_SIMD_INTRINSIC _To
34	__convert_x86(_V __v)
35	{
36	static_assert(__is_vector_type_v<_V>);
37	using _Tp = typename _Traits::value_type;
38	constexpr size_t _Np = _Traits::_S_full_size;
39	[[maybe_unused]] const auto __intrin = __to_intrin(__v);
40	using _Up = typename _VectorTraits<_To>::value_type;
41	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
42
43	// [xyz]_to_[xyz] {{{2
44	[[maybe_unused]] constexpr bool __x_to_x
45	= sizeof(__v) <= `16` && sizeof(_To) <= `16`;
46	[[maybe_unused]] constexpr bool __x_to_y
47	= sizeof(__v) <= `16` && sizeof(_To) == `32`;
48	[[maybe_unused]] constexpr bool __x_to_z
49	= sizeof(__v) <= `16` && sizeof(_To) == `64`;
50	[[maybe_unused]] constexpr bool __y_to_x
51	= sizeof(__v) == `32` && sizeof(_To) <= `16`;
52	[[maybe_unused]] constexpr bool __y_to_y
53	= sizeof(__v) == `32` && sizeof(_To) == `32`;
54	[[maybe_unused]] constexpr bool __y_to_z
55	= sizeof(__v) == `32` && sizeof(_To) == `64`;
56	[[maybe_unused]] constexpr bool __z_to_x
57	= sizeof(__v) == `64` && sizeof(_To) <= `16`;
58	[[maybe_unused]] constexpr bool __z_to_y
59	= sizeof(__v) == `64` && sizeof(_To) == `32`;
60	[[maybe_unused]] constexpr bool __z_to_z
61	= sizeof(__v) == `64` && sizeof(_To) == `64`;
62
63	// iX_to_iX {{{2
64	[[maybe_unused]] constexpr bool __i_to_i
65	= is_integral_v<_Up> && is_integral_v<_Tp>;
66	[[maybe_unused]] constexpr bool __i8_to_i16
67	= __i_to_i && sizeof(_Tp) == `1` && sizeof(_Up) == `2`;
68	[[maybe_unused]] constexpr bool __i8_to_i32
69	= __i_to_i && sizeof(_Tp) == `1` && sizeof(_Up) == `4`;
70	[[maybe_unused]] constexpr bool __i8_to_i64
71	= __i_to_i && sizeof(_Tp) == `1` && sizeof(_Up) == `8`;
72	[[maybe_unused]] constexpr bool __i16_to_i8
73	= __i_to_i && sizeof(_Tp) == `2` && sizeof(_Up) == `1`;
74	[[maybe_unused]] constexpr bool __i16_to_i32
75	= __i_to_i && sizeof(_Tp) == `2` && sizeof(_Up) == `4`;
76	[[maybe_unused]] constexpr bool __i16_to_i64
77	= __i_to_i && sizeof(_Tp) == `2` && sizeof(_Up) == `8`;
78	[[maybe_unused]] constexpr bool __i32_to_i8
79	= __i_to_i && sizeof(_Tp) == `4` && sizeof(_Up) == `1`;
80	[[maybe_unused]] constexpr bool __i32_to_i16
81	= __i_to_i && sizeof(_Tp) == `4` && sizeof(_Up) == `2`;
82	[[maybe_unused]] constexpr bool __i32_to_i64
83	= __i_to_i && sizeof(_Tp) == `4` && sizeof(_Up) == `8`;
84	[[maybe_unused]] constexpr bool __i64_to_i8
85	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `1`;
86	[[maybe_unused]] constexpr bool __i64_to_i16
87	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `2`;
88	[[maybe_unused]] constexpr bool __i64_to_i32
89	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `4`;
90
91	// [fsu]X_to_[fsu]X {{{2
92	// ibw = integral && byte or word, i.e. char and short with any signedness
93	[[maybe_unused]] constexpr bool __s64_to_f32
94	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `8`
95	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
96	[[maybe_unused]] constexpr bool __s32_to_f32
97	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `4`
98	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
99	[[maybe_unused]] constexpr bool __s16_to_f32
100	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `2`
101	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
102	[[maybe_unused]] constexpr bool __s8_to_f32
103	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `1`
104	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
105	[[maybe_unused]] constexpr bool __u64_to_f32
106	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `8`
107	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
108	[[maybe_unused]] constexpr bool __u32_to_f32
109	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `4`
110	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
111	[[maybe_unused]] constexpr bool __u16_to_f32
112	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `2`
113	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
114	[[maybe_unused]] constexpr bool __u8_to_f32
115	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `1`
116	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
117	[[maybe_unused]] constexpr bool __s64_to_f64
118	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `8`
119	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
120	[[maybe_unused]] constexpr bool __s32_to_f64
121	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `4`
122	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
123	[[maybe_unused]] constexpr bool __u64_to_f64
124	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `8`
125	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
126	[[maybe_unused]] constexpr bool __u32_to_f64
127	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `4`
128	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
129	[[maybe_unused]] constexpr bool __f32_to_s64
130	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `8`
131	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
132	[[maybe_unused]] constexpr bool __f32_to_s32
133	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `4`
134	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
135	[[maybe_unused]] constexpr bool __f32_to_u64
136	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `8`
137	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
138	[[maybe_unused]] constexpr bool __f32_to_u32
139	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `4`
140	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
141	[[maybe_unused]] constexpr bool __f64_to_s64
142	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `8`
143	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
144	[[maybe_unused]] constexpr bool __f64_to_s32
145	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `4`
146	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
147	[[maybe_unused]] constexpr bool __f64_to_u64
148	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `8`
149	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
150	[[maybe_unused]] constexpr bool __f64_to_u32
151	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `4`
152	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
153	[[maybe_unused]] constexpr bool __ibw_to_f32
154	= is_integral_v<_Tp> && sizeof(_Tp) <= `2`
155	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
156	[[maybe_unused]] constexpr bool __ibw_to_f64
157	= is_integral_v<_Tp> && sizeof(_Tp) <= `2`
158	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
159	[[maybe_unused]] constexpr bool __f32_to_ibw
160	= is_integral_v<_Up> && sizeof(_Up) <= `2`
161	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
162	[[maybe_unused]] constexpr bool __f64_to_ibw
163	= is_integral_v<_Up> && sizeof(_Up) <= `2`
164	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
165	[[maybe_unused]] constexpr bool __f32_to_f64
166	= is_floating_point_v<_Tp> && sizeof(_Tp) == `4`
167	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
168	[[maybe_unused]] constexpr bool __f64_to_f32
169	= is_floating_point_v<_Tp> && sizeof(_Tp) == `8`
170	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
171
172	if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
173	return __convert_x86<_To>(__lo128(__v), __hi128(__v));
174	else if constexpr (__i_to_i && __x_to_y && !__have_avx2) //{{{2
175	return __concat(__convert_x86<__vector_type_t<_Up, _M / `2`>>(__v),
176	__convert_x86<__vector_type_t<_Up, _M / `2`>>(
177	__extract_part<`1`, _Np / _M * `2`>(__v)));
178	else if constexpr (__i_to_i) //{{{2
179	{
180	static_assert(__x_to_x \|\| __have_avx2,
181	"integral conversions with ymm registers require AVX2");
182	static_assert(__have_avx512bw
183	\|\| ((sizeof(_Tp) >= `4` \|\| sizeof(__v) < `64`)
184	&& (sizeof(_Up) >= `4` \|\| sizeof(_To) < `64`)),
185	"8/16-bit integers in zmm registers require AVX512BW");
186	static_assert((sizeof(__v) < `64` && sizeof(_To) < `64`) \|\| __have_avx512f,
187	"integral conversions with ymm registers require AVX2");
188	}
189	if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> && //{{{2
190	sizeof(_Tp) == sizeof(_Up))
191	{
192	// conversion uses simple bit reinterpretation (or no conversion at all)
193	if constexpr (_Np >= _M)
194	return __intrin_bitcast<_To>(__v);
195	else
196	return __zero_extend(__vector_bitcast<_Up>(__v));
197	}
198	else if constexpr (_Np < _M && sizeof(_To) > `16`) //{{{2
199	// zero extend (eg. xmm -> ymm)
200	return __zero_extend(
201	__convert_x86<__vector_type_t<
202	_Up, (`16` / sizeof(_Up) > _Np) ? `16` / sizeof(_Up) : _Np>>(__v));
203	else if constexpr (_Np > _M && sizeof(__v) > `16`) //{{{2
204	// partial input (eg. ymm -> xmm)
205	return __convert_x86<_To>(__extract_part<`0`, _Np / _M>(__v));
206	else if constexpr (__i64_to_i32) //{{{2
207	{
208	if constexpr (__x_to_x && __have_avx512vl)
209	return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin));
210	else if constexpr (__x_to_x)
211	return __auto_bitcast(
212	_mm_shuffle_ps(__vector_bitcast<float>(__v), __m128(), `8`));
213	else if constexpr (__y_to_x && __have_avx512vl)
214	return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin));
215	else if constexpr (__y_to_x && __have_avx512f)
216	return __intrin_bitcast<_To>(
217	__lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v))));
218	else if constexpr (__y_to_x)
219	return __intrin_bitcast<_To>(
220	__lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, `8`),
221	`0` + `4` * `2`)));
222	else if constexpr (__z_to_y)
223	return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin));
224	}
225	else if constexpr (__i64_to_i16) //{{{2
226	{
227	if constexpr (__x_to_x && __have_avx512vl)
228	return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin));
229	else if constexpr (__x_to_x && __have_avx512f)
230	return __intrin_bitcast<_To>(
231	__lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
232	else if constexpr (__x_to_x && __have_ssse3)
233	{
234	return __intrin_bitcast<_To>(
235	_mm_shuffle_epi8(__intrin,
236	_mm_setr_epi8(b0: `0`, b1: `1`, b2: `8`, b3: `9`, b4: -`0x80`, b5: -`0x80`, b6: -`0x80`,
237	b7: -`0x80`, b8: -`0x80`, b9: -`0x80`, b10: -`0x80`, b11: -`0x80`,
238	b12: -`0x80`, b13: -`0x80`, b14: -`0x80`, b15: -`0x80`)));
239	// fallback without SSSE3
240	}
241	else if constexpr (__y_to_x && __have_avx512vl)
242	return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin));
243	else if constexpr (__y_to_x && __have_avx512f)
244	return __intrin_bitcast<_To>(
245	__lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
246	else if constexpr (__y_to_x)
247	{
248	const auto __a = _mm256_shuffle_epi8(
249	__intrin,
250	_mm256_setr_epi8(b31: `0`, b30: `1`, b29: `8`, b28: `9`, b27: -`0x80`, b26: -`0x80`, b25: -`0x80`, b24: -`0x80`, b23: -`0x80`,
251	b22: -`0x80`, b21: -`0x80`, b20: -`0x80`, b19: -`0x80`, b18: -`0x80`, b17: -`0x80`, b16: -`0x80`,
252	b15: -`0x80`, b14: -`0x80`, b13: -`0x80`, b12: -`0x80`, b11: `0`, b10: `1`, b09: `8`, b08: `9`, b07: -`0x80`,
253	b06: -`0x80`, b05: -`0x80`, b04: -`0x80`, b03: -`0x80`, b02: -`0x80`, b01: -`0x80`,
254	b00: -`0x80`));
255	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
256	}
257	else if constexpr (__z_to_x)
258	return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin));
259	}
260	else if constexpr (__i64_to_i8) //{{{2
261	{
262	if constexpr (__x_to_x && __have_avx512vl)
263	return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin));
264	else if constexpr (__x_to_x && __have_avx512f)
265	return __intrin_bitcast<_To>(
266	__lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin))));
267	else if constexpr (__y_to_x && __have_avx512vl)
268	return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin));
269	else if constexpr (__y_to_x && __have_avx512f)
270	return __intrin_bitcast<_To>(
271	_mm512_cvtepi64_epi8(__zero_extend(__intrin)));
272	else if constexpr (__z_to_x)
273	return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin));
274	}
275	else if constexpr (__i32_to_i64) //{{{2
276	{
277	if constexpr (__have_sse4_1 && __x_to_x)
278	return __intrin_bitcast<_To>(is_signed_v<_Tp>
279	? _mm_cvtepi32_epi64(__intrin)
280	: _mm_cvtepu32_epi64(__intrin));
281	else if constexpr (__x_to_x)
282	{
283	return __intrin_bitcast<_To>(
284	_mm_unpacklo_epi32(__intrin, is_signed_v<_Tp>
285	? _mm_srai_epi32(__intrin, `31`)
286	: __m128i()));
287	}
288	else if constexpr (__x_to_y)
289	return __intrin_bitcast<_To>(is_signed_v<_Tp>
290	? _mm256_cvtepi32_epi64(__intrin)
291	: _mm256_cvtepu32_epi64(__intrin));
292	else if constexpr (__y_to_z)
293	return __intrin_bitcast<_To>(is_signed_v<_Tp>
294	? _mm512_cvtepi32_epi64(__intrin)
295	: _mm512_cvtepu32_epi64(__intrin));
296	}
297	else if constexpr (__i32_to_i16) //{{{2
298	{
299	if constexpr (__x_to_x && __have_avx512vl)
300	return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin));
301	else if constexpr (__x_to_x && __have_avx512f)
302	return __intrin_bitcast<_To>(
303	__lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
304	else if constexpr (__x_to_x && __have_ssse3)
305	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
306	__intrin, _mm_setr_epi8(b0: `0`, b1: `1`, b2: `4`, b3: `5`, b4: `8`, b5: `9`, b6: `12`, b7: `13`, b8: -`0x80`, b9: -`0x80`,
307	b10: -`0x80`, b11: -`0x80`, b12: -`0x80`, b13: -`0x80`, b14: -`0x80`, b15: -`0x80`)));
308	else if constexpr (__x_to_x)
309	{
310	auto __a = _mm_unpacklo_epi16(__intrin, __m128i()); // 0o.o 1o.o
311	auto __b = _mm_unpackhi_epi16(__intrin, __m128i()); // 2o.o 3o.o
312	auto __c = _mm_unpacklo_epi16(__a, __b); // 02oo ..oo
313	auto __d = _mm_unpackhi_epi16(__a, __b); // 13oo ..oo
314	return __intrin_bitcast<_To>(
315	_mm_unpacklo_epi16(__c, __d)); // 0123 oooo
316	}
317	else if constexpr (__y_to_x && __have_avx512vl)
318	return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin));
319	else if constexpr (__y_to_x && __have_avx512f)
320	return __intrin_bitcast<_To>(
321	__lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
322	else if constexpr (__y_to_x)
323	{
324	auto __a = _mm256_shuffle_epi8(
325	__intrin,
326	_mm256_setr_epi8(b31: `0`, b30: `1`, b29: `4`, b28: `5`, b27: `8`, b26: `9`, b25: `12`, b24: `13`, b23: -`0x80`, b22: -`0x80`, b21: -`0x80`,
327	b20: -`0x80`, b19: -`0x80`, b18: -`0x80`, b17: -`0x80`, b16: -`0x80`, b15: `0`, b14: `1`, b13: `4`, b12: `5`, b11: `8`,
328	b10: `9`, b09: `12`, b08: `13`, b07: -`0x80`, b06: -`0x80`, b05: -`0x80`, b04: -`0x80`, b03: -`0x80`,
329	b02: -`0x80`, b01: -`0x80`, b00: -`0x80`));
330	return __intrin_bitcast<_To>(__lo128(
331	_mm256_permute4x64_epi64(__a,
332	`0xf8`))); // __a[0] __a[2] \| __a[3] __a[3]
333	}
334	else if constexpr (__z_to_y)
335	return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin));
336	}
337	else if constexpr (__i32_to_i8) //{{{2
338	{
339	if constexpr (__x_to_x && __have_avx512vl)
340	return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin));
341	else if constexpr (__x_to_x && __have_avx512f)
342	return __intrin_bitcast<_To>(
343	__lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin))));
344	else if constexpr (__x_to_x && __have_ssse3)
345	{
346	return __intrin_bitcast<_To>(
347	_mm_shuffle_epi8(__intrin,
348	_mm_setr_epi8(b0: `0`, b1: `4`, b2: `8`, b3: `12`, b4: -`0x80`, b5: -`0x80`, b6: -`0x80`,
349	b7: -`0x80`, b8: -`0x80`, b9: -`0x80`, b10: -`0x80`, b11: -`0x80`,
350	b12: -`0x80`, b13: -`0x80`, b14: -`0x80`, b15: -`0x80`)));
351	}
352	else if constexpr (__x_to_x)
353	{
354	const auto __a
355	= _mm_unpacklo_epi8(__intrin, __intrin); // 0... .... 1... ....
356	const auto __b
357	= _mm_unpackhi_epi8(__intrin, __intrin); // 2... .... 3... ....
358	const auto __c = _mm_unpacklo_epi8(__a, __b); // 02.. .... .... ....
359	const auto __d = _mm_unpackhi_epi8(__a, __b); // 13.. .... .... ....
360	const auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 .... .... ....
361	return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(a: -`1`));
362	}
363	else if constexpr (__y_to_x && __have_avx512vl)
364	return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin));
365	else if constexpr (__y_to_x && __have_avx512f)
366	return __intrin_bitcast<_To>(
367	_mm512_cvtepi32_epi8(__zero_extend(__intrin)));
368	else if constexpr (__z_to_x)
369	return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin));
370	}
371	else if constexpr (__i16_to_i64) //{{{2
372	{
373	if constexpr (__x_to_x && __have_sse4_1)
374	return __intrin_bitcast<_To>(is_signed_v<_Tp>
375	? _mm_cvtepi16_epi64(__intrin)
376	: _mm_cvtepu16_epi64(__intrin));
377	else if constexpr (__x_to_x && is_signed_v<_Tp>)
378	{
379	auto __x = _mm_srai_epi16(__intrin, `15`);
380	auto __y = _mm_unpacklo_epi16(__intrin, __x);
381	__x = _mm_unpacklo_epi16(__x, __x);
382	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x));
383	}
384	else if constexpr (__x_to_x)
385	return __intrin_bitcast<_To>(
386	_mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()),
387	__m128i()));
388	else if constexpr (__x_to_y)
389	return __intrin_bitcast<_To>(is_signed_v<_Tp>
390	? _mm256_cvtepi16_epi64(__intrin)
391	: _mm256_cvtepu16_epi64(__intrin));
392	else if constexpr (__x_to_z)
393	return __intrin_bitcast<_To>(is_signed_v<_Tp>
394	? _mm512_cvtepi16_epi64(__intrin)
395	: _mm512_cvtepu16_epi64(__intrin));
396	}
397	else if constexpr (__i16_to_i32) //{{{2
398	{
399	if constexpr (__x_to_x && __have_sse4_1)
400	return __intrin_bitcast<_To>(is_signed_v<_Tp>
401	? _mm_cvtepi16_epi32(__intrin)
402	: _mm_cvtepu16_epi32(__intrin));
403	else if constexpr (__x_to_x && is_signed_v<_Tp>)
404	return __intrin_bitcast<_To>(
405	_mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), `16`));
406	else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
407	return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i()));
408	else if constexpr (__x_to_y)
409	return __intrin_bitcast<_To>(is_signed_v<_Tp>
410	? _mm256_cvtepi16_epi32(__intrin)
411	: _mm256_cvtepu16_epi32(__intrin));
412	else if constexpr (__y_to_z)
413	return __intrin_bitcast<_To>(is_signed_v<_Tp>
414	? _mm512_cvtepi16_epi32(__intrin)
415	: _mm512_cvtepu16_epi32(__intrin));
416	}
417	else if constexpr (__i16_to_i8) //{{{2
418	{
419	if constexpr (__x_to_x && __have_avx512bw_vl)
420	return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin));
421	else if constexpr (__x_to_x && __have_avx512bw)
422	return __intrin_bitcast<_To>(
423	__lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
424	else if constexpr (__x_to_x && __have_ssse3)
425	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
426	__intrin, _mm_setr_epi8(b0: `0`, b1: `2`, b2: `4`, b3: `6`, b4: `8`, b5: `10`, b6: `12`, b7: `14`, b8: -`0x80`, b9: -`0x80`,
427	b10: -`0x80`, b11: -`0x80`, b12: -`0x80`, b13: -`0x80`, b14: -`0x80`, b15: -`0x80`)));
428	else if constexpr (__x_to_x)
429	{
430	auto __a
431	= _mm_unpacklo_epi8(__intrin, __intrin); // 00.. 11.. 22.. 33..
432	auto __b
433	= _mm_unpackhi_epi8(__intrin, __intrin); // 44.. 55.. 66.. 77..
434	auto __c = _mm_unpacklo_epi8(__a, __b); // 0404 .... 1515 ....
435	auto __d = _mm_unpackhi_epi8(__a, __b); // 2626 .... 3737 ....
436	auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 0246 .... ....
437	auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 1357 .... ....
438	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
439	}
440	else if constexpr (__y_to_x && __have_avx512bw_vl)
441	return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin));
442	else if constexpr (__y_to_x && __have_avx512bw)
443	return __intrin_bitcast<_To>(
444	__lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
445	else if constexpr (__y_to_x)
446	{
447	auto __a = _mm256_shuffle_epi8(
448	__intrin,
449	_mm256_setr_epi8(b31: `0`, b30: `2`, b29: `4`, b28: `6`, b27: `8`, b26: `10`, b25: `12`, b24: `14`, b23: -`0x80`, b22: -`0x80`, b21: -`0x80`,
450	b20: -`0x80`, b19: -`0x80`, b18: -`0x80`, b17: -`0x80`, b16: -`0x80`, b15: -`0x80`, b14: -`0x80`,
451	b13: -`0x80`, b12: -`0x80`, b11: -`0x80`, b10: -`0x80`, b09: -`0x80`, b08: -`0x80`, b07: `0`, b06: `2`,
452	b05: `4`, b04: `6`, b03: `8`, b02: `10`, b01: `12`, b00: `14`));
453	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
454	}
455	else if constexpr (__z_to_y && __have_avx512bw)
456	return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin));
457	else if constexpr (__z_to_y)
458	__assert_unreachable<_Tp>();
459	}
460	else if constexpr (__i8_to_i64) //{{{2
461	{
462	if constexpr (__x_to_x && __have_sse4_1)
463	return __intrin_bitcast<_To>(is_signed_v<_Tp>
464	? _mm_cvtepi8_epi64(__intrin)
465	: _mm_cvtepu8_epi64(__intrin));
466	else if constexpr (__x_to_x && is_signed_v<_Tp>)
467	{
468	if constexpr (__have_ssse3)
469	{
470	auto __dup = _mm_unpacklo_epi8(__intrin, __intrin);
471	auto __epi16 = _mm_srai_epi16(__dup, `8`);
472	_mm_shuffle_epi8(__epi16,
473	_mm_setr_epi8(b0: `0`, b1: `1`, b2: `1`, b3: `1`, b4: `1`, b5: `1`, b6: `1`, b7: `1`, b8: `2`, b9: `3`, b10: `3`,
474	b11: `3`, b12: `3`, b13: `3`, b14: `3`, b15: `3`));
475	}
476	else
477	{
478	auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
479	__x = _mm_unpacklo_epi16(__x, __x);
480	return __intrin_bitcast<_To>(
481	_mm_unpacklo_epi32(_mm_srai_epi32(__x, `24`),
482	_mm_srai_epi32(__x, `31`)));
483	}
484	}
485	else if constexpr (__x_to_x)
486	{
487	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(
488	_mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
489	__m128i()),
490	__m128i()));
491	}
492	else if constexpr (__x_to_y)
493	return __intrin_bitcast<_To>(is_signed_v<_Tp>
494	? _mm256_cvtepi8_epi64(__intrin)
495	: _mm256_cvtepu8_epi64(__intrin));
496	else if constexpr (__x_to_z)
497	return __intrin_bitcast<_To>(is_signed_v<_Tp>
498	? _mm512_cvtepi8_epi64(__intrin)
499	: _mm512_cvtepu8_epi64(__intrin));
500	}
501	else if constexpr (__i8_to_i32) //{{{2
502	{
503	if constexpr (__x_to_x && __have_sse4_1)
504	return __intrin_bitcast<_To>(is_signed_v<_Tp>
505	? _mm_cvtepi8_epi32(__intrin)
506	: _mm_cvtepu8_epi32(__intrin));
507	else if constexpr (__x_to_x && is_signed_v<_Tp>)
508	{
509	const auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
510	return __intrin_bitcast<_To>(
511	_mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), `24`));
512	}
513	else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
514	return __intrin_bitcast<_To>(
515	_mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
516	__m128i()));
517	else if constexpr (__x_to_y)
518	return __intrin_bitcast<_To>(is_signed_v<_Tp>
519	? _mm256_cvtepi8_epi32(__intrin)
520	: _mm256_cvtepu8_epi32(__intrin));
521	else if constexpr (__x_to_z)
522	return __intrin_bitcast<_To>(is_signed_v<_Tp>
523	? _mm512_cvtepi8_epi32(__intrin)
524	: _mm512_cvtepu8_epi32(__intrin));
525	}
526	else if constexpr (__i8_to_i16) //{{{2
527	{
528	if constexpr (__x_to_x && __have_sse4_1)
529	return __intrin_bitcast<_To>(is_signed_v<_Tp>
530	? _mm_cvtepi8_epi16(__intrin)
531	: _mm_cvtepu8_epi16(__intrin));
532	else if constexpr (__x_to_x && is_signed_v<_Tp>)
533	return __intrin_bitcast<_To>(
534	_mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), `8`));
535	else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
536	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i()));
537	else if constexpr (__x_to_y)
538	return __intrin_bitcast<_To>(is_signed_v<_Tp>
539	? _mm256_cvtepi8_epi16(__intrin)
540	: _mm256_cvtepu8_epi16(__intrin));
541	else if constexpr (__y_to_z && __have_avx512bw)
542	return __intrin_bitcast<_To>(is_signed_v<_Tp>
543	? _mm512_cvtepi8_epi16(__intrin)
544	: _mm512_cvtepu8_epi16(__intrin));
545	else if constexpr (__y_to_z)
546	__assert_unreachable<_Tp>();
547	}
548	else if constexpr (__f32_to_s64) //{{{2
549	{
550	if constexpr (__have_avx512dq_vl && __x_to_x)
551	return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin));
552	else if constexpr (__have_avx512dq_vl && __x_to_y)
553	return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin));
554	else if constexpr (__have_avx512dq && __y_to_z)
555	return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin));
556	// else use scalar fallback
557	}
558	else if constexpr (__f32_to_u64) //{{{2
559	{
560	if constexpr (__have_avx512dq_vl && __x_to_x)
561	return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin));
562	else if constexpr (__have_avx512dq_vl && __x_to_y)
563	return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin));
564	else if constexpr (__have_avx512dq && __y_to_z)
565	return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin));
566	// else use scalar fallback
567	}
568	else if constexpr (__f32_to_s32) //{{{2
569	{
570	if constexpr (__x_to_x \|\| __y_to_y \|\| __z_to_z)
571	{
572	// go to fallback, it does the right thing
573	}
574	else
575	__assert_unreachable<_Tp>();
576	}
577	else if constexpr (__f32_to_u32) //{{{2
578	{
579	if constexpr (__have_avx512vl && __x_to_x)
580	return __auto_bitcast(_mm_cvttps_epu32(__intrin));
581	else if constexpr (__have_avx512f && __x_to_x)
582	return __auto_bitcast(
583	__lo128(_mm512_cvttps_epu32(__auto_bitcast(__v))));
584	else if constexpr (__have_avx512vl && __y_to_y)
585	return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin));
586	else if constexpr (__have_avx512f && __y_to_y)
587	return __vector_bitcast<_Up>(
588	__lo256(_mm512_cvttps_epu32(__auto_bitcast(__v))));
589	else if constexpr (__x_to_x \|\| __y_to_y \|\| __z_to_z)
590	{
591	// go to fallback, it does the right thing. We can't use the
592	// _mm_floor_ps - 0x8000'0000 trick for f32->u32 because it would
593	// discard small input values (only 24 mantissa bits)
594	}
595	else
596	__assert_unreachable<_Tp>();
597	}
598	else if constexpr (__f32_to_ibw) //{{{2
599	return __convert_x86<_To>(__convert_x86<__vector_type_t<int, _Np>>(__v));
600	else if constexpr (__f64_to_s64) //{{{2
601	{
602	if constexpr (__have_avx512dq_vl && __x_to_x)
603	return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin));
604	else if constexpr (__have_avx512dq_vl && __y_to_y)
605	return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin));
606	else if constexpr (__have_avx512dq && __z_to_z)
607	return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin));
608	// else use scalar fallback
609	}
610	else if constexpr (__f64_to_u64) //{{{2
611	{
612	if constexpr (__have_avx512dq_vl && __x_to_x)
613	return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin));
614	else if constexpr (__have_avx512dq_vl && __y_to_y)
615	return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin));
616	else if constexpr (__have_avx512dq && __z_to_z)
617	return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin));
618	// else use scalar fallback
619	}
620	else if constexpr (__f64_to_s32) //{{{2
621	{
622	if constexpr (__x_to_x)
623	return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin));
624	else if constexpr (__y_to_x)
625	return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin));
626	else if constexpr (__z_to_y)
627	return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin));
628	}
629	else if constexpr (__f64_to_u32) //{{{2
630	{
631	if constexpr (__have_avx512vl && __x_to_x)
632	return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin));
633	else if constexpr (__have_sse4_1 && __x_to_x)
634	return __vector_bitcast<_Up, _M>(
635	_mm_cvttpd_epi32(_mm_floor_pd(__intrin) - `0x8000'0000u`))
636	^ `0x8000'0000u`;
637	else if constexpr (__x_to_x)
638	{
639	// use scalar fallback: it's only 2 values to convert, can't get
640	// much better than scalar decomposition
641	}
642	else if constexpr (__have_avx512vl && __y_to_x)
643	return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin));
644	else if constexpr (__y_to_x)
645	{
646	return __intrin_bitcast<_To>(
647	__vector_bitcast<_Up>(
648	_mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - `0x8000'0000u`))
649	^ `0x8000'0000u`);
650	}
651	else if constexpr (__z_to_y)
652	return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin));
653	}
654	else if constexpr (__f64_to_ibw) //{{{2
655	{
656	return __convert_x86<_To>(
657	__convert_x86<__vector_type_t<int, (_Np < `4` ? `4` : _Np)>>(__v));
658	}
659	else if constexpr (__s64_to_f32) //{{{2
660	{
661	if constexpr (__x_to_x && __have_avx512dq_vl)
662	return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin));
663	else if constexpr (__y_to_x && __have_avx512dq_vl)
664	return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin));
665	else if constexpr (__z_to_y && __have_avx512dq)
666	return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin));
667	else if constexpr (__z_to_y)
668	return __intrin_bitcast<_To>(
669	_mm512_cvtpd_ps(__convert_x86<__vector_type_t<double, `8`>>(__v)));
670	}
671	else if constexpr (__u64_to_f32) //{{{2
672	{
673	if constexpr (__x_to_x && __have_avx512dq_vl)
674	return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin));
675	else if constexpr (__y_to_x && __have_avx512dq_vl)
676	return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin));
677	else if constexpr (__z_to_y && __have_avx512dq)
678	return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin));
679	else if constexpr (__z_to_y)
680	{
681	return __intrin_bitcast<_To>(
682	__lo256(_mm512_cvtepu32_ps(__auto_bitcast(
683	_mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, `32`)))))
684	* `0x100000000LL`
685	+ __lo256(_mm512_cvtepu32_ps(
686	__auto_bitcast(_mm512_cvtepi64_epi32(__intrin)))));
687	}
688	}
689	else if constexpr (__s32_to_f32) //{{{2
690	{
691	// use fallback (builtin conversion)
692	}
693	else if constexpr (__u32_to_f32) //{{{2
694	{
695	if constexpr (__x_to_x && __have_avx512vl)
696	{
697	// use fallback
698	}
699	else if constexpr (__x_to_x && __have_avx512f)
700	return __intrin_bitcast<_To>(
701	__lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
702	else if constexpr (__x_to_x && (__have_fma \|\| __have_fma4))
703	// work around PR85819
704	return __auto_bitcast(`0x10000`
705	* _mm_cvtepi32_ps(__to_intrin(__v >> `16`))
706	+ _mm_cvtepi32_ps(__to_intrin(__v & `0xffff`)));
707	else if constexpr (__y_to_y && __have_avx512vl)
708	{
709	// use fallback
710	}
711	else if constexpr (__y_to_y && __have_avx512f)
712	return __intrin_bitcast<_To>(
713	__lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
714	else if constexpr (__y_to_y)
715	// work around PR85819
716	return `0x10000` * _mm256_cvtepi32_ps(__to_intrin(__v >> `16`))
717	+ _mm256_cvtepi32_ps(__to_intrin(__v & `0xffff`));
718	// else use fallback (builtin conversion)
719	}
720	else if constexpr (__ibw_to_f32) //{{{2
721	{
722	if constexpr (_M <= `4` \|\| __have_avx2)
723	return __convert_x86<_To>(
724	__convert_x86<__vector_type_t<int, _M>>(__v));
725	else
726	{
727	static_assert(__x_to_y);
728	__m128i __a, __b;
729	if constexpr (__have_sse4_1)
730	{
731	__a = sizeof(_Tp) == `2`
732	? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin)
733	: _mm_cvtepu16_epi32(__intrin))
734	: (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin)
735	: _mm_cvtepu8_epi32(__intrin));
736	const auto __w
737	= _mm_shuffle_epi32(__intrin, sizeof(_Tp) == `2` ? `0xee` : `0xe9`);
738	__b = sizeof(_Tp) == `2`
739	? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(V: __w)
740	: _mm_cvtepu16_epi32(V: __w))
741	: (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(V: __w)
742	: _mm_cvtepu8_epi32(V: __w));
743	}
744	else
745	{
746	__m128i __tmp;
747	if constexpr (sizeof(_Tp) == `1`)
748	{
749	__tmp = is_signed_v<_Tp>
750	? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin,
751	__intrin),
752	`8`)
753	: _mm_unpacklo_epi8(__intrin, __m128i());
754	}
755	else
756	{
757	static_assert(sizeof(_Tp) == `2`);
758	__tmp = __intrin;
759	}
760	__a = is_signed_v<_Tp>
761	? _mm_srai_epi32(a: _mm_unpacklo_epi16(a: __tmp, b: __tmp), count: `16`)
762	: _mm_unpacklo_epi16(a: __tmp, b: __m128i());
763	__b = is_signed_v<_Tp>
764	? _mm_srai_epi32(a: _mm_unpackhi_epi16(a: __tmp, b: __tmp), count: `16`)
765	: _mm_unpackhi_epi16(a: __tmp, b: __m128i());
766	}
767	return __convert_x86<_To>(__vector_bitcast<int>(x: __a),
768	__vector_bitcast<int>(x: __b));
769	}
770	}
771	else if constexpr (__s64_to_f64) //{{{2
772	{
773	if constexpr (__x_to_x && __have_avx512dq_vl)
774	return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin));
775	else if constexpr (__y_to_y && __have_avx512dq_vl)
776	return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin));
777	else if constexpr (__z_to_z && __have_avx512dq)
778	return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin));
779	else if constexpr (__z_to_z)
780	{
781	return __intrin_bitcast<_To>(
782	_mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> `32`)))
783	* `0x100000000LL`
784	+ _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
785	}
786	}
787	else if constexpr (__u64_to_f64) //{{{2
788	{
789	if constexpr (__x_to_x && __have_avx512dq_vl)
790	return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin));
791	else if constexpr (__y_to_y && __have_avx512dq_vl)
792	return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin));
793	else if constexpr (__z_to_z && __have_avx512dq)
794	return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin));
795	else if constexpr (__z_to_z)
796	{
797	return __intrin_bitcast<_To>(
798	_mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> `32`)))
799	* `0x100000000LL`
800	+ _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
801	}
802	}
803	else if constexpr (__s32_to_f64) //{{{2
804	{
805	if constexpr (__x_to_x)
806	return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin));
807	else if constexpr (__x_to_y)
808	return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin));
809	else if constexpr (__y_to_z)
810	return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin));
811	}
812	else if constexpr (__u32_to_f64) //{{{2
813	{
814	if constexpr (__x_to_x && __have_avx512vl)
815	return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin));
816	else if constexpr (__x_to_x && __have_avx512f)
817	return __intrin_bitcast<_To>(
818	__lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
819	else if constexpr (__x_to_x)
820	return __intrin_bitcast<_To>(
821	_mm_cvtepi32_pd(__to_intrin(__v ^ `0x8000'0000u`)) + `0x8000'0000u`);
822	else if constexpr (__x_to_y && __have_avx512vl)
823	return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin));
824	else if constexpr (__x_to_y && __have_avx512f)
825	return __intrin_bitcast<_To>(
826	__lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
827	else if constexpr (__x_to_y)
828	return __intrin_bitcast<_To>(
829	_mm256_cvtepi32_pd(__to_intrin(__v ^ `0x8000'0000u`)) + `0x8000'0000u`);
830	else if constexpr (__y_to_z)
831	return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin));
832	}
833	else if constexpr (__ibw_to_f64) //{{{2
834	{
835	return __convert_x86<_To>(
836	__convert_x86<__vector_type_t<int, std::max(a: size_t(`4`), b: _M)>>(__v));
837	}
838	else if constexpr (__f32_to_f64) //{{{2
839	{
840	if constexpr (__x_to_x)
841	return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin));
842	else if constexpr (__x_to_y)
843	return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin));
844	else if constexpr (__y_to_z)
845	return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin));
846	}
847	else if constexpr (__f64_to_f32) //{{{2
848	{
849	if constexpr (__x_to_x)
850	return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin));
851	else if constexpr (__y_to_x)
852	return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin));
853	else if constexpr (__z_to_y)
854	return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin));
855	}
856	else //{{{2
857	__assert_unreachable<_Tp>();
858
859	// fallback:{{{2
860	return __vector_convert<_To>(__v, make_index_sequence<std::min(a: _M, b: _Np)>());
861	//}}}
862	}
863
864	// }}}
865	// 2-arg __convert_x86 {{{1
866	template <typename _To, typename _V, typename _Traits>
867	_GLIBCXX_SIMD_INTRINSIC _To
868	__convert_x86(_V __v0, _V __v1)
869	{
870	static_assert(__is_vector_type_v<_V>);
871	using _Tp = typename _Traits::value_type;
872	constexpr size_t _Np = _Traits::_S_full_size;
873	[[maybe_unused]] const auto __i0 = __to_intrin(__v0);
874	[[maybe_unused]] const auto __i1 = __to_intrin(__v1);
875	using _Up = typename _VectorTraits<_To>::value_type;
876	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
877
878	static_assert(`2` * _Np <= _M,
879	"__v1 would be discarded; use the one-argument "
880	"__convert_x86 overload instead");
881
882	// [xyz]_to_[xyz] {{{2
883	[[maybe_unused]] constexpr bool __x_to_x
884	= sizeof(__v0) <= `16` && sizeof(_To) <= `16`;
885	[[maybe_unused]] constexpr bool __x_to_y
886	= sizeof(__v0) <= `16` && sizeof(_To) == `32`;
887	[[maybe_unused]] constexpr bool __x_to_z
888	= sizeof(__v0) <= `16` && sizeof(_To) == `64`;
889	[[maybe_unused]] constexpr bool __y_to_x
890	= sizeof(__v0) == `32` && sizeof(_To) <= `16`;
891	[[maybe_unused]] constexpr bool __y_to_y
892	= sizeof(__v0) == `32` && sizeof(_To) == `32`;
893	[[maybe_unused]] constexpr bool __y_to_z
894	= sizeof(__v0) == `32` && sizeof(_To) == `64`;
895	[[maybe_unused]] constexpr bool __z_to_x
896	= sizeof(__v0) == `64` && sizeof(_To) <= `16`;
897	[[maybe_unused]] constexpr bool __z_to_y
898	= sizeof(__v0) == `64` && sizeof(_To) == `32`;
899	[[maybe_unused]] constexpr bool __z_to_z
900	= sizeof(__v0) == `64` && sizeof(_To) == `64`;
901
902	// iX_to_iX {{{2
903	[[maybe_unused]] constexpr bool __i_to_i
904	= is_integral_v<_Up> && is_integral_v<_Tp>;
905	[[maybe_unused]] constexpr bool __i8_to_i16
906	= __i_to_i && sizeof(_Tp) == `1` && sizeof(_Up) == `2`;
907	[[maybe_unused]] constexpr bool __i8_to_i32
908	= __i_to_i && sizeof(_Tp) == `1` && sizeof(_Up) == `4`;
909	[[maybe_unused]] constexpr bool __i8_to_i64
910	= __i_to_i && sizeof(_Tp) == `1` && sizeof(_Up) == `8`;
911	[[maybe_unused]] constexpr bool __i16_to_i8
912	= __i_to_i && sizeof(_Tp) == `2` && sizeof(_Up) == `1`;
913	[[maybe_unused]] constexpr bool __i16_to_i32
914	= __i_to_i && sizeof(_Tp) == `2` && sizeof(_Up) == `4`;
915	[[maybe_unused]] constexpr bool __i16_to_i64
916	= __i_to_i && sizeof(_Tp) == `2` && sizeof(_Up) == `8`;
917	[[maybe_unused]] constexpr bool __i32_to_i8
918	= __i_to_i && sizeof(_Tp) == `4` && sizeof(_Up) == `1`;
919	[[maybe_unused]] constexpr bool __i32_to_i16
920	= __i_to_i && sizeof(_Tp) == `4` && sizeof(_Up) == `2`;
921	[[maybe_unused]] constexpr bool __i32_to_i64
922	= __i_to_i && sizeof(_Tp) == `4` && sizeof(_Up) == `8`;
923	[[maybe_unused]] constexpr bool __i64_to_i8
924	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `1`;
925	[[maybe_unused]] constexpr bool __i64_to_i16
926	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `2`;
927	[[maybe_unused]] constexpr bool __i64_to_i32
928	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `4`;
929
930	// [fsu]X_to_[fsu]X {{{2
931	// ibw = integral && byte or word, i.e. char and short with any signedness
932	[[maybe_unused]] constexpr bool __i64_to_f32
933	= is_integral_v<_Tp> && sizeof(_Tp) == `8`
934	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
935	[[maybe_unused]] constexpr bool __s32_to_f32
936	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `4`
937	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
938	[[maybe_unused]] constexpr bool __s16_to_f32
939	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `2`
940	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
941	[[maybe_unused]] constexpr bool __s8_to_f32
942	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `1`
943	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
944	[[maybe_unused]] constexpr bool __u32_to_f32
945	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `4`
946	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
947	[[maybe_unused]] constexpr bool __u16_to_f32
948	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `2`
949	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
950	[[maybe_unused]] constexpr bool __u8_to_f32
951	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `1`
952	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
953	[[maybe_unused]] constexpr bool __s64_to_f64
954	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `8`
955	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
956	[[maybe_unused]] constexpr bool __s32_to_f64
957	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `4`
958	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
959	[[maybe_unused]] constexpr bool __s16_to_f64
960	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `2`
961	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
962	[[maybe_unused]] constexpr bool __s8_to_f64
963	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `1`
964	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
965	[[maybe_unused]] constexpr bool __u64_to_f64
966	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `8`
967	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
968	[[maybe_unused]] constexpr bool __u32_to_f64
969	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `4`
970	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
971	[[maybe_unused]] constexpr bool __u16_to_f64
972	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `2`
973	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
974	[[maybe_unused]] constexpr bool __u8_to_f64
975	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `1`
976	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
977	[[maybe_unused]] constexpr bool __f32_to_s64
978	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `8`
979	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
980	[[maybe_unused]] constexpr bool __f32_to_s32
981	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `4`
982	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
983	[[maybe_unused]] constexpr bool __f32_to_u64
984	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `8`
985	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
986	[[maybe_unused]] constexpr bool __f32_to_u32
987	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `4`
988	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
989	[[maybe_unused]] constexpr bool __f64_to_s64
990	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `8`
991	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
992	[[maybe_unused]] constexpr bool __f64_to_s32
993	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `4`
994	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
995	[[maybe_unused]] constexpr bool __f64_to_u64
996	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `8`
997	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
998	[[maybe_unused]] constexpr bool __f64_to_u32
999	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `4`
1000	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
1001	[[maybe_unused]] constexpr bool __f32_to_ibw
1002	= is_integral_v<_Up> && sizeof(_Up) <= `2`
1003	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
1004	[[maybe_unused]] constexpr bool __f64_to_ibw
1005	= is_integral_v<_Up> && sizeof(_Up) <= `2`
1006	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
1007	[[maybe_unused]] constexpr bool __f32_to_f64
1008	= is_floating_point_v<_Tp> && sizeof(_Tp) == `4`
1009	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1010	[[maybe_unused]] constexpr bool __f64_to_f32
1011	= is_floating_point_v<_Tp> && sizeof(_Tp) == `8`
1012	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
1013
1014	if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
1015	// <double, 4>, <double, 4> => <short, 8>
1016	return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
1017	__hi128(__v1));
1018	else if constexpr (__i_to_i) // assert ISA {{{2
1019	{
1020	static_assert(__x_to_x \|\| __have_avx2,
1021	"integral conversions with ymm registers require AVX2");
1022	static_assert(__have_avx512bw
1023	\|\| ((sizeof(_Tp) >= `4` \|\| sizeof(__v0) < `64`)
1024	&& (sizeof(_Up) >= `4` \|\| sizeof(_To) < `64`)),
1025	"8/16-bit integers in zmm registers require AVX512BW");
1026	static_assert((sizeof(__v0) < `64` && sizeof(_To) < `64`) \|\| __have_avx512f,
1027	"integral conversions with ymm registers require AVX2");
1028	}
1029	// concat => use 1-arg __convert_x86 {{{2
1030	if constexpr (sizeof(__v0) < `16` \|\| (sizeof(__v0) == `16` && __have_avx2)
1031	\|\| (sizeof(__v0) == `16` && __have_avx
1032	&& is_floating_point_v<_Tp>)
1033	\|\| (sizeof(__v0) == `32` && __have_avx512f
1034	&& (sizeof(_Tp) >= `4` \|\| __have_avx512bw)))
1035	{
1036	// The ISA can handle wider input registers, so concat and use one-arg
1037	// implementation. This reduces code duplication considerably.
1038	return __convert_x86<_To>(__concat(__v0, __v1));
1039	}
1040	else //{{{2
1041	{
1042	// conversion using bit reinterpretation (or no conversion at all)
1043	// should all go through the concat branch above:
1044	static_assert(
1045	!(is_floating_point_v<
1046	_Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
1047	// handle all zero extension{{{2
1048	if constexpr (`2` * _Np < _M && sizeof(_To) > `16`)
1049	{
1050	constexpr size_t Min = `16` / sizeof(_Up);
1051	return __zero_extend(
1052	__convert_x86<
1053	__vector_type_t<_Up, (Min > `2` * _Np) ? Min : `2` * _Np>>(__v0,
1054	__v1));
1055	}
1056	else if constexpr (__i64_to_i32) //{{{2
1057	{
1058	if constexpr (__x_to_x)
1059	return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0),
1060	__auto_bitcast(__v1), `0x88`));
1061	else if constexpr (__y_to_y)
1062	{
1063	// AVX512F is not available (would concat otherwise)
1064	return __auto_bitcast(
1065	x: __xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0),
1066	__auto_bitcast(__v1), `0x88`)));
1067	// alternative:
1068	// const auto v0_abxxcdxx = _mm256_shuffle_epi32(__v0, 8);
1069	// const auto v1_efxxghxx = _mm256_shuffle_epi32(__v1, 8);
1070	// const auto v_abefcdgh = _mm256_unpacklo_epi64(v0_abxxcdxx,
1071	// v1_efxxghxx); return _mm256_permute4x64_epi64(v_abefcdgh,
1072	// 0x01 0 + 0x04 * 2 + 0x10 * 1 + 0x40 * 3); // abcdefgh*
1073	}
1074	else if constexpr (__z_to_z)
1075	return __intrin_bitcast<_To>(
1076	__concat(_mm512_cvtepi64_epi32(__i0),
1077	_mm512_cvtepi64_epi32(__i1)));
1078	}
1079	else if constexpr (__i64_to_i16) //{{{2
1080	{
1081	if constexpr (__x_to_x)
1082	{
1083	// AVX2 is not available (would concat otherwise)
1084	if constexpr (__have_sse4_1)
1085	{
1086	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1087	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, `4`), `0x44`),
1088	b: _mm_setr_epi8(b0: `0`, b1: `1`, b2: `8`, b3: `9`, b4: `4`, b5: `5`, b6: `12`, b7: `13`, b8: -`0x80`, b9: -`0x80`,
1089	b10: -`0x80`, b11: -`0x80`, b12: -`0x80`, b13: -`0x80`, b14: -`0x80`, b15: -`0x80`)));
1090	}
1091	else
1092	{
1093	return __vector_type_t<_Up, _M>{_Up(__v0[`0`]), _Up(__v0[`1`]),
1094	_Up(__v1[`0`]), _Up(__v1[`1`])};
1095	}
1096	}
1097	else if constexpr (__y_to_x)
1098	{
1099	auto __a
1100	= _mm256_unpacklo_epi16(__i0, __i1); // 04.. .... 26.. ....
1101	auto __b
1102	= _mm256_unpackhi_epi16(__i0, __i1); // 15.. .... 37.. ....
1103	auto __c
1104	= _mm256_unpacklo_epi16(__a, __b); // 0145 .... 2367 ....
1105	return __intrin_bitcast<_To>(
1106	_mm_unpacklo_epi32(__lo128(__c), __hi128(__c))); // 0123 4567
1107	}
1108	else if constexpr (__z_to_y)
1109	return __intrin_bitcast<_To>(
1110	__concat(_mm512_cvtepi64_epi16(__i0),
1111	_mm512_cvtepi64_epi16(__i1)));
1112	}
1113	else if constexpr (__i64_to_i8) //{{{2
1114	{
1115	if constexpr (__x_to_x && __have_sse4_1)
1116	{
1117	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1118	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, `4`), `0x44`),
1119	b: _mm_setr_epi8(b0: `0`, b1: `8`, b2: `4`, b3: `12`, b4: -`0x80`, b5: -`0x80`, b6: -`0x80`, b7: -`0x80`, b8: -`0x80`,
1120	b9: -`0x80`, b10: -`0x80`, b11: -`0x80`, b12: -`0x80`, b13: -`0x80`, b14: -`0x80`,
1121	b15: -`0x80`)));
1122	}
1123	else if constexpr (__x_to_x && __have_ssse3)
1124	{
1125	return __intrin_bitcast<_To>(_mm_unpacklo_epi16(
1126	_mm_shuffle_epi8(
1127	__i0, _mm_setr_epi8(b0: `0`, b1: `8`, b2: -`0x80`, b3: -`0x80`, b4: -`0x80`, b5: -`0x80`, b6: -`0x80`,
1128	b7: -`0x80`, b8: -`0x80`, b9: -`0x80`, b10: -`0x80`, b11: -`0x80`,
1129	b12: -`0x80`, b13: -`0x80`, b14: -`0x80`, b15: -`0x80`)),
1130	_mm_shuffle_epi8(
1131	__i1, _mm_setr_epi8(b0: `0`, b1: `8`, b2: -`0x80`, b3: -`0x80`, b4: -`0x80`, b5: -`0x80`, b6: -`0x80`,
1132	b7: -`0x80`, b8: -`0x80`, b9: -`0x80`, b10: -`0x80`, b11: -`0x80`,
1133	b12: -`0x80`, b13: -`0x80`, b14: -`0x80`, b15: -`0x80`))));
1134	}
1135	else if constexpr (__x_to_x)
1136	{
1137	return __vector_type_t<_Up, _M>{_Up(__v0[`0`]), _Up(__v0[`1`]),
1138	_Up(__v1[`0`]), _Up(__v1[`1`])};
1139	}
1140	else if constexpr (__y_to_x)
1141	{
1142	const auto __a = _mm256_shuffle_epi8(
1143	_mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, `32`), `0xAA`),
1144	b: _mm256_setr_epi8(b31: `0`, b30: `8`, b29: -`0x80`, b28: -`0x80`, b27: `4`, b26: `12`, b25: -`0x80`, b24: -`0x80`,
1145	b23: -`0x80`, b22: -`0x80`, b21: -`0x80`, b20: -`0x80`, b19: -`0x80`, b18: -`0x80`,
1146	b17: -`0x80`, b16: -`0x80`, b15: -`0x80`, b14: -`0x80`, b13: `0`, b12: `8`, b11: -`0x80`,
1147	b10: -`0x80`, b09: `4`, b08: `12`, b07: -`0x80`, b06: -`0x80`, b05: -`0x80`, b04: -`0x80`,
1148	b03: -`0x80`, b02: -`0x80`, b01: -`0x80`, b00: -`0x80`));
1149	return __intrin_bitcast<_To>(__lo128(x: __a) \| __hi128(x: __a));
1150	} // __z_to_x uses concat fallback
1151	}
1152	else if constexpr (__i32_to_i16) //{{{2
1153	{
1154	if constexpr (__x_to_x)
1155	{
1156	// AVX2 is not available (would concat otherwise)
1157	if constexpr (__have_sse4_1)
1158	{
1159	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1160	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, `2`), `0xaa`),
1161	b: _mm_setr_epi8(b0: `0`, b1: `1`, b2: `4`, b3: `5`, b4: `8`, b5: `9`, b6: `12`, b7: `13`, b8: `2`, b9: `3`, b10: `6`, b11: `7`, b12: `10`,
1162	b13: `11`, b14: `14`, b15: `15`)));
1163	}
1164	else if constexpr (__have_ssse3)
1165	{
1166	return __intrin_bitcast<_To>(
1167	_mm_hadd_epi16(__to_intrin(__v0 << `16`),
1168	__to_intrin(__v1 << `16`)));
1169	/*
1170	return _mm_unpacklo_epi64(
1171	_mm_shuffle_epi8(__i0, _mm_setr_epi8(0, 1, 4, 5, 8, 9,
1172	12, 13, 8, 9, 12, 13, 12, 13, 14, 15)),
1173	_mm_shuffle_epi8(__i1, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12,
1174	13, 8, 9, 12, 13, 12, 13, 14, 15)));
1175	*/
1176	}
1177	else
1178	{
1179	auto __a = _mm_unpacklo_epi16(__i0, __i1); // 04.. 15..
1180	auto __b = _mm_unpackhi_epi16(__i0, __i1); // 26.. 37..
1181	auto __c = _mm_unpacklo_epi16(__a, __b); // 0246 ....
1182	auto __d = _mm_unpackhi_epi16(__a, __b); // 1357 ....
1183	return __intrin_bitcast<_To>(
1184	_mm_unpacklo_epi16(__c, __d)); // 0123 4567
1185	}
1186	}
1187	else if constexpr (__y_to_y)
1188	{
1189	const auto __shuf
1190	= _mm256_setr_epi8(b31: `0`, b30: `1`, b29: `4`, b28: `5`, b27: `8`, b26: `9`, b25: `12`, b24: `13`, b23: -`0x80`, b22: -`0x80`,
1191	b21: -`0x80`, b20: -`0x80`, b19: -`0x80`, b18: -`0x80`, b17: -`0x80`, b16: -`0x80`,
1192	b15: `0`, b14: `1`, b13: `4`, b12: `5`, b11: `8`, b10: `9`, b09: `12`, b08: `13`, b07: -`0x80`, b06: -`0x80`,
1193	b05: -`0x80`, b04: -`0x80`, b03: -`0x80`, b02: -`0x80`, b01: -`0x80`, b00: -`0x80`);
1194	auto __a = _mm256_shuffle_epi8(__i0, __shuf);
1195	auto __b = _mm256_shuffle_epi8(__i1, __shuf);
1196	return __intrin_bitcast<_To>(
1197	__xzyw(_mm256_unpacklo_epi64(__a, __b)));
1198	} // __z_to_z uses concat fallback
1199	}
1200	else if constexpr (__i32_to_i8) //{{{2
1201	{
1202	if constexpr (__x_to_x && __have_ssse3)
1203	{
1204	const auto shufmask
1205	= _mm_setr_epi8(b0: `0`, b1: `4`, b2: `8`, b3: `12`, b4: -`0x80`, b5: -`0x80`, b6: -`0x80`, b7: -`0x80`,
1206	b8: -`0x80`, b9: -`0x80`, b10: -`0x80`, b11: -`0x80`, b12: -`0x80`, b13: -`0x80`,
1207	b14: -`0x80`, b15: -`0x80`);
1208	return __intrin_bitcast<_To>(
1209	_mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask),
1210	_mm_shuffle_epi8(__i1, shufmask)));
1211	}
1212	else if constexpr (__x_to_x)
1213	{
1214	auto __a = _mm_unpacklo_epi8(__i0, __i1); // 04.. .... 15.. ....
1215	auto __b = _mm_unpackhi_epi8(__i0, __i1); // 26.. .... 37.. ....
1216	auto __c = _mm_unpacklo_epi8(__a, __b); // 0246 .... .... ....
1217	auto __d = _mm_unpackhi_epi8(__a, __b); // 1357 .... .... ....
1218	auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 4567 .... ....
1219	return __intrin_bitcast<_To>(__e & __m128i{-`1`, `0`});
1220	}
1221	else if constexpr (__y_to_x)
1222	{
1223	const auto __a = _mm256_shuffle_epi8(
1224	_mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, `16`), `0xAA`),
1225	b: _mm256_setr_epi8(b31: `0`, b30: `4`, b29: `8`, b28: `12`, b27: -`0x80`, b26: -`0x80`, b25: -`0x80`, b24: -`0x80`, b23: `2`,
1226	b22: `6`, b21: `10`, b20: `14`, b19: -`0x80`, b18: -`0x80`, b17: -`0x80`, b16: -`0x80`, b15: -`0x80`,
1227	b14: -`0x80`, b13: -`0x80`, b12: -`0x80`, b11: `0`, b10: `4`, b09: `8`, b08: `12`, b07: -`0x80`,
1228	b06: -`0x80`, b05: -`0x80`, b04: -`0x80`, b03: `2`, b02: `6`, b01: `10`, b00: `14`));
1229	return __intrin_bitcast<_To>(__lo128(x: __a) \| __hi128(x: __a));
1230	} // __z_to_y uses concat fallback
1231	}
1232	else if constexpr (__i16_to_i8) //{{{2
1233	{
1234	if constexpr (__x_to_x && __have_ssse3)
1235	{
1236	const auto __shuf = reinterpret_cast<__m128i>(
1237	__vector_type_t<_UChar, `16`>{`0`, `2`, `4`, `6`, `8`, `10`, `12`, `14`, `0x80`,
1238	`0x80`, `0x80`, `0x80`, `0x80`, `0x80`,
1239	`0x80`, `0x80`});
1240	return __intrin_bitcast<_To>(
1241	_mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf),
1242	_mm_shuffle_epi8(__i1, __shuf)));
1243	}
1244	else if constexpr (__x_to_x)
1245	{
1246	auto __a = _mm_unpacklo_epi8(__i0, __i1); // 08.. 19.. 2A.. 3B..
1247	auto __b = _mm_unpackhi_epi8(__i0, __i1); // 4C.. 5D.. 6E.. 7F..
1248	auto __c = _mm_unpacklo_epi8(__a, __b); // 048C .... 159D ....
1249	auto __d = _mm_unpackhi_epi8(__a, __b); // 26AE .... 37BF ....
1250	auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 8ACE .... ....
1251	auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 9BDF .... ....
1252	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
1253	}
1254	else if constexpr (__y_to_y)
1255	{
1256	return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8(
1257	(__to_intrin(__v0) & _mm256_set1_epi32(i: `0x00ff00ff`))
1258	\| _mm256_slli_epi16(__i1, `8`),
1259	_mm256_setr_epi8(b31: `0`, b30: `2`, b29: `4`, b28: `6`, b27: `8`, b26: `10`, b25: `12`, b24: `14`, b23: `1`, b22: `3`, b21: `5`, b20: `7`, b19: `9`, b18: `11`,
1260	b17: `13`, b16: `15`, b15: `0`, b14: `2`, b13: `4`, b12: `6`, b11: `8`, b10: `10`, b09: `12`, b08: `14`, b07: `1`, b06: `3`, b05: `5`,
1261	b04: `7`, b03: `9`, b02: `11`, b01: `13`, b00: `15`))));
1262	} // __z_to_z uses concat fallback
1263	}
1264	else if constexpr (__i64_to_f32) //{{{2
1265	{
1266	if constexpr (__x_to_x)
1267	return __make_wrapper<float>(__v0[`0`], __v0[`1`], __v1[`0`], __v1[`1`]);
1268	else if constexpr (__y_to_y)
1269	{
1270	static_assert(__y_to_y && __have_avx2);
1271	const auto __a = _mm256_unpacklo_epi32(__i0, __i1); // aeAE cgCG
1272	const auto __b = _mm256_unpackhi_epi32(__i0, __i1); // bfBF dhDH
1273	const auto __lo32
1274	= _mm256_unpacklo_epi32(__a, __b); // abef cdgh
1275	const auto __hi32 = __vector_bitcast<
1276	conditional_t<is_signed_v<_Tp>, int, _UInt>>(
1277	_mm256_unpackhi_epi32(__a, __b)); // ABEF CDGH
1278	const auto __hi
1279	= `0x100000000LL`
1280	* __convert_x86<__vector_type_t<float, `8`>>(__hi32);
1281	const auto __mid
1282	= `0x10000` * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, `16`));
1283	const auto __lo
1284	= _mm256_cvtepi32_ps(_mm256_set1_epi32(i: `0x0000ffffu`) & __lo32);
1285	return __xzyw((__hi + __mid) + __lo);
1286	}
1287	else if constexpr (__z_to_z && __have_avx512dq)
1288	{
1289	return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0),
1290	_mm512_cvtepi64_ps(__i1))
1291	: __concat(_mm512_cvtepu64_ps(__i0),
1292	_mm512_cvtepu64_ps(__i1));
1293	}
1294	else if constexpr (__z_to_z && is_signed_v<_Tp>)
1295	{
1296	const __m512 __hi32 = _mm512_cvtepi32_ps(
1297	__concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> `32`)),
1298	_mm512_cvtepi64_epi32(__to_intrin(__v1 >> `32`))));
1299	const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0),
1300	_mm512_cvtepi64_epi32(__i1));
1301	// split low 32-bits, because if __hi32 is a small negative
1302	// number, the 24-bit mantissa may lose important information if
1303	// any of the high 8 bits of __lo32 is set, leading to
1304	// catastrophic cancelation in the FMA
1305	const __m512 __hi16
1306	= _mm512_cvtepu32_ps(A: _mm512_set1_epi32(s: `0xffff0000u`) & __lo32);
1307	const __m512 __lo16
1308	= _mm512_cvtepi32_ps(A: _mm512_set1_epi32(s: `0x0000ffffu`) & __lo32);
1309	return (__hi32 * `0x100000000LL` + __hi16) + __lo16;
1310	}
1311	else if constexpr (__z_to_z && is_unsigned_v<_Tp>)
1312	{
1313	return __intrin_bitcast<_To>(
1314	_mm512_cvtepu32_ps(__concat(
1315	_mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, `32`)),
1316	_mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, `32`))))
1317	* `0x100000000LL`
1318	+ _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0),
1319	_mm512_cvtepi64_epi32(__i1))));
1320	}
1321	}
1322	else if constexpr (__f64_to_s32) //{{{2
1323	{
1324	// use concat fallback
1325	}
1326	else if constexpr (__f64_to_u32) //{{{2
1327	{
1328	if constexpr (__x_to_x && __have_sse4_1)
1329	{
1330	return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64(
1331	a: _mm_cvttpd_epi32(_mm_floor_pd(__i0) - `0x8000'0000u`),
1332	b: _mm_cvttpd_epi32(_mm_floor_pd(__i1) - `0x8000'0000u`)))
1333	^ `0x8000'0000u`;
1334	// without SSE4.1 just use the scalar fallback, it's only four
1335	// values
1336	}
1337	else if constexpr (__y_to_y)
1338	{
1339	return __vector_bitcast<_Up>(
1340	__concat(a_: _mm256_cvttpd_epi32(_mm256_floor_pd(__i0)
1341	- `0x8000'0000u`),
1342	b_: _mm256_cvttpd_epi32(_mm256_floor_pd(__i1)
1343	- `0x8000'0000u`)))
1344	^ `0x8000'0000u`;
1345	} // __z_to_z uses fallback
1346	}
1347	else if constexpr (__f64_to_ibw) //{{{2
1348	{
1349	// one-arg __f64_to_ibw goes via _SimdWrapper<int, ?>. The fallback
1350	// would go via two independet conversions to _SimdWrapper<_To> and
1351	// subsequent interleaving. This is better, because f64->__i32
1352	// allows to combine __v0 and __v1 into one register: if constexpr
1353	// (__z_to_x \|\| __y_to_x) {
1354	return __convert_x86<_To>(
1355	__convert_x86<__vector_type_t<int, _Np * `2`>>(__v0, __v1));
1356	//}
1357	}
1358	else if constexpr (__f32_to_ibw) //{{{2
1359	{
1360	return __convert_x86<_To>(
1361	__convert_x86<__vector_type_t<int, _Np>>(__v0),
1362	__convert_x86<__vector_type_t<int, _Np>>(__v1));
1363	} //}}}
1364
1365	// fallback: {{{2
1366	if constexpr (sizeof(_To) >= `32`)
1367	// if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
1368	return __concat(__convert_x86<__vector_type_t<_Up, _M / `2`>>(__v0),
1369	__convert_x86<__vector_type_t<_Up, _M / `2`>>(__v1));
1370	else if constexpr (sizeof(_To) == `16`)
1371	{
1372	const auto __lo = __to_intrin(__convert_x86<_To>(__v0));
1373	const auto __hi = __to_intrin(__convert_x86<_To>(__v1));
1374	if constexpr (sizeof(_Up) * _Np == `8`)
1375	{
1376	if constexpr (is_floating_point_v<_Up>)
1377	return __auto_bitcast(
1378	_mm_unpacklo_pd(__vector_bitcast<double>(__lo),
1379	__vector_bitcast<double>(__hi)));
1380	else
1381	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1382	}
1383	else if constexpr (sizeof(_Up) * _Np == `4`)
1384	{
1385	if constexpr (is_floating_point_v<_Up>)
1386	return __auto_bitcast(
1387	_mm_unpacklo_ps(__vector_bitcast<float>(__lo),
1388	__vector_bitcast<float>(__hi)));
1389	else
1390	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
1391	}
1392	else if constexpr (sizeof(_Up) * _Np == `2`)
1393	return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi));
1394	else
1395	__assert_unreachable<_Tp>();
1396	}
1397	else
1398	return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>());
1399	//}}}
1400	}
1401	}
1402
1403	//}}}1
1404	// 4-arg __convert_x86 {{{1
1405	template <typename _To, typename _V, typename _Traits>
1406	_GLIBCXX_SIMD_INTRINSIC _To
1407	__convert_x86(_V __v0, _V __v1, _V __v2, _V __v3)
1408	{
1409	static_assert(__is_vector_type_v<_V>);
1410	using _Tp = typename _Traits::value_type;
1411	constexpr size_t _Np = _Traits::_S_full_size;
1412	[[maybe_unused]] const auto __i0 = __to_intrin(__v0);
1413	[[maybe_unused]] const auto __i1 = __to_intrin(__v1);
1414	[[maybe_unused]] const auto __i2 = __to_intrin(__v2);
1415	[[maybe_unused]] const auto __i3 = __to_intrin(__v3);
1416	using _Up = typename _VectorTraits<_To>::value_type;
1417	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
1418
1419	static_assert(`4` * _Np <= _M,
1420	"__v2/__v3 would be discarded; use the two/one-argument "
1421	"__convert_x86 overload instead");
1422
1423	// [xyz]_to_[xyz] {{{2
1424	[[maybe_unused]] constexpr bool __x_to_x
1425	= sizeof(__v0) <= `16` && sizeof(_To) <= `16`;
1426	[[maybe_unused]] constexpr bool __x_to_y
1427	= sizeof(__v0) <= `16` && sizeof(_To) == `32`;
1428	[[maybe_unused]] constexpr bool __x_to_z
1429	= sizeof(__v0) <= `16` && sizeof(_To) == `64`;
1430	[[maybe_unused]] constexpr bool __y_to_x
1431	= sizeof(__v0) == `32` && sizeof(_To) <= `16`;
1432	[[maybe_unused]] constexpr bool __y_to_y
1433	= sizeof(__v0) == `32` && sizeof(_To) == `32`;
1434	[[maybe_unused]] constexpr bool __y_to_z
1435	= sizeof(__v0) == `32` && sizeof(_To) == `64`;
1436	[[maybe_unused]] constexpr bool __z_to_x
1437	= sizeof(__v0) == `64` && sizeof(_To) <= `16`;
1438	[[maybe_unused]] constexpr bool __z_to_y
1439	= sizeof(__v0) == `64` && sizeof(_To) == `32`;
1440	[[maybe_unused]] constexpr bool __z_to_z
1441	= sizeof(__v0) == `64` && sizeof(_To) == `64`;
1442
1443	// iX_to_iX {{{2
1444	[[maybe_unused]] constexpr bool __i_to_i
1445	= is_integral_v<_Up> && is_integral_v<_Tp>;
1446	[[maybe_unused]] constexpr bool __i8_to_i16
1447	= __i_to_i && sizeof(_Tp) == `1` && sizeof(_Up) == `2`;
1448	[[maybe_unused]] constexpr bool __i8_to_i32
1449	= __i_to_i && sizeof(_Tp) == `1` && sizeof(_Up) == `4`;
1450	[[maybe_unused]] constexpr bool __i8_to_i64
1451	= __i_to_i && sizeof(_Tp) == `1` && sizeof(_Up) == `8`;
1452	[[maybe_unused]] constexpr bool __i16_to_i8
1453	= __i_to_i && sizeof(_Tp) == `2` && sizeof(_Up) == `1`;
1454	[[maybe_unused]] constexpr bool __i16_to_i32
1455	= __i_to_i && sizeof(_Tp) == `2` && sizeof(_Up) == `4`;
1456	[[maybe_unused]] constexpr bool __i16_to_i64
1457	= __i_to_i && sizeof(_Tp) == `2` && sizeof(_Up) == `8`;
1458	[[maybe_unused]] constexpr bool __i32_to_i8
1459	= __i_to_i && sizeof(_Tp) == `4` && sizeof(_Up) == `1`;
1460	[[maybe_unused]] constexpr bool __i32_to_i16
1461	= __i_to_i && sizeof(_Tp) == `4` && sizeof(_Up) == `2`;
1462	[[maybe_unused]] constexpr bool __i32_to_i64
1463	= __i_to_i && sizeof(_Tp) == `4` && sizeof(_Up) == `8`;
1464	[[maybe_unused]] constexpr bool __i64_to_i8
1465	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `1`;
1466	[[maybe_unused]] constexpr bool __i64_to_i16
1467	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `2`;
1468	[[maybe_unused]] constexpr bool __i64_to_i32
1469	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `4`;
1470
1471	// [fsu]X_to_[fsu]X {{{2
1472	// ibw = integral && byte or word, i.e. char and short with any signedness
1473	[[maybe_unused]] constexpr bool __i64_to_f32
1474	= is_integral_v<_Tp> && sizeof(_Tp) == `8`
1475	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
1476	[[maybe_unused]] constexpr bool __s32_to_f32
1477	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `4`
1478	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
1479	[[maybe_unused]] constexpr bool __s16_to_f32
1480	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `2`
1481	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
1482	[[maybe_unused]] constexpr bool __s8_to_f32
1483	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `1`
1484	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
1485	[[maybe_unused]] constexpr bool __u32_to_f32
1486	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `4`
1487	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
1488	[[maybe_unused]] constexpr bool __u16_to_f32
1489	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `2`
1490	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
1491	[[maybe_unused]] constexpr bool __u8_to_f32
1492	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `1`
1493	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
1494	[[maybe_unused]] constexpr bool __s64_to_f64
1495	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `8`
1496	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1497	[[maybe_unused]] constexpr bool __s32_to_f64
1498	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `4`
1499	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1500	[[maybe_unused]] constexpr bool __s16_to_f64
1501	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `2`
1502	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1503	[[maybe_unused]] constexpr bool __s8_to_f64
1504	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == `1`
1505	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1506	[[maybe_unused]] constexpr bool __u64_to_f64
1507	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `8`
1508	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1509	[[maybe_unused]] constexpr bool __u32_to_f64
1510	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `4`
1511	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1512	[[maybe_unused]] constexpr bool __u16_to_f64
1513	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `2`
1514	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1515	[[maybe_unused]] constexpr bool __u8_to_f64
1516	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == `1`
1517	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1518	[[maybe_unused]] constexpr bool __f32_to_s64
1519	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `8`
1520	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
1521	[[maybe_unused]] constexpr bool __f32_to_s32
1522	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `4`
1523	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
1524	[[maybe_unused]] constexpr bool __f32_to_u64
1525	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `8`
1526	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
1527	[[maybe_unused]] constexpr bool __f32_to_u32
1528	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `4`
1529	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
1530	[[maybe_unused]] constexpr bool __f64_to_s64
1531	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `8`
1532	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
1533	[[maybe_unused]] constexpr bool __f64_to_s32
1534	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == `4`
1535	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
1536	[[maybe_unused]] constexpr bool __f64_to_u64
1537	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `8`
1538	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
1539	[[maybe_unused]] constexpr bool __f64_to_u32
1540	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == `4`
1541	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
1542	[[maybe_unused]] constexpr bool __f32_to_ibw
1543	= is_integral_v<_Up> && sizeof(_Up) <= `2`
1544	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `4`;
1545	[[maybe_unused]] constexpr bool __f64_to_ibw
1546	= is_integral_v<_Up> && sizeof(_Up) <= `2`
1547	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
1548	[[maybe_unused]] constexpr bool __f32_to_f64
1549	= is_floating_point_v<_Tp> && sizeof(_Tp) == `4`
1550	&& is_floating_point_v<_Up> && sizeof(_Up) == `8`;
1551	[[maybe_unused]] constexpr bool __f64_to_f32
1552	= is_floating_point_v<_Tp> && sizeof(_Tp) == `8`
1553	&& is_floating_point_v<_Up> && sizeof(_Up) == `4`;
1554
1555	if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
1556	{
1557	// <double, 4>, <double, 4>, <double, 4>, <double, 4> => <char, 16>
1558	return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
1559	__hi128(__v1), __lo128(__v2), __hi128(__v2),
1560	__lo128(__v3), __hi128(__v3));
1561	}
1562	else if constexpr (__i_to_i) // assert ISA {{{2
1563	{
1564	static_assert(__x_to_x \|\| __have_avx2,
1565	"integral conversions with ymm registers require AVX2");
1566	static_assert(__have_avx512bw
1567	\|\| ((sizeof(_Tp) >= `4` \|\| sizeof(__v0) < `64`)
1568	&& (sizeof(_Up) >= `4` \|\| sizeof(_To) < `64`)),
1569	"8/16-bit integers in zmm registers require AVX512BW");
1570	static_assert((sizeof(__v0) < `64` && sizeof(_To) < `64`) \|\| __have_avx512f,
1571	"integral conversions with ymm registers require AVX2");
1572	}
1573	// concat => use 2-arg __convert_x86 {{{2
1574	if constexpr (sizeof(__v0) < `16` \|\| (sizeof(__v0) == `16` && __have_avx2)
1575	\|\| (sizeof(__v0) == `16` && __have_avx
1576	&& is_floating_point_v<_Tp>)
1577	\|\| (sizeof(__v0) == `32` && __have_avx512f))
1578	{
1579	// The ISA can handle wider input registers, so concat and use two-arg
1580	// implementation. This reduces code duplication considerably.
1581	return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3));
1582	}
1583	else //{{{2
1584	{
1585	// conversion using bit reinterpretation (or no conversion at all)
1586	// should all go through the concat branch above:
1587	static_assert(
1588	!(is_floating_point_v<
1589	_Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
1590	// handle all zero extension{{{2
1591	if constexpr (`4` * _Np < _M && sizeof(_To) > `16`)
1592	{
1593	constexpr size_t Min = `16` / sizeof(_Up);
1594	return __zero_extend(
1595	__convert_x86<
1596	__vector_type_t<_Up, (Min > `4` * _Np) ? Min : `4` * _Np>>(
1597	__v0, __v1, __v2, __v3));
1598	}
1599	else if constexpr (__i64_to_i16) //{{{2
1600	{
1601	if constexpr (__x_to_x && __have_sse4_1)
1602	{
1603	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1604	_mm_blend_epi16(
1605	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, `2`), `0x22`),
1606	_mm_blend_epi16(_mm_slli_si128(__i2, `4`),
1607	_mm_slli_si128(__i3, `6`), `0x88`),
1608	`0xcc`),
1609	b: _mm_setr_epi8(b0: `0`, b1: `1`, b2: `8`, b3: `9`, b4: `2`, b5: `3`, b6: `10`, b7: `11`, b8: `4`, b9: `5`, b10: `12`, b11: `13`, b12: `6`, b13: `7`,
1610	b14: `14`, b15: `15`)));
1611	}
1612	else if constexpr (__y_to_y && __have_avx2)
1613	{
1614	return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
1615	a: __xzyw(_mm256_blend_epi16(
1616	__auto_bitcast(
1617	_mm256_shuffle_ps(__vector_bitcast<float>(__v0),
1618	__vector_bitcast<float>(__v2),
1619	`0x88`)), // 0.1. 8.9. 2.3. A.B.
1620	__to_intrin(__vector_bitcast<int>(_mm256_shuffle_ps(
1621	__vector_bitcast<float>(__v1),
1622	__vector_bitcast<float>(__v3), `0x88`))
1623	<< `16`), // .4.5 .C.D .6.7 .E.F
1624	`0xaa`) // 0415 8C9D 2637 AEBF
1625	), // 0415 2637 8C9D AEBF
1626	b: _mm256_setr_epi8(b31: `0`, b30: `1`, b29: `4`, b28: `5`, b27: `8`, b26: `9`, b25: `12`, b24: `13`, b23: `2`, b22: `3`, b21: `6`, b20: `7`, b19: `10`, b18: `11`,
1627	b17: `14`, b16: `15`, b15: `0`, b14: `1`, b13: `4`, b12: `5`, b11: `8`, b10: `9`, b09: `12`, b08: `13`, b07: `2`, b06: `3`, b05: `6`, b04: `7`,
1628	b03: `10`, b02: `11`, b01: `14`, b00: `15`)));
1629	/*
1630	auto __a = _mm256_unpacklo_epi16(__v0, __v1); // 04.. .... 26..
1631	.... auto __b = _mm256_unpackhi_epi16(__v0, __v1); // 15..
1632	.... 37.. .... auto __c = _mm256_unpacklo_epi16(__v2, __v3); //
1633	8C.. .... AE.. .... auto __d = _mm256_unpackhi_epi16(__v2,
1634	__v3);
1635	// 9D.. .... BF.. .... auto __e = _mm256_unpacklo_epi16(__a,
1636	__b);
1637	// 0145 .... 2367 .... auto __f = _mm256_unpacklo_epi16(__c,
1638	__d);
1639	// 89CD .... ABEF .... auto __g = _mm256_unpacklo_epi64(__e,
1640	__f);
1641	// 0145 89CD 2367 ABEF return __concat(
1642	_mm_unpacklo_epi32(__lo128(__g), __hi128(__g)),
1643	_mm_unpackhi_epi32(__lo128(__g), __hi128(__g))); // 0123
1644	4567 89AB CDEF
1645	*/
1646	} // else use fallback
1647	}
1648	else if constexpr (__i64_to_i8) //{{{2
1649	{
1650	if constexpr (__x_to_x)
1651	{
1652	// TODO: use fallback for now
1653	}
1654	else if constexpr (__y_to_x)
1655	{
1656	auto __a
1657	= _mm256_srli_epi32(_mm256_slli_epi32(__i0, `24`), `24`)
1658	\| _mm256_srli_epi32(_mm256_slli_epi32(__i1, `24`), `16`)
1659	\| _mm256_srli_epi32(_mm256_slli_epi32(__i2, `24`), `8`)
1660	\| _mm256_slli_epi32(
1661	__i3, `24`); // 048C .... 159D .... 26AE .... 37BF ....
1662	/return _mm_shuffle_epi8(*
1663	_mm_blend_epi32(__lo128(__a) << 32, __hi128(__a), 0x5),
1664	_mm_setr_epi8(4, 12, 0, 8, 5, 13, 1, 9, 6, 14, 2, 10, 7, 15,
1665	3, 11));/*
1666	auto __b = _mm256_unpackhi_epi64(
1667	__a, __a); // 159D .... 159D .... 37BF .... 37BF ....
1668	auto __c = _mm256_unpacklo_epi8(
1669	__a, __b); // 0145 89CD .... .... 2367 ABEF .... ....
1670	return __intrin_bitcast<_To>(
1671	_mm_unpacklo_epi16(__lo128(__c),
1672	__hi128(__c))); // 0123 4567 89AB CDEF
1673	}
1674	}
1675	else if constexpr (__i32_to_i8) //{{{2
1676	{
1677	if constexpr (__x_to_x)
1678	{
1679	if constexpr (__have_ssse3)
1680	{
1681	const auto __x0 = __vector_bitcast<_UInt>(__v0) & `0xff`;
1682	const auto __x1 = (__vector_bitcast<_UInt>(__v1) & `0xff`)
1683	<< `8`;
1684	const auto __x2 = (__vector_bitcast<_UInt>(__v2) & `0xff`)
1685	<< `16`;
1686	const auto __x3 = __vector_bitcast<_UInt>(__v3) << `24`;
1687	return __intrin_bitcast<_To>(
1688	_mm_shuffle_epi8(__to_intrin(__x0 \| __x1 \| __x2 \| __x3),
1689	_mm_setr_epi8(b0: `0`, b1: `4`, b2: `8`, b3: `12`, b4: `1`, b5: `5`, b6: `9`, b7: `13`,
1690	b8: `2`, b9: `6`, b10: `10`, b11: `14`, b12: `3`, b13: `7`, b14: `11`,
1691	b15: `15`)));
1692	}
1693	else
1694	{
1695	auto __a
1696	= _mm_unpacklo_epi8(__i0, __i2); // 08.. .... 19.. ....
1697	auto __b
1698	= _mm_unpackhi_epi8(__i0, __i2); // 2A.. .... 3B.. ....
1699	auto __c
1700	= _mm_unpacklo_epi8(__i1, __i3); // 4C.. .... 5D.. ....
1701	auto __d
1702	= _mm_unpackhi_epi8(__i1, __i3); // 6E.. .... 7F.. ....
1703	auto __e
1704	= _mm_unpacklo_epi8(__a, __c); // 048C .... .... ....
1705	auto __f
1706	= _mm_unpackhi_epi8(__a, __c); // 159D .... .... ....
1707	auto __g
1708	= _mm_unpacklo_epi8(__b, __d); // 26AE .... .... ....
1709	auto __h
1710	= _mm_unpackhi_epi8(__b, __d); // 37BF .... .... ....
1711	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(
1712	_mm_unpacklo_epi8(__e, __g), // 0246 8ACE .... ....
1713	_mm_unpacklo_epi8(__f, __h) // 1357 9BDF .... ....
1714	)); // 0123 4567 89AB CDEF
1715	}
1716	}
1717	else if constexpr (__y_to_y)
1718	{
1719	const auto __a = _mm256_shuffle_epi8(
1720	a: __to_intrin(x: (__vector_bitcast<_UShort>(_mm256_blend_epi16(
1721	__i0, _mm256_slli_epi32(__i1, `16`), `0xAA`))
1722	& `0xff`)
1723	\| (__vector_bitcast<_UShort>(_mm256_blend_epi16(
1724	__i2, _mm256_slli_epi32(__i3, `16`), `0xAA`))
1725	<< `8`)),
1726	b: _mm256_setr_epi8(b31: `0`, b30: `4`, b29: `8`, b28: `12`, b27: `2`, b26: `6`, b25: `10`, b24: `14`, b23: `1`, b22: `5`, b21: `9`, b20: `13`, b19: `3`, b18: `7`,
1727	b17: `11`, b16: `15`, b15: `0`, b14: `4`, b13: `8`, b12: `12`, b11: `2`, b10: `6`, b09: `10`, b08: `14`, b07: `1`, b06: `5`, b05: `9`,
1728	b04: `13`, b03: `3`, b02: `7`, b01: `11`, b00: `15`));
1729	return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32(
1730	__a, b: _mm256_setr_epi32(i0: `0`, i1: `4`, i2: `1`, i3: `5`, i4: `2`, i5: `6`, i6: `3`, i7: `7`)));
1731	}
1732	}
1733	else if constexpr (__i64_to_f32) //{{{2
1734	{
1735	// this branch is only relevant with AVX and w/o AVX2 (i.e. no ymm
1736	// integers)
1737	if constexpr (__x_to_y)
1738	{
1739	return __make_wrapper<float>(__v0[`0`], __v0[`1`], __v1[`0`], __v1[`1`],
1740	__v2[`0`], __v2[`1`], __v3[`0`],
1741	__v3[`1`]);
1742
1743	const auto __a = _mm_unpacklo_epi32(__i0, __i1); // acAC
1744	const auto __b = _mm_unpackhi_epi32(__i0, __i1); // bdBD
1745	const auto __c = _mm_unpacklo_epi32(__i2, __i3); // egEG
1746	const auto __d = _mm_unpackhi_epi32(__i2, __i3); // fhFH
1747	const auto __lo32a = _mm_unpacklo_epi32(__a, __b); // abcd
1748	const auto __lo32b = _mm_unpacklo_epi32(__c, __d); // efgh
1749	const auto __hi32 = __vector_bitcast<
1750	conditional_t<is_signed_v<_Tp>, int, _UInt>>(
1751	__concat(_mm_unpackhi_epi32(__a, __b),
1752	_mm_unpackhi_epi32(__c, __d))); // ABCD EFGH
1753	const auto __hi
1754	= `0x100000000LL`
1755	* __convert_x86<__vector_type_t<float, `8`>>(__hi32);
1756	const auto __mid
1757	= `0x10000`
1758	* _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, `16`),
1759	_mm_srli_epi32(__lo32b, `16`)));
1760	const auto __lo = _mm256_cvtepi32_ps(
1761	__concat(_mm_set1_epi32(i: `0x0000ffffu`) & __lo32a,
1762	_mm_set1_epi32(i: `0x0000ffffu`) & __lo32b));
1763	return (__hi + __mid) + __lo;
1764	}
1765	}
1766	else if constexpr (__f64_to_ibw) //{{{2
1767	{
1768	return __convert_x86<_To>(
1769	__convert_x86<__vector_type_t<int, _Np * `2`>>(__v0, __v1),
1770	__convert_x86<__vector_type_t<int, _Np * `2`>>(__v2, __v3));
1771	}
1772	else if constexpr (__f32_to_ibw) //{{{2
1773	{
1774	return __convert_x86<_To>(
1775	__convert_x86<__vector_type_t<int, _Np>>(__v0),
1776	__convert_x86<__vector_type_t<int, _Np>>(__v1),
1777	__convert_x86<__vector_type_t<int, _Np>>(__v2),
1778	__convert_x86<__vector_type_t<int, _Np>>(__v3));
1779	} //}}}
1780
1781	// fallback: {{{2
1782	if constexpr (sizeof(_To) >= `32`)
1783	// if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
1784	return __concat(__convert_x86<__vector_type_t<_Up, _M / `2`>>(__v0,
1785	__v1),
1786	__convert_x86<__vector_type_t<_Up, _M / `2`>>(__v2,
1787	__v3));
1788	else if constexpr (sizeof(_To) == `16`)
1789	{
1790	const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1));
1791	const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3));
1792	if constexpr (sizeof(_Up) * _Np * `2` == `8`)
1793	{
1794	if constexpr (is_floating_point_v<_Up>)
1795	return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi));
1796	else
1797	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1798	}
1799	else if constexpr (sizeof(_Up) * _Np * `2` == `4`)
1800	{
1801	if constexpr (is_floating_point_v<_Up>)
1802	return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi));
1803	else
1804	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
1805	}
1806	else
1807	__assert_unreachable<_Tp>();
1808	}
1809	else
1810	return __vector_convert<_To>(__v0, __v1, __v2, __v3,
1811	make_index_sequence<_Np>());
1812	//}}}2
1813	}
1814	}
1815
1816	//}}}
1817	// 8-arg __convert_x86 {{{1
1818	template <typename _To, typename _V, typename _Traits>
1819	_GLIBCXX_SIMD_INTRINSIC _To
1820	__convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
1821	_V __v7)
1822	{
1823	static_assert(__is_vector_type_v<_V>);
1824	using _Tp = typename _Traits::value_type;
1825	constexpr size_t _Np = _Traits::_S_full_size;
1826	[[maybe_unused]] const auto __i0 = __to_intrin(__v0);
1827	[[maybe_unused]] const auto __i1 = __to_intrin(__v1);
1828	[[maybe_unused]] const auto __i2 = __to_intrin(__v2);
1829	[[maybe_unused]] const auto __i3 = __to_intrin(__v3);
1830	[[maybe_unused]] const auto __i4 = __to_intrin(__v4);
1831	[[maybe_unused]] const auto __i5 = __to_intrin(__v5);
1832	[[maybe_unused]] const auto __i6 = __to_intrin(__v6);
1833	[[maybe_unused]] const auto __i7 = __to_intrin(__v7);
1834	using _Up = typename _VectorTraits<_To>::value_type;
1835	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
1836
1837	static_assert(`8` * _Np <= _M,
1838	"__v4-__v7 would be discarded; use the four/two/one-argument "
1839	"__convert_x86 overload instead");
1840
1841	// [xyz]_to_[xyz] {{{2
1842	[[maybe_unused]] constexpr bool __x_to_x
1843	= sizeof(__v0) <= `16` && sizeof(_To) <= `16`;
1844	[[maybe_unused]] constexpr bool __x_to_y
1845	= sizeof(__v0) <= `16` && sizeof(_To) == `32`;
1846	[[maybe_unused]] constexpr bool __x_to_z
1847	= sizeof(__v0) <= `16` && sizeof(_To) == `64`;
1848	[[maybe_unused]] constexpr bool __y_to_x
1849	= sizeof(__v0) == `32` && sizeof(_To) <= `16`;
1850	[[maybe_unused]] constexpr bool __y_to_y
1851	= sizeof(__v0) == `32` && sizeof(_To) == `32`;
1852	[[maybe_unused]] constexpr bool __y_to_z
1853	= sizeof(__v0) == `32` && sizeof(_To) == `64`;
1854	[[maybe_unused]] constexpr bool __z_to_x
1855	= sizeof(__v0) == `64` && sizeof(_To) <= `16`;
1856	[[maybe_unused]] constexpr bool __z_to_y
1857	= sizeof(__v0) == `64` && sizeof(_To) == `32`;
1858	[[maybe_unused]] constexpr bool __z_to_z
1859	= sizeof(__v0) == `64` && sizeof(_To) == `64`;
1860
1861	// [if]X_to_i8 {{{2
1862	[[maybe_unused]] constexpr bool __i_to_i
1863	= is_integral_v<_Up> && is_integral_v<_Tp>;
1864	[[maybe_unused]] constexpr bool __i64_to_i8
1865	= __i_to_i && sizeof(_Tp) == `8` && sizeof(_Up) == `1`;
1866	[[maybe_unused]] constexpr bool __f64_to_i8
1867	= is_integral_v<_Up> && sizeof(_Up) == `1`
1868	&& is_floating_point_v<_Tp> && sizeof(_Tp) == `8`;
1869
1870	if constexpr (__i_to_i) // assert ISA {{{2
1871	{
1872	static_assert(__x_to_x \|\| __have_avx2,
1873	"integral conversions with ymm registers require AVX2");
1874	static_assert(__have_avx512bw
1875	\|\| ((sizeof(_Tp) >= `4` \|\| sizeof(__v0) < `64`)
1876	&& (sizeof(_Up) >= `4` \|\| sizeof(_To) < `64`)),
1877	"8/16-bit integers in zmm registers require AVX512BW");
1878	static_assert((sizeof(__v0) < `64` && sizeof(_To) < `64`) \|\| __have_avx512f,
1879	"integral conversions with ymm registers require AVX2");
1880	}
1881	// concat => use 4-arg __convert_x86 {{{2
1882	if constexpr (sizeof(__v0) < `16` \|\| (sizeof(__v0) == `16` && __have_avx2)
1883	\|\| (sizeof(__v0) == `16` && __have_avx
1884	&& is_floating_point_v<_Tp>)
1885	\|\| (sizeof(__v0) == `32` && __have_avx512f))
1886	{
1887	// The ISA can handle wider input registers, so concat and use two-arg
1888	// implementation. This reduces code duplication considerably.
1889	return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
1890	__concat(__v4, __v5), __concat(__v6, __v7));
1891	}
1892	else //{{{2
1893	{
1894	// conversion using bit reinterpretation (or no conversion at all)
1895	// should all go through the concat branch above:
1896	static_assert(
1897	!(is_floating_point_v<
1898	_Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
1899	static_assert(!(`8` * _Np < _M && sizeof(_To) > `16`),
1900	"zero extension should be impossible");
1901	if constexpr (__i64_to_i8) //{{{2
1902	{
1903	if constexpr (__x_to_x && __have_ssse3)
1904	{
1905	// unsure whether this is better than the variant below
1906	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1907	__to_intrin(
1908	(((__v0 & `0xff`) \| ((__v1 & `0xff`) << `8`))
1909	\| (((__v2 & `0xff`) << `16`) \| ((__v3 & `0xff`) << `24`)))
1910	\| ((((__v4 & `0xff`) << `32`) \| ((__v5 & `0xff`) << `40`))
1911	\| (((__v6 & `0xff`) << `48`) \| (__v7 << `56`)))),
1912	_mm_setr_epi8(b0: `0`, b1: `8`, b2: `1`, b3: `9`, b4: `2`, b5: `10`, b6: `3`, b7: `11`, b8: `4`, b9: `12`, b10: `5`, b11: `13`, b12: `6`, b13: `14`,
1913	b14: `7`, b15: `15`)));
1914	}
1915	else if constexpr (__x_to_x)
1916	{
1917	const auto __a = _mm_unpacklo_epi8(__i0, __i1); // ac
1918	const auto __b = _mm_unpackhi_epi8(__i0, __i1); // bd
1919	const auto __c = _mm_unpacklo_epi8(__i2, __i3); // eg
1920	const auto __d = _mm_unpackhi_epi8(__i2, __i3); // fh
1921	const auto __e = _mm_unpacklo_epi8(__i4, __i5); // ik
1922	const auto __f = _mm_unpackhi_epi8(__i4, __i5); // jl
1923	const auto __g = _mm_unpacklo_epi8(__i6, __i7); // mo
1924	const auto __h = _mm_unpackhi_epi8(__i6, __i7); // np
1925	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(
1926	_mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b), // abcd
1927	_mm_unpacklo_epi8(__c, __d)), // efgh
1928	_mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f), // ijkl
1929	_mm_unpacklo_epi8(__g, __h)) // mnop
1930	));
1931	}
1932	else if constexpr (__y_to_y)
1933	{
1934	auto __a = // 048C GKOS 159D HLPT 26AE IMQU 37BF JNRV
1935	__to_intrin(
1936	(((__v0 & `0xff`) \| ((__v1 & `0xff`) << `8`))
1937	\| (((__v2 & `0xff`) << `16`) \| ((__v3 & `0xff`) << `24`)))
1938	\| ((((__v4 & `0xff`) << `32`) \| ((__v5 & `0xff`) << `40`))
1939	\| (((__v6 & `0xff`) << `48`) \| ((__v7 << `56`)))));
1940	/*
1941	auto __b = _mm256_unpackhi_epi64(__a, __a); // 159D HLPT 159D
1942	HLPT 37BF JNRV 37BF JNRV auto __c = _mm256_unpacklo_epi8(__a,
1943	__b); // 0145 89CD GHKL OPST 2367 ABEF IJMN QRUV auto __d =
1944	__xzyw(__c); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV return
1945	_mm256_shuffle_epi8(
1946	__d, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12,
1947	13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
1948	14, 15));
1949	*/
1950	auto __b = _mm256_shuffle_epi8( // 0145 89CD GHKL OPST 2367 ABEF
1951	// IJMN QRUV
1952	__a, _mm256_setr_epi8(b31: `0`, b30: `8`, b29: `1`, b28: `9`, b27: `2`, b26: `10`, b25: `3`, b24: `11`, b23: `4`, b22: `12`, b21: `5`, b20: `13`,
1953	b19: `6`, b18: `14`, b17: `7`, b16: `15`, b15: `0`, b14: `8`, b13: `1`, b12: `9`, b11: `2`, b10: `10`, b09: `3`, b08: `11`,
1954	b07: `4`, b06: `12`, b05: `5`, b04: `13`, b03: `6`, b02: `14`, b01: `7`, b00: `15`));
1955	auto __c
1956	= __xzyw(__b); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV
1957	return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
1958	__c, _mm256_setr_epi8(b31: `0`, b30: `1`, b29: `8`, b28: `9`, b27: `2`, b26: `3`, b25: `10`, b24: `11`, b23: `4`, b22: `5`, b21: `12`, b20: `13`,
1959	b19: `6`, b18: `7`, b17: `14`, b16: `15`, b15: `0`, b14: `1`, b13: `8`, b12: `9`, b11: `2`, b10: `3`, b09: `10`, b08: `11`,
1960	b07: `4`, b06: `5`, b05: `12`, b04: `13`, b03: `6`, b02: `7`, b01: `14`, b00: `15`)));
1961	}
1962	else if constexpr (__z_to_z)
1963	{
1964	return __concat(
1965	__convert_x86<__vector_type_t<_Up, _M / `2`>>(__v0, __v1, __v2,
1966	__v3),
1967	__convert_x86<__vector_type_t<_Up, _M / `2`>>(__v4, __v5, __v6,
1968	__v7));
1969	}
1970	}
1971	else if constexpr (__f64_to_i8) //{{{2
1972	{
1973	return __convert_x86<_To>(
1974	__convert_x86<__vector_type_t<int, _Np * `2`>>(__v0, __v1),
1975	__convert_x86<__vector_type_t<int, _Np * `2`>>(__v2, __v3),
1976	__convert_x86<__vector_type_t<int, _Np * `2`>>(__v4, __v5),
1977	__convert_x86<__vector_type_t<int, _Np * `2`>>(__v6, __v7));
1978	}
1979	else // unreachable {{{2
1980	__assert_unreachable<_Tp>();
1981	//}}}
1982
1983	// fallback: {{{2
1984	if constexpr (sizeof(_To) >= `32`)
1985	// if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
1986	return __concat(
1987	__convert_x86<__vector_type_t<_Up, _M / `2`>>(__v0, __v1, __v2, __v3),
1988	__convert_x86<__vector_type_t<_Up, _M / `2`>>(__v4, __v5, __v6,
1989	__v7));
1990	else if constexpr (sizeof(_To) == `16`)
1991	{
1992	const auto __lo
1993	= __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3));
1994	const auto __hi
1995	= __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7));
1996	static_assert(sizeof(_Up) == `1` && _Np == `2`);
1997	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1998	}
1999	else
2000	{
2001	__assert_unreachable<_Tp>();
2002	// return __vector_convert<_To>(__v0, __v1, __v2, __v3, __v4, __v5,
2003	// __v6, __v7,
2004	// make_index_sequence<_Np>());
2005	} //}}}2
2006	}
2007	}
2008
2009	//}}}
2010	// 16-arg __convert_x86 {{{1
2011	template <typename _To, typename _V, typename _Traits>
2012	_GLIBCXX_SIMD_INTRINSIC _To
2013	__convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
2014	_V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12,
2015	_V __v13, _V __v14, _V __v15)
2016	{
2017	// concat => use 8-arg __convert_x86
2018	return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
2019	__concat(__v4, __v5), __concat(__v6, __v7),
2020	__concat(__v8, __v9), __concat(__v10, __v11),
2021	__concat(__v12, __v13), __concat(__v14, __v15));
2022	}
2023
2024	//}}}
2025
2026	#endif // __cplusplus >= 201703L
2027	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
2028
2029	// vim: foldmethod=marker
2030

source code of include/c++/11/experimental/bits/simd_x86_conversions.h