mod.rs source code [crates/ppv_lite86/src/x86_64/mod.rs]

1	// crate minimums: sse2, x86_64
2
3	use crate::types::*;
4	use core::arch::x86_64::{__m128i, __m256i};
5
6	mod sse2;
7
8	#[derive(Copy, Clone)]
9	pub struct YesS3;
10	#[derive(Copy, Clone)]
11	pub struct NoS3;
12
13	#[derive(Copy, Clone)]
14	pub struct YesS4;
15	#[derive(Copy, Clone)]
16	pub struct NoS4;
17
18	#[derive(Copy, Clone)]
19	pub struct YesA1;
20	#[derive(Copy, Clone)]
21	pub struct NoA1;
22
23	#[derive(Copy, Clone)]
24	pub struct YesA2;
25	#[derive(Copy, Clone)]
26	pub struct NoA2;
27
28	#[derive(Copy, Clone)]
29	pub struct YesNI;
30	#[derive(Copy, Clone)]
31	pub struct NoNI;
32
33	use core::marker::PhantomData;
34
35	#[derive(Copy, Clone)]
36	pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
37	impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
38	where
39	sse2::u128x1_sse2<S3, S4, NI>: Swap64,
40	sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; `2`]> + Vec2<u64>,
41	sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; `4`]> + Vec4<u32>,
42	sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
43	sse2::u128x1_sse2<S3, S4, NI>: BSwap,
44	sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
45	sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
46	sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
47	sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
48	sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
49	{
50	type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
51	type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
52	type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
53
54	type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
55	type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
56	type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
57	type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
58
59	type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
60	type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
61	type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
62
63	#[inline(always)]
64	unsafe fn instance() -> Self {
65	SseMachine(PhantomData)
66	}
67	}
68
69	#[derive(Copy, Clone)]
70	pub struct Avx2Machine<NI>(PhantomData<NI>);
71	impl<NI: Copy> Machine for Avx2Machine<NI>
72	where
73	sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
74	sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; `2`]> + Vec2<u64>,
75	sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; `4`]> + Vec4<u32>,
76	sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
77	{
78	type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
79	type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
80	type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
81
82	type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
83	type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
84	type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
85	type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
86
87	type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
88	type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
89	type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
90
91	#[inline(always)]
92	unsafe fn instance() -> Self {
93	Avx2Machine(PhantomData)
94	}
95	}
96
97	pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
98	pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
99	pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
100	/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
101	/// to avoid expensive SSE/VEX conflicts.
102	pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
103	pub type AVX2 = Avx2Machine<NoNI>;
104
105	zerocopy::cryptocorrosion_derive_traits! {
106	#[repr(C)]
107	/// Generic wrapper for unparameterized storage of any of the possible impls.
108	/// Converting into and out of this type should be essentially free, although it may be more
109	/// aligned than a particular impl requires.
110	#[allow(non_camel_case_types)]
111	#[derive(Copy, Clone)]
112	pub union vec128_storage {
113	u32x4: [u32; `4`],
114	u64x2: [u64; `2`],
115	u128x1: [u128; `1`],
116	sse2: __m128i,
117	}
118	}
119
120	impl Store<vec128_storage> for vec128_storage {
121	#[inline(always)]
122	unsafe fn unpack(p: vec128_storage) -> Self {
123	p
124	}
125	}
126	impl<'a> From<&'a vec128_storage> for &'a [u32; `4`] {
127	#[inline(always)]
128	fn from(x: &'a vec128_storage) -> Self {
129	unsafe { &x.u32x4 }
130	}
131	}
132	impl From<[u32; `4`]> for vec128_storage {
133	#[inline(always)]
134	fn from(u32x4: [u32; `4`]) -> Self {
135	vec128_storage { u32x4 }
136	}
137	}
138	impl Default for vec128_storage {
139	#[inline(always)]
140	fn default() -> Self {
141	vec128_storage { u128x1: [`0`] }
142	}
143	}
144	impl Eq for vec128_storage {}
145	impl PartialEq for vec128_storage {
146	#[inline(always)]
147	fn eq(&self, rhs: &Self) -> bool {
148	unsafe { self.u128x1 == rhs.u128x1 }
149	}
150	}
151
152	#[allow(non_camel_case_types)]
153	#[derive(Copy, Clone)]
154	pub union vec256_storage {
155	u32x8: [u32; `8`],
156	u64x4: [u64; `4`],
157	u128x2: [u128; `2`],
158	sse2: [vec128_storage; `2`],
159	avx: __m256i,
160	}
161	impl From<[u64; `4`]> for vec256_storage {
162	#[inline(always)]
163	fn from(u64x4: [u64; `4`]) -> Self {
164	vec256_storage { u64x4 }
165	}
166	}
167	impl Default for vec256_storage {
168	#[inline(always)]
169	fn default() -> Self {
170	vec256_storage { u128x2: [`0`, `0`] }
171	}
172	}
173	impl vec256_storage {
174	#[inline(always)]
175	pub fn new128(xs: [vec128_storage; `2`]) -> Self {
176	Self { sse2: xs }
177	}
178	#[inline(always)]
179	pub fn split128(self) -> [vec128_storage; `2`] {
180	unsafe { self.sse2 }
181	}
182	}
183	impl Eq for vec256_storage {}
184	impl PartialEq for vec256_storage {
185	#[inline(always)]
186	fn eq(&self, rhs: &Self) -> bool {
187	unsafe { self.sse2 == rhs.sse2 }
188	}
189	}
190
191	#[allow(non_camel_case_types)]
192	#[derive(Copy, Clone)]
193	pub union vec512_storage {
194	u32x16: [u32; `16`],
195	u64x8: [u64; `8`],
196	u128x4: [u128; `4`],
197	sse2: [vec128_storage; `4`],
198	avx: [vec256_storage; `2`],
199	}
200	impl Default for vec512_storage {
201	#[inline(always)]
202	fn default() -> Self {
203	vec512_storage {
204	u128x4: [`0`, `0`, `0`, `0`],
205	}
206	}
207	}
208	impl vec512_storage {
209	#[inline(always)]
210	pub fn new128(xs: [vec128_storage; `4`]) -> Self {
211	Self { sse2: xs }
212	}
213	#[inline(always)]
214	pub fn split128(self) -> [vec128_storage; `4`] {
215	unsafe { self.sse2 }
216	}
217	}
218	impl Eq for vec512_storage {}
219	impl PartialEq for vec512_storage {
220	#[inline(always)]
221	fn eq(&self, rhs: &Self) -> bool {
222	unsafe { self.avx == rhs.avx }
223	}
224	}
225
226	macro_rules! impl_into {
227	($storage:ident, $array:ty, $name:ident) => {
228	impl From<$storage> for $array {
229	#[inline(always)]
230	fn from(vec: $storage) -> Self {
231	unsafe { vec.$name }
232	}
233	}
234	};
235	}
236	impl_into!(vec128_storage, [u32; `4`], u32x4);
237	impl_into!(vec128_storage, [u64; `2`], u64x2);
238	impl_into!(vec128_storage, [u128; `1`], u128x1);
239	impl_into!(vec256_storage, [u32; `8`], u32x8);
240	impl_into!(vec256_storage, [u64; `4`], u64x4);
241	impl_into!(vec256_storage, [u128; `2`], u128x2);
242	impl_into!(vec512_storage, [u32; `16`], u32x16);
243	impl_into!(vec512_storage, [u64; `8`], u64x8);
244	impl_into!(vec512_storage, [u128; `4`], u128x4);
245
246	/// Generate the full set of optimized implementations to take advantage of the most important
247	/// hardware feature sets.
248	///
249	/// This dispatcher is suitable for maximizing throughput.
250	#[macro_export]
251	macro_rules! dispatch {
252	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
253	#[cfg(feature = "std")]
254	$($pub$(($krate))) fn $name($($arg: $argty),*) -> $ret {
255	#[inline(always)]
256	fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
257	use std::arch::x86_64::*;
258	#[target_feature(enable = "avx2")]
259	unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
260	let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
261	_mm256_zeroupper();
262	ret
263	}
264	#[target_feature(enable = "avx")]
265	#[target_feature(enable = "sse4.1")]
266	#[target_feature(enable = "ssse3")]
267	unsafe fn impl_avx($($arg: $argty),*) -> $ret {
268	let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
269	_mm256_zeroupper();
270	ret
271	}
272	#[target_feature(enable = "sse4.1")]
273	#[target_feature(enable = "ssse3")]
274	unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
275	fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
276	}
277	#[target_feature(enable = "ssse3")]
278	unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
279	fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
280	}
281	#[target_feature(enable = "sse2")]
282	unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
283	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
284	}
285	unsafe {
286	if is_x86_feature_detected!("avx2") {
287	impl_avx2($($arg),*)
288	} else if is_x86_feature_detected!("avx") {
289	impl_avx($($arg),*)
290	} else if is_x86_feature_detected!("sse4.1") {
291	impl_sse41($($arg),*)
292	} else if is_x86_feature_detected!("ssse3") {
293	impl_ssse3($($arg),*)
294	} else if is_x86_feature_detected!("sse2") {
295	impl_sse2($($arg),*)
296	} else {
297	unimplemented!()
298	}
299	}
300	}
301	#[cfg(not(feature = "std"))]
302	#[inline(always)]
303	$($pub$(($krate))) fn $name($($arg: $argty),*) -> $ret {
304	unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
305	unsafe {
306	if cfg!(target_feature = "avx2") {
307	fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
308	} else if cfg!(target_feature = "avx") {
309	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
310	} else if cfg!(target_feature = "sse4.1") {
311	fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
312	} else if cfg!(target_feature = "ssse3") {
313	fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
314	} else {
315	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
316	}
317	}
318	}
319	};
320	($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
321	dispatch!($mach, $MTy, {
322	$([$pub $(($krate))]) fn $name($($arg: $argty),*) -> () $body
323	});
324	}
325	}
326
327	/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
328	/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
329	///
330	/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
331	/// features (e.g. because they are done infrequently), so minimizing their contribution to code
332	/// size is more important.
333	#[macro_export]
334	macro_rules! dispatch_light128 {
335	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
336	#[cfg(feature = "std")]
337	$($pub $(($krate))) fn $name($($arg: $argty),*) -> $ret {
338	#[inline(always)]
339	fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
340	use std::arch::x86_64::*;
341	#[target_feature(enable = "avx")]
342	unsafe fn impl_avx($($arg: $argty),*) -> $ret {
343	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
344	}
345	#[target_feature(enable = "sse2")]
346	unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
347	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
348	}
349	unsafe {
350	if is_x86_feature_detected!("avx") {
351	impl_avx($($arg),*)
352	} else if is_x86_feature_detected!("sse2") {
353	impl_sse2($($arg),*)
354	} else {
355	unimplemented!()
356	}
357	}
358	}
359	#[cfg(not(feature = "std"))]
360	#[inline(always)]
361	$($pub$(($krate))) fn $name($($arg: $argty),*) -> $ret {
362	unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
363	unsafe {
364	if cfg!(target_feature = "avx2") {
365	fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
366	} else if cfg!(target_feature = "avx") {
367	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
368	} else if cfg!(target_feature = "sse4.1") {
369	fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
370	} else if cfg!(target_feature = "ssse3") {
371	fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
372	} else {
373	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
374	}
375	}
376	}
377	};
378	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
379	dispatch_light128!($mach, $MTy, {
380	$([$pub $(($krate))]) fn $name($($arg: $argty),*) -> () $body
381	});
382	}
383	}
384
385	/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
386	/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
387	///
388	/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
389	/// features (e.g. because they are done infrequently), so minimizing their contribution to code
390	/// size is more important.
391	#[macro_export]
392	macro_rules! dispatch_light256 {
393	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
394	#[cfg(feature = "std")]
395	$([$pub $(($krate))]) fn $name($($arg: $argty),*) -> $ret {
396	#[inline(always)]
397	fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
398	use std::arch::x86_64::*;
399	#[target_feature(enable = "avx")]
400	unsafe fn impl_avx($($arg: $argty),*) -> $ret {
401	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
402	}
403	#[target_feature(enable = "sse2")]
404	unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
405	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
406	}
407	unsafe {
408	if is_x86_feature_detected!("avx") {
409	impl_avx($($arg),*)
410	} else if is_x86_feature_detected!("sse2") {
411	impl_sse2($($arg),*)
412	} else {
413	unimplemented!()
414	}
415	}
416	}
417	#[cfg(not(feature = "std"))]
418	#[inline(always)]
419	$($pub$(($krate))) fn $name($($arg: $argty),*) -> $ret {
420	unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
421	unsafe {
422	if cfg!(target_feature = "avx2") {
423	fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
424	} else if cfg!(target_feature = "avx") {
425	fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
426	} else if cfg!(target_feature = "sse4.1") {
427	fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
428	} else if cfg!(target_feature = "ssse3") {
429	fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
430	} else {
431	fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
432	}
433	}
434	}
435	};
436	($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))]) fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
437	dispatch_light256!($mach, $MTy, {
438	$([$pub $(($krate))]) fn $name($($arg: $argty),*) -> () $body
439	});
440	}
441	}
442