| 1 | // crate minimums: sse2, x86_64 | 
| 2 |  | 
|---|
| 3 | use crate::types::*; | 
|---|
| 4 | use core::arch::x86_64::{__m128i, __m256i}; | 
|---|
| 5 | use zerocopy::{AsBytes, FromBytes, FromZeroes}; | 
|---|
| 6 |  | 
|---|
| 7 | mod sse2; | 
|---|
| 8 |  | 
|---|
| 9 | #[ derive(Copy, Clone)] | 
|---|
| 10 | pub struct YesS3; | 
|---|
| 11 | #[ derive(Copy, Clone)] | 
|---|
| 12 | pub struct NoS3; | 
|---|
| 13 |  | 
|---|
| 14 | #[ derive(Copy, Clone)] | 
|---|
| 15 | pub struct YesS4; | 
|---|
| 16 | #[ derive(Copy, Clone)] | 
|---|
| 17 | pub struct NoS4; | 
|---|
| 18 |  | 
|---|
| 19 | #[ derive(Copy, Clone)] | 
|---|
| 20 | pub struct YesA1; | 
|---|
| 21 | #[ derive(Copy, Clone)] | 
|---|
| 22 | pub struct NoA1; | 
|---|
| 23 |  | 
|---|
| 24 | #[ derive(Copy, Clone)] | 
|---|
| 25 | pub struct YesA2; | 
|---|
| 26 | #[ derive(Copy, Clone)] | 
|---|
| 27 | pub struct NoA2; | 
|---|
| 28 |  | 
|---|
| 29 | #[ derive(Copy, Clone)] | 
|---|
| 30 | pub struct YesNI; | 
|---|
| 31 | #[ derive(Copy, Clone)] | 
|---|
| 32 | pub struct NoNI; | 
|---|
| 33 |  | 
|---|
| 34 | use core::marker::PhantomData; | 
|---|
| 35 |  | 
|---|
| 36 | #[ derive(Copy, Clone)] | 
|---|
| 37 | pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>); | 
|---|
| 38 | impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI> | 
|---|
| 39 | where | 
|---|
| 40 | sse2::u128x1_sse2<S3, S4, NI>: Swap64, | 
|---|
| 41 | sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, | 
|---|
| 42 | sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, | 
|---|
| 43 | sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4, | 
|---|
| 44 | sse2::u128x1_sse2<S3, S4, NI>: BSwap, | 
|---|
| 45 | sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>, | 
|---|
| 46 | sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>, | 
|---|
| 47 | sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>, | 
|---|
| 48 | sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>, | 
|---|
| 49 | sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>, | 
|---|
| 50 | { | 
|---|
| 51 | type u32x4 = sse2::u32x4_sse2<S3, S4, NI>; | 
|---|
| 52 | type u64x2 = sse2::u64x2_sse2<S3, S4, NI>; | 
|---|
| 53 | type u128x1 = sse2::u128x1_sse2<S3, S4, NI>; | 
|---|
| 54 |  | 
|---|
| 55 | type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>; | 
|---|
| 56 | type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>; | 
|---|
| 57 | type u64x4 = sse2::u64x4_sse2<S3, S4, NI>; | 
|---|
| 58 | type u128x2 = sse2::u128x2_sse2<S3, S4, NI>; | 
|---|
| 59 |  | 
|---|
| 60 | type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>; | 
|---|
| 61 | type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>; | 
|---|
| 62 | type u128x4 = sse2::u128x4_sse2<S3, S4, NI>; | 
|---|
| 63 |  | 
|---|
| 64 | #[ inline(always)] | 
|---|
| 65 | unsafe fn instance() -> Self { | 
|---|
| 66 | SseMachine(PhantomData) | 
|---|
| 67 | } | 
|---|
| 68 | } | 
|---|
| 69 |  | 
|---|
| 70 | #[ derive(Copy, Clone)] | 
|---|
| 71 | pub struct Avx2Machine<NI>(PhantomData<NI>); | 
|---|
| 72 | impl<NI: Copy> Machine for Avx2Machine<NI> | 
|---|
| 73 | where | 
|---|
| 74 | sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64, | 
|---|
| 75 | sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, | 
|---|
| 76 | sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, | 
|---|
| 77 | sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4, | 
|---|
| 78 | { | 
|---|
| 79 | type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>; | 
|---|
| 80 | type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; | 
|---|
| 81 | type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; | 
|---|
| 82 |  | 
|---|
| 83 | type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>; | 
|---|
| 84 | type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; | 
|---|
| 85 | type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; | 
|---|
| 86 | type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; | 
|---|
| 87 |  | 
|---|
| 88 | type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>; | 
|---|
| 89 | type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>; | 
|---|
| 90 | type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>; | 
|---|
| 91 |  | 
|---|
| 92 | #[ inline(always)] | 
|---|
| 93 | unsafe fn instance() -> Self { | 
|---|
| 94 | Avx2Machine(PhantomData) | 
|---|
| 95 | } | 
|---|
| 96 | } | 
|---|
| 97 |  | 
|---|
| 98 | pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>; | 
|---|
| 99 | pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>; | 
|---|
| 100 | pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>; | 
|---|
| 101 | /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything | 
|---|
| 102 | /// to avoid expensive SSE/VEX conflicts. | 
|---|
| 103 | pub type AVX = SseMachine<YesS3, YesS4, NoNI>; | 
|---|
| 104 | pub type AVX2 = Avx2Machine<NoNI>; | 
|---|
| 105 |  | 
|---|
| 106 | /// Generic wrapper for unparameterized storage of any of the possible impls. | 
|---|
| 107 | /// Converting into and out of this type should be essentially free, although it may be more | 
|---|
| 108 | /// aligned than a particular impl requires. | 
|---|
| 109 | #[ allow(non_camel_case_types)] | 
|---|
| 110 | #[ derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)] | 
|---|
| 111 | #[ repr(C)] | 
|---|
| 112 | pub union vec128_storage { | 
|---|
| 113 | u32x4: [u32; 4], | 
|---|
| 114 | u64x2: [u64; 2], | 
|---|
| 115 | u128x1: [u128; 1], | 
|---|
| 116 | sse2: __m128i, | 
|---|
| 117 | } | 
|---|
| 118 | impl Store<vec128_storage> for vec128_storage { | 
|---|
| 119 | #[ inline(always)] | 
|---|
| 120 | unsafe fn unpack(p: vec128_storage) -> Self { | 
|---|
| 121 | p | 
|---|
| 122 | } | 
|---|
| 123 | } | 
|---|
| 124 | impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { | 
|---|
| 125 | #[ inline(always)] | 
|---|
| 126 | fn from(x: &'a vec128_storage) -> Self { | 
|---|
| 127 | unsafe { &x.u32x4 } | 
|---|
| 128 | } | 
|---|
| 129 | } | 
|---|
| 130 | impl From<[u32; 4]> for vec128_storage { | 
|---|
| 131 | #[ inline(always)] | 
|---|
| 132 | fn from(u32x4: [u32; 4]) -> Self { | 
|---|
| 133 | vec128_storage { u32x4 } | 
|---|
| 134 | } | 
|---|
| 135 | } | 
|---|
| 136 | impl Default for vec128_storage { | 
|---|
| 137 | #[ inline(always)] | 
|---|
| 138 | fn default() -> Self { | 
|---|
| 139 | vec128_storage { u128x1: [0] } | 
|---|
| 140 | } | 
|---|
| 141 | } | 
|---|
| 142 | impl Eq for vec128_storage {} | 
|---|
| 143 | impl PartialEq for vec128_storage { | 
|---|
| 144 | #[ inline(always)] | 
|---|
| 145 | fn eq(&self, rhs: &Self) -> bool { | 
|---|
| 146 | unsafe { self.u128x1 == rhs.u128x1 } | 
|---|
| 147 | } | 
|---|
| 148 | } | 
|---|
| 149 |  | 
|---|
| 150 | #[ allow(non_camel_case_types)] | 
|---|
| 151 | #[ derive(Copy, Clone)] | 
|---|
| 152 | pub union vec256_storage { | 
|---|
| 153 | u32x8: [u32; 8], | 
|---|
| 154 | u64x4: [u64; 4], | 
|---|
| 155 | u128x2: [u128; 2], | 
|---|
| 156 | sse2: [vec128_storage; 2], | 
|---|
| 157 | avx: __m256i, | 
|---|
| 158 | } | 
|---|
| 159 | impl From<[u64; 4]> for vec256_storage { | 
|---|
| 160 | #[ inline(always)] | 
|---|
| 161 | fn from(u64x4: [u64; 4]) -> Self { | 
|---|
| 162 | vec256_storage { u64x4 } | 
|---|
| 163 | } | 
|---|
| 164 | } | 
|---|
| 165 | impl Default for vec256_storage { | 
|---|
| 166 | #[ inline(always)] | 
|---|
| 167 | fn default() -> Self { | 
|---|
| 168 | vec256_storage { u128x2: [0, 0] } | 
|---|
| 169 | } | 
|---|
| 170 | } | 
|---|
| 171 | impl vec256_storage { | 
|---|
| 172 | #[ inline(always)] | 
|---|
| 173 | pub fn new128(xs: [vec128_storage; 2]) -> Self { | 
|---|
| 174 | Self { sse2: xs } | 
|---|
| 175 | } | 
|---|
| 176 | #[ inline(always)] | 
|---|
| 177 | pub fn split128(self) -> [vec128_storage; 2] { | 
|---|
| 178 | unsafe { self.sse2 } | 
|---|
| 179 | } | 
|---|
| 180 | } | 
|---|
| 181 | impl Eq for vec256_storage {} | 
|---|
| 182 | impl PartialEq for vec256_storage { | 
|---|
| 183 | #[ inline(always)] | 
|---|
| 184 | fn eq(&self, rhs: &Self) -> bool { | 
|---|
| 185 | unsafe { self.sse2 == rhs.sse2 } | 
|---|
| 186 | } | 
|---|
| 187 | } | 
|---|
| 188 |  | 
|---|
| 189 | #[ allow(non_camel_case_types)] | 
|---|
| 190 | #[ derive(Copy, Clone)] | 
|---|
| 191 | pub union vec512_storage { | 
|---|
| 192 | u32x16: [u32; 16], | 
|---|
| 193 | u64x8: [u64; 8], | 
|---|
| 194 | u128x4: [u128; 4], | 
|---|
| 195 | sse2: [vec128_storage; 4], | 
|---|
| 196 | avx: [vec256_storage; 2], | 
|---|
| 197 | } | 
|---|
| 198 | impl Default for vec512_storage { | 
|---|
| 199 | #[ inline(always)] | 
|---|
| 200 | fn default() -> Self { | 
|---|
| 201 | vec512_storage { | 
|---|
| 202 | u128x4: [0, 0, 0, 0], | 
|---|
| 203 | } | 
|---|
| 204 | } | 
|---|
| 205 | } | 
|---|
| 206 | impl vec512_storage { | 
|---|
| 207 | #[ inline(always)] | 
|---|
| 208 | pub fn new128(xs: [vec128_storage; 4]) -> Self { | 
|---|
| 209 | Self { sse2: xs } | 
|---|
| 210 | } | 
|---|
| 211 | #[ inline(always)] | 
|---|
| 212 | pub fn split128(self) -> [vec128_storage; 4] { | 
|---|
| 213 | unsafe { self.sse2 } | 
|---|
| 214 | } | 
|---|
| 215 | } | 
|---|
| 216 | impl Eq for vec512_storage {} | 
|---|
| 217 | impl PartialEq for vec512_storage { | 
|---|
| 218 | #[ inline(always)] | 
|---|
| 219 | fn eq(&self, rhs: &Self) -> bool { | 
|---|
| 220 | unsafe { self.avx == rhs.avx } | 
|---|
| 221 | } | 
|---|
| 222 | } | 
|---|
| 223 |  | 
|---|
| 224 | macro_rules! impl_into { | 
|---|
| 225 | ($storage:ident, $array:ty, $name:ident) => { | 
|---|
| 226 | impl From<$storage> for $array { | 
|---|
| 227 | #[inline(always)] | 
|---|
| 228 | fn from(vec: $storage) -> Self { | 
|---|
| 229 | unsafe { vec.$name } | 
|---|
| 230 | } | 
|---|
| 231 | } | 
|---|
| 232 | }; | 
|---|
| 233 | } | 
|---|
| 234 | impl_into!(vec128_storage, [u32; 4], u32x4); | 
|---|
| 235 | impl_into!(vec128_storage, [u64; 2], u64x2); | 
|---|
| 236 | impl_into!(vec128_storage, [u128; 1], u128x1); | 
|---|
| 237 | impl_into!(vec256_storage, [u32; 8], u32x8); | 
|---|
| 238 | impl_into!(vec256_storage, [u64; 4], u64x4); | 
|---|
| 239 | impl_into!(vec256_storage, [u128; 2], u128x2); | 
|---|
| 240 | impl_into!(vec512_storage, [u32; 16], u32x16); | 
|---|
| 241 | impl_into!(vec512_storage, [u64; 8], u64x8); | 
|---|
| 242 | impl_into!(vec512_storage, [u128; 4], u128x4); | 
|---|
| 243 |  | 
|---|
| 244 | /// Generate the full set of optimized implementations to take advantage of the most important | 
|---|
| 245 | /// hardware feature sets. | 
|---|
| 246 | /// | 
|---|
| 247 | /// This dispatcher is suitable for maximizing throughput. | 
|---|
| 248 | #[ macro_export] | 
|---|
| 249 | macro_rules! dispatch { | 
|---|
| 250 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { | 
|---|
| 251 | #[cfg(feature = "std")] | 
|---|
| 252 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { | 
|---|
| 253 | #[inline(always)] | 
|---|
| 254 | fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body | 
|---|
| 255 | use std::arch::x86_64::*; | 
|---|
| 256 | #[target_feature(enable = "avx2")] | 
|---|
| 257 | unsafe fn impl_avx2($($arg: $argty),*) -> $ret { | 
|---|
| 258 | let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); | 
|---|
| 259 | _mm256_zeroupper(); | 
|---|
| 260 | ret | 
|---|
| 261 | } | 
|---|
| 262 | #[target_feature(enable = "avx")] | 
|---|
| 263 | #[target_feature(enable = "sse4.1")] | 
|---|
| 264 | #[target_feature(enable = "ssse3")] | 
|---|
| 265 | unsafe fn impl_avx($($arg: $argty),*) -> $ret { | 
|---|
| 266 | let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); | 
|---|
| 267 | _mm256_zeroupper(); | 
|---|
| 268 | ret | 
|---|
| 269 | } | 
|---|
| 270 | #[target_feature(enable = "sse4.1")] | 
|---|
| 271 | #[target_feature(enable = "ssse3")] | 
|---|
| 272 | unsafe fn impl_sse41($($arg: $argty),*) -> $ret { | 
|---|
| 273 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) | 
|---|
| 274 | } | 
|---|
| 275 | #[target_feature(enable = "ssse3")] | 
|---|
| 276 | unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { | 
|---|
| 277 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) | 
|---|
| 278 | } | 
|---|
| 279 | #[target_feature(enable = "sse2")] | 
|---|
| 280 | unsafe fn impl_sse2($($arg: $argty),*) -> $ret { | 
|---|
| 281 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) | 
|---|
| 282 | } | 
|---|
| 283 | unsafe { | 
|---|
| 284 | if is_x86_feature_detected!( "avx2") { | 
|---|
| 285 | impl_avx2($($arg),*) | 
|---|
| 286 | } else if is_x86_feature_detected!( "avx") { | 
|---|
| 287 | impl_avx($($arg),*) | 
|---|
| 288 | } else if is_x86_feature_detected!( "sse4.1") { | 
|---|
| 289 | impl_sse41($($arg),*) | 
|---|
| 290 | } else if is_x86_feature_detected!( "ssse3") { | 
|---|
| 291 | impl_ssse3($($arg),*) | 
|---|
| 292 | } else if is_x86_feature_detected!( "sse2") { | 
|---|
| 293 | impl_sse2($($arg),*) | 
|---|
| 294 | } else { | 
|---|
| 295 | unimplemented!() | 
|---|
| 296 | } | 
|---|
| 297 | } | 
|---|
| 298 | } | 
|---|
| 299 | #[cfg(not(feature = "std"))] | 
|---|
| 300 | #[inline(always)] | 
|---|
| 301 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { | 
|---|
| 302 | unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body | 
|---|
| 303 | unsafe { | 
|---|
| 304 | if cfg!(target_feature = "avx2") { | 
|---|
| 305 | fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) | 
|---|
| 306 | } else if cfg!(target_feature = "avx") { | 
|---|
| 307 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) | 
|---|
| 308 | } else if cfg!(target_feature = "sse4.1") { | 
|---|
| 309 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) | 
|---|
| 310 | } else if cfg!(target_feature = "ssse3") { | 
|---|
| 311 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) | 
|---|
| 312 | } else { | 
|---|
| 313 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) | 
|---|
| 314 | } | 
|---|
| 315 | } | 
|---|
| 316 | } | 
|---|
| 317 | }; | 
|---|
| 318 | ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { | 
|---|
| 319 | dispatch!($mach, $MTy, { | 
|---|
| 320 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body | 
|---|
| 321 | }); | 
|---|
| 322 | } | 
|---|
| 323 | } | 
|---|
| 324 |  | 
|---|
| 325 | /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit | 
|---|
| 326 | /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. | 
|---|
| 327 | /// | 
|---|
| 328 | /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware | 
|---|
| 329 | /// features (e.g. because they are done infrequently), so minimizing their contribution to code | 
|---|
| 330 | /// size is more important. | 
|---|
| 331 | #[ macro_export] | 
|---|
| 332 | macro_rules! dispatch_light128 { | 
|---|
| 333 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { | 
|---|
| 334 | #[cfg(feature = "std")] | 
|---|
| 335 | $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { | 
|---|
| 336 | #[inline(always)] | 
|---|
| 337 | fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body | 
|---|
| 338 | use std::arch::x86_64::*; | 
|---|
| 339 | #[target_feature(enable = "avx")] | 
|---|
| 340 | unsafe fn impl_avx($($arg: $argty),*) -> $ret { | 
|---|
| 341 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) | 
|---|
| 342 | } | 
|---|
| 343 | #[target_feature(enable = "sse2")] | 
|---|
| 344 | unsafe fn impl_sse2($($arg: $argty),*) -> $ret { | 
|---|
| 345 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) | 
|---|
| 346 | } | 
|---|
| 347 | unsafe { | 
|---|
| 348 | if is_x86_feature_detected!( "avx") { | 
|---|
| 349 | impl_avx($($arg),*) | 
|---|
| 350 | } else if is_x86_feature_detected!( "sse2") { | 
|---|
| 351 | impl_sse2($($arg),*) | 
|---|
| 352 | } else { | 
|---|
| 353 | unimplemented!() | 
|---|
| 354 | } | 
|---|
| 355 | } | 
|---|
| 356 | } | 
|---|
| 357 | #[cfg(not(feature = "std"))] | 
|---|
| 358 | #[inline(always)] | 
|---|
| 359 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { | 
|---|
| 360 | unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body | 
|---|
| 361 | unsafe { | 
|---|
| 362 | if cfg!(target_feature = "avx2") { | 
|---|
| 363 | fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) | 
|---|
| 364 | } else if cfg!(target_feature = "avx") { | 
|---|
| 365 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) | 
|---|
| 366 | } else if cfg!(target_feature = "sse4.1") { | 
|---|
| 367 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) | 
|---|
| 368 | } else if cfg!(target_feature = "ssse3") { | 
|---|
| 369 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) | 
|---|
| 370 | } else { | 
|---|
| 371 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) | 
|---|
| 372 | } | 
|---|
| 373 | } | 
|---|
| 374 | } | 
|---|
| 375 | }; | 
|---|
| 376 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { | 
|---|
| 377 | dispatch_light128!($mach, $MTy, { | 
|---|
| 378 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body | 
|---|
| 379 | }); | 
|---|
| 380 | } | 
|---|
| 381 | } | 
|---|
| 382 |  | 
|---|
| 383 | /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit | 
|---|
| 384 | /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. | 
|---|
| 385 | /// | 
|---|
| 386 | /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware | 
|---|
| 387 | /// features (e.g. because they are done infrequently), so minimizing their contribution to code | 
|---|
| 388 | /// size is more important. | 
|---|
| 389 | #[ macro_export] | 
|---|
| 390 | macro_rules! dispatch_light256 { | 
|---|
| 391 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { | 
|---|
| 392 | #[cfg(feature = "std")] | 
|---|
| 393 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { | 
|---|
| 394 | #[inline(always)] | 
|---|
| 395 | fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body | 
|---|
| 396 | use std::arch::x86_64::*; | 
|---|
| 397 | #[target_feature(enable = "avx")] | 
|---|
| 398 | unsafe fn impl_avx($($arg: $argty),*) -> $ret { | 
|---|
| 399 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) | 
|---|
| 400 | } | 
|---|
| 401 | #[target_feature(enable = "sse2")] | 
|---|
| 402 | unsafe fn impl_sse2($($arg: $argty),*) -> $ret { | 
|---|
| 403 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) | 
|---|
| 404 | } | 
|---|
| 405 | unsafe { | 
|---|
| 406 | if is_x86_feature_detected!( "avx") { | 
|---|
| 407 | impl_avx($($arg),*) | 
|---|
| 408 | } else if is_x86_feature_detected!( "sse2") { | 
|---|
| 409 | impl_sse2($($arg),*) | 
|---|
| 410 | } else { | 
|---|
| 411 | unimplemented!() | 
|---|
| 412 | } | 
|---|
| 413 | } | 
|---|
| 414 | } | 
|---|
| 415 | #[cfg(not(feature = "std"))] | 
|---|
| 416 | #[inline(always)] | 
|---|
| 417 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { | 
|---|
| 418 | unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body | 
|---|
| 419 | unsafe { | 
|---|
| 420 | if cfg!(target_feature = "avx2") { | 
|---|
| 421 | fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) | 
|---|
| 422 | } else if cfg!(target_feature = "avx") { | 
|---|
| 423 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) | 
|---|
| 424 | } else if cfg!(target_feature = "sse4.1") { | 
|---|
| 425 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) | 
|---|
| 426 | } else if cfg!(target_feature = "ssse3") { | 
|---|
| 427 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) | 
|---|
| 428 | } else { | 
|---|
| 429 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) | 
|---|
| 430 | } | 
|---|
| 431 | } | 
|---|
| 432 | } | 
|---|
| 433 | }; | 
|---|
| 434 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { | 
|---|
| 435 | dispatch_light256!($mach, $MTy, { | 
|---|
| 436 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body | 
|---|
| 437 | }); | 
|---|
| 438 | } | 
|---|
| 439 | } | 
|---|
| 440 |  | 
|---|