| 1 | // Copyright 2016-2021 Brian Smith. |
| 2 | // |
| 3 | // Permission to use, copy, modify, and/or distribute this software for any |
| 4 | // purpose with or without fee is hereby granted, provided that the above |
| 5 | // copyright notice and this permission notice appear in all copies. |
| 6 | // |
| 7 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES |
| 8 | // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 9 | // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY |
| 10 | // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 11 | // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
| 12 | // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
| 13 | // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 14 | |
| 15 | use cfg_if::cfg_if; |
| 16 | |
| 17 | mod abi_assumptions { |
| 18 | use core::mem::size_of; |
| 19 | |
| 20 | // TOOD: Support targets that do not have SSE and SSE2 enabled, such as |
| 21 | // x86_64-unknown-linux-none. See |
| 22 | // https://github.com/briansmith/ring/issues/1793#issuecomment-1793243725, |
| 23 | // https://github.com/briansmith/ring/issues/1832, |
| 24 | // https://github.com/briansmith/ring/issues/1833. |
| 25 | const _ASSUMES_SSE2: () = |
| 26 | assert!(cfg!(target_feature = "sse" ) && cfg!(target_feature = "sse2" )); |
| 27 | |
| 28 | #[cfg (target_arch = "x86_64" )] |
| 29 | const _ASSUMED_POINTER_SIZE: usize = 8; |
| 30 | #[cfg (target_arch = "x86" )] |
| 31 | const _ASSUMED_POINTER_SIZE: usize = 4; |
| 32 | const _ASSUMED_USIZE_SIZE: () = assert!(size_of::<usize>() == _ASSUMED_POINTER_SIZE); |
| 33 | const _ASSUMED_REF_SIZE: () = assert!(size_of::<&'static u8>() == _ASSUMED_POINTER_SIZE); |
| 34 | |
| 35 | const _ASSUMED_ENDIANNESS: () = assert!(cfg!(target_endian = "little" )); |
| 36 | } |
| 37 | |
| 38 | pub(super) mod featureflags { |
| 39 | use super::super::CAPS_STATIC; |
| 40 | use crate::{ |
| 41 | cpu, |
| 42 | polyfill::{once_cell::race, usize_from_u32}, |
| 43 | }; |
| 44 | use core::num::NonZeroUsize; |
| 45 | |
| 46 | pub(in super::super) fn get_or_init() -> cpu::Features { |
| 47 | // SAFETY: `OPENSSL_cpuid_setup` must be called only in |
| 48 | // `INIT.call_once()` below. |
| 49 | prefixed_extern! { |
| 50 | fn OPENSSL_cpuid_setup(out: &mut [u32; 4]); |
| 51 | } |
| 52 | |
| 53 | let _: NonZeroUsize = FEATURES.get_or_init(|| { |
| 54 | let mut cpuid = [0; 4]; |
| 55 | // SAFETY: We assume that it is safe to execute CPUID and XGETBV. |
| 56 | unsafe { |
| 57 | OPENSSL_cpuid_setup(&mut cpuid); |
| 58 | } |
| 59 | let detected = super::cpuid_to_caps_and_set_c_flags(&cpuid); |
| 60 | let merged = CAPS_STATIC | detected; |
| 61 | |
| 62 | let merged = usize_from_u32(merged) | (1 << (super::Shift::Initialized as u32)); |
| 63 | NonZeroUsize::new(merged).unwrap() // Can't fail because we just set a bit. |
| 64 | }); |
| 65 | |
| 66 | // SAFETY: We initialized the CPU features as required. |
| 67 | // `INIT.call_once` has `happens-before` semantics. |
| 68 | unsafe { cpu::Features::new_after_feature_flags_written_and_synced_unchecked() } |
| 69 | } |
| 70 | |
| 71 | pub(in super::super) fn get(_cpu_features: cpu::Features) -> u32 { |
| 72 | // SAFETY: Since only `get_or_init()` could have created |
| 73 | // `_cpu_features`, and it only does so after `FEATURES.get_or_init()`, |
| 74 | // we know we are reading from `FEATURES` after initializing it. |
| 75 | // |
| 76 | // Also, 0 means "no features detected" to users, which is designed to |
| 77 | // be a safe configuration. |
| 78 | let features = FEATURES.get().map(NonZeroUsize::get).unwrap_or(0); |
| 79 | |
| 80 | // The truncation is lossless, as we set the value with a u32. |
| 81 | #[allow (clippy::cast_possible_truncation)] |
| 82 | let features = features as u32; |
| 83 | |
| 84 | features |
| 85 | } |
| 86 | |
| 87 | static FEATURES: race::OnceNonZeroUsize = race::OnceNonZeroUsize::new(); |
| 88 | |
| 89 | #[cfg (target_arch = "x86" )] |
| 90 | #[rustfmt::skip] |
| 91 | pub const STATIC_DETECTED: u32 = 0 |
| 92 | | (if cfg!(target_feature = "sse2" ) { super::Sse2::mask() } else { 0 }) |
| 93 | ; |
| 94 | |
| 95 | // Limited to x86_64-v2 features. |
| 96 | // TODO: Add missing x86-64-v3 features if we find real-world use of x86-64-v3. |
| 97 | // TODO: Add all features we use. |
| 98 | #[cfg (target_arch = "x86_64" )] |
| 99 | #[rustfmt::skip] |
| 100 | pub const STATIC_DETECTED: u32 = 0 |
| 101 | | if cfg!(target_feature = "sse4.1" ) { super::Sse41::mask() } else { 0 } |
| 102 | | if cfg!(target_feature = "ssse3" ) { super::Ssse3::mask() } else { 0 } |
| 103 | ; |
| 104 | |
| 105 | pub const FORCE_DYNAMIC_DETECTION: u32 = 0; |
| 106 | } |
| 107 | |
| 108 | fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { |
| 109 | // "Intel" citations are for "Intel 64 and IA-32 Architectures Software |
| 110 | // Developer’s Manual", Combined Volumes, December 2024. |
| 111 | // "AMD" citations are for "AMD64 Technology AMD64 Architecture |
| 112 | // Programmer’s Manual, Volumes 1-5" Revision 4.08 April 2024. |
| 113 | |
| 114 | // The `prefixed_extern!` uses below assume this |
| 115 | #[cfg (target_arch = "x86_64" )] |
| 116 | use core::{mem::align_of, sync::atomic::AtomicU32}; |
| 117 | #[cfg (target_arch = "x86_64" )] |
| 118 | const _ATOMIC32_ALIGNMENT_EQUALS_U32_ALIGNMENT: () = |
| 119 | assert!(align_of::<AtomicU32>() == align_of::<u32>()); |
| 120 | |
| 121 | fn check(leaf: u32, bit: u32) -> bool { |
| 122 | let shifted = 1 << bit; |
| 123 | (leaf & shifted) == shifted |
| 124 | } |
| 125 | fn set(out: &mut u32, shift: Shift) { |
| 126 | let shifted = 1 << (shift as u32); |
| 127 | debug_assert_eq!(*out & shifted, 0); |
| 128 | *out |= shifted; |
| 129 | debug_assert_eq!(*out & shifted, shifted); |
| 130 | } |
| 131 | |
| 132 | #[cfg (target_arch = "x86_64" )] |
| 133 | let is_intel = check(cpuid[0], 30); // Synthesized by `OPENSSL_cpuid_setup` |
| 134 | |
| 135 | // CPUID leaf 1. |
| 136 | let leaf1_ecx = cpuid[1]; |
| 137 | |
| 138 | // Intel: "Structured Extended Feature Flags Enumeration Leaf" |
| 139 | #[cfg (target_arch = "x86_64" )] |
| 140 | let extended_features_ebx = cpuid[2]; |
| 141 | |
| 142 | let mut caps = 0; |
| 143 | |
| 144 | // AMD: "Collectively the SSE1, [...] are referred to as the legacy SSE |
| 145 | // instructions. All legacy SSE instructions support 128-bit vector |
| 146 | // operands." |
| 147 | |
| 148 | // Intel: "11.6.2 Checking for Intel SSE and SSE2 Support" |
| 149 | // We have to assume the prerequisites for SSE/SSE2 are met since we're |
| 150 | // already almost definitely using SSE registers if these target features |
| 151 | // are enabled. |
| 152 | // |
| 153 | // These also seem to help ensure CMOV support; There doesn't seem to be |
| 154 | // a `cfg!(target_feature = "cmov")`. It is likely that removing these |
| 155 | // assertions will remove the requirement for CMOV. With our without |
| 156 | // CMOV, it is likely that some of our timing side channel prevention does |
| 157 | // not work. Presumably the people who delete these are verifying that it |
| 158 | // all works fine. |
| 159 | const _SSE_REQUIRED: () = assert!(cfg!(target_feature = "sse" )); |
| 160 | const _SSE2_REQUIRED: () = assert!(cfg!(target_feature = "sse2" )); |
| 161 | |
| 162 | #[cfg (all(target_arch = "x86" , not(target_feature = "sse2" )))] |
| 163 | { |
| 164 | // If somebody is trying to compile for an x86 target without SSE2 |
| 165 | // and they deleted the `_SSE2_REQUIRED` const assertion above then |
| 166 | // they're probably trying to support a Linux/BSD/etc. distro that |
| 167 | // tries to support ancient x86 systems without SSE/SSE2. Try to |
| 168 | // reduce the harm caused, by implementing dynamic feature detection |
| 169 | // for them so that most systems will work like normal. |
| 170 | // |
| 171 | // Note that usually an x86-64 target with SSE2 disabled by default, |
| 172 | // usually `-none-` targets, will not support dynamically-detected use |
| 173 | // of SIMD registers via CPUID. A whole different mechanism is needed |
| 174 | // to support them. Same for i*86-*-none targets. |
| 175 | let leaf1_edx = cpuid[0]; |
| 176 | let sse1_available = check(leaf1_edx, 25); |
| 177 | let sse2_available = check(leaf1_edx, 26); |
| 178 | if sse1_available && sse2_available { |
| 179 | set(&mut caps, Shift::Sse2); |
| 180 | } |
| 181 | } |
| 182 | |
| 183 | // Sometimes people delete the `_SSE_REQUIRED`/`_SSE2_REQUIRED` const |
| 184 | // assertions in an attempt to support pre-SSE2 32-bit x86 systems. If they |
| 185 | // do, hopefully they won't delete these redundant assertions, so that |
| 186 | // x86_64 isn't affected. |
| 187 | #[cfg (target_arch = "x86_64" )] |
| 188 | const _SSE2_REQUIRED_X86_64: () = assert!(cfg!(target_feature = "sse2" )); |
| 189 | #[cfg (target_arch = "x86_64" )] |
| 190 | const _SSE_REQUIRED_X86_64: () = assert!(cfg!(target_feature = "sse2" )); |
| 191 | |
| 192 | // Intel: "12.7.2 Checking for SSSE3 Support" |
| 193 | // If/when we support dynamic detection of SSE/SSE2, make this conditional |
| 194 | // on SSE/SSE2. |
| 195 | if check(leaf1_ecx, 9) { |
| 196 | set(&mut caps, Shift::Ssse3); |
| 197 | } |
| 198 | |
| 199 | // Intel: "12.12.2 Checking for Intel SSE4.1 Support" |
| 200 | // If/when we support dynamic detection of SSE/SSE2, make this conditional |
| 201 | // on SSE/SSE2. |
| 202 | // XXX: We don't check for SSE3 and we're not sure if it is compatible for |
| 203 | // us to do so; does AMD advertise SSE3? TODO: address this. |
| 204 | // XXX: We don't condition this on SSSE3 being available. TODO: address |
| 205 | // this. |
| 206 | #[cfg (target_arch = "x86_64" )] |
| 207 | if check(leaf1_ecx, 19) { |
| 208 | set(&mut caps, Shift::Sse41); |
| 209 | } |
| 210 | |
| 211 | // AMD: "The extended SSE instructions include [...]." |
| 212 | |
| 213 | // Intel: "14.3 DETECTION OF INTEL AVX INSTRUCTIONS" |
| 214 | // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't |
| 215 | // support AVX state. |
| 216 | let avx_available = check(leaf1_ecx, 28); |
| 217 | if avx_available { |
| 218 | set(&mut caps, Shift::Avx); |
| 219 | } |
| 220 | |
| 221 | // "14.7.1 Detection of Intel AVX2 Hardware support" |
| 222 | // XXX: We don't condition AVX2 on AVX. TODO: Address this. |
| 223 | // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't |
| 224 | // support AVX state. |
| 225 | #[cfg (target_arch = "x86_64" )] |
| 226 | if check(extended_features_ebx, 5) { |
| 227 | set(&mut caps, Shift::Avx2); |
| 228 | |
| 229 | // Declared as `uint32_t` in the C code. |
| 230 | prefixed_extern! { |
| 231 | static avx2_available: AtomicU32; |
| 232 | } |
| 233 | // SAFETY: The C code only reads `avx2_available`, and its reads are |
| 234 | // synchronized through the `OnceNonZeroUsize` Acquire/Release |
| 235 | // semantics as we ensure we have a `cpu::Features` instance before |
| 236 | // calling into the C code. |
| 237 | let flag = unsafe { &avx2_available }; |
| 238 | flag.store(1, core::sync::atomic::Ordering::Relaxed); |
| 239 | } |
| 240 | |
| 241 | // Intel: "12.13.4 Checking for Intel AES-NI Support" |
| 242 | // If/when we support dynamic detection of SSE/SSE2, revisit this. |
| 243 | // TODO: Clarify "interesting" states like (!SSE && AVX && AES-NI) |
| 244 | // and AES-NI & !AVX. |
| 245 | // Each check of `ClMul`, `Aes`, and `Sha` must be paired with a check for |
| 246 | // an AVX feature (e.g. `Avx`) or an SSE feature (e.g. `Ssse3`), as every |
| 247 | // use will either be supported by SSE* or AVX* instructions. We then |
| 248 | // assume that those supporting instructions' prerequisites (e.g. OS |
| 249 | // support for AVX or SSE state, respectively) are the only prerequisites |
| 250 | // for these features. |
| 251 | if check(leaf1_ecx, 1) { |
| 252 | set(&mut caps, Shift::ClMul); |
| 253 | } |
| 254 | if check(leaf1_ecx, 25) { |
| 255 | set(&mut caps, Shift::Aes); |
| 256 | } |
| 257 | // See BoringSSL 69c26de93c82ad98daecaec6e0c8644cdf74b03f before enabling |
| 258 | // static feature detection for this. |
| 259 | #[cfg (target_arch = "x86_64" )] |
| 260 | if check(extended_features_ebx, 29) { |
| 261 | set(&mut caps, Shift::Sha); |
| 262 | } |
| 263 | |
| 264 | #[cfg (target_arch = "x86_64" )] |
| 265 | { |
| 266 | if is_intel { |
| 267 | set(&mut caps, Shift::IntelCpu); |
| 268 | } |
| 269 | |
| 270 | if check(leaf1_ecx, 22) { |
| 271 | set(&mut caps, Shift::Movbe); |
| 272 | } |
| 273 | |
| 274 | let adx_available = check(extended_features_ebx, 19); |
| 275 | if adx_available { |
| 276 | set(&mut caps, Shift::Adx); |
| 277 | } |
| 278 | |
| 279 | // Some 6th Generation (Skylake) CPUs claim to support BMI1 and BMI2 |
| 280 | // when they don't; see erratum "SKD052". The Intel document at |
| 281 | // https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/6th-gen-core-u-y-spec-update.pdf |
| 282 | // contains the footnote "Affects 6th Generation Intel Pentium processor |
| 283 | // family and Intel Celeron processor family". Further research indicates |
| 284 | // that Skylake Pentium/Celeron do not implement AVX or ADX. It turns |
| 285 | // out that we only use BMI1 and BMI2 in combination with ADX and/or |
| 286 | // AVX. |
| 287 | // |
| 288 | // rust `std::arch::is_x86_feature_detected` does a very similar thing |
| 289 | // but only looks at AVX, not ADX. Note that they reference an older |
| 290 | // version of the erratum labeled SKL052. |
| 291 | let believe_bmi_bits = !is_intel || (adx_available || avx_available); |
| 292 | |
| 293 | if check(extended_features_ebx, 3) && believe_bmi_bits { |
| 294 | set(&mut caps, Shift::Bmi1); |
| 295 | } |
| 296 | |
| 297 | let bmi2_available = check(extended_features_ebx, 8) && believe_bmi_bits; |
| 298 | if bmi2_available { |
| 299 | set(&mut caps, Shift::Bmi2); |
| 300 | } |
| 301 | |
| 302 | if adx_available && bmi2_available { |
| 303 | // Declared as `uint32_t` in the C code. |
| 304 | prefixed_extern! { |
| 305 | static adx_bmi2_available: AtomicU32; |
| 306 | } |
| 307 | // SAFETY: The C code only reads `adx_bmi2_available`, and its |
| 308 | // reads are synchronized through the `OnceNonZeroUsize` |
| 309 | // Acquire/Release semantics as we ensure we have a |
| 310 | // `cpu::Features` instance before calling into the C code. |
| 311 | let flag = unsafe { &adx_bmi2_available }; |
| 312 | flag.store(1, core::sync::atomic::Ordering::Relaxed); |
| 313 | } |
| 314 | } |
| 315 | |
| 316 | caps |
| 317 | } |
| 318 | |
| 319 | impl_get_feature! { |
| 320 | features: [ |
| 321 | { ("x86" , "x86_64" ) => ClMul }, |
| 322 | { ("x86" , "x86_64" ) => Ssse3 }, |
| 323 | { ("x86_64" ) => Sse41 }, |
| 324 | { ("x86_64" ) => Movbe }, |
| 325 | { ("x86" , "x86_64" ) => Aes }, |
| 326 | { ("x86" , "x86_64" ) => Avx }, |
| 327 | { ("x86_64" ) => Bmi1 }, |
| 328 | { ("x86_64" ) => Avx2 }, |
| 329 | { ("x86_64" ) => Bmi2 }, |
| 330 | { ("x86_64" ) => Adx }, |
| 331 | // See BoringSSL 69c26de93c82ad98daecaec6e0c8644cdf74b03f before enabling |
| 332 | // static feature detection for this. |
| 333 | { ("x86_64" ) => Sha }, |
| 334 | // x86_64 can just assume SSE2 is available. |
| 335 | { ("x86" ) => Sse2 }, |
| 336 | ], |
| 337 | } |
| 338 | |
| 339 | cfg_if! { |
| 340 | if #[cfg(target_arch = "x86_64" )] { |
| 341 | #[derive (Clone, Copy)] |
| 342 | pub(crate) struct IntelCpu(super::Features); |
| 343 | |
| 344 | impl super::GetFeature<IntelCpu> for super::features::Values { |
| 345 | fn get_feature(&self) -> Option<IntelCpu> { |
| 346 | const MASK: u32 = 1 << (Shift::IntelCpu as u32); |
| 347 | if (self.values() & MASK) == MASK { |
| 348 | Some(IntelCpu(self.cpu())) |
| 349 | } else { |
| 350 | None |
| 351 | } |
| 352 | } |
| 353 | } |
| 354 | } |
| 355 | } |
| 356 | |
| 357 | #[cfg (test)] |
| 358 | mod tests { |
| 359 | // This should always pass on any x86 system except very, very, old ones. |
| 360 | #[cfg (target_arch = "x86" )] |
| 361 | #[test ] |
| 362 | fn x86_has_sse2() { |
| 363 | use super::*; |
| 364 | use crate::cpu::{self, GetFeature as _}; |
| 365 | assert!(matches!(cpu::features().get_feature(), Some(Sse2 { .. }))) |
| 366 | } |
| 367 | } |
| 368 | |