| 1 | // SPDX-License-Identifier: Apache-2.0 OR MIT |
| 2 | |
| 3 | /* |
| 4 | 128-bit atomic implementation on x86_64. |
| 5 | |
| 6 | This architecture provides the following 128-bit atomic instructions: |
| 7 | |
| 8 | - CMPXCHG16B: CAS (CMPXCHG16B) |
| 9 | - VMOVDQA: load/store (Intel, AMD, or Zhaoxin CPU with AVX) |
| 10 | |
| 11 | Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use |
| 12 | this module and use intrinsics.rs instead. |
| 13 | |
| 14 | Refs: |
| 15 | - x86 and amd64 instruction reference https://www.felixcloutier.com/x86 |
| 16 | - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit |
| 17 | |
| 18 | Generated asm: |
| 19 | - x86_64 (+cmpxchg16b) https://godbolt.org/z/rfs1jxd51 |
| 20 | */ |
| 21 | |
| 22 | // TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm |
| 23 | |
| 24 | include!("macros.rs" ); |
| 25 | |
| 26 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
| 27 | #[path = "../fallback/outline_atomics.rs" ] |
| 28 | mod fallback; |
| 29 | |
| 30 | #[cfg (not(portable_atomic_no_outline_atomics))] |
| 31 | #[cfg (not(target_env = "sgx" ))] |
| 32 | #[cfg_attr ( |
| 33 | not(target_feature = "sse" ), |
| 34 | cfg(not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))) |
| 35 | )] |
| 36 | #[path = "../detect/x86_64.rs" ] |
| 37 | mod detect; |
| 38 | |
| 39 | #[cfg (not(portable_atomic_no_asm))] |
| 40 | use core::arch::asm; |
| 41 | use core::sync::atomic::Ordering; |
| 42 | |
| 43 | use crate::utils::{Pair, U128}; |
| 44 | |
| 45 | // Asserts that the function is called in the correct context. |
| 46 | macro_rules! debug_assert_cmpxchg16b { |
| 47 | () => { |
| 48 | #[cfg(not(any( |
| 49 | target_feature = "cmpxchg16b" , |
| 50 | portable_atomic_target_feature = "cmpxchg16b" , |
| 51 | )))] |
| 52 | { |
| 53 | debug_assert!(detect::detect().has_cmpxchg16b()); |
| 54 | } |
| 55 | }; |
| 56 | } |
| 57 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 58 | #[cfg (target_feature = "sse" )] |
| 59 | macro_rules! debug_assert_vmovdqa_atomic { |
| 60 | () => {{ |
| 61 | debug_assert_cmpxchg16b!(); |
| 62 | debug_assert!(detect::detect().has_vmovdqa_atomic()); |
| 63 | }}; |
| 64 | } |
| 65 | |
| 66 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 67 | #[cfg (target_feature = "sse" )] |
| 68 | #[cfg (target_pointer_width = "32" )] |
| 69 | macro_rules! ptr_modifier { |
| 70 | () => { |
| 71 | ":e" |
| 72 | }; |
| 73 | } |
| 74 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 75 | #[cfg (target_feature = "sse" )] |
| 76 | #[cfg (target_pointer_width = "64" )] |
| 77 | macro_rules! ptr_modifier { |
| 78 | () => { |
| 79 | "" |
| 80 | }; |
| 81 | } |
| 82 | |
| 83 | // Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction |
| 84 | // requirements for the currently enabled target features. In the first place, |
| 85 | // there is no option in the x86 assembly for such case, like Arm .arch_extension, |
| 86 | // RISC-V .option arch, PowerPC .machine, etc. |
| 87 | // However, we set target_feature(enable) when available (Rust 1.69+) in case a |
| 88 | // new codegen backend is added that checks for it in the future, or an option |
| 89 | // is added to the assembler to check for it. |
| 90 | #[cfg_attr ( |
| 91 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 92 | target_feature(enable = "cmpxchg16b" ) |
| 93 | )] |
| 94 | #[inline ] |
| 95 | unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { |
| 96 | debug_assert!(dst as usize % 16 == 0); |
| 97 | debug_assert_cmpxchg16b!(); |
| 98 | |
| 99 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 100 | // reads, 16-byte aligned (required by CMPXCHG16B), that there are no |
| 101 | // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B. |
| 102 | // |
| 103 | // If the value at `dst` (destination operand) and rdx:rax are equal, the |
| 104 | // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at |
| 105 | // `dst` is loaded to rdx:rax. |
| 106 | // |
| 107 | // The ZF flag is set if the value at `dst` and rdx:rax are equal, |
| 108 | // otherwise it is cleared. Other flags are unaffected. |
| 109 | // |
| 110 | // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b |
| 111 | unsafe { |
| 112 | // cmpxchg16b is always SeqCst. |
| 113 | let r: u8; |
| 114 | let old = U128 { whole: old }; |
| 115 | let new = U128 { whole: new }; |
| 116 | let (prev_lo, prev_hi); |
| 117 | macro_rules! cmpxchg16b { |
| 118 | ($rdi:tt) => { |
| 119 | asm!( |
| 120 | "xchg {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 121 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 122 | "sete cl" , |
| 123 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 124 | rbx_tmp = inout(reg) new.pair.lo => _, |
| 125 | in("rcx" ) new.pair.hi, |
| 126 | inout("rax" ) old.pair.lo => prev_lo, |
| 127 | inout("rdx" ) old.pair.hi => prev_hi, |
| 128 | in($rdi) dst, |
| 129 | lateout("cl" ) r, |
| 130 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 131 | options(nostack), |
| 132 | ) |
| 133 | }; |
| 134 | } |
| 135 | #[cfg (target_pointer_width = "32" )] |
| 136 | cmpxchg16b!("edi" ); |
| 137 | #[cfg (target_pointer_width = "64" )] |
| 138 | cmpxchg16b!("rdi" ); |
| 139 | crate::utils::assert_unchecked(r == 0 || r == 1); // needed to remove extra test |
| 140 | (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0) |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | // VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX. |
| 145 | // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details. |
| 146 | // |
| 147 | // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64 |
| 148 | // |
| 149 | // Use cfg(target_feature = "sse") here -- SSE is included in the x86_64 |
| 150 | // baseline and is always available, but the SSE target feature is disabled for |
| 151 | // use cases such as kernels and firmware that should not use vector registers. |
| 152 | // So, do not use vector registers unless SSE target feature is enabled. |
| 153 | // See also https://github.com/rust-lang/rust/blob/1.84.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md. |
| 154 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 155 | #[cfg (target_feature = "sse" )] |
| 156 | #[target_feature (enable = "avx" )] |
| 157 | #[inline ] |
| 158 | unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 { |
| 159 | debug_assert!(src as usize % 16 == 0); |
| 160 | debug_assert_vmovdqa_atomic!(); |
| 161 | |
| 162 | // SAFETY: the caller must uphold the safety contract. |
| 163 | // |
| 164 | // atomic load by vmovdqa is always SeqCst. |
| 165 | unsafe { |
| 166 | let out: core::arch::x86_64::__m128i; |
| 167 | asm!( |
| 168 | concat!("vmovdqa {out}, xmmword ptr [{src" , ptr_modifier!( ) , "}]" ), |
| 169 | src = in(reg) src, |
| 170 | out = out(xmm_reg) out, |
| 171 | options(nostack, preserves_flags), |
| 172 | ); |
| 173 | core::mem::transmute(src:out) |
| 174 | } |
| 175 | } |
| 176 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 177 | #[cfg (target_feature = "sse" )] |
| 178 | #[target_feature (enable = "avx" )] |
| 179 | #[inline ] |
| 180 | unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) { |
| 181 | debug_assert!(dst as usize % 16 == 0); |
| 182 | debug_assert_vmovdqa_atomic!(); |
| 183 | |
| 184 | // SAFETY: the caller must uphold the safety contract. |
| 185 | unsafe { |
| 186 | let val: core::arch::x86_64::__m128i = core::mem::transmute(val); |
| 187 | match order { |
| 188 | // Relaxed and Release stores are equivalent. |
| 189 | Ordering::Relaxed | Ordering::Release => { |
| 190 | asm!( |
| 191 | concat!("vmovdqa xmmword ptr [{dst" , ptr_modifier!( ) , "}], {val}" ), |
| 192 | dst = in(reg) dst, |
| 193 | val = in(xmm_reg) val, |
| 194 | options(nostack, preserves_flags), |
| 195 | ); |
| 196 | } |
| 197 | Ordering::SeqCst => { |
| 198 | let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit()); |
| 199 | asm!( |
| 200 | concat!("vmovdqa xmmword ptr [{dst" , ptr_modifier!( ) , "}], {val}" ), |
| 201 | // Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases. |
| 202 | // - https://github.com/taiki-e/portable-atomic/pull/156 |
| 203 | // - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc |
| 204 | // - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier |
| 205 | // - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740 |
| 206 | // - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22 |
| 207 | concat!("xchg qword ptr [{p" , ptr_modifier!( ) , "}], {tmp}" ), |
| 208 | dst = in(reg) dst, |
| 209 | val = in(xmm_reg) val, |
| 210 | p = inout(reg) p.get() => _, |
| 211 | tmp = lateout(reg) _, |
| 212 | options(nostack, preserves_flags), |
| 213 | ); |
| 214 | } |
| 215 | _ => unreachable!(), |
| 216 | } |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | #[cfg (not(all( |
| 221 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 222 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 223 | )))] |
| 224 | macro_rules! load_store_detect { |
| 225 | ( |
| 226 | vmovdqa = $vmovdqa:ident |
| 227 | cmpxchg16b = $cmpxchg16b:ident |
| 228 | fallback = $fallback:ident |
| 229 | ) => {{ |
| 230 | let cpuid = detect::detect(); |
| 231 | #[cfg(not(any( |
| 232 | target_feature = "cmpxchg16b" , |
| 233 | portable_atomic_target_feature = "cmpxchg16b" , |
| 234 | )))] |
| 235 | { |
| 236 | // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access. |
| 237 | if cpuid.has_cmpxchg16b() { |
| 238 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
| 239 | #[cfg(target_feature = "sse" )] |
| 240 | { |
| 241 | if cpuid.has_vmovdqa_atomic() { $vmovdqa } else { $cmpxchg16b } |
| 242 | } |
| 243 | #[cfg(not(target_feature = "sse" ))] |
| 244 | { |
| 245 | $cmpxchg16b |
| 246 | } |
| 247 | } else { |
| 248 | fallback::$fallback |
| 249 | } |
| 250 | } |
| 251 | #[cfg(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
| 252 | { |
| 253 | if cpuid.has_vmovdqa_atomic() { $vmovdqa } else { $cmpxchg16b } |
| 254 | } |
| 255 | }}; |
| 256 | } |
| 257 | |
| 258 | #[inline ] |
| 259 | unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 { |
| 260 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
| 261 | // SGX doesn't support CPUID. |
| 262 | #[cfg (all( |
| 263 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 264 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 265 | ))] |
| 266 | // SAFETY: the caller must uphold the safety contract. |
| 267 | // cfg guarantees that CMPXCHG16B is available at compile-time. |
| 268 | unsafe { |
| 269 | // cmpxchg16b is always SeqCst. |
| 270 | atomic_load_cmpxchg16b(src) |
| 271 | } |
| 272 | #[cfg (not(all( |
| 273 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 274 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 275 | )))] |
| 276 | // SAFETY: the caller must uphold the safety contract. |
| 277 | unsafe { |
| 278 | ifunc!(unsafe fn(src: *mut u128) -> u128 { |
| 279 | load_store_detect! { |
| 280 | vmovdqa = atomic_load_vmovdqa |
| 281 | cmpxchg16b = atomic_load_cmpxchg16b |
| 282 | // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst. |
| 283 | fallback = atomic_load_seqcst |
| 284 | } |
| 285 | }) |
| 286 | } |
| 287 | } |
| 288 | // See cmpxchg16b() for target_feature(enable). |
| 289 | #[cfg_attr ( |
| 290 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 291 | target_feature(enable = "cmpxchg16b" ) |
| 292 | )] |
| 293 | #[inline ] |
| 294 | unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 { |
| 295 | debug_assert!(src as usize % 16 == 0); |
| 296 | debug_assert_cmpxchg16b!(); |
| 297 | |
| 298 | // SAFETY: the caller must guarantee that `src` is valid for both writes and |
| 299 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
| 300 | // cfg guarantees that the CPU supports CMPXCHG16B. |
| 301 | // |
| 302 | // See cmpxchg16b function for more. |
| 303 | // |
| 304 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
| 305 | // omitting the storing of condition flags and avoid use of xchg to handle rbx. |
| 306 | unsafe { |
| 307 | // cmpxchg16b is always SeqCst. |
| 308 | let (out_lo, out_hi); |
| 309 | macro_rules! cmpxchg16b { |
| 310 | ($rdi:tt) => { |
| 311 | asm!( |
| 312 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 313 | "xor rbx, rbx" , // zeroed rbx |
| 314 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 315 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 316 | // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg) |
| 317 | rbx_tmp = out(reg) _, |
| 318 | in("rcx" ) 0_u64, |
| 319 | inout("rax" ) 0_u64 => out_lo, |
| 320 | inout("rdx" ) 0_u64 => out_hi, |
| 321 | in($rdi) src, |
| 322 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 323 | options(nostack), |
| 324 | ) |
| 325 | }; |
| 326 | } |
| 327 | #[cfg (target_pointer_width = "32" )] |
| 328 | cmpxchg16b!("edi" ); |
| 329 | #[cfg (target_pointer_width = "64" )] |
| 330 | cmpxchg16b!("rdi" ); |
| 331 | U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole |
| 332 | } |
| 333 | } |
| 334 | |
| 335 | #[inline ] |
| 336 | unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { |
| 337 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
| 338 | // SGX doesn't support CPUID. |
| 339 | #[cfg (all( |
| 340 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 341 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 342 | ))] |
| 343 | // SAFETY: the caller must uphold the safety contract. |
| 344 | // cfg guarantees that CMPXCHG16B is available at compile-time. |
| 345 | unsafe { |
| 346 | // cmpxchg16b is always SeqCst. |
| 347 | let _ = order; |
| 348 | atomic_store_cmpxchg16b(dst, val); |
| 349 | } |
| 350 | #[cfg (not(all( |
| 351 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 352 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 353 | )))] |
| 354 | // SAFETY: the caller must uphold the safety contract. |
| 355 | unsafe { |
| 356 | #[cfg (target_feature = "sse" )] |
| 357 | fn_alias! { |
| 358 | #[target_feature (enable = "avx" )] |
| 359 | unsafe fn(dst: *mut u128, val: u128); |
| 360 | // atomic store by vmovdqa has at least release semantics. |
| 361 | atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release); |
| 362 | atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst); |
| 363 | } |
| 364 | match order { |
| 365 | // Relaxed and Release stores are equivalent in all implementations |
| 366 | // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback). |
| 367 | // core::arch's cmpxchg16b will never called here. |
| 368 | Ordering::Relaxed | Ordering::Release => { |
| 369 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { |
| 370 | load_store_detect! { |
| 371 | vmovdqa = atomic_store_vmovdqa_non_seqcst |
| 372 | cmpxchg16b = atomic_store_cmpxchg16b |
| 373 | fallback = atomic_store_non_seqcst |
| 374 | } |
| 375 | }); |
| 376 | } |
| 377 | Ordering::SeqCst => { |
| 378 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { |
| 379 | load_store_detect! { |
| 380 | vmovdqa = atomic_store_vmovdqa_seqcst |
| 381 | cmpxchg16b = atomic_store_cmpxchg16b |
| 382 | fallback = atomic_store_seqcst |
| 383 | } |
| 384 | }); |
| 385 | } |
| 386 | _ => unreachable!(), |
| 387 | } |
| 388 | } |
| 389 | } |
| 390 | // See cmpxchg16b() for target_feature(enable). |
| 391 | #[cfg_attr ( |
| 392 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 393 | target_feature(enable = "cmpxchg16b" ) |
| 394 | )] |
| 395 | #[inline ] |
| 396 | unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) { |
| 397 | // SAFETY: the caller must uphold the safety contract. |
| 398 | unsafe { |
| 399 | // cmpxchg16b is always SeqCst. |
| 400 | atomic_swap_cmpxchg16b(dst, val, _order:Ordering::SeqCst); |
| 401 | } |
| 402 | } |
| 403 | |
| 404 | #[inline ] |
| 405 | unsafe fn atomic_compare_exchange( |
| 406 | dst: *mut u128, |
| 407 | old: u128, |
| 408 | new: u128, |
| 409 | _success: Ordering, |
| 410 | _failure: Ordering, |
| 411 | ) -> Result<u128, u128> { |
| 412 | #[cfg (any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
| 413 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 414 | // reads, 16-byte aligned, that there are no concurrent non-atomic operations, |
| 415 | // and cfg guarantees that CMPXCHG16B is available at compile-time. |
| 416 | let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) }; |
| 417 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
| 418 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 419 | // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses. |
| 420 | let (prev: u128, ok: bool) = unsafe { |
| 421 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { |
| 422 | if detect::detect().has_cmpxchg16b() { |
| 423 | cmpxchg16b |
| 424 | } else { |
| 425 | // Use SeqCst because cmpxchg16b is always SeqCst. |
| 426 | fallback::atomic_compare_exchange_seqcst |
| 427 | } |
| 428 | }) |
| 429 | }; |
| 430 | if ok { Ok(prev) } else { Err(prev) } |
| 431 | } |
| 432 | |
| 433 | // cmpxchg16b is always strong. |
| 434 | use self::atomic_compare_exchange as atomic_compare_exchange_weak; |
| 435 | |
| 436 | // See cmpxchg16b() for target_feature(enable). |
| 437 | #[cfg_attr ( |
| 438 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 439 | target_feature(enable = "cmpxchg16b" ) |
| 440 | )] |
| 441 | #[inline ] |
| 442 | unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 { |
| 443 | debug_assert!(dst as usize % 16 == 0); |
| 444 | debug_assert_cmpxchg16b!(); |
| 445 | |
| 446 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 447 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
| 448 | // cfg guarantees that the CPU supports CMPXCHG16B. |
| 449 | // |
| 450 | // See cmpxchg16b function for more. |
| 451 | // |
| 452 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
| 453 | // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx. |
| 454 | // |
| 455 | // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap. |
| 456 | unsafe { |
| 457 | // cmpxchg16b is always SeqCst. |
| 458 | let val = U128 { whole: val }; |
| 459 | let (mut prev_lo, mut prev_hi); |
| 460 | macro_rules! cmpxchg16b { |
| 461 | ($rdi:tt) => { |
| 462 | asm!( |
| 463 | "xchg {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 464 | // This is not single-copy atomic reads, but this is ok because subsequent |
| 465 | // CAS will check for consistency. |
| 466 | // |
| 467 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
| 468 | // |
| 469 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
| 470 | // so we must use inline assembly to implement this. |
| 471 | // (i.e., byte-wise atomic based on the standard library's atomic types |
| 472 | // cannot be used here). |
| 473 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
| 474 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
| 475 | "2:" , |
| 476 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 477 | "jne 2b" , |
| 478 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 479 | rbx_tmp = inout(reg) val.pair.lo => _, |
| 480 | in("rcx" ) val.pair.hi, |
| 481 | out("rax" ) prev_lo, |
| 482 | out("rdx" ) prev_hi, |
| 483 | in($rdi) dst, |
| 484 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 485 | options(nostack), |
| 486 | ) |
| 487 | }; |
| 488 | } |
| 489 | #[cfg (target_pointer_width = "32" )] |
| 490 | cmpxchg16b!("edi" ); |
| 491 | #[cfg (target_pointer_width = "64" )] |
| 492 | cmpxchg16b!("rdi" ); |
| 493 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
| 494 | } |
| 495 | } |
| 496 | |
| 497 | /// Atomic RMW by CAS loop (3 arguments) |
| 498 | /// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;` |
| 499 | /// |
| 500 | /// `$op` can use the following registers: |
| 501 | /// - rsi/r8 pair: val argument (read-only for `$op`) |
| 502 | /// - rax/rdx pair: previous value loaded (read-only for `$op`) |
| 503 | /// - rbx/rcx pair: new value that will be stored |
| 504 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
| 505 | // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx. |
| 506 | macro_rules! atomic_rmw_cas_3 { |
| 507 | ($name:ident, $($op:tt)*) => { |
| 508 | // See cmpxchg16b() for target_feature(enable). |
| 509 | #[cfg_attr( |
| 510 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 511 | target_feature(enable = "cmpxchg16b" ) |
| 512 | )] |
| 513 | #[inline] |
| 514 | unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 { |
| 515 | debug_assert!(dst as usize % 16 == 0); |
| 516 | debug_assert_cmpxchg16b!(); |
| 517 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 518 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
| 519 | // cfg guarantees that the CPU supports CMPXCHG16B. |
| 520 | // |
| 521 | // See cmpxchg16b function for more. |
| 522 | unsafe { |
| 523 | // cmpxchg16b is always SeqCst. |
| 524 | let val = U128 { whole: val }; |
| 525 | let (mut prev_lo, mut prev_hi); |
| 526 | macro_rules! cmpxchg16b { |
| 527 | ($rdi:tt) => { |
| 528 | asm!( |
| 529 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 530 | // This is not single-copy atomic reads, but this is ok because subsequent |
| 531 | // CAS will check for consistency. |
| 532 | // |
| 533 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
| 534 | // |
| 535 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
| 536 | // so we must use inline assembly to implement this. |
| 537 | // (i.e., byte-wise atomic based on the standard library's atomic types |
| 538 | // cannot be used here). |
| 539 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
| 540 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
| 541 | "2:" , |
| 542 | $($op)* |
| 543 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 544 | "jne 2b" , |
| 545 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 546 | rbx_tmp = out(reg) _, |
| 547 | out("rcx" ) _, |
| 548 | out("rax" ) prev_lo, |
| 549 | out("rdx" ) prev_hi, |
| 550 | in($rdi) dst, |
| 551 | in("rsi" ) val.pair.lo, |
| 552 | in("r8" ) val.pair.hi, |
| 553 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 554 | options(nostack), |
| 555 | ) |
| 556 | }; |
| 557 | } |
| 558 | #[cfg(target_pointer_width = "32" )] |
| 559 | cmpxchg16b!("edi" ); |
| 560 | #[cfg(target_pointer_width = "64" )] |
| 561 | cmpxchg16b!("rdi" ); |
| 562 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
| 563 | } |
| 564 | } |
| 565 | }; |
| 566 | } |
| 567 | /// Atomic RMW by CAS loop (2 arguments) |
| 568 | /// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;` |
| 569 | /// |
| 570 | /// `$op` can use the following registers: |
| 571 | /// - rax/rdx pair: previous value loaded (read-only for `$op`) |
| 572 | /// - rbx/rcx pair: new value that will be stored |
| 573 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
| 574 | // omitting the storing of condition flags and avoid use of xchg to handle rbx. |
| 575 | macro_rules! atomic_rmw_cas_2 { |
| 576 | ($name:ident, $($op:tt)*) => { |
| 577 | // See cmpxchg16b() for target_feature(enable). |
| 578 | #[cfg_attr( |
| 579 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 580 | target_feature(enable = "cmpxchg16b" ) |
| 581 | )] |
| 582 | #[inline] |
| 583 | unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 { |
| 584 | debug_assert!(dst as usize % 16 == 0); |
| 585 | debug_assert_cmpxchg16b!(); |
| 586 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 587 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
| 588 | // cfg guarantees that the CPU supports CMPXCHG16B. |
| 589 | // |
| 590 | // See cmpxchg16b function for more. |
| 591 | unsafe { |
| 592 | // cmpxchg16b is always SeqCst. |
| 593 | let (mut prev_lo, mut prev_hi); |
| 594 | macro_rules! cmpxchg16b { |
| 595 | ($rdi:tt) => { |
| 596 | asm!( |
| 597 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 598 | // This is not single-copy atomic reads, but this is ok because subsequent |
| 599 | // CAS will check for consistency. |
| 600 | // |
| 601 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
| 602 | // |
| 603 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
| 604 | // so we must use inline assembly to implement this. |
| 605 | // (i.e., byte-wise atomic based on the standard library's atomic types |
| 606 | // cannot be used here). |
| 607 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
| 608 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
| 609 | "2:" , |
| 610 | $($op)* |
| 611 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 612 | "jne 2b" , |
| 613 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 614 | rbx_tmp = out(reg) _, |
| 615 | out("rcx" ) _, |
| 616 | out("rax" ) prev_lo, |
| 617 | out("rdx" ) prev_hi, |
| 618 | in($rdi) dst, |
| 619 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 620 | options(nostack), |
| 621 | ) |
| 622 | }; |
| 623 | } |
| 624 | #[cfg(target_pointer_width = "32" )] |
| 625 | cmpxchg16b!("edi" ); |
| 626 | #[cfg(target_pointer_width = "64" )] |
| 627 | cmpxchg16b!("rdi" ); |
| 628 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
| 629 | } |
| 630 | } |
| 631 | }; |
| 632 | } |
| 633 | |
| 634 | atomic_rmw_cas_3! { |
| 635 | atomic_add_cmpxchg16b, |
| 636 | "mov rbx, rax" , |
| 637 | "add rbx, rsi" , |
| 638 | "mov rcx, rdx" , |
| 639 | "adc rcx, r8" , |
| 640 | } |
| 641 | atomic_rmw_cas_3! { |
| 642 | atomic_sub_cmpxchg16b, |
| 643 | "mov rbx, rax" , |
| 644 | "sub rbx, rsi" , |
| 645 | "mov rcx, rdx" , |
| 646 | "sbb rcx, r8" , |
| 647 | } |
| 648 | atomic_rmw_cas_3! { |
| 649 | atomic_and_cmpxchg16b, |
| 650 | "mov rbx, rax" , |
| 651 | "and rbx, rsi" , |
| 652 | "mov rcx, rdx" , |
| 653 | "and rcx, r8" , |
| 654 | } |
| 655 | atomic_rmw_cas_3! { |
| 656 | atomic_nand_cmpxchg16b, |
| 657 | "mov rbx, rax" , |
| 658 | "and rbx, rsi" , |
| 659 | "not rbx" , |
| 660 | "mov rcx, rdx" , |
| 661 | "and rcx, r8" , |
| 662 | "not rcx" , |
| 663 | } |
| 664 | atomic_rmw_cas_3! { |
| 665 | atomic_or_cmpxchg16b, |
| 666 | "mov rbx, rax" , |
| 667 | "or rbx, rsi" , |
| 668 | "mov rcx, rdx" , |
| 669 | "or rcx, r8" , |
| 670 | } |
| 671 | atomic_rmw_cas_3! { |
| 672 | atomic_xor_cmpxchg16b, |
| 673 | "mov rbx, rax" , |
| 674 | "xor rbx, rsi" , |
| 675 | "mov rcx, rdx" , |
| 676 | "xor rcx, r8" , |
| 677 | } |
| 678 | |
| 679 | atomic_rmw_cas_2! { |
| 680 | atomic_not_cmpxchg16b, |
| 681 | "mov rbx, rax" , |
| 682 | "not rbx" , |
| 683 | "mov rcx, rdx" , |
| 684 | "not rcx" , |
| 685 | } |
| 686 | atomic_rmw_cas_2! { |
| 687 | atomic_neg_cmpxchg16b, |
| 688 | "mov rbx, rax" , |
| 689 | "neg rbx" , |
| 690 | "mov rcx, 0" , |
| 691 | "sbb rcx, rdx" , |
| 692 | } |
| 693 | |
| 694 | atomic_rmw_cas_3! { |
| 695 | atomic_max_cmpxchg16b, |
| 696 | "cmp rsi, rax" , |
| 697 | "mov rcx, r8" , |
| 698 | "sbb rcx, rdx" , |
| 699 | "mov rcx, r8" , |
| 700 | "cmovl rcx, rdx" , |
| 701 | "mov rbx, rsi" , |
| 702 | "cmovl rbx, rax" , |
| 703 | } |
| 704 | atomic_rmw_cas_3! { |
| 705 | atomic_umax_cmpxchg16b, |
| 706 | "cmp rsi, rax" , |
| 707 | "mov rcx, r8" , |
| 708 | "sbb rcx, rdx" , |
| 709 | "mov rcx, r8" , |
| 710 | "cmovb rcx, rdx" , |
| 711 | "mov rbx, rsi" , |
| 712 | "cmovb rbx, rax" , |
| 713 | } |
| 714 | atomic_rmw_cas_3! { |
| 715 | atomic_min_cmpxchg16b, |
| 716 | "cmp rsi, rax" , |
| 717 | "mov rcx, r8" , |
| 718 | "sbb rcx, rdx" , |
| 719 | "mov rcx, r8" , |
| 720 | "cmovge rcx, rdx" , |
| 721 | "mov rbx, rsi" , |
| 722 | "cmovge rbx, rax" , |
| 723 | } |
| 724 | atomic_rmw_cas_3! { |
| 725 | atomic_umin_cmpxchg16b, |
| 726 | "cmp rsi, rax" , |
| 727 | "mov rcx, r8" , |
| 728 | "sbb rcx, rdx" , |
| 729 | "mov rcx, r8" , |
| 730 | "cmovae rcx, rdx" , |
| 731 | "mov rbx, rsi" , |
| 732 | "cmovae rbx, rax" , |
| 733 | } |
| 734 | |
| 735 | macro_rules! select_atomic_rmw { |
| 736 | ( |
| 737 | unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?; |
| 738 | cmpxchg16b = $cmpxchg16b_fn:ident; |
| 739 | fallback = $seqcst_fallback_fn:ident; |
| 740 | ) => { |
| 741 | // If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn. |
| 742 | #[cfg(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
| 743 | use self::$cmpxchg16b_fn as $name; |
| 744 | // Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available. |
| 745 | #[cfg(not(any( |
| 746 | target_feature = "cmpxchg16b" , |
| 747 | portable_atomic_target_feature = "cmpxchg16b" , |
| 748 | )))] |
| 749 | #[inline] |
| 750 | unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? { |
| 751 | fn_alias! { |
| 752 | // See cmpxchg16b() for target_feature(enable). |
| 753 | #[cfg_attr( |
| 754 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 755 | target_feature(enable = "cmpxchg16b" ) |
| 756 | )] |
| 757 | unsafe fn($($arg)*) $(-> $ret_ty)?; |
| 758 | // cmpxchg16b is always SeqCst. |
| 759 | cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst); |
| 760 | } |
| 761 | // SAFETY: the caller must uphold the safety contract. |
| 762 | // we only calls cmpxchg16b_fn if cmpxchg16b is available. |
| 763 | unsafe { |
| 764 | ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? { |
| 765 | if detect::detect().has_cmpxchg16b() { |
| 766 | cmpxchg16b_seqcst_fn |
| 767 | } else { |
| 768 | // Use SeqCst because cmpxchg16b is always SeqCst. |
| 769 | fallback::$seqcst_fallback_fn |
| 770 | } |
| 771 | }) |
| 772 | } |
| 773 | } |
| 774 | }; |
| 775 | } |
| 776 | |
| 777 | select_atomic_rmw! { |
| 778 | unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128; |
| 779 | cmpxchg16b = atomic_swap_cmpxchg16b; |
| 780 | fallback = atomic_swap_seqcst; |
| 781 | } |
| 782 | select_atomic_rmw! { |
| 783 | unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128; |
| 784 | cmpxchg16b = atomic_add_cmpxchg16b; |
| 785 | fallback = atomic_add_seqcst; |
| 786 | } |
| 787 | select_atomic_rmw! { |
| 788 | unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128; |
| 789 | cmpxchg16b = atomic_sub_cmpxchg16b; |
| 790 | fallback = atomic_sub_seqcst; |
| 791 | } |
| 792 | select_atomic_rmw! { |
| 793 | unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128; |
| 794 | cmpxchg16b = atomic_and_cmpxchg16b; |
| 795 | fallback = atomic_and_seqcst; |
| 796 | } |
| 797 | select_atomic_rmw! { |
| 798 | unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128; |
| 799 | cmpxchg16b = atomic_nand_cmpxchg16b; |
| 800 | fallback = atomic_nand_seqcst; |
| 801 | } |
| 802 | select_atomic_rmw! { |
| 803 | unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128; |
| 804 | cmpxchg16b = atomic_or_cmpxchg16b; |
| 805 | fallback = atomic_or_seqcst; |
| 806 | } |
| 807 | select_atomic_rmw! { |
| 808 | unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128; |
| 809 | cmpxchg16b = atomic_xor_cmpxchg16b; |
| 810 | fallback = atomic_xor_seqcst; |
| 811 | } |
| 812 | select_atomic_rmw! { |
| 813 | unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128; |
| 814 | cmpxchg16b = atomic_max_cmpxchg16b; |
| 815 | fallback = atomic_max_seqcst; |
| 816 | } |
| 817 | select_atomic_rmw! { |
| 818 | unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128; |
| 819 | cmpxchg16b = atomic_umax_cmpxchg16b; |
| 820 | fallback = atomic_umax_seqcst; |
| 821 | } |
| 822 | select_atomic_rmw! { |
| 823 | unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128; |
| 824 | cmpxchg16b = atomic_min_cmpxchg16b; |
| 825 | fallback = atomic_min_seqcst; |
| 826 | } |
| 827 | select_atomic_rmw! { |
| 828 | unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128; |
| 829 | cmpxchg16b = atomic_umin_cmpxchg16b; |
| 830 | fallback = atomic_umin_seqcst; |
| 831 | } |
| 832 | select_atomic_rmw! { |
| 833 | unsafe fn atomic_not(dst: *mut u128) -> u128; |
| 834 | cmpxchg16b = atomic_not_cmpxchg16b; |
| 835 | fallback = atomic_not_seqcst; |
| 836 | } |
| 837 | select_atomic_rmw! { |
| 838 | unsafe fn atomic_neg(dst: *mut u128) -> u128; |
| 839 | cmpxchg16b = atomic_neg_cmpxchg16b; |
| 840 | fallback = atomic_neg_seqcst; |
| 841 | } |
| 842 | |
| 843 | #[inline ] |
| 844 | fn is_lock_free() -> bool { |
| 845 | #[cfg (any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
| 846 | { |
| 847 | // CMPXCHG16B is available at compile-time. |
| 848 | true |
| 849 | } |
| 850 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
| 851 | { |
| 852 | detect::detect().has_cmpxchg16b() |
| 853 | } |
| 854 | } |
| 855 | const IS_ALWAYS_LOCK_FREE: bool = |
| 856 | cfg!(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )); |
| 857 | |
| 858 | atomic128!(AtomicI128, i128, atomic_max, atomic_min); |
| 859 | atomic128!(AtomicU128, u128, atomic_umax, atomic_umin); |
| 860 | |
| 861 | #[allow (clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)] |
| 862 | #[cfg (test)] |
| 863 | mod tests { |
| 864 | use super::*; |
| 865 | |
| 866 | test_atomic_int!(i128); |
| 867 | test_atomic_int!(u128); |
| 868 | |
| 869 | // load/store/swap implementation is not affected by signedness, so it is |
| 870 | // enough to test only unsigned types. |
| 871 | stress_test!(u128); |
| 872 | } |
| 873 | |