| 1 | // SPDX-License-Identifier: Apache-2.0 OR MIT |
| 2 | |
| 3 | /* |
| 4 | 128-bit atomic implementation on x86_64 using CMPXCHG16B (DWCAS). |
| 5 | |
| 6 | Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use |
| 7 | this module and use intrinsics.rs instead. |
| 8 | |
| 9 | Refs: |
| 10 | - x86 and amd64 instruction reference https://www.felixcloutier.com/x86 |
| 11 | - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit |
| 12 | |
| 13 | Generated asm: |
| 14 | - x86_64 (+cmpxchg16b) https://godbolt.org/z/rfs1jxd51 |
| 15 | */ |
| 16 | |
| 17 | // TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm |
| 18 | |
| 19 | include!("macros.rs" ); |
| 20 | |
| 21 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
| 22 | #[path = "../fallback/outline_atomics.rs" ] |
| 23 | mod fallback; |
| 24 | |
| 25 | #[cfg (not(portable_atomic_no_outline_atomics))] |
| 26 | #[cfg (not(target_env = "sgx" ))] |
| 27 | #[cfg_attr ( |
| 28 | not(target_feature = "sse" ), |
| 29 | cfg(not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))) |
| 30 | )] |
| 31 | #[path = "../detect/x86_64.rs" ] |
| 32 | mod detect; |
| 33 | |
| 34 | #[cfg (not(portable_atomic_no_asm))] |
| 35 | use core::arch::asm; |
| 36 | use core::sync::atomic::Ordering; |
| 37 | |
| 38 | use crate::utils::{Pair, U128}; |
| 39 | |
| 40 | // Asserts that the function is called in the correct context. |
| 41 | macro_rules! debug_assert_cmpxchg16b { |
| 42 | () => { |
| 43 | #[cfg(not(any( |
| 44 | target_feature = "cmpxchg16b" , |
| 45 | portable_atomic_target_feature = "cmpxchg16b" , |
| 46 | )))] |
| 47 | { |
| 48 | debug_assert!(detect::detect().has_cmpxchg16b()); |
| 49 | } |
| 50 | }; |
| 51 | } |
| 52 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 53 | #[cfg (target_feature = "sse" )] |
| 54 | macro_rules! debug_assert_vmovdqa_atomic { |
| 55 | () => {{ |
| 56 | debug_assert_cmpxchg16b!(); |
| 57 | debug_assert!(detect::detect().has_vmovdqa_atomic()); |
| 58 | }}; |
| 59 | } |
| 60 | |
| 61 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 62 | #[cfg (target_feature = "sse" )] |
| 63 | #[cfg (target_pointer_width = "32" )] |
| 64 | macro_rules! ptr_modifier { |
| 65 | () => { |
| 66 | ":e" |
| 67 | }; |
| 68 | } |
| 69 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 70 | #[cfg (target_feature = "sse" )] |
| 71 | #[cfg (target_pointer_width = "64" )] |
| 72 | macro_rules! ptr_modifier { |
| 73 | () => { |
| 74 | "" |
| 75 | }; |
| 76 | } |
| 77 | |
| 78 | // Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction |
| 79 | // requirements for the currently enabled target features. In the first place, |
| 80 | // there is no option in the x86 assembly for such case, like Arm .arch_extension, |
| 81 | // RISC-V .option arch, PowerPC .machine, etc. |
| 82 | // However, we set target_feature(enable) when available (Rust 1.69+) in case a |
| 83 | // new codegen backend is added that checks for it in the future, or an option |
| 84 | // is added to the assembler to check for it. |
| 85 | #[cfg_attr ( |
| 86 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 87 | target_feature(enable = "cmpxchg16b" ) |
| 88 | )] |
| 89 | #[inline ] |
| 90 | unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { |
| 91 | debug_assert!(dst as usize % 16 == 0); |
| 92 | debug_assert_cmpxchg16b!(); |
| 93 | |
| 94 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 95 | // reads, 16-byte aligned (required by CMPXCHG16B), that there are no |
| 96 | // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B. |
| 97 | // |
| 98 | // If the value at `dst` (destination operand) and rdx:rax are equal, the |
| 99 | // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at |
| 100 | // `dst` is loaded to rdx:rax. |
| 101 | // |
| 102 | // The ZF flag is set if the value at `dst` and rdx:rax are equal, |
| 103 | // otherwise it is cleared. Other flags are unaffected. |
| 104 | // |
| 105 | // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b |
| 106 | unsafe { |
| 107 | // cmpxchg16b is always SeqCst. |
| 108 | let r: u8; |
| 109 | let old = U128 { whole: old }; |
| 110 | let new = U128 { whole: new }; |
| 111 | let (prev_lo, prev_hi); |
| 112 | macro_rules! cmpxchg16b { |
| 113 | ($rdi:tt) => { |
| 114 | asm!( |
| 115 | "xchg {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 116 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 117 | "sete cl" , |
| 118 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 119 | rbx_tmp = inout(reg) new.pair.lo => _, |
| 120 | in("rcx" ) new.pair.hi, |
| 121 | inout("rax" ) old.pair.lo => prev_lo, |
| 122 | inout("rdx" ) old.pair.hi => prev_hi, |
| 123 | in($rdi) dst, |
| 124 | lateout("cl" ) r, |
| 125 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 126 | options(nostack), |
| 127 | ) |
| 128 | }; |
| 129 | } |
| 130 | #[cfg (target_pointer_width = "32" )] |
| 131 | cmpxchg16b!("edi" ); |
| 132 | #[cfg (target_pointer_width = "64" )] |
| 133 | cmpxchg16b!("rdi" ); |
| 134 | crate::utils::assert_unchecked(r == 0 || r == 1); // needed to remove extra test |
| 135 | (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0) |
| 136 | } |
| 137 | } |
| 138 | |
| 139 | // VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX. |
| 140 | // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details. |
| 141 | // |
| 142 | // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64 |
| 143 | // |
| 144 | // Use cfg(target_feature = "sse") here -- SSE is included in the x86_64 |
| 145 | // baseline and is always available, but the SSE target feature is disabled for |
| 146 | // use cases such as kernels and firmware that should not use vector registers. |
| 147 | // So, do not use vector registers unless SSE target feature is enabled. |
| 148 | // See also https://github.com/rust-lang/rust/blob/1.80.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md. |
| 149 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 150 | #[cfg (target_feature = "sse" )] |
| 151 | #[target_feature (enable = "avx" )] |
| 152 | #[inline ] |
| 153 | unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 { |
| 154 | debug_assert!(src as usize % 16 == 0); |
| 155 | debug_assert_vmovdqa_atomic!(); |
| 156 | |
| 157 | // SAFETY: the caller must uphold the safety contract. |
| 158 | // |
| 159 | // atomic load by vmovdqa is always SeqCst. |
| 160 | unsafe { |
| 161 | let out: core::arch::x86_64::__m128i; |
| 162 | asm!( |
| 163 | concat!("vmovdqa {out}, xmmword ptr [{src" , ptr_modifier!( ) , "}]" ), |
| 164 | src = in(reg) src, |
| 165 | out = out(xmm_reg) out, |
| 166 | options(nostack, preserves_flags), |
| 167 | ); |
| 168 | core::mem::transmute(src:out) |
| 169 | } |
| 170 | } |
| 171 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
| 172 | #[cfg (target_feature = "sse" )] |
| 173 | #[target_feature (enable = "avx" )] |
| 174 | #[inline ] |
| 175 | unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) { |
| 176 | debug_assert!(dst as usize % 16 == 0); |
| 177 | debug_assert_vmovdqa_atomic!(); |
| 178 | |
| 179 | // SAFETY: the caller must uphold the safety contract. |
| 180 | unsafe { |
| 181 | let val: core::arch::x86_64::__m128i = core::mem::transmute(val); |
| 182 | match order { |
| 183 | // Relaxed and Release stores are equivalent. |
| 184 | Ordering::Relaxed | Ordering::Release => { |
| 185 | asm!( |
| 186 | concat!("vmovdqa xmmword ptr [{dst" , ptr_modifier!( ) , "}], {val}" ), |
| 187 | dst = in(reg) dst, |
| 188 | val = in(xmm_reg) val, |
| 189 | options(nostack, preserves_flags), |
| 190 | ); |
| 191 | } |
| 192 | Ordering::SeqCst => { |
| 193 | let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit()); |
| 194 | asm!( |
| 195 | concat!("vmovdqa xmmword ptr [{dst" , ptr_modifier!( ) , "}], {val}" ), |
| 196 | // Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases. |
| 197 | // - https://github.com/taiki-e/portable-atomic/pull/156 |
| 198 | // - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc |
| 199 | // - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier |
| 200 | // - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740 |
| 201 | // - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22 |
| 202 | concat!("xchg qword ptr [{p" , ptr_modifier!( ) , "}], {tmp}" ), |
| 203 | dst = in(reg) dst, |
| 204 | val = in(xmm_reg) val, |
| 205 | p = inout(reg) p.get() => _, |
| 206 | tmp = lateout(reg) _, |
| 207 | options(nostack, preserves_flags), |
| 208 | ); |
| 209 | } |
| 210 | _ => unreachable!(), |
| 211 | } |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | #[cfg (not(all( |
| 216 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 217 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 218 | )))] |
| 219 | macro_rules! load_store_detect { |
| 220 | ( |
| 221 | vmovdqa = $vmovdqa:ident |
| 222 | cmpxchg16b = $cmpxchg16b:ident |
| 223 | fallback = $fallback:ident |
| 224 | ) => {{ |
| 225 | let cpuid = detect::detect(); |
| 226 | #[cfg(not(any( |
| 227 | target_feature = "cmpxchg16b" , |
| 228 | portable_atomic_target_feature = "cmpxchg16b" , |
| 229 | )))] |
| 230 | { |
| 231 | // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access. |
| 232 | if cpuid.has_cmpxchg16b() { |
| 233 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
| 234 | #[cfg(target_feature = "sse" )] |
| 235 | { |
| 236 | if cpuid.has_vmovdqa_atomic() { |
| 237 | $vmovdqa |
| 238 | } else { |
| 239 | $cmpxchg16b |
| 240 | } |
| 241 | } |
| 242 | #[cfg(not(target_feature = "sse" ))] |
| 243 | { |
| 244 | $cmpxchg16b |
| 245 | } |
| 246 | } else { |
| 247 | fallback::$fallback |
| 248 | } |
| 249 | } |
| 250 | #[cfg(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
| 251 | { |
| 252 | if cpuid.has_vmovdqa_atomic() { |
| 253 | $vmovdqa |
| 254 | } else { |
| 255 | $cmpxchg16b |
| 256 | } |
| 257 | } |
| 258 | }}; |
| 259 | } |
| 260 | |
| 261 | #[inline ] |
| 262 | unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 { |
| 263 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
| 264 | // SGX doesn't support CPUID. |
| 265 | #[cfg (all( |
| 266 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 267 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 268 | ))] |
| 269 | // SAFETY: the caller must uphold the safety contract. |
| 270 | // cfg guarantees that CMPXCHG16B is available at compile-time. |
| 271 | unsafe { |
| 272 | // cmpxchg16b is always SeqCst. |
| 273 | atomic_load_cmpxchg16b(src) |
| 274 | } |
| 275 | #[cfg (not(all( |
| 276 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 277 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 278 | )))] |
| 279 | // SAFETY: the caller must uphold the safety contract. |
| 280 | unsafe { |
| 281 | ifunc!(unsafe fn(src: *mut u128) -> u128 { |
| 282 | load_store_detect! { |
| 283 | vmovdqa = atomic_load_vmovdqa |
| 284 | cmpxchg16b = atomic_load_cmpxchg16b |
| 285 | // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst. |
| 286 | fallback = atomic_load_seqcst |
| 287 | } |
| 288 | }) |
| 289 | } |
| 290 | } |
| 291 | // See cmpxchg16b() for target_feature(enable). |
| 292 | #[cfg_attr ( |
| 293 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 294 | target_feature(enable = "cmpxchg16b" ) |
| 295 | )] |
| 296 | #[inline ] |
| 297 | unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 { |
| 298 | debug_assert!(src as usize % 16 == 0); |
| 299 | debug_assert_cmpxchg16b!(); |
| 300 | |
| 301 | // SAFETY: the caller must guarantee that `src` is valid for both writes and |
| 302 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
| 303 | // cfg guarantees that the CPU supports CMPXCHG16B. |
| 304 | // |
| 305 | // See cmpxchg16b function for more. |
| 306 | // |
| 307 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
| 308 | // omitting the storing of condition flags and avoid use of xchg to handle rbx. |
| 309 | unsafe { |
| 310 | // cmpxchg16b is always SeqCst. |
| 311 | let (out_lo, out_hi); |
| 312 | macro_rules! cmpxchg16b { |
| 313 | ($rdi:tt) => { |
| 314 | asm!( |
| 315 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 316 | "xor rbx, rbx" , // zeroed rbx |
| 317 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 318 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 319 | // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg) |
| 320 | rbx_tmp = out(reg) _, |
| 321 | in("rcx" ) 0_u64, |
| 322 | inout("rax" ) 0_u64 => out_lo, |
| 323 | inout("rdx" ) 0_u64 => out_hi, |
| 324 | in($rdi) src, |
| 325 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 326 | options(nostack), |
| 327 | ) |
| 328 | }; |
| 329 | } |
| 330 | #[cfg (target_pointer_width = "32" )] |
| 331 | cmpxchg16b!("edi" ); |
| 332 | #[cfg (target_pointer_width = "64" )] |
| 333 | cmpxchg16b!("rdi" ); |
| 334 | U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole |
| 335 | } |
| 336 | } |
| 337 | |
| 338 | #[inline ] |
| 339 | unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { |
| 340 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
| 341 | // SGX doesn't support CPUID. |
| 342 | #[cfg (all( |
| 343 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 344 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 345 | ))] |
| 346 | // SAFETY: the caller must uphold the safety contract. |
| 347 | // cfg guarantees that CMPXCHG16B is available at compile-time. |
| 348 | unsafe { |
| 349 | // cmpxchg16b is always SeqCst. |
| 350 | let _ = order; |
| 351 | atomic_store_cmpxchg16b(dst, val); |
| 352 | } |
| 353 | #[cfg (not(all( |
| 354 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
| 355 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
| 356 | )))] |
| 357 | // SAFETY: the caller must uphold the safety contract. |
| 358 | unsafe { |
| 359 | #[cfg (target_feature = "sse" )] |
| 360 | fn_alias! { |
| 361 | #[target_feature (enable = "avx" )] |
| 362 | unsafe fn(dst: *mut u128, val: u128); |
| 363 | // atomic store by vmovdqa has at least release semantics. |
| 364 | atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release); |
| 365 | atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst); |
| 366 | } |
| 367 | match order { |
| 368 | // Relaxed and Release stores are equivalent in all implementations |
| 369 | // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback). |
| 370 | // core::arch's cmpxchg16b will never called here. |
| 371 | Ordering::Relaxed | Ordering::Release => { |
| 372 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { |
| 373 | load_store_detect! { |
| 374 | vmovdqa = atomic_store_vmovdqa_non_seqcst |
| 375 | cmpxchg16b = atomic_store_cmpxchg16b |
| 376 | fallback = atomic_store_non_seqcst |
| 377 | } |
| 378 | }); |
| 379 | } |
| 380 | Ordering::SeqCst => { |
| 381 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { |
| 382 | load_store_detect! { |
| 383 | vmovdqa = atomic_store_vmovdqa_seqcst |
| 384 | cmpxchg16b = atomic_store_cmpxchg16b |
| 385 | fallback = atomic_store_seqcst |
| 386 | } |
| 387 | }); |
| 388 | } |
| 389 | _ => unreachable!(), |
| 390 | } |
| 391 | } |
| 392 | } |
| 393 | // See cmpxchg16b() for target_feature(enable). |
| 394 | #[cfg_attr ( |
| 395 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 396 | target_feature(enable = "cmpxchg16b" ) |
| 397 | )] |
| 398 | #[inline ] |
| 399 | unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) { |
| 400 | // SAFETY: the caller must uphold the safety contract. |
| 401 | unsafe { |
| 402 | // cmpxchg16b is always SeqCst. |
| 403 | atomic_swap_cmpxchg16b(dst, val, _order:Ordering::SeqCst); |
| 404 | } |
| 405 | } |
| 406 | |
| 407 | #[inline ] |
| 408 | unsafe fn atomic_compare_exchange( |
| 409 | dst: *mut u128, |
| 410 | old: u128, |
| 411 | new: u128, |
| 412 | _success: Ordering, |
| 413 | _failure: Ordering, |
| 414 | ) -> Result<u128, u128> { |
| 415 | #[cfg (any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
| 416 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 417 | // reads, 16-byte aligned, that there are no concurrent non-atomic operations, |
| 418 | // and cfg guarantees that CMPXCHG16B is available at compile-time. |
| 419 | let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) }; |
| 420 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
| 421 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 422 | // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses. |
| 423 | let (prev, ok) = unsafe { |
| 424 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { |
| 425 | if detect::detect().has_cmpxchg16b() { |
| 426 | cmpxchg16b |
| 427 | } else { |
| 428 | // Use SeqCst because cmpxchg16b is always SeqCst. |
| 429 | fallback::atomic_compare_exchange_seqcst |
| 430 | } |
| 431 | }) |
| 432 | }; |
| 433 | if ok { |
| 434 | Ok(prev) |
| 435 | } else { |
| 436 | Err(prev) |
| 437 | } |
| 438 | } |
| 439 | |
| 440 | // cmpxchg16b is always strong. |
| 441 | use self::atomic_compare_exchange as atomic_compare_exchange_weak; |
| 442 | |
| 443 | // See cmpxchg16b() for target_feature(enable). |
| 444 | #[cfg_attr ( |
| 445 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 446 | target_feature(enable = "cmpxchg16b" ) |
| 447 | )] |
| 448 | #[inline ] |
| 449 | unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 { |
| 450 | debug_assert!(dst as usize % 16 == 0); |
| 451 | debug_assert_cmpxchg16b!(); |
| 452 | |
| 453 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 454 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
| 455 | // cfg guarantees that the CPU supports CMPXCHG16B. |
| 456 | // |
| 457 | // See cmpxchg16b function for more. |
| 458 | // |
| 459 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
| 460 | // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx. |
| 461 | // |
| 462 | // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap. |
| 463 | unsafe { |
| 464 | // cmpxchg16b is always SeqCst. |
| 465 | let val = U128 { whole: val }; |
| 466 | let (mut prev_lo, mut prev_hi); |
| 467 | macro_rules! cmpxchg16b { |
| 468 | ($rdi:tt) => { |
| 469 | asm!( |
| 470 | "xchg {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 471 | // This is not single-copy atomic reads, but this is ok because subsequent |
| 472 | // CAS will check for consistency. |
| 473 | // |
| 474 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
| 475 | // |
| 476 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
| 477 | // so we must use inline assembly to implement this. |
| 478 | // (i.e., byte-wise atomic based on the standard library's atomic types |
| 479 | // cannot be used here). |
| 480 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
| 481 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
| 482 | "2:" , |
| 483 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 484 | "jne 2b" , |
| 485 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 486 | rbx_tmp = inout(reg) val.pair.lo => _, |
| 487 | in("rcx" ) val.pair.hi, |
| 488 | out("rax" ) prev_lo, |
| 489 | out("rdx" ) prev_hi, |
| 490 | in($rdi) dst, |
| 491 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 492 | options(nostack), |
| 493 | ) |
| 494 | }; |
| 495 | } |
| 496 | #[cfg (target_pointer_width = "32" )] |
| 497 | cmpxchg16b!("edi" ); |
| 498 | #[cfg (target_pointer_width = "64" )] |
| 499 | cmpxchg16b!("rdi" ); |
| 500 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
| 501 | } |
| 502 | } |
| 503 | |
| 504 | /// Atomic RMW by CAS loop (3 arguments) |
| 505 | /// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;` |
| 506 | /// |
| 507 | /// `$op` can use the following registers: |
| 508 | /// - rsi/r8 pair: val argument (read-only for `$op`) |
| 509 | /// - rax/rdx pair: previous value loaded (read-only for `$op`) |
| 510 | /// - rbx/rcx pair: new value that will be stored |
| 511 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
| 512 | // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx. |
| 513 | macro_rules! atomic_rmw_cas_3 { |
| 514 | ($name:ident, $($op:tt)*) => { |
| 515 | // See cmpxchg16b() for target_feature(enable). |
| 516 | #[cfg_attr( |
| 517 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 518 | target_feature(enable = "cmpxchg16b" ) |
| 519 | )] |
| 520 | #[inline] |
| 521 | unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 { |
| 522 | debug_assert!(dst as usize % 16 == 0); |
| 523 | debug_assert_cmpxchg16b!(); |
| 524 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 525 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
| 526 | // cfg guarantees that the CPU supports CMPXCHG16B. |
| 527 | // |
| 528 | // See cmpxchg16b function for more. |
| 529 | unsafe { |
| 530 | // cmpxchg16b is always SeqCst. |
| 531 | let val = U128 { whole: val }; |
| 532 | let (mut prev_lo, mut prev_hi); |
| 533 | macro_rules! cmpxchg16b { |
| 534 | ($rdi:tt) => { |
| 535 | asm!( |
| 536 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 537 | // This is not single-copy atomic reads, but this is ok because subsequent |
| 538 | // CAS will check for consistency. |
| 539 | // |
| 540 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
| 541 | // |
| 542 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
| 543 | // so we must use inline assembly to implement this. |
| 544 | // (i.e., byte-wise atomic based on the standard library's atomic types |
| 545 | // cannot be used here). |
| 546 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
| 547 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
| 548 | "2:" , |
| 549 | $($op)* |
| 550 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 551 | "jne 2b" , |
| 552 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 553 | rbx_tmp = out(reg) _, |
| 554 | out("rcx" ) _, |
| 555 | out("rax" ) prev_lo, |
| 556 | out("rdx" ) prev_hi, |
| 557 | in($rdi) dst, |
| 558 | in("rsi" ) val.pair.lo, |
| 559 | in("r8" ) val.pair.hi, |
| 560 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 561 | options(nostack), |
| 562 | ) |
| 563 | }; |
| 564 | } |
| 565 | #[cfg(target_pointer_width = "32" )] |
| 566 | cmpxchg16b!("edi" ); |
| 567 | #[cfg(target_pointer_width = "64" )] |
| 568 | cmpxchg16b!("rdi" ); |
| 569 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
| 570 | } |
| 571 | } |
| 572 | }; |
| 573 | } |
| 574 | /// Atomic RMW by CAS loop (2 arguments) |
| 575 | /// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;` |
| 576 | /// |
| 577 | /// `$op` can use the following registers: |
| 578 | /// - rax/rdx pair: previous value loaded (read-only for `$op`) |
| 579 | /// - rbx/rcx pair: new value that will be stored |
| 580 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
| 581 | // omitting the storing of condition flags and avoid use of xchg to handle rbx. |
| 582 | macro_rules! atomic_rmw_cas_2 { |
| 583 | ($name:ident, $($op:tt)*) => { |
| 584 | // See cmpxchg16b() for target_feature(enable). |
| 585 | #[cfg_attr( |
| 586 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 587 | target_feature(enable = "cmpxchg16b" ) |
| 588 | )] |
| 589 | #[inline] |
| 590 | unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 { |
| 591 | debug_assert!(dst as usize % 16 == 0); |
| 592 | debug_assert_cmpxchg16b!(); |
| 593 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
| 594 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
| 595 | // cfg guarantees that the CPU supports CMPXCHG16B. |
| 596 | // |
| 597 | // See cmpxchg16b function for more. |
| 598 | unsafe { |
| 599 | // cmpxchg16b is always SeqCst. |
| 600 | let (mut prev_lo, mut prev_hi); |
| 601 | macro_rules! cmpxchg16b { |
| 602 | ($rdi:tt) => { |
| 603 | asm!( |
| 604 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
| 605 | // This is not single-copy atomic reads, but this is ok because subsequent |
| 606 | // CAS will check for consistency. |
| 607 | // |
| 608 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
| 609 | // |
| 610 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
| 611 | // so we must use inline assembly to implement this. |
| 612 | // (i.e., byte-wise atomic based on the standard library's atomic types |
| 613 | // cannot be used here). |
| 614 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
| 615 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
| 616 | "2:" , |
| 617 | $($op)* |
| 618 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
| 619 | "jne 2b" , |
| 620 | "mov rbx, {rbx_tmp}" , // restore rbx |
| 621 | rbx_tmp = out(reg) _, |
| 622 | out("rcx" ) _, |
| 623 | out("rax" ) prev_lo, |
| 624 | out("rdx" ) prev_hi, |
| 625 | in($rdi) dst, |
| 626 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
| 627 | options(nostack), |
| 628 | ) |
| 629 | }; |
| 630 | } |
| 631 | #[cfg(target_pointer_width = "32" )] |
| 632 | cmpxchg16b!("edi" ); |
| 633 | #[cfg(target_pointer_width = "64" )] |
| 634 | cmpxchg16b!("rdi" ); |
| 635 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
| 636 | } |
| 637 | } |
| 638 | }; |
| 639 | } |
| 640 | |
| 641 | atomic_rmw_cas_3! { |
| 642 | atomic_add_cmpxchg16b, |
| 643 | "mov rbx, rax" , |
| 644 | "add rbx, rsi" , |
| 645 | "mov rcx, rdx" , |
| 646 | "adc rcx, r8" , |
| 647 | } |
| 648 | atomic_rmw_cas_3! { |
| 649 | atomic_sub_cmpxchg16b, |
| 650 | "mov rbx, rax" , |
| 651 | "sub rbx, rsi" , |
| 652 | "mov rcx, rdx" , |
| 653 | "sbb rcx, r8" , |
| 654 | } |
| 655 | atomic_rmw_cas_3! { |
| 656 | atomic_and_cmpxchg16b, |
| 657 | "mov rbx, rax" , |
| 658 | "and rbx, rsi" , |
| 659 | "mov rcx, rdx" , |
| 660 | "and rcx, r8" , |
| 661 | } |
| 662 | atomic_rmw_cas_3! { |
| 663 | atomic_nand_cmpxchg16b, |
| 664 | "mov rbx, rax" , |
| 665 | "and rbx, rsi" , |
| 666 | "not rbx" , |
| 667 | "mov rcx, rdx" , |
| 668 | "and rcx, r8" , |
| 669 | "not rcx" , |
| 670 | } |
| 671 | atomic_rmw_cas_3! { |
| 672 | atomic_or_cmpxchg16b, |
| 673 | "mov rbx, rax" , |
| 674 | "or rbx, rsi" , |
| 675 | "mov rcx, rdx" , |
| 676 | "or rcx, r8" , |
| 677 | } |
| 678 | atomic_rmw_cas_3! { |
| 679 | atomic_xor_cmpxchg16b, |
| 680 | "mov rbx, rax" , |
| 681 | "xor rbx, rsi" , |
| 682 | "mov rcx, rdx" , |
| 683 | "xor rcx, r8" , |
| 684 | } |
| 685 | |
| 686 | atomic_rmw_cas_2! { |
| 687 | atomic_not_cmpxchg16b, |
| 688 | "mov rbx, rax" , |
| 689 | "not rbx" , |
| 690 | "mov rcx, rdx" , |
| 691 | "not rcx" , |
| 692 | } |
| 693 | atomic_rmw_cas_2! { |
| 694 | atomic_neg_cmpxchg16b, |
| 695 | "mov rbx, rax" , |
| 696 | "neg rbx" , |
| 697 | "mov rcx, 0" , |
| 698 | "sbb rcx, rdx" , |
| 699 | } |
| 700 | |
| 701 | atomic_rmw_cas_3! { |
| 702 | atomic_max_cmpxchg16b, |
| 703 | "cmp rsi, rax" , |
| 704 | "mov rcx, r8" , |
| 705 | "sbb rcx, rdx" , |
| 706 | "mov rcx, r8" , |
| 707 | "cmovl rcx, rdx" , |
| 708 | "mov rbx, rsi" , |
| 709 | "cmovl rbx, rax" , |
| 710 | } |
| 711 | atomic_rmw_cas_3! { |
| 712 | atomic_umax_cmpxchg16b, |
| 713 | "cmp rsi, rax" , |
| 714 | "mov rcx, r8" , |
| 715 | "sbb rcx, rdx" , |
| 716 | "mov rcx, r8" , |
| 717 | "cmovb rcx, rdx" , |
| 718 | "mov rbx, rsi" , |
| 719 | "cmovb rbx, rax" , |
| 720 | } |
| 721 | atomic_rmw_cas_3! { |
| 722 | atomic_min_cmpxchg16b, |
| 723 | "cmp rsi, rax" , |
| 724 | "mov rcx, r8" , |
| 725 | "sbb rcx, rdx" , |
| 726 | "mov rcx, r8" , |
| 727 | "cmovge rcx, rdx" , |
| 728 | "mov rbx, rsi" , |
| 729 | "cmovge rbx, rax" , |
| 730 | } |
| 731 | atomic_rmw_cas_3! { |
| 732 | atomic_umin_cmpxchg16b, |
| 733 | "cmp rsi, rax" , |
| 734 | "mov rcx, r8" , |
| 735 | "sbb rcx, rdx" , |
| 736 | "mov rcx, r8" , |
| 737 | "cmovae rcx, rdx" , |
| 738 | "mov rbx, rsi" , |
| 739 | "cmovae rbx, rax" , |
| 740 | } |
| 741 | |
| 742 | macro_rules! select_atomic_rmw { |
| 743 | ( |
| 744 | unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?; |
| 745 | cmpxchg16b = $cmpxchg16b_fn:ident; |
| 746 | fallback = $seqcst_fallback_fn:ident; |
| 747 | ) => { |
| 748 | // If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn. |
| 749 | #[cfg(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
| 750 | use self::$cmpxchg16b_fn as $name; |
| 751 | // Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available. |
| 752 | #[cfg(not(any( |
| 753 | target_feature = "cmpxchg16b" , |
| 754 | portable_atomic_target_feature = "cmpxchg16b" , |
| 755 | )))] |
| 756 | #[inline] |
| 757 | unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? { |
| 758 | fn_alias! { |
| 759 | // See cmpxchg16b() for target_feature(enable). |
| 760 | #[cfg_attr( |
| 761 | not(portable_atomic_no_cmpxchg16b_target_feature), |
| 762 | target_feature(enable = "cmpxchg16b" ) |
| 763 | )] |
| 764 | unsafe fn($($arg)*) $(-> $ret_ty)?; |
| 765 | // cmpxchg16b is always SeqCst. |
| 766 | cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst); |
| 767 | } |
| 768 | // SAFETY: the caller must uphold the safety contract. |
| 769 | // we only calls cmpxchg16b_fn if cmpxchg16b is available. |
| 770 | unsafe { |
| 771 | ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? { |
| 772 | if detect::detect().has_cmpxchg16b() { |
| 773 | cmpxchg16b_seqcst_fn |
| 774 | } else { |
| 775 | // Use SeqCst because cmpxchg16b is always SeqCst. |
| 776 | fallback::$seqcst_fallback_fn |
| 777 | } |
| 778 | }) |
| 779 | } |
| 780 | } |
| 781 | }; |
| 782 | } |
| 783 | |
| 784 | select_atomic_rmw! { |
| 785 | unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128; |
| 786 | cmpxchg16b = atomic_swap_cmpxchg16b; |
| 787 | fallback = atomic_swap_seqcst; |
| 788 | } |
| 789 | select_atomic_rmw! { |
| 790 | unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128; |
| 791 | cmpxchg16b = atomic_add_cmpxchg16b; |
| 792 | fallback = atomic_add_seqcst; |
| 793 | } |
| 794 | select_atomic_rmw! { |
| 795 | unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128; |
| 796 | cmpxchg16b = atomic_sub_cmpxchg16b; |
| 797 | fallback = atomic_sub_seqcst; |
| 798 | } |
| 799 | select_atomic_rmw! { |
| 800 | unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128; |
| 801 | cmpxchg16b = atomic_and_cmpxchg16b; |
| 802 | fallback = atomic_and_seqcst; |
| 803 | } |
| 804 | select_atomic_rmw! { |
| 805 | unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128; |
| 806 | cmpxchg16b = atomic_nand_cmpxchg16b; |
| 807 | fallback = atomic_nand_seqcst; |
| 808 | } |
| 809 | select_atomic_rmw! { |
| 810 | unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128; |
| 811 | cmpxchg16b = atomic_or_cmpxchg16b; |
| 812 | fallback = atomic_or_seqcst; |
| 813 | } |
| 814 | select_atomic_rmw! { |
| 815 | unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128; |
| 816 | cmpxchg16b = atomic_xor_cmpxchg16b; |
| 817 | fallback = atomic_xor_seqcst; |
| 818 | } |
| 819 | select_atomic_rmw! { |
| 820 | unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128; |
| 821 | cmpxchg16b = atomic_max_cmpxchg16b; |
| 822 | fallback = atomic_max_seqcst; |
| 823 | } |
| 824 | select_atomic_rmw! { |
| 825 | unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128; |
| 826 | cmpxchg16b = atomic_umax_cmpxchg16b; |
| 827 | fallback = atomic_umax_seqcst; |
| 828 | } |
| 829 | select_atomic_rmw! { |
| 830 | unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128; |
| 831 | cmpxchg16b = atomic_min_cmpxchg16b; |
| 832 | fallback = atomic_min_seqcst; |
| 833 | } |
| 834 | select_atomic_rmw! { |
| 835 | unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128; |
| 836 | cmpxchg16b = atomic_umin_cmpxchg16b; |
| 837 | fallback = atomic_umin_seqcst; |
| 838 | } |
| 839 | select_atomic_rmw! { |
| 840 | unsafe fn atomic_not(dst: *mut u128) -> u128; |
| 841 | cmpxchg16b = atomic_not_cmpxchg16b; |
| 842 | fallback = atomic_not_seqcst; |
| 843 | } |
| 844 | select_atomic_rmw! { |
| 845 | unsafe fn atomic_neg(dst: *mut u128) -> u128; |
| 846 | cmpxchg16b = atomic_neg_cmpxchg16b; |
| 847 | fallback = atomic_neg_seqcst; |
| 848 | } |
| 849 | |
| 850 | #[inline ] |
| 851 | fn is_lock_free() -> bool { |
| 852 | #[cfg (any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
| 853 | { |
| 854 | // CMPXCHG16B is available at compile-time. |
| 855 | true |
| 856 | } |
| 857 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
| 858 | { |
| 859 | detect::detect().has_cmpxchg16b() |
| 860 | } |
| 861 | } |
| 862 | const IS_ALWAYS_LOCK_FREE: bool = |
| 863 | cfg!(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )); |
| 864 | |
| 865 | atomic128!(AtomicI128, i128, atomic_max, atomic_min); |
| 866 | atomic128!(AtomicU128, u128, atomic_umax, atomic_umin); |
| 867 | |
| 868 | #[allow (clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)] |
| 869 | #[cfg (test)] |
| 870 | mod tests { |
| 871 | use super::*; |
| 872 | |
| 873 | test_atomic_int!(i128); |
| 874 | test_atomic_int!(u128); |
| 875 | |
| 876 | // load/store/swap implementation is not affected by signedness, so it is |
| 877 | // enough to test only unsigned types. |
| 878 | stress_test!(u128); |
| 879 | } |
| 880 | |