1// SPDX-License-Identifier: Apache-2.0 OR MIT
2
3/*
4128-bit atomic implementation on x86_64 using CMPXCHG16B (DWCAS).
5
6Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
7this module and use intrinsics.rs instead.
8
9Refs:
10- x86 and amd64 instruction reference https://www.felixcloutier.com/x86
11- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
12
13Generated asm:
14- x86_64 (+cmpxchg16b) https://godbolt.org/z/rfs1jxd51
15*/
16
17// TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm
18
19include!("macros.rs");
20
21#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
22#[path = "../fallback/outline_atomics.rs"]
23mod fallback;
24
25#[cfg(not(portable_atomic_no_outline_atomics))]
26#[cfg(not(target_env = "sgx"))]
27#[cfg_attr(
28 not(target_feature = "sse"),
29 cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))
30)]
31#[path = "../detect/x86_64.rs"]
32mod detect;
33
34#[cfg(not(portable_atomic_no_asm))]
35use core::arch::asm;
36use core::sync::atomic::Ordering;
37
38use crate::utils::{Pair, U128};
39
40// Asserts that the function is called in the correct context.
41macro_rules! debug_assert_cmpxchg16b {
42 () => {
43 #[cfg(not(any(
44 target_feature = "cmpxchg16b",
45 portable_atomic_target_feature = "cmpxchg16b",
46 )))]
47 {
48 debug_assert!(detect::detect().has_cmpxchg16b());
49 }
50 };
51}
52#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
53#[cfg(target_feature = "sse")]
54macro_rules! debug_assert_vmovdqa_atomic {
55 () => {{
56 debug_assert_cmpxchg16b!();
57 debug_assert!(detect::detect().has_vmovdqa_atomic());
58 }};
59}
60
61#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
62#[cfg(target_feature = "sse")]
63#[cfg(target_pointer_width = "32")]
64macro_rules! ptr_modifier {
65 () => {
66 ":e"
67 };
68}
69#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
70#[cfg(target_feature = "sse")]
71#[cfg(target_pointer_width = "64")]
72macro_rules! ptr_modifier {
73 () => {
74 ""
75 };
76}
77
78// Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
79// requirements for the currently enabled target features. In the first place,
80// there is no option in the x86 assembly for such case, like Arm .arch_extension,
81// RISC-V .option arch, PowerPC .machine, etc.
82// However, we set target_feature(enable) when available (Rust 1.69+) in case a
83// new codegen backend is added that checks for it in the future, or an option
84// is added to the assembler to check for it.
85#[cfg_attr(
86 not(portable_atomic_no_cmpxchg16b_target_feature),
87 target_feature(enable = "cmpxchg16b")
88)]
89#[inline]
90unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
91 debug_assert!(dst as usize % 16 == 0);
92 debug_assert_cmpxchg16b!();
93
94 // SAFETY: the caller must guarantee that `dst` is valid for both writes and
95 // reads, 16-byte aligned (required by CMPXCHG16B), that there are no
96 // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
97 //
98 // If the value at `dst` (destination operand) and rdx:rax are equal, the
99 // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at
100 // `dst` is loaded to rdx:rax.
101 //
102 // The ZF flag is set if the value at `dst` and rdx:rax are equal,
103 // otherwise it is cleared. Other flags are unaffected.
104 //
105 // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
106 unsafe {
107 // cmpxchg16b is always SeqCst.
108 let r: u8;
109 let old = U128 { whole: old };
110 let new = U128 { whole: new };
111 let (prev_lo, prev_hi);
112 macro_rules! cmpxchg16b {
113 ($rdi:tt) => {
114 asm!(
115 "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
116 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
117 "sete cl",
118 "mov rbx, {rbx_tmp}", // restore rbx
119 rbx_tmp = inout(reg) new.pair.lo => _,
120 in("rcx") new.pair.hi,
121 inout("rax") old.pair.lo => prev_lo,
122 inout("rdx") old.pair.hi => prev_hi,
123 in($rdi) dst,
124 lateout("cl") r,
125 // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
126 options(nostack),
127 )
128 };
129 }
130 #[cfg(target_pointer_width = "32")]
131 cmpxchg16b!("edi");
132 #[cfg(target_pointer_width = "64")]
133 cmpxchg16b!("rdi");
134 crate::utils::assert_unchecked(r == 0 || r == 1); // needed to remove extra test
135 (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0)
136 }
137}
138
139// VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX.
140// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
141//
142// Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
143//
144// Use cfg(target_feature = "sse") here -- SSE is included in the x86_64
145// baseline and is always available, but the SSE target feature is disabled for
146// use cases such as kernels and firmware that should not use vector registers.
147// So, do not use vector registers unless SSE target feature is enabled.
148// See also https://github.com/rust-lang/rust/blob/1.80.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md.
149#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
150#[cfg(target_feature = "sse")]
151#[target_feature(enable = "avx")]
152#[inline]
153unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 {
154 debug_assert!(src as usize % 16 == 0);
155 debug_assert_vmovdqa_atomic!();
156
157 // SAFETY: the caller must uphold the safety contract.
158 //
159 // atomic load by vmovdqa is always SeqCst.
160 unsafe {
161 let out: core::arch::x86_64::__m128i;
162 asm!(
163 concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"),
164 src = in(reg) src,
165 out = out(xmm_reg) out,
166 options(nostack, preserves_flags),
167 );
168 core::mem::transmute(src:out)
169 }
170}
171#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
172#[cfg(target_feature = "sse")]
173#[target_feature(enable = "avx")]
174#[inline]
175unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
176 debug_assert!(dst as usize % 16 == 0);
177 debug_assert_vmovdqa_atomic!();
178
179 // SAFETY: the caller must uphold the safety contract.
180 unsafe {
181 let val: core::arch::x86_64::__m128i = core::mem::transmute(val);
182 match order {
183 // Relaxed and Release stores are equivalent.
184 Ordering::Relaxed | Ordering::Release => {
185 asm!(
186 concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
187 dst = in(reg) dst,
188 val = in(xmm_reg) val,
189 options(nostack, preserves_flags),
190 );
191 }
192 Ordering::SeqCst => {
193 let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit());
194 asm!(
195 concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
196 // Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
197 // - https://github.com/taiki-e/portable-atomic/pull/156
198 // - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc
199 // - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
200 // - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740
201 // - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
202 concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"),
203 dst = in(reg) dst,
204 val = in(xmm_reg) val,
205 p = inout(reg) p.get() => _,
206 tmp = lateout(reg) _,
207 options(nostack, preserves_flags),
208 );
209 }
210 _ => unreachable!(),
211 }
212 }
213}
214
215#[cfg(not(all(
216 any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
217 any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
218)))]
219macro_rules! load_store_detect {
220 (
221 vmovdqa = $vmovdqa:ident
222 cmpxchg16b = $cmpxchg16b:ident
223 fallback = $fallback:ident
224 ) => {{
225 let cpuid = detect::detect();
226 #[cfg(not(any(
227 target_feature = "cmpxchg16b",
228 portable_atomic_target_feature = "cmpxchg16b",
229 )))]
230 {
231 // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access.
232 if cpuid.has_cmpxchg16b() {
233 // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
234 #[cfg(target_feature = "sse")]
235 {
236 if cpuid.has_vmovdqa_atomic() {
237 $vmovdqa
238 } else {
239 $cmpxchg16b
240 }
241 }
242 #[cfg(not(target_feature = "sse"))]
243 {
244 $cmpxchg16b
245 }
246 } else {
247 fallback::$fallback
248 }
249 }
250 #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
251 {
252 if cpuid.has_vmovdqa_atomic() {
253 $vmovdqa
254 } else {
255 $cmpxchg16b
256 }
257 }
258 }};
259}
260
261#[inline]
262unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
263 // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
264 // SGX doesn't support CPUID.
265 #[cfg(all(
266 any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
267 any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
268 ))]
269 // SAFETY: the caller must uphold the safety contract.
270 // cfg guarantees that CMPXCHG16B is available at compile-time.
271 unsafe {
272 // cmpxchg16b is always SeqCst.
273 atomic_load_cmpxchg16b(src)
274 }
275 #[cfg(not(all(
276 any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
277 any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
278 )))]
279 // SAFETY: the caller must uphold the safety contract.
280 unsafe {
281 ifunc!(unsafe fn(src: *mut u128) -> u128 {
282 load_store_detect! {
283 vmovdqa = atomic_load_vmovdqa
284 cmpxchg16b = atomic_load_cmpxchg16b
285 // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst.
286 fallback = atomic_load_seqcst
287 }
288 })
289 }
290}
291// See cmpxchg16b() for target_feature(enable).
292#[cfg_attr(
293 not(portable_atomic_no_cmpxchg16b_target_feature),
294 target_feature(enable = "cmpxchg16b")
295)]
296#[inline]
297unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 {
298 debug_assert!(src as usize % 16 == 0);
299 debug_assert_cmpxchg16b!();
300
301 // SAFETY: the caller must guarantee that `src` is valid for both writes and
302 // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
303 // cfg guarantees that the CPU supports CMPXCHG16B.
304 //
305 // See cmpxchg16b function for more.
306 //
307 // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
308 // omitting the storing of condition flags and avoid use of xchg to handle rbx.
309 unsafe {
310 // cmpxchg16b is always SeqCst.
311 let (out_lo, out_hi);
312 macro_rules! cmpxchg16b {
313 ($rdi:tt) => {
314 asm!(
315 "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
316 "xor rbx, rbx", // zeroed rbx
317 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
318 "mov rbx, {rbx_tmp}", // restore rbx
319 // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
320 rbx_tmp = out(reg) _,
321 in("rcx") 0_u64,
322 inout("rax") 0_u64 => out_lo,
323 inout("rdx") 0_u64 => out_hi,
324 in($rdi) src,
325 // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
326 options(nostack),
327 )
328 };
329 }
330 #[cfg(target_pointer_width = "32")]
331 cmpxchg16b!("edi");
332 #[cfg(target_pointer_width = "64")]
333 cmpxchg16b!("rdi");
334 U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
335 }
336}
337
338#[inline]
339unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
340 // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
341 // SGX doesn't support CPUID.
342 #[cfg(all(
343 any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
344 any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
345 ))]
346 // SAFETY: the caller must uphold the safety contract.
347 // cfg guarantees that CMPXCHG16B is available at compile-time.
348 unsafe {
349 // cmpxchg16b is always SeqCst.
350 let _ = order;
351 atomic_store_cmpxchg16b(dst, val);
352 }
353 #[cfg(not(all(
354 any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
355 any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
356 )))]
357 // SAFETY: the caller must uphold the safety contract.
358 unsafe {
359 #[cfg(target_feature = "sse")]
360 fn_alias! {
361 #[target_feature(enable = "avx")]
362 unsafe fn(dst: *mut u128, val: u128);
363 // atomic store by vmovdqa has at least release semantics.
364 atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release);
365 atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst);
366 }
367 match order {
368 // Relaxed and Release stores are equivalent in all implementations
369 // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback).
370 // core::arch's cmpxchg16b will never called here.
371 Ordering::Relaxed | Ordering::Release => {
372 ifunc!(unsafe fn(dst: *mut u128, val: u128) {
373 load_store_detect! {
374 vmovdqa = atomic_store_vmovdqa_non_seqcst
375 cmpxchg16b = atomic_store_cmpxchg16b
376 fallback = atomic_store_non_seqcst
377 }
378 });
379 }
380 Ordering::SeqCst => {
381 ifunc!(unsafe fn(dst: *mut u128, val: u128) {
382 load_store_detect! {
383 vmovdqa = atomic_store_vmovdqa_seqcst
384 cmpxchg16b = atomic_store_cmpxchg16b
385 fallback = atomic_store_seqcst
386 }
387 });
388 }
389 _ => unreachable!(),
390 }
391 }
392}
393// See cmpxchg16b() for target_feature(enable).
394#[cfg_attr(
395 not(portable_atomic_no_cmpxchg16b_target_feature),
396 target_feature(enable = "cmpxchg16b")
397)]
398#[inline]
399unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) {
400 // SAFETY: the caller must uphold the safety contract.
401 unsafe {
402 // cmpxchg16b is always SeqCst.
403 atomic_swap_cmpxchg16b(dst, val, _order:Ordering::SeqCst);
404 }
405}
406
407#[inline]
408unsafe fn atomic_compare_exchange(
409 dst: *mut u128,
410 old: u128,
411 new: u128,
412 _success: Ordering,
413 _failure: Ordering,
414) -> Result<u128, u128> {
415 #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
416 // SAFETY: the caller must guarantee that `dst` is valid for both writes and
417 // reads, 16-byte aligned, that there are no concurrent non-atomic operations,
418 // and cfg guarantees that CMPXCHG16B is available at compile-time.
419 let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) };
420 #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
421 // SAFETY: the caller must guarantee that `dst` is valid for both writes and
422 // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
423 let (prev, ok) = unsafe {
424 ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
425 if detect::detect().has_cmpxchg16b() {
426 cmpxchg16b
427 } else {
428 // Use SeqCst because cmpxchg16b is always SeqCst.
429 fallback::atomic_compare_exchange_seqcst
430 }
431 })
432 };
433 if ok {
434 Ok(prev)
435 } else {
436 Err(prev)
437 }
438}
439
440// cmpxchg16b is always strong.
441use self::atomic_compare_exchange as atomic_compare_exchange_weak;
442
443// See cmpxchg16b() for target_feature(enable).
444#[cfg_attr(
445 not(portable_atomic_no_cmpxchg16b_target_feature),
446 target_feature(enable = "cmpxchg16b")
447)]
448#[inline]
449unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
450 debug_assert!(dst as usize % 16 == 0);
451 debug_assert_cmpxchg16b!();
452
453 // SAFETY: the caller must guarantee that `dst` is valid for both writes and
454 // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
455 // cfg guarantees that the CPU supports CMPXCHG16B.
456 //
457 // See cmpxchg16b function for more.
458 //
459 // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
460 // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
461 //
462 // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
463 unsafe {
464 // cmpxchg16b is always SeqCst.
465 let val = U128 { whole: val };
466 let (mut prev_lo, mut prev_hi);
467 macro_rules! cmpxchg16b {
468 ($rdi:tt) => {
469 asm!(
470 "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
471 // This is not single-copy atomic reads, but this is ok because subsequent
472 // CAS will check for consistency.
473 //
474 // This is based on the code generated for the first load in DW RMWs by LLVM.
475 //
476 // Note that the C++20 memory model does not allow mixed-sized atomic access,
477 // so we must use inline assembly to implement this.
478 // (i.e., byte-wise atomic based on the standard library's atomic types
479 // cannot be used here).
480 concat!("mov rax, qword ptr [", $rdi, "]"),
481 concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
482 "2:",
483 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
484 "jne 2b",
485 "mov rbx, {rbx_tmp}", // restore rbx
486 rbx_tmp = inout(reg) val.pair.lo => _,
487 in("rcx") val.pair.hi,
488 out("rax") prev_lo,
489 out("rdx") prev_hi,
490 in($rdi) dst,
491 // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
492 options(nostack),
493 )
494 };
495 }
496 #[cfg(target_pointer_width = "32")]
497 cmpxchg16b!("edi");
498 #[cfg(target_pointer_width = "64")]
499 cmpxchg16b!("rdi");
500 U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
501 }
502}
503
504/// Atomic RMW by CAS loop (3 arguments)
505/// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;`
506///
507/// `$op` can use the following registers:
508/// - rsi/r8 pair: val argument (read-only for `$op`)
509/// - rax/rdx pair: previous value loaded (read-only for `$op`)
510/// - rbx/rcx pair: new value that will be stored
511// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
512// omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
513macro_rules! atomic_rmw_cas_3 {
514 ($name:ident, $($op:tt)*) => {
515 // See cmpxchg16b() for target_feature(enable).
516 #[cfg_attr(
517 not(portable_atomic_no_cmpxchg16b_target_feature),
518 target_feature(enable = "cmpxchg16b")
519 )]
520 #[inline]
521 unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
522 debug_assert!(dst as usize % 16 == 0);
523 debug_assert_cmpxchg16b!();
524 // SAFETY: the caller must guarantee that `dst` is valid for both writes and
525 // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
526 // cfg guarantees that the CPU supports CMPXCHG16B.
527 //
528 // See cmpxchg16b function for more.
529 unsafe {
530 // cmpxchg16b is always SeqCst.
531 let val = U128 { whole: val };
532 let (mut prev_lo, mut prev_hi);
533 macro_rules! cmpxchg16b {
534 ($rdi:tt) => {
535 asm!(
536 "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
537 // This is not single-copy atomic reads, but this is ok because subsequent
538 // CAS will check for consistency.
539 //
540 // This is based on the code generated for the first load in DW RMWs by LLVM.
541 //
542 // Note that the C++20 memory model does not allow mixed-sized atomic access,
543 // so we must use inline assembly to implement this.
544 // (i.e., byte-wise atomic based on the standard library's atomic types
545 // cannot be used here).
546 concat!("mov rax, qword ptr [", $rdi, "]"),
547 concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
548 "2:",
549 $($op)*
550 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
551 "jne 2b",
552 "mov rbx, {rbx_tmp}", // restore rbx
553 rbx_tmp = out(reg) _,
554 out("rcx") _,
555 out("rax") prev_lo,
556 out("rdx") prev_hi,
557 in($rdi) dst,
558 in("rsi") val.pair.lo,
559 in("r8") val.pair.hi,
560 // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
561 options(nostack),
562 )
563 };
564 }
565 #[cfg(target_pointer_width = "32")]
566 cmpxchg16b!("edi");
567 #[cfg(target_pointer_width = "64")]
568 cmpxchg16b!("rdi");
569 U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
570 }
571 }
572 };
573}
574/// Atomic RMW by CAS loop (2 arguments)
575/// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;`
576///
577/// `$op` can use the following registers:
578/// - rax/rdx pair: previous value loaded (read-only for `$op`)
579/// - rbx/rcx pair: new value that will be stored
580// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
581// omitting the storing of condition flags and avoid use of xchg to handle rbx.
582macro_rules! atomic_rmw_cas_2 {
583 ($name:ident, $($op:tt)*) => {
584 // See cmpxchg16b() for target_feature(enable).
585 #[cfg_attr(
586 not(portable_atomic_no_cmpxchg16b_target_feature),
587 target_feature(enable = "cmpxchg16b")
588 )]
589 #[inline]
590 unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
591 debug_assert!(dst as usize % 16 == 0);
592 debug_assert_cmpxchg16b!();
593 // SAFETY: the caller must guarantee that `dst` is valid for both writes and
594 // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
595 // cfg guarantees that the CPU supports CMPXCHG16B.
596 //
597 // See cmpxchg16b function for more.
598 unsafe {
599 // cmpxchg16b is always SeqCst.
600 let (mut prev_lo, mut prev_hi);
601 macro_rules! cmpxchg16b {
602 ($rdi:tt) => {
603 asm!(
604 "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
605 // This is not single-copy atomic reads, but this is ok because subsequent
606 // CAS will check for consistency.
607 //
608 // This is based on the code generated for the first load in DW RMWs by LLVM.
609 //
610 // Note that the C++20 memory model does not allow mixed-sized atomic access,
611 // so we must use inline assembly to implement this.
612 // (i.e., byte-wise atomic based on the standard library's atomic types
613 // cannot be used here).
614 concat!("mov rax, qword ptr [", $rdi, "]"),
615 concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
616 "2:",
617 $($op)*
618 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
619 "jne 2b",
620 "mov rbx, {rbx_tmp}", // restore rbx
621 rbx_tmp = out(reg) _,
622 out("rcx") _,
623 out("rax") prev_lo,
624 out("rdx") prev_hi,
625 in($rdi) dst,
626 // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
627 options(nostack),
628 )
629 };
630 }
631 #[cfg(target_pointer_width = "32")]
632 cmpxchg16b!("edi");
633 #[cfg(target_pointer_width = "64")]
634 cmpxchg16b!("rdi");
635 U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
636 }
637 }
638 };
639}
640
641atomic_rmw_cas_3! {
642 atomic_add_cmpxchg16b,
643 "mov rbx, rax",
644 "add rbx, rsi",
645 "mov rcx, rdx",
646 "adc rcx, r8",
647}
648atomic_rmw_cas_3! {
649 atomic_sub_cmpxchg16b,
650 "mov rbx, rax",
651 "sub rbx, rsi",
652 "mov rcx, rdx",
653 "sbb rcx, r8",
654}
655atomic_rmw_cas_3! {
656 atomic_and_cmpxchg16b,
657 "mov rbx, rax",
658 "and rbx, rsi",
659 "mov rcx, rdx",
660 "and rcx, r8",
661}
662atomic_rmw_cas_3! {
663 atomic_nand_cmpxchg16b,
664 "mov rbx, rax",
665 "and rbx, rsi",
666 "not rbx",
667 "mov rcx, rdx",
668 "and rcx, r8",
669 "not rcx",
670}
671atomic_rmw_cas_3! {
672 atomic_or_cmpxchg16b,
673 "mov rbx, rax",
674 "or rbx, rsi",
675 "mov rcx, rdx",
676 "or rcx, r8",
677}
678atomic_rmw_cas_3! {
679 atomic_xor_cmpxchg16b,
680 "mov rbx, rax",
681 "xor rbx, rsi",
682 "mov rcx, rdx",
683 "xor rcx, r8",
684}
685
686atomic_rmw_cas_2! {
687 atomic_not_cmpxchg16b,
688 "mov rbx, rax",
689 "not rbx",
690 "mov rcx, rdx",
691 "not rcx",
692}
693atomic_rmw_cas_2! {
694 atomic_neg_cmpxchg16b,
695 "mov rbx, rax",
696 "neg rbx",
697 "mov rcx, 0",
698 "sbb rcx, rdx",
699}
700
701atomic_rmw_cas_3! {
702 atomic_max_cmpxchg16b,
703 "cmp rsi, rax",
704 "mov rcx, r8",
705 "sbb rcx, rdx",
706 "mov rcx, r8",
707 "cmovl rcx, rdx",
708 "mov rbx, rsi",
709 "cmovl rbx, rax",
710}
711atomic_rmw_cas_3! {
712 atomic_umax_cmpxchg16b,
713 "cmp rsi, rax",
714 "mov rcx, r8",
715 "sbb rcx, rdx",
716 "mov rcx, r8",
717 "cmovb rcx, rdx",
718 "mov rbx, rsi",
719 "cmovb rbx, rax",
720}
721atomic_rmw_cas_3! {
722 atomic_min_cmpxchg16b,
723 "cmp rsi, rax",
724 "mov rcx, r8",
725 "sbb rcx, rdx",
726 "mov rcx, r8",
727 "cmovge rcx, rdx",
728 "mov rbx, rsi",
729 "cmovge rbx, rax",
730}
731atomic_rmw_cas_3! {
732 atomic_umin_cmpxchg16b,
733 "cmp rsi, rax",
734 "mov rcx, r8",
735 "sbb rcx, rdx",
736 "mov rcx, r8",
737 "cmovae rcx, rdx",
738 "mov rbx, rsi",
739 "cmovae rbx, rax",
740}
741
742macro_rules! select_atomic_rmw {
743 (
744 unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?;
745 cmpxchg16b = $cmpxchg16b_fn:ident;
746 fallback = $seqcst_fallback_fn:ident;
747 ) => {
748 // If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn.
749 #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
750 use self::$cmpxchg16b_fn as $name;
751 // Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available.
752 #[cfg(not(any(
753 target_feature = "cmpxchg16b",
754 portable_atomic_target_feature = "cmpxchg16b",
755 )))]
756 #[inline]
757 unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? {
758 fn_alias! {
759 // See cmpxchg16b() for target_feature(enable).
760 #[cfg_attr(
761 not(portable_atomic_no_cmpxchg16b_target_feature),
762 target_feature(enable = "cmpxchg16b")
763 )]
764 unsafe fn($($arg)*) $(-> $ret_ty)?;
765 // cmpxchg16b is always SeqCst.
766 cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst);
767 }
768 // SAFETY: the caller must uphold the safety contract.
769 // we only calls cmpxchg16b_fn if cmpxchg16b is available.
770 unsafe {
771 ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
772 if detect::detect().has_cmpxchg16b() {
773 cmpxchg16b_seqcst_fn
774 } else {
775 // Use SeqCst because cmpxchg16b is always SeqCst.
776 fallback::$seqcst_fallback_fn
777 }
778 })
779 }
780 }
781 };
782}
783
784select_atomic_rmw! {
785 unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
786 cmpxchg16b = atomic_swap_cmpxchg16b;
787 fallback = atomic_swap_seqcst;
788}
789select_atomic_rmw! {
790 unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128;
791 cmpxchg16b = atomic_add_cmpxchg16b;
792 fallback = atomic_add_seqcst;
793}
794select_atomic_rmw! {
795 unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128;
796 cmpxchg16b = atomic_sub_cmpxchg16b;
797 fallback = atomic_sub_seqcst;
798}
799select_atomic_rmw! {
800 unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128;
801 cmpxchg16b = atomic_and_cmpxchg16b;
802 fallback = atomic_and_seqcst;
803}
804select_atomic_rmw! {
805 unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128;
806 cmpxchg16b = atomic_nand_cmpxchg16b;
807 fallback = atomic_nand_seqcst;
808}
809select_atomic_rmw! {
810 unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128;
811 cmpxchg16b = atomic_or_cmpxchg16b;
812 fallback = atomic_or_seqcst;
813}
814select_atomic_rmw! {
815 unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128;
816 cmpxchg16b = atomic_xor_cmpxchg16b;
817 fallback = atomic_xor_seqcst;
818}
819select_atomic_rmw! {
820 unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128;
821 cmpxchg16b = atomic_max_cmpxchg16b;
822 fallback = atomic_max_seqcst;
823}
824select_atomic_rmw! {
825 unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128;
826 cmpxchg16b = atomic_umax_cmpxchg16b;
827 fallback = atomic_umax_seqcst;
828}
829select_atomic_rmw! {
830 unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128;
831 cmpxchg16b = atomic_min_cmpxchg16b;
832 fallback = atomic_min_seqcst;
833}
834select_atomic_rmw! {
835 unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128;
836 cmpxchg16b = atomic_umin_cmpxchg16b;
837 fallback = atomic_umin_seqcst;
838}
839select_atomic_rmw! {
840 unsafe fn atomic_not(dst: *mut u128) -> u128;
841 cmpxchg16b = atomic_not_cmpxchg16b;
842 fallback = atomic_not_seqcst;
843}
844select_atomic_rmw! {
845 unsafe fn atomic_neg(dst: *mut u128) -> u128;
846 cmpxchg16b = atomic_neg_cmpxchg16b;
847 fallback = atomic_neg_seqcst;
848}
849
850#[inline]
851fn is_lock_free() -> bool {
852 #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
853 {
854 // CMPXCHG16B is available at compile-time.
855 true
856 }
857 #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
858 {
859 detect::detect().has_cmpxchg16b()
860 }
861}
862const IS_ALWAYS_LOCK_FREE: bool =
863 cfg!(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"));
864
865atomic128!(AtomicI128, i128, atomic_max, atomic_min);
866atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
867
868#[allow(clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
869#[cfg(test)]
870mod tests {
871 use super::*;
872
873 test_atomic_int!(i128);
874 test_atomic_int!(u128);
875
876 // load/store/swap implementation is not affected by signedness, so it is
877 // enough to test only unsigned types.
878 stress_test!(u128);
879}
880