x86_64.rs source code [crates/portable_atomic/src/imp/atomic128/x86_64.rs]

1	// SPDX-License-Identifier: Apache-2.0 OR MIT
2
3	/*
4	128-bit atomic implementation on x86_64.
5
6	This architecture provides the following 128-bit atomic instructions:
7
8	- CMPXCHG16B: CAS (CMPXCHG16B)
9	- VMOVDQA: load/store (Intel, AMD, or Zhaoxin CPU with AVX)
10
11	Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
12	this module and use intrinsics.rs instead.
13
14	Refs:
15	- x86 and amd64 instruction reference https://www.felixcloutier.com/x86
16	- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
17
18	Generated asm:
19	- x86_64 (+cmpxchg16b) https://godbolt.org/z/rfs1jxd51
20	*/
21
22	// TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm
23
24	include!("macros.rs");
25
26	#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
27	#[path = "../fallback/outline_atomics.rs"]
28	mod fallback;
29
30	#[cfg(not(portable_atomic_no_outline_atomics))]
31	#[cfg(not(target_env = "sgx"))]
32	#[cfg_attr(
33	not(target_feature = "sse"),
34	cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))
35	)]
36	#[path = "../detect/x86_64.rs"]
37	mod detect;
38
39	#[cfg(not(portable_atomic_no_asm))]
40	use core::arch::asm;
41	use core::sync::atomic::Ordering;
42
43	use crate::utils::{Pair, U128};
44
45	// Asserts that the function is called in the correct context.
46	macro_rules! debug_assert_cmpxchg16b {
47	() => {
48	#[cfg(not(any(
49	target_feature = "cmpxchg16b",
50	portable_atomic_target_feature = "cmpxchg16b",
51	)))]
52	{
53	debug_assert!(detect::detect().has_cmpxchg16b());
54	}
55	};
56	}
57	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
58	#[cfg(target_feature = "sse")]
59	macro_rules! debug_assert_vmovdqa_atomic {
60	() => {{
61	debug_assert_cmpxchg16b!();
62	debug_assert!(detect::detect().has_vmovdqa_atomic());
63	}};
64	}
65
66	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
67	#[cfg(target_feature = "sse")]
68	#[cfg(target_pointer_width = "32")]
69	macro_rules! ptr_modifier {
70	() => {
71	":e"
72	};
73	}
74	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
75	#[cfg(target_feature = "sse")]
76	#[cfg(target_pointer_width = "64")]
77	macro_rules! ptr_modifier {
78	() => {
79	""
80	};
81	}
82
83	// Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
84	// requirements for the currently enabled target features. In the first place,
85	// there is no option in the x86 assembly for such case, like Arm .arch_extension,
86	// RISC-V .option arch, PowerPC .machine, etc.
87	// However, we set target_feature(enable) when available (Rust 1.69+) in case a
88	// new codegen backend is added that checks for it in the future, or an option
89	// is added to the assembler to check for it.
90	#[cfg_attr(
91	not(portable_atomic_no_cmpxchg16b_target_feature),
92	target_feature(enable = "cmpxchg16b")
93	)]
94	#[inline]
95	unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
96	debug_assert!(dst as usize % `16` == `0`);
97	debug_assert_cmpxchg16b!();
98
99	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
100	// reads, 16-byte aligned (required by CMPXCHG16B), that there are no
101	// concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
102	//
103	// If the value at `dst` (destination operand) and rdx:rax are equal, the
104	// 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at
105	// `dst` is loaded to rdx:rax.
106	//
107	// The ZF flag is set if the value at `dst` and rdx:rax are equal,
108	// otherwise it is cleared. Other flags are unaffected.
109	//
110	// Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
111	unsafe {
112	// cmpxchg16b is always SeqCst.
113	let r: u8;
114	let old = U128 { whole: old };
115	let new = U128 { whole: new };
116	let (prev_lo, prev_hi);
117	macro_rules! cmpxchg16b {
118	($rdi:tt) => {
119	asm!(
120	"xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
121	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
122	"sete cl",
123	"mov rbx, {rbx_tmp}", // restore rbx
124	rbx_tmp = inout(reg) new.pair.lo => _,
125	in("rcx") new.pair.hi,
126	inout("rax") old.pair.lo => prev_lo,
127	inout("rdx") old.pair.hi => prev_hi,
128	in($rdi) dst,
129	lateout("cl") r,
130	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
131	options(nostack),
132	)
133	};
134	}
135	#[cfg(target_pointer_width = "32")]
136	cmpxchg16b!("edi");
137	#[cfg(target_pointer_width = "64")]
138	cmpxchg16b!("rdi");
139	crate::utils::assert_unchecked(r == `0` \|\| r == `1`); // needed to remove extra test
140	(U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != `0`)
141	}
142	}
143
144	// VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX.
145	// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
146	//
147	// Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
148	//
149	// Use cfg(target_feature = "sse") here -- SSE is included in the x86_64
150	// baseline and is always available, but the SSE target feature is disabled for
151	// use cases such as kernels and firmware that should not use vector registers.
152	// So, do not use vector registers unless SSE target feature is enabled.
153	// See also https://github.com/rust-lang/rust/blob/1.84.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md.
154	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
155	#[cfg(target_feature = "sse")]
156	#[target_feature(enable = "avx")]
157	#[inline]
158	unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 {
159	debug_assert!(src as usize % `16` == `0`);
160	debug_assert_vmovdqa_atomic!();
161
162	// SAFETY: the caller must uphold the safety contract.
163	//
164	// atomic load by vmovdqa is always SeqCst.
165	unsafe {
166	let out: core::arch::x86_64::__m128i;
167	asm!(
168	concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"),
169	src = in(reg) src,
170	out = out(xmm_reg) out,
171	options(nostack, preserves_flags),
172	);
173	core::mem::transmute(src:out)
174	}
175	}
176	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
177	#[cfg(target_feature = "sse")]
178	#[target_feature(enable = "avx")]
179	#[inline]
180	unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
181	debug_assert!(dst as usize % `16` == `0`);
182	debug_assert_vmovdqa_atomic!();
183
184	// SAFETY: the caller must uphold the safety contract.
185	unsafe {
186	let val: core::arch::x86_64::__m128i = core::mem::transmute(val);
187	match order {
188	// Relaxed and Release stores are equivalent.
189	Ordering::Relaxed \| Ordering::Release => {
190	asm!(
191	concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
192	dst = in(reg) dst,
193	val = in(xmm_reg) val,
194	options(nostack, preserves_flags),
195	);
196	}
197	Ordering::SeqCst => {
198	let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit());
199	asm!(
200	concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
201	// Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
202	// - https://github.com/taiki-e/portable-atomic/pull/156
203	// - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc
204	// - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
205	// - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740
206	// - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
207	concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"),
208	dst = in(reg) dst,
209	val = in(xmm_reg) val,
210	p = inout(reg) p.get() => _,
211	tmp = lateout(reg) _,
212	options(nostack, preserves_flags),
213	);
214	}
215	_ => unreachable!(),
216	}
217	}
218	}
219
220	#[cfg(not(all(
221	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
222	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
223	)))]
224	macro_rules! load_store_detect {
225	(
226	vmovdqa = $vmovdqa:ident
227	cmpxchg16b = $cmpxchg16b:ident
228	fallback = $fallback:ident
229	) => {{
230	let cpuid = detect::detect();
231	#[cfg(not(any(
232	target_feature = "cmpxchg16b",
233	portable_atomic_target_feature = "cmpxchg16b",
234	)))]
235	{
236	// Check CMPXCHG16B first to prevent mixing atomic and non-atomic access.
237	if cpuid.has_cmpxchg16b() {
238	// We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
239	#[cfg(target_feature = "sse")]
240	{
241	if cpuid.has_vmovdqa_atomic() { $vmovdqa } else { $cmpxchg16b }
242	}
243	#[cfg(not(target_feature = "sse"))]
244	{
245	$cmpxchg16b
246	}
247	} else {
248	fallback::$fallback
249	}
250	}
251	#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
252	{
253	if cpuid.has_vmovdqa_atomic() { $vmovdqa } else { $cmpxchg16b }
254	}
255	}};
256	}
257
258	#[inline]
259	unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
260	// We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
261	// SGX doesn't support CPUID.
262	#[cfg(all(
263	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
264	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
265	))]
266	// SAFETY: the caller must uphold the safety contract.
267	// cfg guarantees that CMPXCHG16B is available at compile-time.
268	unsafe {
269	// cmpxchg16b is always SeqCst.
270	atomic_load_cmpxchg16b(src)
271	}
272	#[cfg(not(all(
273	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
274	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
275	)))]
276	// SAFETY: the caller must uphold the safety contract.
277	unsafe {
278	ifunc!(unsafe fn(src: *mut u128) -> u128 {
279	load_store_detect! {
280	vmovdqa = atomic_load_vmovdqa
281	cmpxchg16b = atomic_load_cmpxchg16b
282	// Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst.
283	fallback = atomic_load_seqcst
284	}
285	})
286	}
287	}
288	// See cmpxchg16b() for target_feature(enable).
289	#[cfg_attr(
290	not(portable_atomic_no_cmpxchg16b_target_feature),
291	target_feature(enable = "cmpxchg16b")
292	)]
293	#[inline]
294	unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 {
295	debug_assert!(src as usize % `16` == `0`);
296	debug_assert_cmpxchg16b!();
297
298	// SAFETY: the caller must guarantee that `src` is valid for both writes and
299	// reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
300	// cfg guarantees that the CPU supports CMPXCHG16B.
301	//
302	// See cmpxchg16b function for more.
303	//
304	// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
305	// omitting the storing of condition flags and avoid use of xchg to handle rbx.
306	unsafe {
307	// cmpxchg16b is always SeqCst.
308	let (out_lo, out_hi);
309	macro_rules! cmpxchg16b {
310	($rdi:tt) => {
311	asm!(
312	"mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
313	"xor rbx, rbx", // zeroed rbx
314	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
315	"mov rbx, {rbx_tmp}", // restore rbx
316	// set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
317	rbx_tmp = out(reg) _,
318	in("rcx") `0_u64`,
319	inout("rax") `0_u64` => out_lo,
320	inout("rdx") `0_u64` => out_hi,
321	in($rdi) src,
322	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
323	options(nostack),
324	)
325	};
326	}
327	#[cfg(target_pointer_width = "32")]
328	cmpxchg16b!("edi");
329	#[cfg(target_pointer_width = "64")]
330	cmpxchg16b!("rdi");
331	U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
332	}
333	}
334
335	#[inline]
336	unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
337	// We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
338	// SGX doesn't support CPUID.
339	#[cfg(all(
340	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
341	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
342	))]
343	// SAFETY: the caller must uphold the safety contract.
344	// cfg guarantees that CMPXCHG16B is available at compile-time.
345	unsafe {
346	// cmpxchg16b is always SeqCst.
347	let _ = order;
348	atomic_store_cmpxchg16b(dst, val);
349	}
350	#[cfg(not(all(
351	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
352	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
353	)))]
354	// SAFETY: the caller must uphold the safety contract.
355	unsafe {
356	#[cfg(target_feature = "sse")]
357	fn_alias! {
358	#[target_feature(enable = "avx")]
359	unsafe fn(dst: *mut u128, val: u128);
360	// atomic store by vmovdqa has at least release semantics.
361	atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release);
362	atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst);
363	}
364	match order {
365	// Relaxed and Release stores are equivalent in all implementations
366	// that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback).
367	// core::arch's cmpxchg16b will never called here.
368	Ordering::Relaxed \| Ordering::Release => {
369	ifunc!(unsafe fn(dst: *mut u128, val: u128) {
370	load_store_detect! {
371	vmovdqa = atomic_store_vmovdqa_non_seqcst
372	cmpxchg16b = atomic_store_cmpxchg16b
373	fallback = atomic_store_non_seqcst
374	}
375	});
376	}
377	Ordering::SeqCst => {
378	ifunc!(unsafe fn(dst: *mut u128, val: u128) {
379	load_store_detect! {
380	vmovdqa = atomic_store_vmovdqa_seqcst
381	cmpxchg16b = atomic_store_cmpxchg16b
382	fallback = atomic_store_seqcst
383	}
384	});
385	}
386	_ => unreachable!(),
387	}
388	}
389	}
390	// See cmpxchg16b() for target_feature(enable).
391	#[cfg_attr(
392	not(portable_atomic_no_cmpxchg16b_target_feature),
393	target_feature(enable = "cmpxchg16b")
394	)]
395	#[inline]
396	unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) {
397	// SAFETY: the caller must uphold the safety contract.
398	unsafe {
399	// cmpxchg16b is always SeqCst.
400	atomic_swap_cmpxchg16b(dst, val, _order:Ordering::SeqCst);
401	}
402	}
403
404	#[inline]
405	unsafe fn atomic_compare_exchange(
406	dst: *mut u128,
407	old: u128,
408	new: u128,
409	_success: Ordering,
410	_failure: Ordering,
411	) -> Result<u128, u128> {
412	#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
413	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
414	// reads, 16-byte aligned, that there are no concurrent non-atomic operations,
415	// and cfg guarantees that CMPXCHG16B is available at compile-time.
416	let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) };
417	#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
418	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
419	// reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
420	let (prev: u128, ok: bool) = unsafe {
421	ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
422	if detect::detect().has_cmpxchg16b() {
423	cmpxchg16b
424	} else {
425	// Use SeqCst because cmpxchg16b is always SeqCst.
426	fallback::atomic_compare_exchange_seqcst
427	}
428	})
429	};
430	if ok { Ok(prev) } else { Err(prev) }
431	}
432
433	// cmpxchg16b is always strong.
434	use self::atomic_compare_exchange as atomic_compare_exchange_weak;
435
436	// See cmpxchg16b() for target_feature(enable).
437	#[cfg_attr(
438	not(portable_atomic_no_cmpxchg16b_target_feature),
439	target_feature(enable = "cmpxchg16b")
440	)]
441	#[inline]
442	unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
443	debug_assert!(dst as usize % `16` == `0`);
444	debug_assert_cmpxchg16b!();
445
446	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
447	// reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
448	// cfg guarantees that the CPU supports CMPXCHG16B.
449	//
450	// See cmpxchg16b function for more.
451	//
452	// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
453	// omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
454	//
455	// Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
456	unsafe {
457	// cmpxchg16b is always SeqCst.
458	let val = U128 { whole: val };
459	let (mut prev_lo, mut prev_hi);
460	macro_rules! cmpxchg16b {
461	($rdi:tt) => {
462	asm!(
463	"xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
464	// This is not single-copy atomic reads, but this is ok because subsequent
465	// CAS will check for consistency.
466	//
467	// This is based on the code generated for the first load in DW RMWs by LLVM.
468	//
469	// Note that the C++20 memory model does not allow mixed-sized atomic access,
470	// so we must use inline assembly to implement this.
471	// (i.e., byte-wise atomic based on the standard library's atomic types
472	// cannot be used here).
473	concat!("mov rax, qword ptr [", $rdi, "]"),
474	concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
475	"2:",
476	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
477	"jne 2b",
478	"mov rbx, {rbx_tmp}", // restore rbx
479	rbx_tmp = inout(reg) val.pair.lo => _,
480	in("rcx") val.pair.hi,
481	out("rax") prev_lo,
482	out("rdx") prev_hi,
483	in($rdi) dst,
484	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
485	options(nostack),
486	)
487	};
488	}
489	#[cfg(target_pointer_width = "32")]
490	cmpxchg16b!("edi");
491	#[cfg(target_pointer_width = "64")]
492	cmpxchg16b!("rdi");
493	U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
494	}
495	}
496
497	/// Atomic RMW by CAS loop (3 arguments)
498	/// `unsafe fn(dst: mut u128, val: u128, order: Ordering) -> u128;`*
499	///
500	/// `$op` can use the following registers:
501	/// - rsi/r8 pair: val argument (read-only for `$op`)
502	/// - rax/rdx pair: previous value loaded (read-only for `$op`)
503	/// - rbx/rcx pair: new value that will be stored
504	// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
505	// omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
506	macro_rules! atomic_rmw_cas_3 {
507	($name:ident, $($op:tt)*) => {
508	// See cmpxchg16b() for target_feature(enable).
509	#[cfg_attr(
510	not(portable_atomic_no_cmpxchg16b_target_feature),
511	target_feature(enable = "cmpxchg16b")
512	)]
513	#[inline]
514	unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
515	debug_assert!(dst as usize % `16` == `0`);
516	debug_assert_cmpxchg16b!();
517	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
518	// reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
519	// cfg guarantees that the CPU supports CMPXCHG16B.
520	//
521	// See cmpxchg16b function for more.
522	unsafe {
523	// cmpxchg16b is always SeqCst.
524	let val = U128 { whole: val };
525	let (mut prev_lo, mut prev_hi);
526	macro_rules! cmpxchg16b {
527	($rdi:tt) => {
528	asm!(
529	"mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
530	// This is not single-copy atomic reads, but this is ok because subsequent
531	// CAS will check for consistency.
532	//
533	// This is based on the code generated for the first load in DW RMWs by LLVM.
534	//
535	// Note that the C++20 memory model does not allow mixed-sized atomic access,
536	// so we must use inline assembly to implement this.
537	// (i.e., byte-wise atomic based on the standard library's atomic types
538	// cannot be used here).
539	concat!("mov rax, qword ptr [", $rdi, "]"),
540	concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
541	"2:",
542	$($op)*
543	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
544	"jne 2b",
545	"mov rbx, {rbx_tmp}", // restore rbx
546	rbx_tmp = out(reg) _,
547	out("rcx") _,
548	out("rax") prev_lo,
549	out("rdx") prev_hi,
550	in($rdi) dst,
551	in("rsi") val.pair.lo,
552	in("r8") val.pair.hi,
553	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
554	options(nostack),
555	)
556	};
557	}
558	#[cfg(target_pointer_width = "32")]
559	cmpxchg16b!("edi");
560	#[cfg(target_pointer_width = "64")]
561	cmpxchg16b!("rdi");
562	U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
563	}
564	}
565	};
566	}
567	/// Atomic RMW by CAS loop (2 arguments)
568	/// `unsafe fn(dst: mut u128, order: Ordering) -> u128;`*
569	///
570	/// `$op` can use the following registers:
571	/// - rax/rdx pair: previous value loaded (read-only for `$op`)
572	/// - rbx/rcx pair: new value that will be stored
573	// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
574	// omitting the storing of condition flags and avoid use of xchg to handle rbx.
575	macro_rules! atomic_rmw_cas_2 {
576	($name:ident, $($op:tt)*) => {
577	// See cmpxchg16b() for target_feature(enable).
578	#[cfg_attr(
579	not(portable_atomic_no_cmpxchg16b_target_feature),
580	target_feature(enable = "cmpxchg16b")
581	)]
582	#[inline]
583	unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
584	debug_assert!(dst as usize % `16` == `0`);
585	debug_assert_cmpxchg16b!();
586	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
587	// reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
588	// cfg guarantees that the CPU supports CMPXCHG16B.
589	//
590	// See cmpxchg16b function for more.
591	unsafe {
592	// cmpxchg16b is always SeqCst.
593	let (mut prev_lo, mut prev_hi);
594	macro_rules! cmpxchg16b {
595	($rdi:tt) => {
596	asm!(
597	"mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
598	// This is not single-copy atomic reads, but this is ok because subsequent
599	// CAS will check for consistency.
600	//
601	// This is based on the code generated for the first load in DW RMWs by LLVM.
602	//
603	// Note that the C++20 memory model does not allow mixed-sized atomic access,
604	// so we must use inline assembly to implement this.
605	// (i.e., byte-wise atomic based on the standard library's atomic types
606	// cannot be used here).
607	concat!("mov rax, qword ptr [", $rdi, "]"),
608	concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
609	"2:",
610	$($op)*
611	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
612	"jne 2b",
613	"mov rbx, {rbx_tmp}", // restore rbx
614	rbx_tmp = out(reg) _,
615	out("rcx") _,
616	out("rax") prev_lo,
617	out("rdx") prev_hi,
618	in($rdi) dst,
619	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
620	options(nostack),
621	)
622	};
623	}
624	#[cfg(target_pointer_width = "32")]
625	cmpxchg16b!("edi");
626	#[cfg(target_pointer_width = "64")]
627	cmpxchg16b!("rdi");
628	U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
629	}
630	}
631	};
632	}
633
634	atomic_rmw_cas_3! {
635	atomic_add_cmpxchg16b,
636	"mov rbx, rax",
637	"add rbx, rsi",
638	"mov rcx, rdx",
639	"adc rcx, r8",
640	}
641	atomic_rmw_cas_3! {
642	atomic_sub_cmpxchg16b,
643	"mov rbx, rax",
644	"sub rbx, rsi",
645	"mov rcx, rdx",
646	"sbb rcx, r8",
647	}
648	atomic_rmw_cas_3! {
649	atomic_and_cmpxchg16b,
650	"mov rbx, rax",
651	"and rbx, rsi",
652	"mov rcx, rdx",
653	"and rcx, r8",
654	}
655	atomic_rmw_cas_3! {
656	atomic_nand_cmpxchg16b,
657	"mov rbx, rax",
658	"and rbx, rsi",
659	"not rbx",
660	"mov rcx, rdx",
661	"and rcx, r8",
662	"not rcx",
663	}
664	atomic_rmw_cas_3! {
665	atomic_or_cmpxchg16b,
666	"mov rbx, rax",
667	"or rbx, rsi",
668	"mov rcx, rdx",
669	"or rcx, r8",
670	}
671	atomic_rmw_cas_3! {
672	atomic_xor_cmpxchg16b,
673	"mov rbx, rax",
674	"xor rbx, rsi",
675	"mov rcx, rdx",
676	"xor rcx, r8",
677	}
678
679	atomic_rmw_cas_2! {
680	atomic_not_cmpxchg16b,
681	"mov rbx, rax",
682	"not rbx",
683	"mov rcx, rdx",
684	"not rcx",
685	}
686	atomic_rmw_cas_2! {
687	atomic_neg_cmpxchg16b,
688	"mov rbx, rax",
689	"neg rbx",
690	"mov rcx, 0",
691	"sbb rcx, rdx",
692	}
693
694	atomic_rmw_cas_3! {
695	atomic_max_cmpxchg16b,
696	"cmp rsi, rax",
697	"mov rcx, r8",
698	"sbb rcx, rdx",
699	"mov rcx, r8",
700	"cmovl rcx, rdx",
701	"mov rbx, rsi",
702	"cmovl rbx, rax",
703	}
704	atomic_rmw_cas_3! {
705	atomic_umax_cmpxchg16b,
706	"cmp rsi, rax",
707	"mov rcx, r8",
708	"sbb rcx, rdx",
709	"mov rcx, r8",
710	"cmovb rcx, rdx",
711	"mov rbx, rsi",
712	"cmovb rbx, rax",
713	}
714	atomic_rmw_cas_3! {
715	atomic_min_cmpxchg16b,
716	"cmp rsi, rax",
717	"mov rcx, r8",
718	"sbb rcx, rdx",
719	"mov rcx, r8",
720	"cmovge rcx, rdx",
721	"mov rbx, rsi",
722	"cmovge rbx, rax",
723	}
724	atomic_rmw_cas_3! {
725	atomic_umin_cmpxchg16b,
726	"cmp rsi, rax",
727	"mov rcx, r8",
728	"sbb rcx, rdx",
729	"mov rcx, r8",
730	"cmovae rcx, rdx",
731	"mov rbx, rsi",
732	"cmovae rbx, rax",
733	}
734
735	macro_rules! select_atomic_rmw {
736	(
737	unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?;
738	cmpxchg16b = $cmpxchg16b_fn:ident;
739	fallback = $seqcst_fallback_fn:ident;
740	) => {
741	// If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn.
742	#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
743	use self::$cmpxchg16b_fn as $name;
744	// Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available.
745	#[cfg(not(any(
746	target_feature = "cmpxchg16b",
747	portable_atomic_target_feature = "cmpxchg16b",
748	)))]
749	#[inline]
750	unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? {
751	fn_alias! {
752	// See cmpxchg16b() for target_feature(enable).
753	#[cfg_attr(
754	not(portable_atomic_no_cmpxchg16b_target_feature),
755	target_feature(enable = "cmpxchg16b")
756	)]
757	unsafe fn($($arg)*) $(-> $ret_ty)?;
758	// cmpxchg16b is always SeqCst.
759	cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst);
760	}
761	// SAFETY: the caller must uphold the safety contract.
762	// we only calls cmpxchg16b_fn if cmpxchg16b is available.
763	unsafe {
764	ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
765	if detect::detect().has_cmpxchg16b() {
766	cmpxchg16b_seqcst_fn
767	} else {
768	// Use SeqCst because cmpxchg16b is always SeqCst.
769	fallback::$seqcst_fallback_fn
770	}
771	})
772	}
773	}
774	};
775	}
776
777	select_atomic_rmw! {
778	unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
779	cmpxchg16b = atomic_swap_cmpxchg16b;
780	fallback = atomic_swap_seqcst;
781	}
782	select_atomic_rmw! {
783	unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128;
784	cmpxchg16b = atomic_add_cmpxchg16b;
785	fallback = atomic_add_seqcst;
786	}
787	select_atomic_rmw! {
788	unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128;
789	cmpxchg16b = atomic_sub_cmpxchg16b;
790	fallback = atomic_sub_seqcst;
791	}
792	select_atomic_rmw! {
793	unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128;
794	cmpxchg16b = atomic_and_cmpxchg16b;
795	fallback = atomic_and_seqcst;
796	}
797	select_atomic_rmw! {
798	unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128;
799	cmpxchg16b = atomic_nand_cmpxchg16b;
800	fallback = atomic_nand_seqcst;
801	}
802	select_atomic_rmw! {
803	unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128;
804	cmpxchg16b = atomic_or_cmpxchg16b;
805	fallback = atomic_or_seqcst;
806	}
807	select_atomic_rmw! {
808	unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128;
809	cmpxchg16b = atomic_xor_cmpxchg16b;
810	fallback = atomic_xor_seqcst;
811	}
812	select_atomic_rmw! {
813	unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128;
814	cmpxchg16b = atomic_max_cmpxchg16b;
815	fallback = atomic_max_seqcst;
816	}
817	select_atomic_rmw! {
818	unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128;
819	cmpxchg16b = atomic_umax_cmpxchg16b;
820	fallback = atomic_umax_seqcst;
821	}
822	select_atomic_rmw! {
823	unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128;
824	cmpxchg16b = atomic_min_cmpxchg16b;
825	fallback = atomic_min_seqcst;
826	}
827	select_atomic_rmw! {
828	unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128;
829	cmpxchg16b = atomic_umin_cmpxchg16b;
830	fallback = atomic_umin_seqcst;
831	}
832	select_atomic_rmw! {
833	unsafe fn atomic_not(dst: *mut u128) -> u128;
834	cmpxchg16b = atomic_not_cmpxchg16b;
835	fallback = atomic_not_seqcst;
836	}
837	select_atomic_rmw! {
838	unsafe fn atomic_neg(dst: *mut u128) -> u128;
839	cmpxchg16b = atomic_neg_cmpxchg16b;
840	fallback = atomic_neg_seqcst;
841	}
842
843	#[inline]
844	fn is_lock_free() -> bool {
845	#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
846	{
847	// CMPXCHG16B is available at compile-time.
848	`true`
849	}
850	#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
851	{
852	detect::detect().has_cmpxchg16b()
853	}
854	}
855	const IS_ALWAYS_LOCK_FREE: bool =
856	cfg!(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"));
857
858	atomic128!(AtomicI128, i128, atomic_max, atomic_min);
859	atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
860
861	#[allow(clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
862	#[cfg(test)]
863	mod tests {
864	use super::*;
865
866	test_atomic_int!(i128);
867	test_atomic_int!(u128);
868
869	// load/store/swap implementation is not affected by signedness, so it is
870	// enough to test only unsigned types.
871	stress_test!(u128);
872	}
873