x86_64.rs source code [crates/portable_atomic/src/imp/atomic128/x86_64.rs]

1	// SPDX-License-Identifier: Apache-2.0 OR MIT
2
3	/*
4	128-bit atomic implementation on x86_64 using CMPXCHG16B (DWCAS).
5
6	Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
7	this module and use intrinsics.rs instead.
8
9	Refs:
10	- x86 and amd64 instruction reference https://www.felixcloutier.com/x86
11	- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
12
13	Generated asm:
14	- x86_64 (+cmpxchg16b) https://godbolt.org/z/rfs1jxd51
15	*/
16
17	// TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm
18
19	include!("macros.rs");
20
21	#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
22	#[path = "../fallback/outline_atomics.rs"]
23	mod fallback;
24
25	#[cfg(not(portable_atomic_no_outline_atomics))]
26	#[cfg(not(target_env = "sgx"))]
27	#[cfg_attr(
28	not(target_feature = "sse"),
29	cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))
30	)]
31	#[path = "../detect/x86_64.rs"]
32	mod detect;
33
34	#[cfg(not(portable_atomic_no_asm))]
35	use core::arch::asm;
36	use core::sync::atomic::Ordering;
37
38	use crate::utils::{Pair, U128};
39
40	// Asserts that the function is called in the correct context.
41	macro_rules! debug_assert_cmpxchg16b {
42	() => {
43	#[cfg(not(any(
44	target_feature = "cmpxchg16b",
45	portable_atomic_target_feature = "cmpxchg16b",
46	)))]
47	{
48	debug_assert!(detect::detect().has_cmpxchg16b());
49	}
50	};
51	}
52	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
53	#[cfg(target_feature = "sse")]
54	macro_rules! debug_assert_vmovdqa_atomic {
55	() => {{
56	debug_assert_cmpxchg16b!();
57	debug_assert!(detect::detect().has_vmovdqa_atomic());
58	}};
59	}
60
61	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
62	#[cfg(target_feature = "sse")]
63	#[cfg(target_pointer_width = "32")]
64	macro_rules! ptr_modifier {
65	() => {
66	":e"
67	};
68	}
69	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
70	#[cfg(target_feature = "sse")]
71	#[cfg(target_pointer_width = "64")]
72	macro_rules! ptr_modifier {
73	() => {
74	""
75	};
76	}
77
78	// Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
79	// requirements for the currently enabled target features. In the first place,
80	// there is no option in the x86 assembly for such case, like Arm .arch_extension,
81	// RISC-V .option arch, PowerPC .machine, etc.
82	// However, we set target_feature(enable) when available (Rust 1.69+) in case a
83	// new codegen backend is added that checks for it in the future, or an option
84	// is added to the assembler to check for it.
85	#[cfg_attr(
86	not(portable_atomic_no_cmpxchg16b_target_feature),
87	target_feature(enable = "cmpxchg16b")
88	)]
89	#[inline]
90	unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
91	debug_assert!(dst as usize % `16` == `0`);
92	debug_assert_cmpxchg16b!();
93
94	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
95	// reads, 16-byte aligned (required by CMPXCHG16B), that there are no
96	// concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
97	//
98	// If the value at `dst` (destination operand) and rdx:rax are equal, the
99	// 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at
100	// `dst` is loaded to rdx:rax.
101	//
102	// The ZF flag is set if the value at `dst` and rdx:rax are equal,
103	// otherwise it is cleared. Other flags are unaffected.
104	//
105	// Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
106	unsafe {
107	// cmpxchg16b is always SeqCst.
108	let r: u8;
109	let old = U128 { whole: old };
110	let new = U128 { whole: new };
111	let (prev_lo, prev_hi);
112	macro_rules! cmpxchg16b {
113	($rdi:tt) => {
114	asm!(
115	"xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
116	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
117	"sete cl",
118	"mov rbx, {rbx_tmp}", // restore rbx
119	rbx_tmp = inout(reg) new.pair.lo => _,
120	in("rcx") new.pair.hi,
121	inout("rax") old.pair.lo => prev_lo,
122	inout("rdx") old.pair.hi => prev_hi,
123	in($rdi) dst,
124	lateout("cl") r,
125	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
126	options(nostack),
127	)
128	};
129	}
130	#[cfg(target_pointer_width = "32")]
131	cmpxchg16b!("edi");
132	#[cfg(target_pointer_width = "64")]
133	cmpxchg16b!("rdi");
134	crate::utils::assert_unchecked(r == `0` \|\| r == `1`); // needed to remove extra test
135	(U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != `0`)
136	}
137	}
138
139	// VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX.
140	// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
141	//
142	// Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
143	//
144	// Use cfg(target_feature = "sse") here -- SSE is included in the x86_64
145	// baseline and is always available, but the SSE target feature is disabled for
146	// use cases such as kernels and firmware that should not use vector registers.
147	// So, do not use vector registers unless SSE target feature is enabled.
148	// See also https://github.com/rust-lang/rust/blob/1.80.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md.
149	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
150	#[cfg(target_feature = "sse")]
151	#[target_feature(enable = "avx")]
152	#[inline]
153	unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 {
154	debug_assert!(src as usize % `16` == `0`);
155	debug_assert_vmovdqa_atomic!();
156
157	// SAFETY: the caller must uphold the safety contract.
158	//
159	// atomic load by vmovdqa is always SeqCst.
160	unsafe {
161	let out: core::arch::x86_64::__m128i;
162	asm!(
163	concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"),
164	src = in(reg) src,
165	out = out(xmm_reg) out,
166	options(nostack, preserves_flags),
167	);
168	core::mem::transmute(src:out)
169	}
170	}
171	#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
172	#[cfg(target_feature = "sse")]
173	#[target_feature(enable = "avx")]
174	#[inline]
175	unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
176	debug_assert!(dst as usize % `16` == `0`);
177	debug_assert_vmovdqa_atomic!();
178
179	// SAFETY: the caller must uphold the safety contract.
180	unsafe {
181	let val: core::arch::x86_64::__m128i = core::mem::transmute(val);
182	match order {
183	// Relaxed and Release stores are equivalent.
184	Ordering::Relaxed \| Ordering::Release => {
185	asm!(
186	concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
187	dst = in(reg) dst,
188	val = in(xmm_reg) val,
189	options(nostack, preserves_flags),
190	);
191	}
192	Ordering::SeqCst => {
193	let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit());
194	asm!(
195	concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
196	// Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
197	// - https://github.com/taiki-e/portable-atomic/pull/156
198	// - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc
199	// - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
200	// - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740
201	// - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
202	concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"),
203	dst = in(reg) dst,
204	val = in(xmm_reg) val,
205	p = inout(reg) p.get() => _,
206	tmp = lateout(reg) _,
207	options(nostack, preserves_flags),
208	);
209	}
210	_ => unreachable!(),
211	}
212	}
213	}
214
215	#[cfg(not(all(
216	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
217	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
218	)))]
219	macro_rules! load_store_detect {
220	(
221	vmovdqa = $vmovdqa:ident
222	cmpxchg16b = $cmpxchg16b:ident
223	fallback = $fallback:ident
224	) => {{
225	let cpuid = detect::detect();
226	#[cfg(not(any(
227	target_feature = "cmpxchg16b",
228	portable_atomic_target_feature = "cmpxchg16b",
229	)))]
230	{
231	// Check CMPXCHG16B first to prevent mixing atomic and non-atomic access.
232	if cpuid.has_cmpxchg16b() {
233	// We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
234	#[cfg(target_feature = "sse")]
235	{
236	if cpuid.has_vmovdqa_atomic() {
237	$vmovdqa
238	} else {
239	$cmpxchg16b
240	}
241	}
242	#[cfg(not(target_feature = "sse"))]
243	{
244	$cmpxchg16b
245	}
246	} else {
247	fallback::$fallback
248	}
249	}
250	#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
251	{
252	if cpuid.has_vmovdqa_atomic() {
253	$vmovdqa
254	} else {
255	$cmpxchg16b
256	}
257	}
258	}};
259	}
260
261	#[inline]
262	unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
263	// We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
264	// SGX doesn't support CPUID.
265	#[cfg(all(
266	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
267	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
268	))]
269	// SAFETY: the caller must uphold the safety contract.
270	// cfg guarantees that CMPXCHG16B is available at compile-time.
271	unsafe {
272	// cmpxchg16b is always SeqCst.
273	atomic_load_cmpxchg16b(src)
274	}
275	#[cfg(not(all(
276	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
277	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
278	)))]
279	// SAFETY: the caller must uphold the safety contract.
280	unsafe {
281	ifunc!(unsafe fn(src: *mut u128) -> u128 {
282	load_store_detect! {
283	vmovdqa = atomic_load_vmovdqa
284	cmpxchg16b = atomic_load_cmpxchg16b
285	// Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst.
286	fallback = atomic_load_seqcst
287	}
288	})
289	}
290	}
291	// See cmpxchg16b() for target_feature(enable).
292	#[cfg_attr(
293	not(portable_atomic_no_cmpxchg16b_target_feature),
294	target_feature(enable = "cmpxchg16b")
295	)]
296	#[inline]
297	unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 {
298	debug_assert!(src as usize % `16` == `0`);
299	debug_assert_cmpxchg16b!();
300
301	// SAFETY: the caller must guarantee that `src` is valid for both writes and
302	// reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
303	// cfg guarantees that the CPU supports CMPXCHG16B.
304	//
305	// See cmpxchg16b function for more.
306	//
307	// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
308	// omitting the storing of condition flags and avoid use of xchg to handle rbx.
309	unsafe {
310	// cmpxchg16b is always SeqCst.
311	let (out_lo, out_hi);
312	macro_rules! cmpxchg16b {
313	($rdi:tt) => {
314	asm!(
315	"mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
316	"xor rbx, rbx", // zeroed rbx
317	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
318	"mov rbx, {rbx_tmp}", // restore rbx
319	// set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
320	rbx_tmp = out(reg) _,
321	in("rcx") `0_u64`,
322	inout("rax") `0_u64` => out_lo,
323	inout("rdx") `0_u64` => out_hi,
324	in($rdi) src,
325	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
326	options(nostack),
327	)
328	};
329	}
330	#[cfg(target_pointer_width = "32")]
331	cmpxchg16b!("edi");
332	#[cfg(target_pointer_width = "64")]
333	cmpxchg16b!("rdi");
334	U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
335	}
336	}
337
338	#[inline]
339	unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
340	// We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
341	// SGX doesn't support CPUID.
342	#[cfg(all(
343	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
344	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
345	))]
346	// SAFETY: the caller must uphold the safety contract.
347	// cfg guarantees that CMPXCHG16B is available at compile-time.
348	unsafe {
349	// cmpxchg16b is always SeqCst.
350	let _ = order;
351	atomic_store_cmpxchg16b(dst, val);
352	}
353	#[cfg(not(all(
354	any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
355	any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
356	)))]
357	// SAFETY: the caller must uphold the safety contract.
358	unsafe {
359	#[cfg(target_feature = "sse")]
360	fn_alias! {
361	#[target_feature(enable = "avx")]
362	unsafe fn(dst: *mut u128, val: u128);
363	// atomic store by vmovdqa has at least release semantics.
364	atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release);
365	atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst);
366	}
367	match order {
368	// Relaxed and Release stores are equivalent in all implementations
369	// that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback).
370	// core::arch's cmpxchg16b will never called here.
371	Ordering::Relaxed \| Ordering::Release => {
372	ifunc!(unsafe fn(dst: *mut u128, val: u128) {
373	load_store_detect! {
374	vmovdqa = atomic_store_vmovdqa_non_seqcst
375	cmpxchg16b = atomic_store_cmpxchg16b
376	fallback = atomic_store_non_seqcst
377	}
378	});
379	}
380	Ordering::SeqCst => {
381	ifunc!(unsafe fn(dst: *mut u128, val: u128) {
382	load_store_detect! {
383	vmovdqa = atomic_store_vmovdqa_seqcst
384	cmpxchg16b = atomic_store_cmpxchg16b
385	fallback = atomic_store_seqcst
386	}
387	});
388	}
389	_ => unreachable!(),
390	}
391	}
392	}
393	// See cmpxchg16b() for target_feature(enable).
394	#[cfg_attr(
395	not(portable_atomic_no_cmpxchg16b_target_feature),
396	target_feature(enable = "cmpxchg16b")
397	)]
398	#[inline]
399	unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) {
400	// SAFETY: the caller must uphold the safety contract.
401	unsafe {
402	// cmpxchg16b is always SeqCst.
403	atomic_swap_cmpxchg16b(dst, val, _order:Ordering::SeqCst);
404	}
405	}
406
407	#[inline]
408	unsafe fn atomic_compare_exchange(
409	dst: *mut u128,
410	old: u128,
411	new: u128,
412	_success: Ordering,
413	_failure: Ordering,
414	) -> Result<u128, u128> {
415	#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
416	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
417	// reads, 16-byte aligned, that there are no concurrent non-atomic operations,
418	// and cfg guarantees that CMPXCHG16B is available at compile-time.
419	let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) };
420	#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
421	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
422	// reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
423	let (prev, ok) = unsafe {
424	ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
425	if detect::detect().has_cmpxchg16b() {
426	cmpxchg16b
427	} else {
428	// Use SeqCst because cmpxchg16b is always SeqCst.
429	fallback::atomic_compare_exchange_seqcst
430	}
431	})
432	};
433	if ok {
434	Ok(prev)
435	} else {
436	Err(prev)
437	}
438	}
439
440	// cmpxchg16b is always strong.
441	use self::atomic_compare_exchange as atomic_compare_exchange_weak;
442
443	// See cmpxchg16b() for target_feature(enable).
444	#[cfg_attr(
445	not(portable_atomic_no_cmpxchg16b_target_feature),
446	target_feature(enable = "cmpxchg16b")
447	)]
448	#[inline]
449	unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
450	debug_assert!(dst as usize % `16` == `0`);
451	debug_assert_cmpxchg16b!();
452
453	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
454	// reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
455	// cfg guarantees that the CPU supports CMPXCHG16B.
456	//
457	// See cmpxchg16b function for more.
458	//
459	// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
460	// omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
461	//
462	// Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
463	unsafe {
464	// cmpxchg16b is always SeqCst.
465	let val = U128 { whole: val };
466	let (mut prev_lo, mut prev_hi);
467	macro_rules! cmpxchg16b {
468	($rdi:tt) => {
469	asm!(
470	"xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
471	// This is not single-copy atomic reads, but this is ok because subsequent
472	// CAS will check for consistency.
473	//
474	// This is based on the code generated for the first load in DW RMWs by LLVM.
475	//
476	// Note that the C++20 memory model does not allow mixed-sized atomic access,
477	// so we must use inline assembly to implement this.
478	// (i.e., byte-wise atomic based on the standard library's atomic types
479	// cannot be used here).
480	concat!("mov rax, qword ptr [", $rdi, "]"),
481	concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
482	"2:",
483	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
484	"jne 2b",
485	"mov rbx, {rbx_tmp}", // restore rbx
486	rbx_tmp = inout(reg) val.pair.lo => _,
487	in("rcx") val.pair.hi,
488	out("rax") prev_lo,
489	out("rdx") prev_hi,
490	in($rdi) dst,
491	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
492	options(nostack),
493	)
494	};
495	}
496	#[cfg(target_pointer_width = "32")]
497	cmpxchg16b!("edi");
498	#[cfg(target_pointer_width = "64")]
499	cmpxchg16b!("rdi");
500	U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
501	}
502	}
503
504	/// Atomic RMW by CAS loop (3 arguments)
505	/// `unsafe fn(dst: mut u128, val: u128, order: Ordering) -> u128;`*
506	///
507	/// `$op` can use the following registers:
508	/// - rsi/r8 pair: val argument (read-only for `$op`)
509	/// - rax/rdx pair: previous value loaded (read-only for `$op`)
510	/// - rbx/rcx pair: new value that will be stored
511	// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
512	// omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
513	macro_rules! atomic_rmw_cas_3 {
514	($name:ident, $($op:tt)*) => {
515	// See cmpxchg16b() for target_feature(enable).
516	#[cfg_attr(
517	not(portable_atomic_no_cmpxchg16b_target_feature),
518	target_feature(enable = "cmpxchg16b")
519	)]
520	#[inline]
521	unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
522	debug_assert!(dst as usize % `16` == `0`);
523	debug_assert_cmpxchg16b!();
524	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
525	// reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
526	// cfg guarantees that the CPU supports CMPXCHG16B.
527	//
528	// See cmpxchg16b function for more.
529	unsafe {
530	// cmpxchg16b is always SeqCst.
531	let val = U128 { whole: val };
532	let (mut prev_lo, mut prev_hi);
533	macro_rules! cmpxchg16b {
534	($rdi:tt) => {
535	asm!(
536	"mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
537	// This is not single-copy atomic reads, but this is ok because subsequent
538	// CAS will check for consistency.
539	//
540	// This is based on the code generated for the first load in DW RMWs by LLVM.
541	//
542	// Note that the C++20 memory model does not allow mixed-sized atomic access,
543	// so we must use inline assembly to implement this.
544	// (i.e., byte-wise atomic based on the standard library's atomic types
545	// cannot be used here).
546	concat!("mov rax, qword ptr [", $rdi, "]"),
547	concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
548	"2:",
549	$($op)*
550	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
551	"jne 2b",
552	"mov rbx, {rbx_tmp}", // restore rbx
553	rbx_tmp = out(reg) _,
554	out("rcx") _,
555	out("rax") prev_lo,
556	out("rdx") prev_hi,
557	in($rdi) dst,
558	in("rsi") val.pair.lo,
559	in("r8") val.pair.hi,
560	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
561	options(nostack),
562	)
563	};
564	}
565	#[cfg(target_pointer_width = "32")]
566	cmpxchg16b!("edi");
567	#[cfg(target_pointer_width = "64")]
568	cmpxchg16b!("rdi");
569	U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
570	}
571	}
572	};
573	}
574	/// Atomic RMW by CAS loop (2 arguments)
575	/// `unsafe fn(dst: mut u128, order: Ordering) -> u128;`*
576	///
577	/// `$op` can use the following registers:
578	/// - rax/rdx pair: previous value loaded (read-only for `$op`)
579	/// - rbx/rcx pair: new value that will be stored
580	// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
581	// omitting the storing of condition flags and avoid use of xchg to handle rbx.
582	macro_rules! atomic_rmw_cas_2 {
583	($name:ident, $($op:tt)*) => {
584	// See cmpxchg16b() for target_feature(enable).
585	#[cfg_attr(
586	not(portable_atomic_no_cmpxchg16b_target_feature),
587	target_feature(enable = "cmpxchg16b")
588	)]
589	#[inline]
590	unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
591	debug_assert!(dst as usize % `16` == `0`);
592	debug_assert_cmpxchg16b!();
593	// SAFETY: the caller must guarantee that `dst` is valid for both writes and
594	// reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
595	// cfg guarantees that the CPU supports CMPXCHG16B.
596	//
597	// See cmpxchg16b function for more.
598	unsafe {
599	// cmpxchg16b is always SeqCst.
600	let (mut prev_lo, mut prev_hi);
601	macro_rules! cmpxchg16b {
602	($rdi:tt) => {
603	asm!(
604	"mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
605	// This is not single-copy atomic reads, but this is ok because subsequent
606	// CAS will check for consistency.
607	//
608	// This is based on the code generated for the first load in DW RMWs by LLVM.
609	//
610	// Note that the C++20 memory model does not allow mixed-sized atomic access,
611	// so we must use inline assembly to implement this.
612	// (i.e., byte-wise atomic based on the standard library's atomic types
613	// cannot be used here).
614	concat!("mov rax, qword ptr [", $rdi, "]"),
615	concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
616	"2:",
617	$($op)*
618	concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
619	"jne 2b",
620	"mov rbx, {rbx_tmp}", // restore rbx
621	rbx_tmp = out(reg) _,
622	out("rcx") _,
623	out("rax") prev_lo,
624	out("rdx") prev_hi,
625	in($rdi) dst,
626	// Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
627	options(nostack),
628	)
629	};
630	}
631	#[cfg(target_pointer_width = "32")]
632	cmpxchg16b!("edi");
633	#[cfg(target_pointer_width = "64")]
634	cmpxchg16b!("rdi");
635	U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
636	}
637	}
638	};
639	}
640
641	atomic_rmw_cas_3! {
642	atomic_add_cmpxchg16b,
643	"mov rbx, rax",
644	"add rbx, rsi",
645	"mov rcx, rdx",
646	"adc rcx, r8",
647	}
648	atomic_rmw_cas_3! {
649	atomic_sub_cmpxchg16b,
650	"mov rbx, rax",
651	"sub rbx, rsi",
652	"mov rcx, rdx",
653	"sbb rcx, r8",
654	}
655	atomic_rmw_cas_3! {
656	atomic_and_cmpxchg16b,
657	"mov rbx, rax",
658	"and rbx, rsi",
659	"mov rcx, rdx",
660	"and rcx, r8",
661	}
662	atomic_rmw_cas_3! {
663	atomic_nand_cmpxchg16b,
664	"mov rbx, rax",
665	"and rbx, rsi",
666	"not rbx",
667	"mov rcx, rdx",
668	"and rcx, r8",
669	"not rcx",
670	}
671	atomic_rmw_cas_3! {
672	atomic_or_cmpxchg16b,
673	"mov rbx, rax",
674	"or rbx, rsi",
675	"mov rcx, rdx",
676	"or rcx, r8",
677	}
678	atomic_rmw_cas_3! {
679	atomic_xor_cmpxchg16b,
680	"mov rbx, rax",
681	"xor rbx, rsi",
682	"mov rcx, rdx",
683	"xor rcx, r8",
684	}
685
686	atomic_rmw_cas_2! {
687	atomic_not_cmpxchg16b,
688	"mov rbx, rax",
689	"not rbx",
690	"mov rcx, rdx",
691	"not rcx",
692	}
693	atomic_rmw_cas_2! {
694	atomic_neg_cmpxchg16b,
695	"mov rbx, rax",
696	"neg rbx",
697	"mov rcx, 0",
698	"sbb rcx, rdx",
699	}
700
701	atomic_rmw_cas_3! {
702	atomic_max_cmpxchg16b,
703	"cmp rsi, rax",
704	"mov rcx, r8",
705	"sbb rcx, rdx",
706	"mov rcx, r8",
707	"cmovl rcx, rdx",
708	"mov rbx, rsi",
709	"cmovl rbx, rax",
710	}
711	atomic_rmw_cas_3! {
712	atomic_umax_cmpxchg16b,
713	"cmp rsi, rax",
714	"mov rcx, r8",
715	"sbb rcx, rdx",
716	"mov rcx, r8",
717	"cmovb rcx, rdx",
718	"mov rbx, rsi",
719	"cmovb rbx, rax",
720	}
721	atomic_rmw_cas_3! {
722	atomic_min_cmpxchg16b,
723	"cmp rsi, rax",
724	"mov rcx, r8",
725	"sbb rcx, rdx",
726	"mov rcx, r8",
727	"cmovge rcx, rdx",
728	"mov rbx, rsi",
729	"cmovge rbx, rax",
730	}
731	atomic_rmw_cas_3! {
732	atomic_umin_cmpxchg16b,
733	"cmp rsi, rax",
734	"mov rcx, r8",
735	"sbb rcx, rdx",
736	"mov rcx, r8",
737	"cmovae rcx, rdx",
738	"mov rbx, rsi",
739	"cmovae rbx, rax",
740	}
741
742	macro_rules! select_atomic_rmw {
743	(
744	unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?;
745	cmpxchg16b = $cmpxchg16b_fn:ident;
746	fallback = $seqcst_fallback_fn:ident;
747	) => {
748	// If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn.
749	#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
750	use self::$cmpxchg16b_fn as $name;
751	// Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available.
752	#[cfg(not(any(
753	target_feature = "cmpxchg16b",
754	portable_atomic_target_feature = "cmpxchg16b",
755	)))]
756	#[inline]
757	unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? {
758	fn_alias! {
759	// See cmpxchg16b() for target_feature(enable).
760	#[cfg_attr(
761	not(portable_atomic_no_cmpxchg16b_target_feature),
762	target_feature(enable = "cmpxchg16b")
763	)]
764	unsafe fn($($arg)*) $(-> $ret_ty)?;
765	// cmpxchg16b is always SeqCst.
766	cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst);
767	}
768	// SAFETY: the caller must uphold the safety contract.
769	// we only calls cmpxchg16b_fn if cmpxchg16b is available.
770	unsafe {
771	ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
772	if detect::detect().has_cmpxchg16b() {
773	cmpxchg16b_seqcst_fn
774	} else {
775	// Use SeqCst because cmpxchg16b is always SeqCst.
776	fallback::$seqcst_fallback_fn
777	}
778	})
779	}
780	}
781	};
782	}
783
784	select_atomic_rmw! {
785	unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
786	cmpxchg16b = atomic_swap_cmpxchg16b;
787	fallback = atomic_swap_seqcst;
788	}
789	select_atomic_rmw! {
790	unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128;
791	cmpxchg16b = atomic_add_cmpxchg16b;
792	fallback = atomic_add_seqcst;
793	}
794	select_atomic_rmw! {
795	unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128;
796	cmpxchg16b = atomic_sub_cmpxchg16b;
797	fallback = atomic_sub_seqcst;
798	}
799	select_atomic_rmw! {
800	unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128;
801	cmpxchg16b = atomic_and_cmpxchg16b;
802	fallback = atomic_and_seqcst;
803	}
804	select_atomic_rmw! {
805	unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128;
806	cmpxchg16b = atomic_nand_cmpxchg16b;
807	fallback = atomic_nand_seqcst;
808	}
809	select_atomic_rmw! {
810	unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128;
811	cmpxchg16b = atomic_or_cmpxchg16b;
812	fallback = atomic_or_seqcst;
813	}
814	select_atomic_rmw! {
815	unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128;
816	cmpxchg16b = atomic_xor_cmpxchg16b;
817	fallback = atomic_xor_seqcst;
818	}
819	select_atomic_rmw! {
820	unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128;
821	cmpxchg16b = atomic_max_cmpxchg16b;
822	fallback = atomic_max_seqcst;
823	}
824	select_atomic_rmw! {
825	unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128;
826	cmpxchg16b = atomic_umax_cmpxchg16b;
827	fallback = atomic_umax_seqcst;
828	}
829	select_atomic_rmw! {
830	unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128;
831	cmpxchg16b = atomic_min_cmpxchg16b;
832	fallback = atomic_min_seqcst;
833	}
834	select_atomic_rmw! {
835	unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128;
836	cmpxchg16b = atomic_umin_cmpxchg16b;
837	fallback = atomic_umin_seqcst;
838	}
839	select_atomic_rmw! {
840	unsafe fn atomic_not(dst: *mut u128) -> u128;
841	cmpxchg16b = atomic_not_cmpxchg16b;
842	fallback = atomic_not_seqcst;
843	}
844	select_atomic_rmw! {
845	unsafe fn atomic_neg(dst: *mut u128) -> u128;
846	cmpxchg16b = atomic_neg_cmpxchg16b;
847	fallback = atomic_neg_seqcst;
848	}
849
850	#[inline]
851	fn is_lock_free() -> bool {
852	#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
853	{
854	// CMPXCHG16B is available at compile-time.
855	`true`
856	}
857	#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
858	{
859	detect::detect().has_cmpxchg16b()
860	}
861	}
862	const IS_ALWAYS_LOCK_FREE: bool =
863	cfg!(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"));
864
865	atomic128!(AtomicI128, i128, atomic_max, atomic_min);
866	atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
867
868	#[allow(clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
869	#[cfg(test)]
870	mod tests {
871	use super::*;
872
873	test_atomic_int!(i128);
874	test_atomic_int!(u128);
875
876	// load/store/swap implementation is not affected by signedness, so it is
877	// enough to test only unsigned types.
878	stress_test!(u128);
879	}
880