1 | // SPDX-License-Identifier: Apache-2.0 OR MIT |
2 | |
3 | /* |
4 | 128-bit atomic implementation on x86_64 using CMPXCHG16B (DWCAS). |
5 | |
6 | Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use |
7 | this module and use intrinsics.rs instead. |
8 | |
9 | Refs: |
10 | - x86 and amd64 instruction reference https://www.felixcloutier.com/x86 |
11 | - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit |
12 | |
13 | Generated asm: |
14 | - x86_64 (+cmpxchg16b) https://godbolt.org/z/rfs1jxd51 |
15 | */ |
16 | |
17 | // TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm |
18 | |
19 | include!("macros.rs" ); |
20 | |
21 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
22 | #[path = "../fallback/outline_atomics.rs" ] |
23 | mod fallback; |
24 | |
25 | #[cfg (not(portable_atomic_no_outline_atomics))] |
26 | #[cfg (not(target_env = "sgx" ))] |
27 | #[cfg_attr ( |
28 | not(target_feature = "sse" ), |
29 | cfg(not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))) |
30 | )] |
31 | #[path = "../detect/x86_64.rs" ] |
32 | mod detect; |
33 | |
34 | #[cfg (not(portable_atomic_no_asm))] |
35 | use core::arch::asm; |
36 | use core::sync::atomic::Ordering; |
37 | |
38 | use crate::utils::{Pair, U128}; |
39 | |
40 | // Asserts that the function is called in the correct context. |
41 | macro_rules! debug_assert_cmpxchg16b { |
42 | () => { |
43 | #[cfg(not(any( |
44 | target_feature = "cmpxchg16b" , |
45 | portable_atomic_target_feature = "cmpxchg16b" , |
46 | )))] |
47 | { |
48 | debug_assert!(detect::detect().has_cmpxchg16b()); |
49 | } |
50 | }; |
51 | } |
52 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
53 | #[cfg (target_feature = "sse" )] |
54 | macro_rules! debug_assert_vmovdqa_atomic { |
55 | () => {{ |
56 | debug_assert_cmpxchg16b!(); |
57 | debug_assert!(detect::detect().has_vmovdqa_atomic()); |
58 | }}; |
59 | } |
60 | |
61 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
62 | #[cfg (target_feature = "sse" )] |
63 | #[cfg (target_pointer_width = "32" )] |
64 | macro_rules! ptr_modifier { |
65 | () => { |
66 | ":e" |
67 | }; |
68 | } |
69 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
70 | #[cfg (target_feature = "sse" )] |
71 | #[cfg (target_pointer_width = "64" )] |
72 | macro_rules! ptr_modifier { |
73 | () => { |
74 | "" |
75 | }; |
76 | } |
77 | |
78 | // Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction |
79 | // requirements for the currently enabled target features. In the first place, |
80 | // there is no option in the x86 assembly for such case, like Arm .arch_extension, |
81 | // RISC-V .option arch, PowerPC .machine, etc. |
82 | // However, we set target_feature(enable) when available (Rust 1.69+) in case a |
83 | // new codegen backend is added that checks for it in the future, or an option |
84 | // is added to the assembler to check for it. |
85 | #[cfg_attr ( |
86 | not(portable_atomic_no_cmpxchg16b_target_feature), |
87 | target_feature(enable = "cmpxchg16b" ) |
88 | )] |
89 | #[inline ] |
90 | unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { |
91 | debug_assert!(dst as usize % 16 == 0); |
92 | debug_assert_cmpxchg16b!(); |
93 | |
94 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
95 | // reads, 16-byte aligned (required by CMPXCHG16B), that there are no |
96 | // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B. |
97 | // |
98 | // If the value at `dst` (destination operand) and rdx:rax are equal, the |
99 | // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at |
100 | // `dst` is loaded to rdx:rax. |
101 | // |
102 | // The ZF flag is set if the value at `dst` and rdx:rax are equal, |
103 | // otherwise it is cleared. Other flags are unaffected. |
104 | // |
105 | // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b |
106 | unsafe { |
107 | // cmpxchg16b is always SeqCst. |
108 | let r: u8; |
109 | let old = U128 { whole: old }; |
110 | let new = U128 { whole: new }; |
111 | let (prev_lo, prev_hi); |
112 | macro_rules! cmpxchg16b { |
113 | ($rdi:tt) => { |
114 | asm!( |
115 | "xchg {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
116 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
117 | "sete cl" , |
118 | "mov rbx, {rbx_tmp}" , // restore rbx |
119 | rbx_tmp = inout(reg) new.pair.lo => _, |
120 | in("rcx" ) new.pair.hi, |
121 | inout("rax" ) old.pair.lo => prev_lo, |
122 | inout("rdx" ) old.pair.hi => prev_hi, |
123 | in($rdi) dst, |
124 | lateout("cl" ) r, |
125 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
126 | options(nostack), |
127 | ) |
128 | }; |
129 | } |
130 | #[cfg (target_pointer_width = "32" )] |
131 | cmpxchg16b!("edi" ); |
132 | #[cfg (target_pointer_width = "64" )] |
133 | cmpxchg16b!("rdi" ); |
134 | crate::utils::assert_unchecked(r == 0 || r == 1); // needed to remove extra test |
135 | (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0) |
136 | } |
137 | } |
138 | |
139 | // VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX. |
140 | // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details. |
141 | // |
142 | // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64 |
143 | // |
144 | // Use cfg(target_feature = "sse") here -- SSE is included in the x86_64 |
145 | // baseline and is always available, but the SSE target feature is disabled for |
146 | // use cases such as kernels and firmware that should not use vector registers. |
147 | // So, do not use vector registers unless SSE target feature is enabled. |
148 | // See also https://github.com/rust-lang/rust/blob/1.80.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md. |
149 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
150 | #[cfg (target_feature = "sse" )] |
151 | #[target_feature (enable = "avx" )] |
152 | #[inline ] |
153 | unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 { |
154 | debug_assert!(src as usize % 16 == 0); |
155 | debug_assert_vmovdqa_atomic!(); |
156 | |
157 | // SAFETY: the caller must uphold the safety contract. |
158 | // |
159 | // atomic load by vmovdqa is always SeqCst. |
160 | unsafe { |
161 | let out: core::arch::x86_64::__m128i; |
162 | asm!( |
163 | concat!("vmovdqa {out}, xmmword ptr [{src" , ptr_modifier!( ) , "}]" ), |
164 | src = in(reg) src, |
165 | out = out(xmm_reg) out, |
166 | options(nostack, preserves_flags), |
167 | ); |
168 | core::mem::transmute(src:out) |
169 | } |
170 | } |
171 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
172 | #[cfg (target_feature = "sse" )] |
173 | #[target_feature (enable = "avx" )] |
174 | #[inline ] |
175 | unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) { |
176 | debug_assert!(dst as usize % 16 == 0); |
177 | debug_assert_vmovdqa_atomic!(); |
178 | |
179 | // SAFETY: the caller must uphold the safety contract. |
180 | unsafe { |
181 | let val: core::arch::x86_64::__m128i = core::mem::transmute(val); |
182 | match order { |
183 | // Relaxed and Release stores are equivalent. |
184 | Ordering::Relaxed | Ordering::Release => { |
185 | asm!( |
186 | concat!("vmovdqa xmmword ptr [{dst" , ptr_modifier!( ) , "}], {val}" ), |
187 | dst = in(reg) dst, |
188 | val = in(xmm_reg) val, |
189 | options(nostack, preserves_flags), |
190 | ); |
191 | } |
192 | Ordering::SeqCst => { |
193 | let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit()); |
194 | asm!( |
195 | concat!("vmovdqa xmmword ptr [{dst" , ptr_modifier!( ) , "}], {val}" ), |
196 | // Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases. |
197 | // - https://github.com/taiki-e/portable-atomic/pull/156 |
198 | // - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc |
199 | // - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier |
200 | // - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740 |
201 | // - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22 |
202 | concat!("xchg qword ptr [{p" , ptr_modifier!( ) , "}], {tmp}" ), |
203 | dst = in(reg) dst, |
204 | val = in(xmm_reg) val, |
205 | p = inout(reg) p.get() => _, |
206 | tmp = lateout(reg) _, |
207 | options(nostack, preserves_flags), |
208 | ); |
209 | } |
210 | _ => unreachable!(), |
211 | } |
212 | } |
213 | } |
214 | |
215 | #[cfg (not(all( |
216 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
217 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
218 | )))] |
219 | macro_rules! load_store_detect { |
220 | ( |
221 | vmovdqa = $vmovdqa:ident |
222 | cmpxchg16b = $cmpxchg16b:ident |
223 | fallback = $fallback:ident |
224 | ) => {{ |
225 | let cpuid = detect::detect(); |
226 | #[cfg(not(any( |
227 | target_feature = "cmpxchg16b" , |
228 | portable_atomic_target_feature = "cmpxchg16b" , |
229 | )))] |
230 | { |
231 | // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access. |
232 | if cpuid.has_cmpxchg16b() { |
233 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
234 | #[cfg(target_feature = "sse" )] |
235 | { |
236 | if cpuid.has_vmovdqa_atomic() { |
237 | $vmovdqa |
238 | } else { |
239 | $cmpxchg16b |
240 | } |
241 | } |
242 | #[cfg(not(target_feature = "sse" ))] |
243 | { |
244 | $cmpxchg16b |
245 | } |
246 | } else { |
247 | fallback::$fallback |
248 | } |
249 | } |
250 | #[cfg(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
251 | { |
252 | if cpuid.has_vmovdqa_atomic() { |
253 | $vmovdqa |
254 | } else { |
255 | $cmpxchg16b |
256 | } |
257 | } |
258 | }}; |
259 | } |
260 | |
261 | #[inline ] |
262 | unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 { |
263 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
264 | // SGX doesn't support CPUID. |
265 | #[cfg (all( |
266 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
267 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
268 | ))] |
269 | // SAFETY: the caller must uphold the safety contract. |
270 | // cfg guarantees that CMPXCHG16B is available at compile-time. |
271 | unsafe { |
272 | // cmpxchg16b is always SeqCst. |
273 | atomic_load_cmpxchg16b(src) |
274 | } |
275 | #[cfg (not(all( |
276 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
277 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
278 | )))] |
279 | // SAFETY: the caller must uphold the safety contract. |
280 | unsafe { |
281 | ifunc!(unsafe fn(src: *mut u128) -> u128 { |
282 | load_store_detect! { |
283 | vmovdqa = atomic_load_vmovdqa |
284 | cmpxchg16b = atomic_load_cmpxchg16b |
285 | // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst. |
286 | fallback = atomic_load_seqcst |
287 | } |
288 | }) |
289 | } |
290 | } |
291 | // See cmpxchg16b() for target_feature(enable). |
292 | #[cfg_attr ( |
293 | not(portable_atomic_no_cmpxchg16b_target_feature), |
294 | target_feature(enable = "cmpxchg16b" ) |
295 | )] |
296 | #[inline ] |
297 | unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 { |
298 | debug_assert!(src as usize % 16 == 0); |
299 | debug_assert_cmpxchg16b!(); |
300 | |
301 | // SAFETY: the caller must guarantee that `src` is valid for both writes and |
302 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
303 | // cfg guarantees that the CPU supports CMPXCHG16B. |
304 | // |
305 | // See cmpxchg16b function for more. |
306 | // |
307 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
308 | // omitting the storing of condition flags and avoid use of xchg to handle rbx. |
309 | unsafe { |
310 | // cmpxchg16b is always SeqCst. |
311 | let (out_lo, out_hi); |
312 | macro_rules! cmpxchg16b { |
313 | ($rdi:tt) => { |
314 | asm!( |
315 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
316 | "xor rbx, rbx" , // zeroed rbx |
317 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
318 | "mov rbx, {rbx_tmp}" , // restore rbx |
319 | // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg) |
320 | rbx_tmp = out(reg) _, |
321 | in("rcx" ) 0_u64, |
322 | inout("rax" ) 0_u64 => out_lo, |
323 | inout("rdx" ) 0_u64 => out_hi, |
324 | in($rdi) src, |
325 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
326 | options(nostack), |
327 | ) |
328 | }; |
329 | } |
330 | #[cfg (target_pointer_width = "32" )] |
331 | cmpxchg16b!("edi" ); |
332 | #[cfg (target_pointer_width = "64" )] |
333 | cmpxchg16b!("rdi" ); |
334 | U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole |
335 | } |
336 | } |
337 | |
338 | #[inline ] |
339 | unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { |
340 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
341 | // SGX doesn't support CPUID. |
342 | #[cfg (all( |
343 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
344 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
345 | ))] |
346 | // SAFETY: the caller must uphold the safety contract. |
347 | // cfg guarantees that CMPXCHG16B is available at compile-time. |
348 | unsafe { |
349 | // cmpxchg16b is always SeqCst. |
350 | let _ = order; |
351 | atomic_store_cmpxchg16b(dst, val); |
352 | } |
353 | #[cfg (not(all( |
354 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
355 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
356 | )))] |
357 | // SAFETY: the caller must uphold the safety contract. |
358 | unsafe { |
359 | #[cfg (target_feature = "sse" )] |
360 | fn_alias! { |
361 | #[target_feature (enable = "avx" )] |
362 | unsafe fn(dst: *mut u128, val: u128); |
363 | // atomic store by vmovdqa has at least release semantics. |
364 | atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release); |
365 | atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst); |
366 | } |
367 | match order { |
368 | // Relaxed and Release stores are equivalent in all implementations |
369 | // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback). |
370 | // core::arch's cmpxchg16b will never called here. |
371 | Ordering::Relaxed | Ordering::Release => { |
372 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { |
373 | load_store_detect! { |
374 | vmovdqa = atomic_store_vmovdqa_non_seqcst |
375 | cmpxchg16b = atomic_store_cmpxchg16b |
376 | fallback = atomic_store_non_seqcst |
377 | } |
378 | }); |
379 | } |
380 | Ordering::SeqCst => { |
381 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { |
382 | load_store_detect! { |
383 | vmovdqa = atomic_store_vmovdqa_seqcst |
384 | cmpxchg16b = atomic_store_cmpxchg16b |
385 | fallback = atomic_store_seqcst |
386 | } |
387 | }); |
388 | } |
389 | _ => unreachable!(), |
390 | } |
391 | } |
392 | } |
393 | // See cmpxchg16b() for target_feature(enable). |
394 | #[cfg_attr ( |
395 | not(portable_atomic_no_cmpxchg16b_target_feature), |
396 | target_feature(enable = "cmpxchg16b" ) |
397 | )] |
398 | #[inline ] |
399 | unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) { |
400 | // SAFETY: the caller must uphold the safety contract. |
401 | unsafe { |
402 | // cmpxchg16b is always SeqCst. |
403 | atomic_swap_cmpxchg16b(dst, val, _order:Ordering::SeqCst); |
404 | } |
405 | } |
406 | |
407 | #[inline ] |
408 | unsafe fn atomic_compare_exchange( |
409 | dst: *mut u128, |
410 | old: u128, |
411 | new: u128, |
412 | _success: Ordering, |
413 | _failure: Ordering, |
414 | ) -> Result<u128, u128> { |
415 | #[cfg (any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
416 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
417 | // reads, 16-byte aligned, that there are no concurrent non-atomic operations, |
418 | // and cfg guarantees that CMPXCHG16B is available at compile-time. |
419 | let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) }; |
420 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
421 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
422 | // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses. |
423 | let (prev, ok) = unsafe { |
424 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { |
425 | if detect::detect().has_cmpxchg16b() { |
426 | cmpxchg16b |
427 | } else { |
428 | // Use SeqCst because cmpxchg16b is always SeqCst. |
429 | fallback::atomic_compare_exchange_seqcst |
430 | } |
431 | }) |
432 | }; |
433 | if ok { |
434 | Ok(prev) |
435 | } else { |
436 | Err(prev) |
437 | } |
438 | } |
439 | |
440 | // cmpxchg16b is always strong. |
441 | use self::atomic_compare_exchange as atomic_compare_exchange_weak; |
442 | |
443 | // See cmpxchg16b() for target_feature(enable). |
444 | #[cfg_attr ( |
445 | not(portable_atomic_no_cmpxchg16b_target_feature), |
446 | target_feature(enable = "cmpxchg16b" ) |
447 | )] |
448 | #[inline ] |
449 | unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 { |
450 | debug_assert!(dst as usize % 16 == 0); |
451 | debug_assert_cmpxchg16b!(); |
452 | |
453 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
454 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
455 | // cfg guarantees that the CPU supports CMPXCHG16B. |
456 | // |
457 | // See cmpxchg16b function for more. |
458 | // |
459 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
460 | // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx. |
461 | // |
462 | // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap. |
463 | unsafe { |
464 | // cmpxchg16b is always SeqCst. |
465 | let val = U128 { whole: val }; |
466 | let (mut prev_lo, mut prev_hi); |
467 | macro_rules! cmpxchg16b { |
468 | ($rdi:tt) => { |
469 | asm!( |
470 | "xchg {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
471 | // This is not single-copy atomic reads, but this is ok because subsequent |
472 | // CAS will check for consistency. |
473 | // |
474 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
475 | // |
476 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
477 | // so we must use inline assembly to implement this. |
478 | // (i.e., byte-wise atomic based on the standard library's atomic types |
479 | // cannot be used here). |
480 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
481 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
482 | "2:" , |
483 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
484 | "jne 2b" , |
485 | "mov rbx, {rbx_tmp}" , // restore rbx |
486 | rbx_tmp = inout(reg) val.pair.lo => _, |
487 | in("rcx" ) val.pair.hi, |
488 | out("rax" ) prev_lo, |
489 | out("rdx" ) prev_hi, |
490 | in($rdi) dst, |
491 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
492 | options(nostack), |
493 | ) |
494 | }; |
495 | } |
496 | #[cfg (target_pointer_width = "32" )] |
497 | cmpxchg16b!("edi" ); |
498 | #[cfg (target_pointer_width = "64" )] |
499 | cmpxchg16b!("rdi" ); |
500 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
501 | } |
502 | } |
503 | |
504 | /// Atomic RMW by CAS loop (3 arguments) |
505 | /// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;` |
506 | /// |
507 | /// `$op` can use the following registers: |
508 | /// - rsi/r8 pair: val argument (read-only for `$op`) |
509 | /// - rax/rdx pair: previous value loaded (read-only for `$op`) |
510 | /// - rbx/rcx pair: new value that will be stored |
511 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
512 | // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx. |
513 | macro_rules! atomic_rmw_cas_3 { |
514 | ($name:ident, $($op:tt)*) => { |
515 | // See cmpxchg16b() for target_feature(enable). |
516 | #[cfg_attr( |
517 | not(portable_atomic_no_cmpxchg16b_target_feature), |
518 | target_feature(enable = "cmpxchg16b" ) |
519 | )] |
520 | #[inline] |
521 | unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 { |
522 | debug_assert!(dst as usize % 16 == 0); |
523 | debug_assert_cmpxchg16b!(); |
524 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
525 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
526 | // cfg guarantees that the CPU supports CMPXCHG16B. |
527 | // |
528 | // See cmpxchg16b function for more. |
529 | unsafe { |
530 | // cmpxchg16b is always SeqCst. |
531 | let val = U128 { whole: val }; |
532 | let (mut prev_lo, mut prev_hi); |
533 | macro_rules! cmpxchg16b { |
534 | ($rdi:tt) => { |
535 | asm!( |
536 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
537 | // This is not single-copy atomic reads, but this is ok because subsequent |
538 | // CAS will check for consistency. |
539 | // |
540 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
541 | // |
542 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
543 | // so we must use inline assembly to implement this. |
544 | // (i.e., byte-wise atomic based on the standard library's atomic types |
545 | // cannot be used here). |
546 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
547 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
548 | "2:" , |
549 | $($op)* |
550 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
551 | "jne 2b" , |
552 | "mov rbx, {rbx_tmp}" , // restore rbx |
553 | rbx_tmp = out(reg) _, |
554 | out("rcx" ) _, |
555 | out("rax" ) prev_lo, |
556 | out("rdx" ) prev_hi, |
557 | in($rdi) dst, |
558 | in("rsi" ) val.pair.lo, |
559 | in("r8" ) val.pair.hi, |
560 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
561 | options(nostack), |
562 | ) |
563 | }; |
564 | } |
565 | #[cfg(target_pointer_width = "32" )] |
566 | cmpxchg16b!("edi" ); |
567 | #[cfg(target_pointer_width = "64" )] |
568 | cmpxchg16b!("rdi" ); |
569 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
570 | } |
571 | } |
572 | }; |
573 | } |
574 | /// Atomic RMW by CAS loop (2 arguments) |
575 | /// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;` |
576 | /// |
577 | /// `$op` can use the following registers: |
578 | /// - rax/rdx pair: previous value loaded (read-only for `$op`) |
579 | /// - rbx/rcx pair: new value that will be stored |
580 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
581 | // omitting the storing of condition flags and avoid use of xchg to handle rbx. |
582 | macro_rules! atomic_rmw_cas_2 { |
583 | ($name:ident, $($op:tt)*) => { |
584 | // See cmpxchg16b() for target_feature(enable). |
585 | #[cfg_attr( |
586 | not(portable_atomic_no_cmpxchg16b_target_feature), |
587 | target_feature(enable = "cmpxchg16b" ) |
588 | )] |
589 | #[inline] |
590 | unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 { |
591 | debug_assert!(dst as usize % 16 == 0); |
592 | debug_assert_cmpxchg16b!(); |
593 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
594 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
595 | // cfg guarantees that the CPU supports CMPXCHG16B. |
596 | // |
597 | // See cmpxchg16b function for more. |
598 | unsafe { |
599 | // cmpxchg16b is always SeqCst. |
600 | let (mut prev_lo, mut prev_hi); |
601 | macro_rules! cmpxchg16b { |
602 | ($rdi:tt) => { |
603 | asm!( |
604 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
605 | // This is not single-copy atomic reads, but this is ok because subsequent |
606 | // CAS will check for consistency. |
607 | // |
608 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
609 | // |
610 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
611 | // so we must use inline assembly to implement this. |
612 | // (i.e., byte-wise atomic based on the standard library's atomic types |
613 | // cannot be used here). |
614 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
615 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
616 | "2:" , |
617 | $($op)* |
618 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
619 | "jne 2b" , |
620 | "mov rbx, {rbx_tmp}" , // restore rbx |
621 | rbx_tmp = out(reg) _, |
622 | out("rcx" ) _, |
623 | out("rax" ) prev_lo, |
624 | out("rdx" ) prev_hi, |
625 | in($rdi) dst, |
626 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
627 | options(nostack), |
628 | ) |
629 | }; |
630 | } |
631 | #[cfg(target_pointer_width = "32" )] |
632 | cmpxchg16b!("edi" ); |
633 | #[cfg(target_pointer_width = "64" )] |
634 | cmpxchg16b!("rdi" ); |
635 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
636 | } |
637 | } |
638 | }; |
639 | } |
640 | |
641 | atomic_rmw_cas_3! { |
642 | atomic_add_cmpxchg16b, |
643 | "mov rbx, rax" , |
644 | "add rbx, rsi" , |
645 | "mov rcx, rdx" , |
646 | "adc rcx, r8" , |
647 | } |
648 | atomic_rmw_cas_3! { |
649 | atomic_sub_cmpxchg16b, |
650 | "mov rbx, rax" , |
651 | "sub rbx, rsi" , |
652 | "mov rcx, rdx" , |
653 | "sbb rcx, r8" , |
654 | } |
655 | atomic_rmw_cas_3! { |
656 | atomic_and_cmpxchg16b, |
657 | "mov rbx, rax" , |
658 | "and rbx, rsi" , |
659 | "mov rcx, rdx" , |
660 | "and rcx, r8" , |
661 | } |
662 | atomic_rmw_cas_3! { |
663 | atomic_nand_cmpxchg16b, |
664 | "mov rbx, rax" , |
665 | "and rbx, rsi" , |
666 | "not rbx" , |
667 | "mov rcx, rdx" , |
668 | "and rcx, r8" , |
669 | "not rcx" , |
670 | } |
671 | atomic_rmw_cas_3! { |
672 | atomic_or_cmpxchg16b, |
673 | "mov rbx, rax" , |
674 | "or rbx, rsi" , |
675 | "mov rcx, rdx" , |
676 | "or rcx, r8" , |
677 | } |
678 | atomic_rmw_cas_3! { |
679 | atomic_xor_cmpxchg16b, |
680 | "mov rbx, rax" , |
681 | "xor rbx, rsi" , |
682 | "mov rcx, rdx" , |
683 | "xor rcx, r8" , |
684 | } |
685 | |
686 | atomic_rmw_cas_2! { |
687 | atomic_not_cmpxchg16b, |
688 | "mov rbx, rax" , |
689 | "not rbx" , |
690 | "mov rcx, rdx" , |
691 | "not rcx" , |
692 | } |
693 | atomic_rmw_cas_2! { |
694 | atomic_neg_cmpxchg16b, |
695 | "mov rbx, rax" , |
696 | "neg rbx" , |
697 | "mov rcx, 0" , |
698 | "sbb rcx, rdx" , |
699 | } |
700 | |
701 | atomic_rmw_cas_3! { |
702 | atomic_max_cmpxchg16b, |
703 | "cmp rsi, rax" , |
704 | "mov rcx, r8" , |
705 | "sbb rcx, rdx" , |
706 | "mov rcx, r8" , |
707 | "cmovl rcx, rdx" , |
708 | "mov rbx, rsi" , |
709 | "cmovl rbx, rax" , |
710 | } |
711 | atomic_rmw_cas_3! { |
712 | atomic_umax_cmpxchg16b, |
713 | "cmp rsi, rax" , |
714 | "mov rcx, r8" , |
715 | "sbb rcx, rdx" , |
716 | "mov rcx, r8" , |
717 | "cmovb rcx, rdx" , |
718 | "mov rbx, rsi" , |
719 | "cmovb rbx, rax" , |
720 | } |
721 | atomic_rmw_cas_3! { |
722 | atomic_min_cmpxchg16b, |
723 | "cmp rsi, rax" , |
724 | "mov rcx, r8" , |
725 | "sbb rcx, rdx" , |
726 | "mov rcx, r8" , |
727 | "cmovge rcx, rdx" , |
728 | "mov rbx, rsi" , |
729 | "cmovge rbx, rax" , |
730 | } |
731 | atomic_rmw_cas_3! { |
732 | atomic_umin_cmpxchg16b, |
733 | "cmp rsi, rax" , |
734 | "mov rcx, r8" , |
735 | "sbb rcx, rdx" , |
736 | "mov rcx, r8" , |
737 | "cmovae rcx, rdx" , |
738 | "mov rbx, rsi" , |
739 | "cmovae rbx, rax" , |
740 | } |
741 | |
742 | macro_rules! select_atomic_rmw { |
743 | ( |
744 | unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?; |
745 | cmpxchg16b = $cmpxchg16b_fn:ident; |
746 | fallback = $seqcst_fallback_fn:ident; |
747 | ) => { |
748 | // If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn. |
749 | #[cfg(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
750 | use self::$cmpxchg16b_fn as $name; |
751 | // Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available. |
752 | #[cfg(not(any( |
753 | target_feature = "cmpxchg16b" , |
754 | portable_atomic_target_feature = "cmpxchg16b" , |
755 | )))] |
756 | #[inline] |
757 | unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? { |
758 | fn_alias! { |
759 | // See cmpxchg16b() for target_feature(enable). |
760 | #[cfg_attr( |
761 | not(portable_atomic_no_cmpxchg16b_target_feature), |
762 | target_feature(enable = "cmpxchg16b" ) |
763 | )] |
764 | unsafe fn($($arg)*) $(-> $ret_ty)?; |
765 | // cmpxchg16b is always SeqCst. |
766 | cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst); |
767 | } |
768 | // SAFETY: the caller must uphold the safety contract. |
769 | // we only calls cmpxchg16b_fn if cmpxchg16b is available. |
770 | unsafe { |
771 | ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? { |
772 | if detect::detect().has_cmpxchg16b() { |
773 | cmpxchg16b_seqcst_fn |
774 | } else { |
775 | // Use SeqCst because cmpxchg16b is always SeqCst. |
776 | fallback::$seqcst_fallback_fn |
777 | } |
778 | }) |
779 | } |
780 | } |
781 | }; |
782 | } |
783 | |
784 | select_atomic_rmw! { |
785 | unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128; |
786 | cmpxchg16b = atomic_swap_cmpxchg16b; |
787 | fallback = atomic_swap_seqcst; |
788 | } |
789 | select_atomic_rmw! { |
790 | unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128; |
791 | cmpxchg16b = atomic_add_cmpxchg16b; |
792 | fallback = atomic_add_seqcst; |
793 | } |
794 | select_atomic_rmw! { |
795 | unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128; |
796 | cmpxchg16b = atomic_sub_cmpxchg16b; |
797 | fallback = atomic_sub_seqcst; |
798 | } |
799 | select_atomic_rmw! { |
800 | unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128; |
801 | cmpxchg16b = atomic_and_cmpxchg16b; |
802 | fallback = atomic_and_seqcst; |
803 | } |
804 | select_atomic_rmw! { |
805 | unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128; |
806 | cmpxchg16b = atomic_nand_cmpxchg16b; |
807 | fallback = atomic_nand_seqcst; |
808 | } |
809 | select_atomic_rmw! { |
810 | unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128; |
811 | cmpxchg16b = atomic_or_cmpxchg16b; |
812 | fallback = atomic_or_seqcst; |
813 | } |
814 | select_atomic_rmw! { |
815 | unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128; |
816 | cmpxchg16b = atomic_xor_cmpxchg16b; |
817 | fallback = atomic_xor_seqcst; |
818 | } |
819 | select_atomic_rmw! { |
820 | unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128; |
821 | cmpxchg16b = atomic_max_cmpxchg16b; |
822 | fallback = atomic_max_seqcst; |
823 | } |
824 | select_atomic_rmw! { |
825 | unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128; |
826 | cmpxchg16b = atomic_umax_cmpxchg16b; |
827 | fallback = atomic_umax_seqcst; |
828 | } |
829 | select_atomic_rmw! { |
830 | unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128; |
831 | cmpxchg16b = atomic_min_cmpxchg16b; |
832 | fallback = atomic_min_seqcst; |
833 | } |
834 | select_atomic_rmw! { |
835 | unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128; |
836 | cmpxchg16b = atomic_umin_cmpxchg16b; |
837 | fallback = atomic_umin_seqcst; |
838 | } |
839 | select_atomic_rmw! { |
840 | unsafe fn atomic_not(dst: *mut u128) -> u128; |
841 | cmpxchg16b = atomic_not_cmpxchg16b; |
842 | fallback = atomic_not_seqcst; |
843 | } |
844 | select_atomic_rmw! { |
845 | unsafe fn atomic_neg(dst: *mut u128) -> u128; |
846 | cmpxchg16b = atomic_neg_cmpxchg16b; |
847 | fallback = atomic_neg_seqcst; |
848 | } |
849 | |
850 | #[inline ] |
851 | fn is_lock_free() -> bool { |
852 | #[cfg (any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
853 | { |
854 | // CMPXCHG16B is available at compile-time. |
855 | true |
856 | } |
857 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
858 | { |
859 | detect::detect().has_cmpxchg16b() |
860 | } |
861 | } |
862 | const IS_ALWAYS_LOCK_FREE: bool = |
863 | cfg!(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )); |
864 | |
865 | atomic128!(AtomicI128, i128, atomic_max, atomic_min); |
866 | atomic128!(AtomicU128, u128, atomic_umax, atomic_umin); |
867 | |
868 | #[allow (clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)] |
869 | #[cfg (test)] |
870 | mod tests { |
871 | use super::*; |
872 | |
873 | test_atomic_int!(i128); |
874 | test_atomic_int!(u128); |
875 | |
876 | // load/store/swap implementation is not affected by signedness, so it is |
877 | // enough to test only unsigned types. |
878 | stress_test!(u128); |
879 | } |
880 | |