1 | // SPDX-License-Identifier: Apache-2.0 OR MIT |
2 | |
3 | /* |
4 | 128-bit atomic implementation on x86_64. |
5 | |
6 | This architecture provides the following 128-bit atomic instructions: |
7 | |
8 | - CMPXCHG16B: CAS (CMPXCHG16B) |
9 | - VMOVDQA: load/store (Intel, AMD, or Zhaoxin CPU with AVX) |
10 | |
11 | Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use |
12 | this module and use intrinsics.rs instead. |
13 | |
14 | Refs: |
15 | - x86 and amd64 instruction reference https://www.felixcloutier.com/x86 |
16 | - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit |
17 | |
18 | Generated asm: |
19 | - x86_64 (+cmpxchg16b) https://godbolt.org/z/rfs1jxd51 |
20 | */ |
21 | |
22 | // TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm |
23 | |
24 | include!("macros.rs" ); |
25 | |
26 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
27 | #[path = "../fallback/outline_atomics.rs" ] |
28 | mod fallback; |
29 | |
30 | #[cfg (not(portable_atomic_no_outline_atomics))] |
31 | #[cfg (not(target_env = "sgx" ))] |
32 | #[cfg_attr ( |
33 | not(target_feature = "sse" ), |
34 | cfg(not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))) |
35 | )] |
36 | #[path = "../detect/x86_64.rs" ] |
37 | mod detect; |
38 | |
39 | #[cfg (not(portable_atomic_no_asm))] |
40 | use core::arch::asm; |
41 | use core::sync::atomic::Ordering; |
42 | |
43 | use crate::utils::{Pair, U128}; |
44 | |
45 | // Asserts that the function is called in the correct context. |
46 | macro_rules! debug_assert_cmpxchg16b { |
47 | () => { |
48 | #[cfg(not(any( |
49 | target_feature = "cmpxchg16b" , |
50 | portable_atomic_target_feature = "cmpxchg16b" , |
51 | )))] |
52 | { |
53 | debug_assert!(detect::detect().has_cmpxchg16b()); |
54 | } |
55 | }; |
56 | } |
57 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
58 | #[cfg (target_feature = "sse" )] |
59 | macro_rules! debug_assert_vmovdqa_atomic { |
60 | () => {{ |
61 | debug_assert_cmpxchg16b!(); |
62 | debug_assert!(detect::detect().has_vmovdqa_atomic()); |
63 | }}; |
64 | } |
65 | |
66 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
67 | #[cfg (target_feature = "sse" )] |
68 | #[cfg (target_pointer_width = "32" )] |
69 | macro_rules! ptr_modifier { |
70 | () => { |
71 | ":e" |
72 | }; |
73 | } |
74 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
75 | #[cfg (target_feature = "sse" )] |
76 | #[cfg (target_pointer_width = "64" )] |
77 | macro_rules! ptr_modifier { |
78 | () => { |
79 | "" |
80 | }; |
81 | } |
82 | |
83 | // Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction |
84 | // requirements for the currently enabled target features. In the first place, |
85 | // there is no option in the x86 assembly for such case, like Arm .arch_extension, |
86 | // RISC-V .option arch, PowerPC .machine, etc. |
87 | // However, we set target_feature(enable) when available (Rust 1.69+) in case a |
88 | // new codegen backend is added that checks for it in the future, or an option |
89 | // is added to the assembler to check for it. |
90 | #[cfg_attr ( |
91 | not(portable_atomic_no_cmpxchg16b_target_feature), |
92 | target_feature(enable = "cmpxchg16b" ) |
93 | )] |
94 | #[inline ] |
95 | unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { |
96 | debug_assert!(dst as usize % 16 == 0); |
97 | debug_assert_cmpxchg16b!(); |
98 | |
99 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
100 | // reads, 16-byte aligned (required by CMPXCHG16B), that there are no |
101 | // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B. |
102 | // |
103 | // If the value at `dst` (destination operand) and rdx:rax are equal, the |
104 | // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at |
105 | // `dst` is loaded to rdx:rax. |
106 | // |
107 | // The ZF flag is set if the value at `dst` and rdx:rax are equal, |
108 | // otherwise it is cleared. Other flags are unaffected. |
109 | // |
110 | // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b |
111 | unsafe { |
112 | // cmpxchg16b is always SeqCst. |
113 | let r: u8; |
114 | let old = U128 { whole: old }; |
115 | let new = U128 { whole: new }; |
116 | let (prev_lo, prev_hi); |
117 | macro_rules! cmpxchg16b { |
118 | ($rdi:tt) => { |
119 | asm!( |
120 | "xchg {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
121 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
122 | "sete cl" , |
123 | "mov rbx, {rbx_tmp}" , // restore rbx |
124 | rbx_tmp = inout(reg) new.pair.lo => _, |
125 | in("rcx" ) new.pair.hi, |
126 | inout("rax" ) old.pair.lo => prev_lo, |
127 | inout("rdx" ) old.pair.hi => prev_hi, |
128 | in($rdi) dst, |
129 | lateout("cl" ) r, |
130 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
131 | options(nostack), |
132 | ) |
133 | }; |
134 | } |
135 | #[cfg (target_pointer_width = "32" )] |
136 | cmpxchg16b!("edi" ); |
137 | #[cfg (target_pointer_width = "64" )] |
138 | cmpxchg16b!("rdi" ); |
139 | crate::utils::assert_unchecked(r == 0 || r == 1); // needed to remove extra test |
140 | (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0) |
141 | } |
142 | } |
143 | |
144 | // VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX. |
145 | // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details. |
146 | // |
147 | // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64 |
148 | // |
149 | // Use cfg(target_feature = "sse") here -- SSE is included in the x86_64 |
150 | // baseline and is always available, but the SSE target feature is disabled for |
151 | // use cases such as kernels and firmware that should not use vector registers. |
152 | // So, do not use vector registers unless SSE target feature is enabled. |
153 | // See also https://github.com/rust-lang/rust/blob/1.84.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md. |
154 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
155 | #[cfg (target_feature = "sse" )] |
156 | #[target_feature (enable = "avx" )] |
157 | #[inline ] |
158 | unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 { |
159 | debug_assert!(src as usize % 16 == 0); |
160 | debug_assert_vmovdqa_atomic!(); |
161 | |
162 | // SAFETY: the caller must uphold the safety contract. |
163 | // |
164 | // atomic load by vmovdqa is always SeqCst. |
165 | unsafe { |
166 | let out: core::arch::x86_64::__m128i; |
167 | asm!( |
168 | concat!("vmovdqa {out}, xmmword ptr [{src" , ptr_modifier!( ) , "}]" ), |
169 | src = in(reg) src, |
170 | out = out(xmm_reg) out, |
171 | options(nostack, preserves_flags), |
172 | ); |
173 | core::mem::transmute(src:out) |
174 | } |
175 | } |
176 | #[cfg (not(any(portable_atomic_no_outline_atomics, target_env = "sgx" )))] |
177 | #[cfg (target_feature = "sse" )] |
178 | #[target_feature (enable = "avx" )] |
179 | #[inline ] |
180 | unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) { |
181 | debug_assert!(dst as usize % 16 == 0); |
182 | debug_assert_vmovdqa_atomic!(); |
183 | |
184 | // SAFETY: the caller must uphold the safety contract. |
185 | unsafe { |
186 | let val: core::arch::x86_64::__m128i = core::mem::transmute(val); |
187 | match order { |
188 | // Relaxed and Release stores are equivalent. |
189 | Ordering::Relaxed | Ordering::Release => { |
190 | asm!( |
191 | concat!("vmovdqa xmmword ptr [{dst" , ptr_modifier!( ) , "}], {val}" ), |
192 | dst = in(reg) dst, |
193 | val = in(xmm_reg) val, |
194 | options(nostack, preserves_flags), |
195 | ); |
196 | } |
197 | Ordering::SeqCst => { |
198 | let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit()); |
199 | asm!( |
200 | concat!("vmovdqa xmmword ptr [{dst" , ptr_modifier!( ) , "}], {val}" ), |
201 | // Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases. |
202 | // - https://github.com/taiki-e/portable-atomic/pull/156 |
203 | // - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc |
204 | // - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier |
205 | // - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740 |
206 | // - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22 |
207 | concat!("xchg qword ptr [{p" , ptr_modifier!( ) , "}], {tmp}" ), |
208 | dst = in(reg) dst, |
209 | val = in(xmm_reg) val, |
210 | p = inout(reg) p.get() => _, |
211 | tmp = lateout(reg) _, |
212 | options(nostack, preserves_flags), |
213 | ); |
214 | } |
215 | _ => unreachable!(), |
216 | } |
217 | } |
218 | } |
219 | |
220 | #[cfg (not(all( |
221 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
222 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
223 | )))] |
224 | macro_rules! load_store_detect { |
225 | ( |
226 | vmovdqa = $vmovdqa:ident |
227 | cmpxchg16b = $cmpxchg16b:ident |
228 | fallback = $fallback:ident |
229 | ) => {{ |
230 | let cpuid = detect::detect(); |
231 | #[cfg(not(any( |
232 | target_feature = "cmpxchg16b" , |
233 | portable_atomic_target_feature = "cmpxchg16b" , |
234 | )))] |
235 | { |
236 | // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access. |
237 | if cpuid.has_cmpxchg16b() { |
238 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
239 | #[cfg(target_feature = "sse" )] |
240 | { |
241 | if cpuid.has_vmovdqa_atomic() { $vmovdqa } else { $cmpxchg16b } |
242 | } |
243 | #[cfg(not(target_feature = "sse" ))] |
244 | { |
245 | $cmpxchg16b |
246 | } |
247 | } else { |
248 | fallback::$fallback |
249 | } |
250 | } |
251 | #[cfg(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
252 | { |
253 | if cpuid.has_vmovdqa_atomic() { $vmovdqa } else { $cmpxchg16b } |
254 | } |
255 | }}; |
256 | } |
257 | |
258 | #[inline ] |
259 | unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 { |
260 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
261 | // SGX doesn't support CPUID. |
262 | #[cfg (all( |
263 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
264 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
265 | ))] |
266 | // SAFETY: the caller must uphold the safety contract. |
267 | // cfg guarantees that CMPXCHG16B is available at compile-time. |
268 | unsafe { |
269 | // cmpxchg16b is always SeqCst. |
270 | atomic_load_cmpxchg16b(src) |
271 | } |
272 | #[cfg (not(all( |
273 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
274 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
275 | )))] |
276 | // SAFETY: the caller must uphold the safety contract. |
277 | unsafe { |
278 | ifunc!(unsafe fn(src: *mut u128) -> u128 { |
279 | load_store_detect! { |
280 | vmovdqa = atomic_load_vmovdqa |
281 | cmpxchg16b = atomic_load_cmpxchg16b |
282 | // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst. |
283 | fallback = atomic_load_seqcst |
284 | } |
285 | }) |
286 | } |
287 | } |
288 | // See cmpxchg16b() for target_feature(enable). |
289 | #[cfg_attr ( |
290 | not(portable_atomic_no_cmpxchg16b_target_feature), |
291 | target_feature(enable = "cmpxchg16b" ) |
292 | )] |
293 | #[inline ] |
294 | unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 { |
295 | debug_assert!(src as usize % 16 == 0); |
296 | debug_assert_cmpxchg16b!(); |
297 | |
298 | // SAFETY: the caller must guarantee that `src` is valid for both writes and |
299 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
300 | // cfg guarantees that the CPU supports CMPXCHG16B. |
301 | // |
302 | // See cmpxchg16b function for more. |
303 | // |
304 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
305 | // omitting the storing of condition flags and avoid use of xchg to handle rbx. |
306 | unsafe { |
307 | // cmpxchg16b is always SeqCst. |
308 | let (out_lo, out_hi); |
309 | macro_rules! cmpxchg16b { |
310 | ($rdi:tt) => { |
311 | asm!( |
312 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
313 | "xor rbx, rbx" , // zeroed rbx |
314 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
315 | "mov rbx, {rbx_tmp}" , // restore rbx |
316 | // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg) |
317 | rbx_tmp = out(reg) _, |
318 | in("rcx" ) 0_u64, |
319 | inout("rax" ) 0_u64 => out_lo, |
320 | inout("rdx" ) 0_u64 => out_hi, |
321 | in($rdi) src, |
322 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
323 | options(nostack), |
324 | ) |
325 | }; |
326 | } |
327 | #[cfg (target_pointer_width = "32" )] |
328 | cmpxchg16b!("edi" ); |
329 | #[cfg (target_pointer_width = "64" )] |
330 | cmpxchg16b!("rdi" ); |
331 | U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole |
332 | } |
333 | } |
334 | |
335 | #[inline ] |
336 | unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { |
337 | // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more. |
338 | // SGX doesn't support CPUID. |
339 | #[cfg (all( |
340 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
341 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
342 | ))] |
343 | // SAFETY: the caller must uphold the safety contract. |
344 | // cfg guarantees that CMPXCHG16B is available at compile-time. |
345 | unsafe { |
346 | // cmpxchg16b is always SeqCst. |
347 | let _ = order; |
348 | atomic_store_cmpxchg16b(dst, val); |
349 | } |
350 | #[cfg (not(all( |
351 | any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ), |
352 | any(portable_atomic_no_outline_atomics, target_env = "sgx" , not(target_feature = "sse" )), |
353 | )))] |
354 | // SAFETY: the caller must uphold the safety contract. |
355 | unsafe { |
356 | #[cfg (target_feature = "sse" )] |
357 | fn_alias! { |
358 | #[target_feature (enable = "avx" )] |
359 | unsafe fn(dst: *mut u128, val: u128); |
360 | // atomic store by vmovdqa has at least release semantics. |
361 | atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release); |
362 | atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst); |
363 | } |
364 | match order { |
365 | // Relaxed and Release stores are equivalent in all implementations |
366 | // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback). |
367 | // core::arch's cmpxchg16b will never called here. |
368 | Ordering::Relaxed | Ordering::Release => { |
369 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { |
370 | load_store_detect! { |
371 | vmovdqa = atomic_store_vmovdqa_non_seqcst |
372 | cmpxchg16b = atomic_store_cmpxchg16b |
373 | fallback = atomic_store_non_seqcst |
374 | } |
375 | }); |
376 | } |
377 | Ordering::SeqCst => { |
378 | ifunc!(unsafe fn(dst: *mut u128, val: u128) { |
379 | load_store_detect! { |
380 | vmovdqa = atomic_store_vmovdqa_seqcst |
381 | cmpxchg16b = atomic_store_cmpxchg16b |
382 | fallback = atomic_store_seqcst |
383 | } |
384 | }); |
385 | } |
386 | _ => unreachable!(), |
387 | } |
388 | } |
389 | } |
390 | // See cmpxchg16b() for target_feature(enable). |
391 | #[cfg_attr ( |
392 | not(portable_atomic_no_cmpxchg16b_target_feature), |
393 | target_feature(enable = "cmpxchg16b" ) |
394 | )] |
395 | #[inline ] |
396 | unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) { |
397 | // SAFETY: the caller must uphold the safety contract. |
398 | unsafe { |
399 | // cmpxchg16b is always SeqCst. |
400 | atomic_swap_cmpxchg16b(dst, val, _order:Ordering::SeqCst); |
401 | } |
402 | } |
403 | |
404 | #[inline ] |
405 | unsafe fn atomic_compare_exchange( |
406 | dst: *mut u128, |
407 | old: u128, |
408 | new: u128, |
409 | _success: Ordering, |
410 | _failure: Ordering, |
411 | ) -> Result<u128, u128> { |
412 | #[cfg (any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
413 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
414 | // reads, 16-byte aligned, that there are no concurrent non-atomic operations, |
415 | // and cfg guarantees that CMPXCHG16B is available at compile-time. |
416 | let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) }; |
417 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
418 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
419 | // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses. |
420 | let (prev: u128, ok: bool) = unsafe { |
421 | ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { |
422 | if detect::detect().has_cmpxchg16b() { |
423 | cmpxchg16b |
424 | } else { |
425 | // Use SeqCst because cmpxchg16b is always SeqCst. |
426 | fallback::atomic_compare_exchange_seqcst |
427 | } |
428 | }) |
429 | }; |
430 | if ok { Ok(prev) } else { Err(prev) } |
431 | } |
432 | |
433 | // cmpxchg16b is always strong. |
434 | use self::atomic_compare_exchange as atomic_compare_exchange_weak; |
435 | |
436 | // See cmpxchg16b() for target_feature(enable). |
437 | #[cfg_attr ( |
438 | not(portable_atomic_no_cmpxchg16b_target_feature), |
439 | target_feature(enable = "cmpxchg16b" ) |
440 | )] |
441 | #[inline ] |
442 | unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 { |
443 | debug_assert!(dst as usize % 16 == 0); |
444 | debug_assert_cmpxchg16b!(); |
445 | |
446 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
447 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
448 | // cfg guarantees that the CPU supports CMPXCHG16B. |
449 | // |
450 | // See cmpxchg16b function for more. |
451 | // |
452 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
453 | // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx. |
454 | // |
455 | // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap. |
456 | unsafe { |
457 | // cmpxchg16b is always SeqCst. |
458 | let val = U128 { whole: val }; |
459 | let (mut prev_lo, mut prev_hi); |
460 | macro_rules! cmpxchg16b { |
461 | ($rdi:tt) => { |
462 | asm!( |
463 | "xchg {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
464 | // This is not single-copy atomic reads, but this is ok because subsequent |
465 | // CAS will check for consistency. |
466 | // |
467 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
468 | // |
469 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
470 | // so we must use inline assembly to implement this. |
471 | // (i.e., byte-wise atomic based on the standard library's atomic types |
472 | // cannot be used here). |
473 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
474 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
475 | "2:" , |
476 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
477 | "jne 2b" , |
478 | "mov rbx, {rbx_tmp}" , // restore rbx |
479 | rbx_tmp = inout(reg) val.pair.lo => _, |
480 | in("rcx" ) val.pair.hi, |
481 | out("rax" ) prev_lo, |
482 | out("rdx" ) prev_hi, |
483 | in($rdi) dst, |
484 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
485 | options(nostack), |
486 | ) |
487 | }; |
488 | } |
489 | #[cfg (target_pointer_width = "32" )] |
490 | cmpxchg16b!("edi" ); |
491 | #[cfg (target_pointer_width = "64" )] |
492 | cmpxchg16b!("rdi" ); |
493 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
494 | } |
495 | } |
496 | |
497 | /// Atomic RMW by CAS loop (3 arguments) |
498 | /// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;` |
499 | /// |
500 | /// `$op` can use the following registers: |
501 | /// - rsi/r8 pair: val argument (read-only for `$op`) |
502 | /// - rax/rdx pair: previous value loaded (read-only for `$op`) |
503 | /// - rbx/rcx pair: new value that will be stored |
504 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
505 | // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx. |
506 | macro_rules! atomic_rmw_cas_3 { |
507 | ($name:ident, $($op:tt)*) => { |
508 | // See cmpxchg16b() for target_feature(enable). |
509 | #[cfg_attr( |
510 | not(portable_atomic_no_cmpxchg16b_target_feature), |
511 | target_feature(enable = "cmpxchg16b" ) |
512 | )] |
513 | #[inline] |
514 | unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 { |
515 | debug_assert!(dst as usize % 16 == 0); |
516 | debug_assert_cmpxchg16b!(); |
517 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
518 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
519 | // cfg guarantees that the CPU supports CMPXCHG16B. |
520 | // |
521 | // See cmpxchg16b function for more. |
522 | unsafe { |
523 | // cmpxchg16b is always SeqCst. |
524 | let val = U128 { whole: val }; |
525 | let (mut prev_lo, mut prev_hi); |
526 | macro_rules! cmpxchg16b { |
527 | ($rdi:tt) => { |
528 | asm!( |
529 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
530 | // This is not single-copy atomic reads, but this is ok because subsequent |
531 | // CAS will check for consistency. |
532 | // |
533 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
534 | // |
535 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
536 | // so we must use inline assembly to implement this. |
537 | // (i.e., byte-wise atomic based on the standard library's atomic types |
538 | // cannot be used here). |
539 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
540 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
541 | "2:" , |
542 | $($op)* |
543 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
544 | "jne 2b" , |
545 | "mov rbx, {rbx_tmp}" , // restore rbx |
546 | rbx_tmp = out(reg) _, |
547 | out("rcx" ) _, |
548 | out("rax" ) prev_lo, |
549 | out("rdx" ) prev_hi, |
550 | in($rdi) dst, |
551 | in("rsi" ) val.pair.lo, |
552 | in("r8" ) val.pair.hi, |
553 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
554 | options(nostack), |
555 | ) |
556 | }; |
557 | } |
558 | #[cfg(target_pointer_width = "32" )] |
559 | cmpxchg16b!("edi" ); |
560 | #[cfg(target_pointer_width = "64" )] |
561 | cmpxchg16b!("rdi" ); |
562 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
563 | } |
564 | } |
565 | }; |
566 | } |
567 | /// Atomic RMW by CAS loop (2 arguments) |
568 | /// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;` |
569 | /// |
570 | /// `$op` can use the following registers: |
571 | /// - rax/rdx pair: previous value loaded (read-only for `$op`) |
572 | /// - rbx/rcx pair: new value that will be stored |
573 | // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows |
574 | // omitting the storing of condition flags and avoid use of xchg to handle rbx. |
575 | macro_rules! atomic_rmw_cas_2 { |
576 | ($name:ident, $($op:tt)*) => { |
577 | // See cmpxchg16b() for target_feature(enable). |
578 | #[cfg_attr( |
579 | not(portable_atomic_no_cmpxchg16b_target_feature), |
580 | target_feature(enable = "cmpxchg16b" ) |
581 | )] |
582 | #[inline] |
583 | unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 { |
584 | debug_assert!(dst as usize % 16 == 0); |
585 | debug_assert_cmpxchg16b!(); |
586 | // SAFETY: the caller must guarantee that `dst` is valid for both writes and |
587 | // reads, 16-byte aligned, and that there are no concurrent non-atomic operations. |
588 | // cfg guarantees that the CPU supports CMPXCHG16B. |
589 | // |
590 | // See cmpxchg16b function for more. |
591 | unsafe { |
592 | // cmpxchg16b is always SeqCst. |
593 | let (mut prev_lo, mut prev_hi); |
594 | macro_rules! cmpxchg16b { |
595 | ($rdi:tt) => { |
596 | asm!( |
597 | "mov {rbx_tmp}, rbx" , // save rbx which is reserved by LLVM |
598 | // This is not single-copy atomic reads, but this is ok because subsequent |
599 | // CAS will check for consistency. |
600 | // |
601 | // This is based on the code generated for the first load in DW RMWs by LLVM. |
602 | // |
603 | // Note that the C++20 memory model does not allow mixed-sized atomic access, |
604 | // so we must use inline assembly to implement this. |
605 | // (i.e., byte-wise atomic based on the standard library's atomic types |
606 | // cannot be used here). |
607 | concat!("mov rax, qword ptr [" , $rdi, "]" ), |
608 | concat!("mov rdx, qword ptr [" , $rdi, " + 8]" ), |
609 | "2:" , |
610 | $($op)* |
611 | concat!("lock cmpxchg16b xmmword ptr [" , $rdi, "]" ), |
612 | "jne 2b" , |
613 | "mov rbx, {rbx_tmp}" , // restore rbx |
614 | rbx_tmp = out(reg) _, |
615 | out("rcx" ) _, |
616 | out("rax" ) prev_lo, |
617 | out("rdx" ) prev_hi, |
618 | in($rdi) dst, |
619 | // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag. |
620 | options(nostack), |
621 | ) |
622 | }; |
623 | } |
624 | #[cfg(target_pointer_width = "32" )] |
625 | cmpxchg16b!("edi" ); |
626 | #[cfg(target_pointer_width = "64" )] |
627 | cmpxchg16b!("rdi" ); |
628 | U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole |
629 | } |
630 | } |
631 | }; |
632 | } |
633 | |
634 | atomic_rmw_cas_3! { |
635 | atomic_add_cmpxchg16b, |
636 | "mov rbx, rax" , |
637 | "add rbx, rsi" , |
638 | "mov rcx, rdx" , |
639 | "adc rcx, r8" , |
640 | } |
641 | atomic_rmw_cas_3! { |
642 | atomic_sub_cmpxchg16b, |
643 | "mov rbx, rax" , |
644 | "sub rbx, rsi" , |
645 | "mov rcx, rdx" , |
646 | "sbb rcx, r8" , |
647 | } |
648 | atomic_rmw_cas_3! { |
649 | atomic_and_cmpxchg16b, |
650 | "mov rbx, rax" , |
651 | "and rbx, rsi" , |
652 | "mov rcx, rdx" , |
653 | "and rcx, r8" , |
654 | } |
655 | atomic_rmw_cas_3! { |
656 | atomic_nand_cmpxchg16b, |
657 | "mov rbx, rax" , |
658 | "and rbx, rsi" , |
659 | "not rbx" , |
660 | "mov rcx, rdx" , |
661 | "and rcx, r8" , |
662 | "not rcx" , |
663 | } |
664 | atomic_rmw_cas_3! { |
665 | atomic_or_cmpxchg16b, |
666 | "mov rbx, rax" , |
667 | "or rbx, rsi" , |
668 | "mov rcx, rdx" , |
669 | "or rcx, r8" , |
670 | } |
671 | atomic_rmw_cas_3! { |
672 | atomic_xor_cmpxchg16b, |
673 | "mov rbx, rax" , |
674 | "xor rbx, rsi" , |
675 | "mov rcx, rdx" , |
676 | "xor rcx, r8" , |
677 | } |
678 | |
679 | atomic_rmw_cas_2! { |
680 | atomic_not_cmpxchg16b, |
681 | "mov rbx, rax" , |
682 | "not rbx" , |
683 | "mov rcx, rdx" , |
684 | "not rcx" , |
685 | } |
686 | atomic_rmw_cas_2! { |
687 | atomic_neg_cmpxchg16b, |
688 | "mov rbx, rax" , |
689 | "neg rbx" , |
690 | "mov rcx, 0" , |
691 | "sbb rcx, rdx" , |
692 | } |
693 | |
694 | atomic_rmw_cas_3! { |
695 | atomic_max_cmpxchg16b, |
696 | "cmp rsi, rax" , |
697 | "mov rcx, r8" , |
698 | "sbb rcx, rdx" , |
699 | "mov rcx, r8" , |
700 | "cmovl rcx, rdx" , |
701 | "mov rbx, rsi" , |
702 | "cmovl rbx, rax" , |
703 | } |
704 | atomic_rmw_cas_3! { |
705 | atomic_umax_cmpxchg16b, |
706 | "cmp rsi, rax" , |
707 | "mov rcx, r8" , |
708 | "sbb rcx, rdx" , |
709 | "mov rcx, r8" , |
710 | "cmovb rcx, rdx" , |
711 | "mov rbx, rsi" , |
712 | "cmovb rbx, rax" , |
713 | } |
714 | atomic_rmw_cas_3! { |
715 | atomic_min_cmpxchg16b, |
716 | "cmp rsi, rax" , |
717 | "mov rcx, r8" , |
718 | "sbb rcx, rdx" , |
719 | "mov rcx, r8" , |
720 | "cmovge rcx, rdx" , |
721 | "mov rbx, rsi" , |
722 | "cmovge rbx, rax" , |
723 | } |
724 | atomic_rmw_cas_3! { |
725 | atomic_umin_cmpxchg16b, |
726 | "cmp rsi, rax" , |
727 | "mov rcx, r8" , |
728 | "sbb rcx, rdx" , |
729 | "mov rcx, r8" , |
730 | "cmovae rcx, rdx" , |
731 | "mov rbx, rsi" , |
732 | "cmovae rbx, rax" , |
733 | } |
734 | |
735 | macro_rules! select_atomic_rmw { |
736 | ( |
737 | unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?; |
738 | cmpxchg16b = $cmpxchg16b_fn:ident; |
739 | fallback = $seqcst_fallback_fn:ident; |
740 | ) => { |
741 | // If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn. |
742 | #[cfg(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
743 | use self::$cmpxchg16b_fn as $name; |
744 | // Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available. |
745 | #[cfg(not(any( |
746 | target_feature = "cmpxchg16b" , |
747 | portable_atomic_target_feature = "cmpxchg16b" , |
748 | )))] |
749 | #[inline] |
750 | unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? { |
751 | fn_alias! { |
752 | // See cmpxchg16b() for target_feature(enable). |
753 | #[cfg_attr( |
754 | not(portable_atomic_no_cmpxchg16b_target_feature), |
755 | target_feature(enable = "cmpxchg16b" ) |
756 | )] |
757 | unsafe fn($($arg)*) $(-> $ret_ty)?; |
758 | // cmpxchg16b is always SeqCst. |
759 | cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst); |
760 | } |
761 | // SAFETY: the caller must uphold the safety contract. |
762 | // we only calls cmpxchg16b_fn if cmpxchg16b is available. |
763 | unsafe { |
764 | ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? { |
765 | if detect::detect().has_cmpxchg16b() { |
766 | cmpxchg16b_seqcst_fn |
767 | } else { |
768 | // Use SeqCst because cmpxchg16b is always SeqCst. |
769 | fallback::$seqcst_fallback_fn |
770 | } |
771 | }) |
772 | } |
773 | } |
774 | }; |
775 | } |
776 | |
777 | select_atomic_rmw! { |
778 | unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128; |
779 | cmpxchg16b = atomic_swap_cmpxchg16b; |
780 | fallback = atomic_swap_seqcst; |
781 | } |
782 | select_atomic_rmw! { |
783 | unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128; |
784 | cmpxchg16b = atomic_add_cmpxchg16b; |
785 | fallback = atomic_add_seqcst; |
786 | } |
787 | select_atomic_rmw! { |
788 | unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128; |
789 | cmpxchg16b = atomic_sub_cmpxchg16b; |
790 | fallback = atomic_sub_seqcst; |
791 | } |
792 | select_atomic_rmw! { |
793 | unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128; |
794 | cmpxchg16b = atomic_and_cmpxchg16b; |
795 | fallback = atomic_and_seqcst; |
796 | } |
797 | select_atomic_rmw! { |
798 | unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128; |
799 | cmpxchg16b = atomic_nand_cmpxchg16b; |
800 | fallback = atomic_nand_seqcst; |
801 | } |
802 | select_atomic_rmw! { |
803 | unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128; |
804 | cmpxchg16b = atomic_or_cmpxchg16b; |
805 | fallback = atomic_or_seqcst; |
806 | } |
807 | select_atomic_rmw! { |
808 | unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128; |
809 | cmpxchg16b = atomic_xor_cmpxchg16b; |
810 | fallback = atomic_xor_seqcst; |
811 | } |
812 | select_atomic_rmw! { |
813 | unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128; |
814 | cmpxchg16b = atomic_max_cmpxchg16b; |
815 | fallback = atomic_max_seqcst; |
816 | } |
817 | select_atomic_rmw! { |
818 | unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128; |
819 | cmpxchg16b = atomic_umax_cmpxchg16b; |
820 | fallback = atomic_umax_seqcst; |
821 | } |
822 | select_atomic_rmw! { |
823 | unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128; |
824 | cmpxchg16b = atomic_min_cmpxchg16b; |
825 | fallback = atomic_min_seqcst; |
826 | } |
827 | select_atomic_rmw! { |
828 | unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128; |
829 | cmpxchg16b = atomic_umin_cmpxchg16b; |
830 | fallback = atomic_umin_seqcst; |
831 | } |
832 | select_atomic_rmw! { |
833 | unsafe fn atomic_not(dst: *mut u128) -> u128; |
834 | cmpxchg16b = atomic_not_cmpxchg16b; |
835 | fallback = atomic_not_seqcst; |
836 | } |
837 | select_atomic_rmw! { |
838 | unsafe fn atomic_neg(dst: *mut u128) -> u128; |
839 | cmpxchg16b = atomic_neg_cmpxchg16b; |
840 | fallback = atomic_neg_seqcst; |
841 | } |
842 | |
843 | #[inline ] |
844 | fn is_lock_free() -> bool { |
845 | #[cfg (any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" ))] |
846 | { |
847 | // CMPXCHG16B is available at compile-time. |
848 | true |
849 | } |
850 | #[cfg (not(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )))] |
851 | { |
852 | detect::detect().has_cmpxchg16b() |
853 | } |
854 | } |
855 | const IS_ALWAYS_LOCK_FREE: bool = |
856 | cfg!(any(target_feature = "cmpxchg16b" , portable_atomic_target_feature = "cmpxchg16b" )); |
857 | |
858 | atomic128!(AtomicI128, i128, atomic_max, atomic_min); |
859 | atomic128!(AtomicU128, u128, atomic_umax, atomic_umin); |
860 | |
861 | #[allow (clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)] |
862 | #[cfg (test)] |
863 | mod tests { |
864 | use super::*; |
865 | |
866 | test_atomic_int!(i128); |
867 | test_atomic_int!(u128); |
868 | |
869 | // load/store/swap implementation is not affected by signedness, so it is |
870 | // enough to test only unsigned types. |
871 | stress_test!(u128); |
872 | } |
873 | |