1// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
2#![allow(unused_unsafe)]
3// The functions are complex with many branches, and explicit
4// `return`s makes it clear where function exit points are
5#![allow(clippy::needless_return)]
6#![allow(clippy::comparison_chain)]
7// Clippy is confused by the complex configuration
8#![allow(clippy::if_same_then_else)]
9#![allow(clippy::needless_bool)]
10
11//! This `specialized_div_rem` module is originally from version 1.0.0 of the
12//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
13//! module, since unoptimized compilation may generate references to `memcpy`.
14//!
15//! The purpose of these macros is to easily change the both the division algorithm used
16//! for a given integer size and the half division used by that algorithm. The way
17//! functions call each other is also constructed such that linkers will find the chain of
18//! software and hardware divisions needed for every size of signed and unsigned division.
19//! For example, most target compilations do the following:
20//!
21//! - Many 128 bit division functions like `u128::wrapping_div` use
22//! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
23//! is not a 128 bit by 128 bit hardware division function in most architectures.
24//! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
25//! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
26//! one function to calculate both the quotient and remainder. If configuration flags
27//! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
28//! which requires the half sized division `u64_by_u64_div_rem`. If the architecture
29//! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
30//! reduced to those instructions. Note that we do not specify the half size division
31//! directly to be `__udivdi3`, because hardware division would never be introduced.
32//! - If the architecture does not supply a 64 bit hardware division instruction, u64
33//! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
34//! which is defined by `impl_delegate!`. The half division for this algorithm is
35//! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
36//! software division algorithms.
37//! - If the architecture does not supply a 32 bit hardware instruction, linkers will
38//! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half
39//! division, so the chain of calls ends here.
40//!
41//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
42//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
43//! extend the 128 by 64 bit division to a full 128 by 128 bit division.
44
45// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
46// ridiculously complex
47
48#[macro_use]
49mod norm_shift;
50
51#[macro_use]
52mod binary_long;
53
54#[macro_use]
55mod delegate;
56
57// used on SPARC
58#[allow(unused_imports)]
59#[cfg(not(feature = "public-test-deps"))]
60pub(crate) use self::delegate::u128_divide_sparc;
61
62#[cfg(feature = "public-test-deps")]
63pub use self::delegate::u128_divide_sparc;
64
65#[macro_use]
66mod trifecta;
67
68#[macro_use]
69mod asymmetric;
70
71/// The behavior of all divisions by zero is controlled by this function. This function should be
72/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
73/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
74fn zero_div_fn() -> ! {
75 // Calling the intrinsic directly, to avoid the `assert_unsafe_precondition` that cannot be used
76 // here because it involves non-`inline` functions
77 // (https://github.com/rust-lang/compiler-builtins/issues/491).
78 unsafe { core::intrinsics::unreachable() }
79}
80
81const USE_LZ: bool = {
82 if cfg!(target_arch = "arm") {
83 if cfg!(target_feature = "thumb-mode") {
84 // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
85 // supported. This is needed to successfully differentiate between targets like
86 // `thumbv8.base` and `thumbv8.main`.
87 cfg!(target_feature = "v6t2")
88 } else {
89 // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
90 // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
91 // feature does not seem to work.
92 cfg!(target_feature = "v5te")
93 }
94 } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
95 // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
96 cfg!(target_feature = "vis3")
97 } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
98 // The 'Zbb' Basic Bit-Manipulation extension on RISC-V
99 // determines if a CLZ assembly instruction exists
100 cfg!(target_feature = "zbb")
101 } else {
102 // All other common targets Rust supports should have CLZ instructions
103 true
104 }
105};
106
107impl_normalization_shift!(
108 u32_normalization_shift,
109 USE_LZ,
110 32,
111 u32,
112 i32,
113 allow(dead_code)
114);
115impl_normalization_shift!(
116 u64_normalization_shift,
117 USE_LZ,
118 64,
119 u64,
120 i64,
121 allow(dead_code)
122);
123
124/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
125/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
126/// dependencies.
127#[inline]
128fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
129 if let Some(quo: u64) = duo.checked_div(div) {
130 if let Some(rem: u64) = duo.checked_rem(div) {
131 return (quo, rem);
132 }
133 }
134 zero_div_fn()
135}
136
137// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
138// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
139// faster if the target pointer width is at least 64. Note that this
140// implementation is additionally included on WebAssembly despite the typical
141// pointer width there being 32 because it's typically run on a 64-bit machine
142// that has access to faster 64-bit operations.
143#[cfg(all(
144 any(
145 target_family = "wasm",
146 not(any(target_pointer_width = "16", target_pointer_width = "32")),
147 ),
148 not(all(not(feature = "no-asm"), target_arch = "x86_64")),
149 not(any(target_arch = "sparc", target_arch = "sparc64"))
150))]
151impl_trifecta!(
152 u128_div_rem,
153 zero_div_fn,
154 u64_by_u64_div_rem,
155 32,
156 u32,
157 u64,
158 u128
159);
160
161// If the pointer width less than 64 and this isn't wasm, then the target
162// architecture almost certainly does not have the fast 64 to 128 bit widening
163// multiplication needed for `trifecta` to be faster.
164#[cfg(all(
165 not(any(
166 target_family = "wasm",
167 not(any(target_pointer_width = "16", target_pointer_width = "32")),
168 )),
169 not(all(not(feature = "no-asm"), target_arch = "x86_64")),
170 not(any(target_arch = "sparc", target_arch = "sparc64"))
171))]
172impl_delegate!(
173 u128_div_rem,
174 zero_div_fn,
175 u64_normalization_shift,
176 u64_by_u64_div_rem,
177 32,
178 u32,
179 u64,
180 u128,
181 i128
182);
183
184/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
185///
186/// # Safety
187///
188/// If the quotient does not fit in a `u64`, a floating point exception occurs.
189/// If `div == 0`, then a division by zero exception occurs.
190#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
191#[inline]
192unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
193 let duo_lo: u64 = duo as u64;
194 let duo_hi: u64 = (duo >> 64) as u64;
195 let quo: u64;
196 let rem: u64;
197 unsafe {
198 // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
199 // by `div`. The quotient is stored in rax and the remainder in rdx.
200 // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
201 core::arch::asm!(
202 "div {0}",
203 in(reg) div,
204 inlateout("rax") duo_lo => quo,
205 inlateout("rdx") duo_hi => rem,
206 options(att_syntax, pure, nomem, nostack)
207 );
208 }
209 (quo, rem)
210}
211
212// use `asymmetric` instead of `trifecta` on x86_64
213#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
214impl_asymmetric!(
215 u128_div_rem,
216 zero_div_fn,
217 u64_by_u64_div_rem,
218 u128_by_u64_div_rem,
219 32,
220 u32,
221 u64,
222 u128
223);
224
225/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
226/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
227/// dependencies.
228#[inline]
229#[allow(dead_code)]
230fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
231 if let Some(quo: u32) = duo.checked_div(div) {
232 if let Some(rem: u32) = duo.checked_rem(div) {
233 return (quo, rem);
234 }
235 }
236 zero_div_fn()
237}
238
239// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
240// than register size.
241#[cfg(all(
242 not(all(not(feature = "no-asm"), target_arch = "x86")),
243 not(target_pointer_width = "64")
244))]
245impl_delegate!(
246 u64_div_rem,
247 zero_div_fn,
248 u32_normalization_shift,
249 u32_by_u32_div_rem,
250 16,
251 u16,
252 u32,
253 u64,
254 i64
255);
256
257// When not on x86 and the pointer width is 64, use `binary_long`.
258#[cfg(all(
259 not(all(not(feature = "no-asm"), target_arch = "x86")),
260 target_pointer_width = "64"
261))]
262impl_binary_long!(
263 u64_div_rem,
264 zero_div_fn,
265 u64_normalization_shift,
266 64,
267 u64,
268 i64
269);
270
271/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
272///
273/// # Safety
274///
275/// If the quotient does not fit in a `u32`, a floating point exception occurs.
276/// If `div == 0`, then a division by zero exception occurs.
277#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
278#[inline]
279unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
280 let duo_lo = duo as u32;
281 let duo_hi = (duo >> 32) as u32;
282 let quo: u32;
283 let rem: u32;
284 unsafe {
285 // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
286 // by `div`. The quotient is stored in rax and the remainder in rdx.
287 // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
288 core::arch::asm!(
289 "div {0}",
290 in(reg) div,
291 inlateout("rax") duo_lo => quo,
292 inlateout("rdx") duo_hi => rem,
293 options(att_syntax, pure, nomem, nostack)
294 );
295 }
296 (quo, rem)
297}
298
299// use `asymmetric` instead of `delegate` on x86
300#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
301impl_asymmetric!(
302 u64_div_rem,
303 zero_div_fn,
304 u32_by_u32_div_rem,
305 u64_by_u32_div_rem,
306 16,
307 u16,
308 u32,
309 u64
310);
311
312// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
313impl_binary_long!(
314 u32_div_rem,
315 zero_div_fn,
316 u32_normalization_shift,
317 32,
318 u32,
319 i32,
320 allow(dead_code)
321);
322