mod.rs source code [crates/compiler-builtins/src/int/specialized_div_rem/mod.rs]

1	// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
2	#![allow(unused_unsafe)]
3	// The functions are complex with many branches, and explicit
4	// `return`s makes it clear where function exit points are
5	#![allow(clippy::needless_return)]
6	#![allow(clippy::comparison_chain)]
7	// Clippy is confused by the complex configuration
8	#![allow(clippy::if_same_then_else)]
9	#![allow(clippy::needless_bool)]
10
11	//! This `specialized_div_rem` module is originally from version 1.0.0 of the
12	//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
13	//! module, since unoptimized compilation may generate references to `memcpy`.
14	//!
15	//! The purpose of these macros is to easily change the both the division algorithm used
16	//! for a given integer size and the half division used by that algorithm. The way
17	//! functions call each other is also constructed such that linkers will find the chain of
18	//! software and hardware divisions needed for every size of signed and unsigned division.
19	//! For example, most target compilations do the following:
20	//!
21	//! - Many 128 bit division functions like `u128::wrapping_div` use
22	//! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
23	//! is not a 128 bit by 128 bit hardware division function in most architectures.
24	//! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
25	//! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
26	//! one function to calculate both the quotient and remainder. If configuration flags
27	//! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
28	//! which requires the half sized division `u64_by_u64_div_rem`. If the architecture
29	//! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
30	//! reduced to those instructions. Note that we do not specify the half size division
31	//! directly to be `__udivdi3`, because hardware division would never be introduced.
32	//! - If the architecture does not supply a 64 bit hardware division instruction, u64
33	//! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
34	//! which is defined by `impl_delegate!`. The half division for this algorithm is
35	//! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
36	//! software division algorithms.
37	//! - If the architecture does not supply a 32 bit hardware instruction, linkers will
38	//! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half
39	//! division, so the chain of calls ends here.
40	//!
41	//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
42	//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
43	//! extend the 128 by 64 bit division to a full 128 by 128 bit division.
44
45	// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
46	// ridiculously complex
47
48	#[macro_use]
49	mod norm_shift;
50
51	#[macro_use]
52	mod binary_long;
53
54	#[macro_use]
55	mod delegate;
56
57	// used on SPARC
58	#[allow(unused_imports)]
59	#[cfg(not(feature = "unstable-public-internals"))]
60	pub(crate) use self::delegate::u128_divide_sparc;
61	#[cfg(feature = "unstable-public-internals")]
62	pub use self::delegate::u128_divide_sparc;
63
64	#[macro_use]
65	mod trifecta;
66
67	#[macro_use]
68	mod asymmetric;
69
70	/// The behavior of all divisions by zero is controlled by this function. This function should be
71	/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
72	/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
73	fn zero_div_fn() -> ! {
74	// Calling the intrinsic directly, to avoid the `assert_unsafe_precondition` that cannot be used
75	// here because it involves non-`inline` functions
76	// (https://github.com/rust-lang/compiler-builtins/issues/491).
77	unsafe { core::intrinsics::unreachable() }
78	}
79
80	const USE_LZ: bool = {
81	if cfg!(target_arch = "arm") {
82	if cfg!(target_feature = "thumb-mode") {
83	// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
84	// supported. This is needed to successfully differentiate between targets like
85	// `thumbv8.base` and `thumbv8.main`.
86	cfg!(target_feature = "v6t2")
87	} else {
88	// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
89	// supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
90	// feature does not seem to work.
91	cfg!(target_feature = "v5te")
92	}
93	} else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
94	// LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
95	cfg!(target_feature = "vis3")
96	} else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
97	// The 'Zbb' Basic Bit-Manipulation extension on RISC-V
98	// determines if a CLZ assembly instruction exists
99	cfg!(target_feature = "zbb")
100	} else {
101	// All other common targets Rust supports should have CLZ instructions
102	`true`
103	}
104	};
105
106	impl_normalization_shift!(
107	u32_normalization_shift,
108	USE_LZ,
109	`32`,
110	u32,
111	i32,
112	allow(dead_code)
113	);
114	impl_normalization_shift!(
115	u64_normalization_shift,
116	USE_LZ,
117	`64`,
118	u64,
119	i64,
120	allow(dead_code)
121	);
122
123	/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
124	/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
125	/// dependencies.
126	#[inline]
127	fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
128	if let Some(quo: u64) = duo.checked_div(div)
129	&& let Some(rem: u64) = duo.checked_rem(div)
130	{
131	return (quo, rem);
132	}
133	zero_div_fn()
134	}
135
136	// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
137	// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
138	// faster if the target pointer width is at least 64. Note that this
139	// implementation is additionally included on WebAssembly despite the typical
140	// pointer width there being 32 because it's typically run on a 64-bit machine
141	// that has access to faster 64-bit operations.
142	#[cfg(all(
143	any(
144	target_family = "wasm",
145	not(any(target_pointer_width = "16", target_pointer_width = "32")),
146	),
147	not(all(not(feature = "no-asm"), target_arch = "x86_64")),
148	not(any(target_arch = "sparc", target_arch = "sparc64"))
149	))]
150	impl_trifecta!(
151	u128_div_rem,
152	zero_div_fn,
153	u64_by_u64_div_rem,
154	`32`,
155	u32,
156	u64,
157	u128
158	);
159
160	// If the pointer width less than 64 and this isn't wasm, then the target
161	// architecture almost certainly does not have the fast 64 to 128 bit widening
162	// multiplication needed for `trifecta` to be faster.
163	#[cfg(all(
164	not(any(
165	target_family = "wasm",
166	not(any(target_pointer_width = "16", target_pointer_width = "32")),
167	)),
168	not(all(not(feature = "no-asm"), target_arch = "x86_64")),
169	not(any(target_arch = "sparc", target_arch = "sparc64"))
170	))]
171	impl_delegate!(
172	u128_div_rem,
173	zero_div_fn,
174	u64_normalization_shift,
175	u64_by_u64_div_rem,
176	`32`,
177	u32,
178	u64,
179	u128,
180	i128
181	);
182
183	/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
184	///
185	/// # Safety
186	///
187	/// If the quotient does not fit in a `u64`, a floating point exception occurs.
188	/// If `div == 0`, then a division by zero exception occurs.
189	#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
190	#[inline]
191	unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
192	let duo_lo: u64 = duo as u64;
193	let duo_hi: u64 = (duo >> `64`) as u64;
194	let quo: u64;
195	let rem: u64;
196	unsafe {
197	// divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
198	// by `div`. The quotient is stored in rax and the remainder in rdx.
199	// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
200	core::arch::asm!(
201	"div {0}",
202	in(reg) div,
203	inlateout("rax") duo_lo => quo,
204	inlateout("rdx") duo_hi => rem,
205	options(att_syntax, pure, nomem, nostack)
206	);
207	}
208	(quo, rem)
209	}
210
211	// use `asymmetric` instead of `trifecta` on x86_64
212	#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
213	impl_asymmetric!(
214	u128_div_rem,
215	zero_div_fn,
216	u64_by_u64_div_rem,
217	u128_by_u64_div_rem,
218	`32`,
219	u32,
220	u64,
221	u128
222	);
223
224	/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
225	/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
226	/// dependencies.
227	#[inline]
228	#[allow(dead_code)]
229	fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
230	if let Some(quo: u32) = duo.checked_div(div)
231	&& let Some(rem: u32) = duo.checked_rem(div)
232	{
233	return (quo, rem);
234	}
235	zero_div_fn()
236	}
237
238	// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
239	// than register size.
240	#[cfg(all(
241	not(all(not(feature = "no-asm"), target_arch = "x86")),
242	not(target_pointer_width = "64")
243	))]
244	impl_delegate!(
245	u64_div_rem,
246	zero_div_fn,
247	u32_normalization_shift,
248	u32_by_u32_div_rem,
249	`16`,
250	u16,
251	u32,
252	u64,
253	i64
254	);
255
256	// When not on x86 and the pointer width is 64, use `binary_long`.
257	#[cfg(all(
258	not(all(not(feature = "no-asm"), target_arch = "x86")),
259	target_pointer_width = "64"
260	))]
261	impl_binary_long!(
262	u64_div_rem,
263	zero_div_fn,
264	u64_normalization_shift,
265	`64`,
266	u64,
267	i64
268	);
269
270	/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
271	///
272	/// # Safety
273	///
274	/// If the quotient does not fit in a `u32`, a floating point exception occurs.
275	/// If `div == 0`, then a division by zero exception occurs.
276	#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
277	#[inline]
278	unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
279	let duo_lo = duo as u32;
280	let duo_hi = (duo >> `32`) as u32;
281	let quo: u32;
282	let rem: u32;
283	unsafe {
284	// divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
285	// by `div`. The quotient is stored in rax and the remainder in rdx.
286	// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
287	core::arch::asm!(
288	"div {0}",
289	in(reg) div,
290	inlateout("rax") duo_lo => quo,
291	inlateout("rdx") duo_hi => rem,
292	options(att_syntax, pure, nomem, nostack)
293	);
294	}
295	(quo, rem)
296	}
297
298	// use `asymmetric` instead of `delegate` on x86
299	#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
300	impl_asymmetric!(
301	u64_div_rem,
302	zero_div_fn,
303	u32_by_u32_div_rem,
304	u64_by_u32_div_rem,
305	`16`,
306	u16,
307	u32,
308	u64
309	);
310
311	// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
312	impl_binary_long!(
313	u32_div_rem,
314	zero_div_fn,
315	u32_normalization_shift,
316	`32`,
317	u32,
318	i32,
319	allow(dead_code)
320	);
321