mod.rs source code [crates/compiler_builtins/src/int/specialized_div_rem/mod.rs]

1	// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
2	#![allow(unused_unsafe)]
3	// The functions are complex with many branches, and explicit
4	// `return`s makes it clear where function exit points are
5	#![allow(clippy::needless_return)]
6	#![allow(clippy::comparison_chain)]
7	// Clippy is confused by the complex configuration
8	#![allow(clippy::if_same_then_else)]
9	#![allow(clippy::needless_bool)]
10
11	//! This `specialized_div_rem` module is originally from version 1.0.0 of the
12	//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
13	//! module, since unoptimized compilation may generate references to `memcpy`.
14	//!
15	//! The purpose of these macros is to easily change the both the division algorithm used
16	//! for a given integer size and the half division used by that algorithm. The way
17	//! functions call each other is also constructed such that linkers will find the chain of
18	//! software and hardware divisions needed for every size of signed and unsigned division.
19	//! For example, most target compilations do the following:
20	//!
21	//! - Many 128 bit division functions like `u128::wrapping_div` use
22	//! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
23	//! is not a 128 bit by 128 bit hardware division function in most architectures.
24	//! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
25	//! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
26	//! one function to calculate both the quotient and remainder. If configuration flags
27	//! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
28	//! which requires the half sized division `u64_by_u64_div_rem`. If the architecture
29	//! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
30	//! reduced to those instructions. Note that we do not specify the half size division
31	//! directly to be `__udivdi3`, because hardware division would never be introduced.
32	//! - If the architecture does not supply a 64 bit hardware division instruction, u64
33	//! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
34	//! which is defined by `impl_delegate!`. The half division for this algorithm is
35	//! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
36	//! software division algorithms.
37	//! - If the architecture does not supply a 32 bit hardware instruction, linkers will
38	//! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half
39	//! division, so the chain of calls ends here.
40	//!
41	//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
42	//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
43	//! extend the 128 by 64 bit division to a full 128 by 128 bit division.
44
45	// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
46	// ridiculously complex
47
48	#[macro_use]
49	mod norm_shift;
50
51	#[macro_use]
52	mod binary_long;
53
54	#[macro_use]
55	mod delegate;
56
57	// used on SPARC
58	#[allow(unused_imports)]
59	#[cfg(not(feature = "public-test-deps"))]
60	pub(crate) use self::delegate::u128_divide_sparc;
61
62	#[cfg(feature = "public-test-deps")]
63	pub use self::delegate::u128_divide_sparc;
64
65	#[macro_use]
66	mod trifecta;
67
68	#[macro_use]
69	mod asymmetric;
70
71	/// The behavior of all divisions by zero is controlled by this function. This function should be
72	/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
73	/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
74	fn zero_div_fn() -> ! {
75	// Calling the intrinsic directly, to avoid the `assert_unsafe_precondition` that cannot be used
76	// here because it involves non-`inline` functions
77	// (https://github.com/rust-lang/compiler-builtins/issues/491).
78	unsafe { core::intrinsics::unreachable() }
79	}
80
81	const USE_LZ: bool = {
82	if cfg!(target_arch = "arm") {
83	if cfg!(target_feature = "thumb-mode") {
84	// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
85	// supported. This is needed to successfully differentiate between targets like
86	// `thumbv8.base` and `thumbv8.main`.
87	cfg!(target_feature = "v6t2")
88	} else {
89	// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
90	// supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
91	// feature does not seem to work.
92	cfg!(target_feature = "v5te")
93	}
94	} else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
95	// LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
96	cfg!(target_feature = "vis3")
97	} else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
98	// The 'Zbb' Basic Bit-Manipulation extension on RISC-V
99	// determines if a CLZ assembly instruction exists
100	cfg!(target_feature = "zbb")
101	} else {
102	// All other common targets Rust supports should have CLZ instructions
103	`true`
104	}
105	};
106
107	impl_normalization_shift!(
108	u32_normalization_shift,
109	USE_LZ,
110	`32`,
111	u32,
112	i32,
113	allow(dead_code)
114	);
115	impl_normalization_shift!(
116	u64_normalization_shift,
117	USE_LZ,
118	`64`,
119	u64,
120	i64,
121	allow(dead_code)
122	);
123
124	/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
125	/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
126	/// dependencies.
127	#[inline]
128	fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
129	if let Some(quo: u64) = duo.checked_div(div) {
130	if let Some(rem: u64) = duo.checked_rem(div) {
131	return (quo, rem);
132	}
133	}
134	zero_div_fn()
135	}
136
137	// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
138	// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
139	// faster if the target pointer width is at least 64. Note that this
140	// implementation is additionally included on WebAssembly despite the typical
141	// pointer width there being 32 because it's typically run on a 64-bit machine
142	// that has access to faster 64-bit operations.
143	#[cfg(all(
144	any(
145	target_family = "wasm",
146	not(any(target_pointer_width = "16", target_pointer_width = "32")),
147	),
148	not(all(not(feature = "no-asm"), target_arch = "x86_64")),
149	not(any(target_arch = "sparc", target_arch = "sparc64"))
150	))]
151	impl_trifecta!(
152	u128_div_rem,
153	zero_div_fn,
154	u64_by_u64_div_rem,
155	`32`,
156	u32,
157	u64,
158	u128
159	);
160
161	// If the pointer width less than 64 and this isn't wasm, then the target
162	// architecture almost certainly does not have the fast 64 to 128 bit widening
163	// multiplication needed for `trifecta` to be faster.
164	#[cfg(all(
165	not(any(
166	target_family = "wasm",
167	not(any(target_pointer_width = "16", target_pointer_width = "32")),
168	)),
169	not(all(not(feature = "no-asm"), target_arch = "x86_64")),
170	not(any(target_arch = "sparc", target_arch = "sparc64"))
171	))]
172	impl_delegate!(
173	u128_div_rem,
174	zero_div_fn,
175	u64_normalization_shift,
176	u64_by_u64_div_rem,
177	`32`,
178	u32,
179	u64,
180	u128,
181	i128
182	);
183
184	/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
185	///
186	/// # Safety
187	///
188	/// If the quotient does not fit in a `u64`, a floating point exception occurs.
189	/// If `div == 0`, then a division by zero exception occurs.
190	#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
191	#[inline]
192	unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
193	let duo_lo: u64 = duo as u64;
194	let duo_hi: u64 = (duo >> `64`) as u64;
195	let quo: u64;
196	let rem: u64;
197	unsafe {
198	// divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
199	// by `div`. The quotient is stored in rax and the remainder in rdx.
200	// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
201	core::arch::asm!(
202	"div {0}",
203	in(reg) div,
204	inlateout("rax") duo_lo => quo,
205	inlateout("rdx") duo_hi => rem,
206	options(att_syntax, pure, nomem, nostack)
207	);
208	}
209	(quo, rem)
210	}
211
212	// use `asymmetric` instead of `trifecta` on x86_64
213	#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
214	impl_asymmetric!(
215	u128_div_rem,
216	zero_div_fn,
217	u64_by_u64_div_rem,
218	u128_by_u64_div_rem,
219	`32`,
220	u32,
221	u64,
222	u128
223	);
224
225	/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
226	/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
227	/// dependencies.
228	#[inline]
229	#[allow(dead_code)]
230	fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
231	if let Some(quo: u32) = duo.checked_div(div) {
232	if let Some(rem: u32) = duo.checked_rem(div) {
233	return (quo, rem);
234	}
235	}
236	zero_div_fn()
237	}
238
239	// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
240	// than register size.
241	#[cfg(all(
242	not(all(not(feature = "no-asm"), target_arch = "x86")),
243	not(target_pointer_width = "64")
244	))]
245	impl_delegate!(
246	u64_div_rem,
247	zero_div_fn,
248	u32_normalization_shift,
249	u32_by_u32_div_rem,
250	`16`,
251	u16,
252	u32,
253	u64,
254	i64
255	);
256
257	// When not on x86 and the pointer width is 64, use `binary_long`.
258	#[cfg(all(
259	not(all(not(feature = "no-asm"), target_arch = "x86")),
260	target_pointer_width = "64"
261	))]
262	impl_binary_long!(
263	u64_div_rem,
264	zero_div_fn,
265	u64_normalization_shift,
266	`64`,
267	u64,
268	i64
269	);
270
271	/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
272	///
273	/// # Safety
274	///
275	/// If the quotient does not fit in a `u32`, a floating point exception occurs.
276	/// If `div == 0`, then a division by zero exception occurs.
277	#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
278	#[inline]
279	unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
280	let duo_lo = duo as u32;
281	let duo_hi = (duo >> `32`) as u32;
282	let quo: u32;
283	let rem: u32;
284	unsafe {
285	// divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
286	// by `div`. The quotient is stored in rax and the remainder in rdx.
287	// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
288	core::arch::asm!(
289	"div {0}",
290	in(reg) div,
291	inlateout("rax") duo_lo => quo,
292	inlateout("rdx") duo_hi => rem,
293	options(att_syntax, pure, nomem, nostack)
294	);
295	}
296	(quo, rem)
297	}
298
299	// use `asymmetric` instead of `delegate` on x86
300	#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
301	impl_asymmetric!(
302	u64_div_rem,
303	zero_div_fn,
304	u32_by_u32_div_rem,
305	u64_by_u32_div_rem,
306	`16`,
307	u16,
308	u32,
309	u64
310	);
311
312	// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
313	impl_binary_long!(
314	u32_div_rem,
315	zero_div_fn,
316	u32_normalization_shift,
317	`32`,
318	u32,
319	i32,
320	allow(dead_code)
321	);
322