binary_long.rs source code [crates/compiler_builtins/src/int/specialized_div_rem/binary_long.rs]

1	/// Creates an unsigned division function that uses binary long division, designed for
2	/// computer architectures without division instructions. These functions have good performance for
3	/// microarchitectures with large branch miss penalties and architectures without the ability to
4	/// predicate instructions. For architectures with predicated instructions, one of the algorithms
5	/// described in the documentation of these functions probably has higher performance, and a custom
6	/// assembly routine should be used instead.
7	#[allow(unused_macros)]
8	macro_rules! impl_binary_long {
9	(
10	$fn:ident, // name of the unsigned division function
11	$zero_div_fn:ident, // function called when division by zero is attempted
12	$normalization_shift:ident, // function for finding the normalization shift
13	$n:tt, // the number of bits in a $iX or $uX
14	$uX:ident, // unsigned integer type for the inputs and outputs of `$fn`
15	$iX:ident // signed integer type with same bitwidth as `$uX`
16	$(, $fun_attr:meta)* // attributes for the function
17	) => {
18	/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
19	/// tuple.
20	$(
21	#[$fun_attr]
22	)*
23	pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) {
24	let mut duo = duo;
25	// handle edge cases before calling `$normalization_shift`
26	if div == `0` {
27	$zero_div_fn()
28	}
29	if duo < div {
30	return (`0`, duo);
31	}
32
33	// There are many variations of binary division algorithm that could be used. This
34	// documentation gives a tour of different methods so that future readers wanting to
35	// optimize further do not have to painstakingly derive them. The SWAR variation is
36	// especially hard to understand without reading the less convoluted methods first.
37
38	// You may notice that a `duo < div_original` check is included in many these
39	// algorithms. A critical optimization that many algorithms miss is handling of
40	// quotients that will turn out to have many trailing zeros or many leading zeros. This
41	// happens in cases of exact or close-to-exact divisions, divisions by power of two, and
42	// in cases where the quotient is small. The `duo < div_original` check handles these
43	// cases of early returns and ends up replacing other kinds of mundane checks that
44	// normally terminate a binary division algorithm.
45	//
46	// Something you may see in other algorithms that is not special-cased here is checks
47	// for division by powers of two. The `duo < div_original` check handles this case and
48	// more, however it can be checked up front before the bisection using the
49	// `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
50	// compilers should handle most cases where divisions by power of two occur, and we do
51	// not want to add on a few cycles for every division operation just to save a few
52	// cycles rarely.
53
54	// The following example is the most straightforward translation from the way binary
55	// long division is typically visualized:
56	// Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
57	// the result from `$normalization_shift(duo, div, false)`.
58	//
59	// Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
60	// and `duo` is kept unchanged.
61	// duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
62	//
63	// Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
64	// duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
65	//
66	// Step 2: Continue based on `sub`. The `quo` bits start accumulating.
67	// duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
68	// duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
69	// duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
70	// duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
71	// The `duo < div_original` check terminates the algorithm with the correct quotient of
72	// 29u8 and remainder of 4u8
73	/*
74	let div_original = div;
75	let mut shl = $normalization_shift(duo, div, false);
76	let mut quo = 0;
77	loop {
78	let div_shifted = div << shl;
79	let sub = duo.wrapping_sub(div_shifted);
80	// it is recommended to use `println!`s like this if functionality is unclear
81	/*
82	println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
83	duo,
84	div_shifted,
85	sub,
86	quo,
87	shl
88	);
89	*/
90	if 0 <= (sub as $iX) {
91	duo = sub;
92	quo += 1 << shl;
93	if duo < div_original {
94	// this branch is optional
95	return (quo, duo)
96	}
97	}
98	if shl == 0 {
99	return (quo, duo)
100	}
101	shl -= 1;
102	}
103	*/
104
105	// This restoring binary long division algorithm reduces the number of operations
106	// overall via:
107	// - `pow` can be shifted right instead of recalculating from `shl`
108	// - starting `div` shifted left and shifting it right for each step instead of
109	// recalculating from `shl`
110	// - The `duo < div_original` branch is used to terminate the algorithm instead of the
111	// `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
112	// `div` from being shifted off the end. This check also only occurs on half of steps
113	// on average, since it is behind the `(sub as $iX) >= 0` branch.
114	// - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
115	// being updated between steps
116	//
117	// There are many variations of this algorithm, but this encompases the largest number
118	// of architectures and does not rely on carry flags, add-with-carry, or SWAR
119	// complications to be decently fast.
120	/*
121	let div_original = div;
122	let shl = $normalization_shift(duo, div, false);
123	let mut div: $uX = div << shl;
124	let mut pow: $uX = 1 << shl;
125	let mut quo: $uX = 0;
126	loop {
127	let sub = duo.wrapping_sub(div);
128	if 0 <= (sub as $iX) {
129	duo = sub;
130	quo \|= pow;
131	if duo < div_original {
132	return (quo, duo)
133	}
134	}
135	div >>= 1;
136	pow >>= 1;
137	}
138	*/
139
140	// If the architecture has flags and predicated arithmetic instructions, it is possible
141	// to do binary long division without branching and in only 3 or 4 instructions. This is
142	// a variation of a 3 instruction central loop from
143	// http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
144	//
145	// What allows doing division in only 3 instructions is realizing that instead of
146	// keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
147	// place and `duo` can be shifted left. This means `div` does not have to be updated,
148	// but causes edge case problems and makes `duo < div_original` tests harder. Some
149	// architectures have an option to shift an argument in an arithmetic operation, which
150	// means `duo` can be shifted left and subtracted from in one instruction. The other two
151	// instructions are updating `quo` and undoing the subtraction if it turns out things
152	// were not normalized.
153
154	/*
155	// Perform one binary long division step on the already normalized arguments, because
156	// the main. Note that this does a full normalization since the central loop needs
157	// `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
158	// variation only did normalization to the nearest 4 steps, but this makes handling edge
159	// cases much harder. We do a full normalization and perform a binary long division
160	// step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
161	// of `duo`, then the edge case handler shifts `div` right and does another long
162	// division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
163	let div_original = div;
164	let mut shl = $normalization_shift(duo, div, true);
165	let mut div: $uX = (div << shl);
166	let mut quo: $uX = 1;
167	duo = duo.wrapping_sub(div);
168	if duo < div_original {
169	return (1 << shl, duo);
170	}
171	let div_neg: $uX;
172	if (div as $iX) < 0 {
173	// A very ugly edge case where the most significant bit of `div` is set (after
174	// shifting to match `duo` when its most significant bit is at the sign bit), which
175	// leads to the sign bit of `div_neg` being cut off and carries not happening when
176	// they should. This branch performs a long division step that keeps `duo` in place
177	// and shifts `div` down.
178	div >>= 1;
179	div_neg = div.wrapping_neg();
180	let (sub, carry) = duo.overflowing_add(div_neg);
181	duo = sub;
182	quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
183	if !carry {
184	duo = duo.wrapping_add(div);
185	}
186	shl -= 1;
187	} else {
188	div_neg = div.wrapping_neg();
189	}
190	// The add-with-carry that updates `quo` needs to have the carry set when a normalized
191	// subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
192	// subtraction generates a carry when an unnormalized subtract happens, which is the
193	// opposite of what we want. Instead, we use
194	// `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
195	let mut i = shl;
196	loop {
197	if i == 0 {
198	break;
199	}
200	i -= 1;
201	// `ADDS duo, div, duo, LSL #1`
202	// (add `div` to `duo << 1` and set flags)
203	let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
204	duo = sub;
205	// `ADC quo, quo, quo`
206	// (add with carry). Effectively shifts `quo` left by 1 and sets the least
207	// significant bit to the carry.
208	quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
209	// `ADDCC duo, duo, div`
210	// (add if carry clear). Undoes the subtraction if no carry was generated.
211	if !carry {
212	duo = duo.wrapping_add(div);
213	}
214	}
215	return (quo, duo >> shl);
216	*/
217
218	// This is the SWAR (SIMD within in a register) restoring division algorithm.
219	// This combines several ideas of the above algorithms:
220	// - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
221	// restoring division algorithm, some architectures can do the shifting and
222	// subtraction step in one instruction.
223	// - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
224	// and adding one.
225	// - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
226	// LSB, so what if we use those bits to store `quo`?
227	// Through a complex setup, it is possible to manage `duo` and `quo` in the same
228	// register, and perform one step with 2 or 3 instructions. The only major downsides are
229	// that there is significant setup (it is only saves instructions if `shl` is
230	// approximately more than 4), `duo < div_original` checks are impractical once SWAR is
231	// initiated, and the number of division steps taken has to be exact (we cannot do more
232	// division steps than `shl`, because it introduces edge cases where quotient bits in
233	// `duo` start to collide with the real part of `div`.
234	/*
235	// first step. The quotient bit is stored in `quo` for now
236	let div_original = div;
237	let mut shl = $normalization_shift(duo, div, true);
238	let mut div: $uX = (div << shl);
239	duo = duo.wrapping_sub(div);
240	let mut quo: $uX = 1 << shl;
241	if duo < div_original {
242	return (quo, duo);
243	}
244
245	let mask: $uX;
246	if (div as $iX) < 0 {
247	// deal with same edge case as the 3 instruction restoring division algorithm, but
248	// the quotient bit from this step also has to be stored in `quo`
249	div >>= 1;
250	shl -= 1;
251	let tmp = 1 << shl;
252	mask = tmp - 1;
253	let sub = duo.wrapping_sub(div);
254	if (sub as $iX) >= 0 {
255	// restore
256	duo = sub;
257	quo \|= tmp;
258	}
259	if duo < div_original {
260	return (quo, duo);
261	}
262	} else {
263	mask = quo - 1;
264	}
265	// There is now room for quotient bits in `duo`.
266
267	// Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
268	// `div` and end up with the subset of `shl` bits being all being set. This subset acts
269	// just like a two's complement negative one. The subset of `div` containing the divisor
270	// had 1 subtracted from it, but a carry will always be generated from the `shl` subset
271	// as long as the quotient stays positive.
272	//
273	// When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
274	// adds a quotient bit to the least significant bit.
275	// For example, 89 (0b01011001) divided by 3 (0b11):
276	//
277	// shl:4, div:0b00110000
278	// first step:
279	// duo:0b01011001
280	// + div_neg:0b11010000
281	// ____________________
282	// 0b00101001
283	// quo is set to 0b00010000 and mask is set to 0b00001111 for later
284	//
285	// 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
286	// quotient part of `duo` with `^`s.
287	// chars.
288	// div:0b00110000
289	// ^^^^
290	// + 0b11111111
291	// ________________
292	// 0b00101111
293	// ^^^^
294	// div_neg:0b11010001
295	//
296	// first SWAR step:
297	// duo_shl1:0b01010010
298	// ^
299	// + div_neg:0b11010001
300	// ____________________
301	// 0b00100011
302	// ^
303	// second:
304	// duo_shl1:0b01000110
305	// ^^
306	// + div_neg:0b11010001
307	// ____________________
308	// 0b00010111
309	// ^^
310	// third:
311	// duo_shl1:0b00101110
312	// ^^^
313	// + div_neg:0b11010001
314	// ____________________
315	// 0b11111111
316	// ^^^
317	// 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
318	// part of `duo` is negative and the third step was an unnormalized step. The restore
319	// branch then restores `duo`. Note that the restore branch does not shift `duo` left.
320	//
321	// duo:0b11111111
322	// ^^^
323	// + div:0b00101111
324	// ^^^^
325	// ________________
326	// 0b00101110
327	// ^^^
328	// `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
329	// unset quotient bit.
330	//
331	// final step (`shl` was 4, so exactly 4 steps must be taken)
332	// duo_shl1:0b01011100
333	// ^^^^
334	// + div_neg:0b11010001
335	// ____________________
336	// 0b00101101
337	// ^^^^
338	// The quotient includes the `^` bits added with the `quo` bits from the beginning that
339	// contained the first step and potential edge case step,
340	// `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
341	// The remainder is the bits remaining in `duo` that are not part of the quotient bits,
342	// `duo:0b00101101 >> shl == 0b0010 == 2u8`.
343	let div: $uX = div.wrapping_sub(1);
344	let mut i = shl;
345	loop {
346	if i == 0 {
347	break;
348	}
349	i -= 1;
350	duo = duo.wrapping_shl(1).wrapping_sub(div);
351	if (duo as $iX) < 0 {
352	// restore
353	duo = duo.wrapping_add(div);
354	}
355	}
356	// unpack the results of SWAR
357	return ((duo & mask) \| quo, duo >> shl);
358	*/
359
360	// The problem with the conditional restoring SWAR algorithm above is that, in practice,
361	// it requires assembly code to bring out its full unrolled potential (It seems that
362	// LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
363	// that my algorithm intends. On architectures without predicated instructions, the code
364	// gen is especially bad. We need a default software division algorithm that is
365	// guaranteed to get decent code gen for the central loop.
366
367	// For non-SWAR algorithms, there is a way to do binary long division without
368	// predication or even branching. This involves creating a mask from the sign bit and
369	// performing different kinds of steps using that.
370	/*
371	let shl = $normalization_shift(duo, div, true);
372	let mut div: $uX = div << shl;
373	let mut pow: $uX = 1 << shl;
374	let mut quo: $uX = 0;
375	loop {
376	let sub = duo.wrapping_sub(div);
377	let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
378	duo -= div & sign_mask;
379	quo \|= pow & sign_mask;
380	div >>= 1;
381	pow >>= 1;
382	if pow == 0 {
383	break;
384	}
385	}
386	return (quo, duo);
387	*/
388	// However, it requires about 4 extra operations (smearing the sign bit, negating the
389	// mask, and applying the mask twice) on top of the operations done by the actual
390	// algorithm. With SWAR however, just 2 extra operations are needed, making it
391	// practical and even the most optimal algorithm for some architectures.
392
393	// What we do is use custom assembly for predicated architectures that need software
394	// division, and for the default algorithm use a mask based restoring SWAR algorithm
395	// without conditionals or branches. On almost all architectures, this Rust code is
396	// guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
397	// will unroll it in a decent way.
398
399	// standard opening for SWAR algorithm with first step and edge case handling
400	let div_original = div;
401	let mut shl = $normalization_shift(duo, div, `true`);
402	let mut div: $uX = (div << shl);
403	duo = duo.wrapping_sub(div);
404	let mut quo: $uX = `1` << shl;
405	if duo < div_original {
406	return (quo, duo);
407	}
408	let mask: $uX;
409	if (div as $iX) < `0` {
410	div >>= `1`;
411	shl -= `1`;
412	let tmp = `1` << shl;
413	mask = tmp - `1`;
414	let sub = duo.wrapping_sub(div);
415	if (sub as $iX) >= `0` {
416	duo = sub;
417	quo \|= tmp;
418	}
419	if duo < div_original {
420	return (quo, duo);
421	}
422	} else {
423	mask = quo - `1`;
424	}
425
426	// central loop
427	div = div.wrapping_sub(`1`);
428	let mut i = shl;
429	loop {
430	if i == `0` {
431	break;
432	}
433	i -= `1`;
434	// shift left 1 and subtract
435	duo = duo.wrapping_shl(`1`).wrapping_sub(div);
436	// create mask
437	let mask = (duo as $iX).wrapping_shr($n - `1`) as $uX;
438	// restore
439	duo = duo.wrapping_add(div & mask);
440	}
441	// unpack
442	return ((duo & mask) \| quo, duo >> shl);
443
444	// miscellanious binary long division algorithms that might be better for specific
445	// architectures
446
447	// Another kind of long division uses an interesting fact that `div` and `pow` can be
448	// negated when `duo` is negative to perform a "negated" division step that works in
449	// place of any normalization mechanism. This is a non-restoring division algorithm that
450	// is very similar to the non-restoring division algorithms that can be found on the
451	// internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
452	// be viewed as shifting the least significant set bit right (e.x. if we enter a series
453	// of negated binary long division steps starting with `quo == 0b1011_0000` and
454	// `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
455	// 0b1010_0010, 0b1010_0001).
456	/*
457	let div_original = div;
458	let shl = $normalization_shift(duo, div, true);
459	let mut div: $uX = (div << shl);
460	let mut pow: $uX = 1 << shl;
461	let mut quo: $uX = pow;
462	duo = duo.wrapping_sub(div);
463	if duo < div_original {
464	return (quo, duo);
465	}
466	div >>= 1;
467	pow >>= 1;
468	loop {
469	if (duo as $iX) < 0 {
470	// Negated binary long division step.
471	duo = duo.wrapping_add(div);
472	quo = quo.wrapping_sub(pow);
473	} else {
474	// Normal long division step.
475	if duo < div_original {
476	return (quo, duo)
477	}
478	duo = duo.wrapping_sub(div);
479	quo = quo.wrapping_add(pow);
480	}
481	pow >>= 1;
482	div >>= 1;
483	}
484	*/
485
486	// This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
487	// SWAR techniques that makes the only difference between steps be negation of `div`.
488	// If there was an architecture with an instruction that negated inputs to an adder
489	// based on conditionals, and in place shifting (or a three input addition operation
490	// that can have `duo` as two of the inputs to effectively shift it left by 1), then a
491	// single instruction central loop is possible. Microarchitectures often have inputs to
492	// their ALU that can invert the arguments and carry in of adders, but the architectures
493	// unfortunately do not have an instruction to dynamically invert this input based on
494	// conditionals.
495	/*
496	// SWAR opening
497	let div_original = div;
498	let mut shl = $normalization_shift(duo, div, true);
499	let mut div: $uX = (div << shl);
500	duo = duo.wrapping_sub(div);
501	let mut quo: $uX = 1 << shl;
502	if duo < div_original {
503	return (quo, duo);
504	}
505	let mask: $uX;
506	if (div as $iX) < 0 {
507	div >>= 1;
508	shl -= 1;
509	let tmp = 1 << shl;
510	let sub = duo.wrapping_sub(div);
511	if (sub as $iX) >= 0 {
512	// restore
513	duo = sub;
514	quo \|= tmp;
515	}
516	if duo < div_original {
517	return (quo, duo);
518	}
519	mask = tmp - 1;
520	} else {
521	mask = quo - 1;
522	}
523
524	// central loop
525	let div: $uX = div.wrapping_sub(1);
526	let mut i = shl;
527	loop {
528	if i == 0 {
529	break;
530	}
531	i -= 1;
532	// note: the `wrapping_shl(1)` can be factored out, but would require another
533	// restoring division step to prevent `(duo as $iX)` from overflowing
534	if (duo as $iX) < 0 {
535	// Negated binary long division step.
536	duo = duo.wrapping_shl(1).wrapping_add(div);
537	} else {
538	// Normal long division step.
539	duo = duo.wrapping_shl(1).wrapping_sub(div);
540	}
541	}
542	if (duo as $iX) < 0 {
543	// Restore. This was not needed in the original nonrestoring algorithm because of
544	// the `duo < div_original` checks.
545	duo = duo.wrapping_add(div);
546	}
547	// unpack
548	return ((duo & mask) \| quo, duo >> shl);
549	*/
550	}
551	};
552	}
553