1/// Creates an unsigned division function that uses binary long division, designed for
2/// computer architectures without division instructions. These functions have good performance for
3/// microarchitectures with large branch miss penalties and architectures without the ability to
4/// predicate instructions. For architectures with predicated instructions, one of the algorithms
5/// described in the documentation of these functions probably has higher performance, and a custom
6/// assembly routine should be used instead.
7#[allow(unused_macros)]
8macro_rules! impl_binary_long {
9 (
10 $fn:ident, // name of the unsigned division function
11 $zero_div_fn:ident, // function called when division by zero is attempted
12 $normalization_shift:ident, // function for finding the normalization shift
13 $n:tt, // the number of bits in a $iX or $uX
14 $uX:ident, // unsigned integer type for the inputs and outputs of `$fn`
15 $iX:ident // signed integer type with same bitwidth as `$uX`
16 $(, $fun_attr:meta)* // attributes for the function
17 ) => {
18 /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
19 /// tuple.
20 $(
21 #[$fun_attr]
22 )*
23 pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) {
24 let mut duo = duo;
25 // handle edge cases before calling `$normalization_shift`
26 if div == 0 {
27 $zero_div_fn()
28 }
29 if duo < div {
30 return (0, duo);
31 }
32
33 // There are many variations of binary division algorithm that could be used. This
34 // documentation gives a tour of different methods so that future readers wanting to
35 // optimize further do not have to painstakingly derive them. The SWAR variation is
36 // especially hard to understand without reading the less convoluted methods first.
37
38 // You may notice that a `duo < div_original` check is included in many these
39 // algorithms. A critical optimization that many algorithms miss is handling of
40 // quotients that will turn out to have many trailing zeros or many leading zeros. This
41 // happens in cases of exact or close-to-exact divisions, divisions by power of two, and
42 // in cases where the quotient is small. The `duo < div_original` check handles these
43 // cases of early returns and ends up replacing other kinds of mundane checks that
44 // normally terminate a binary division algorithm.
45 //
46 // Something you may see in other algorithms that is not special-cased here is checks
47 // for division by powers of two. The `duo < div_original` check handles this case and
48 // more, however it can be checked up front before the bisection using the
49 // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
50 // compilers should handle most cases where divisions by power of two occur, and we do
51 // not want to add on a few cycles for every division operation just to save a few
52 // cycles rarely.
53
54 // The following example is the most straightforward translation from the way binary
55 // long division is typically visualized:
56 // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
57 // the result from `$normalization_shift(duo, div, false)`.
58 //
59 // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
60 // and `duo` is kept unchanged.
61 // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
62 //
63 // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
64 // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
65 //
66 // Step 2: Continue based on `sub`. The `quo` bits start accumulating.
67 // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
68 // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
69 // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
70 // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
71 // The `duo < div_original` check terminates the algorithm with the correct quotient of
72 // 29u8 and remainder of 4u8
73 /*
74 let div_original = div;
75 let mut shl = $normalization_shift(duo, div, false);
76 let mut quo = 0;
77 loop {
78 let div_shifted = div << shl;
79 let sub = duo.wrapping_sub(div_shifted);
80 // it is recommended to use `println!`s like this if functionality is unclear
81 /*
82 println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
83 duo,
84 div_shifted,
85 sub,
86 quo,
87 shl
88 );
89 */
90 if 0 <= (sub as $iX) {
91 duo = sub;
92 quo += 1 << shl;
93 if duo < div_original {
94 // this branch is optional
95 return (quo, duo)
96 }
97 }
98 if shl == 0 {
99 return (quo, duo)
100 }
101 shl -= 1;
102 }
103 */
104
105 // This restoring binary long division algorithm reduces the number of operations
106 // overall via:
107 // - `pow` can be shifted right instead of recalculating from `shl`
108 // - starting `div` shifted left and shifting it right for each step instead of
109 // recalculating from `shl`
110 // - The `duo < div_original` branch is used to terminate the algorithm instead of the
111 // `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
112 // `div` from being shifted off the end. This check also only occurs on half of steps
113 // on average, since it is behind the `(sub as $iX) >= 0` branch.
114 // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
115 // being updated between steps
116 //
117 // There are many variations of this algorithm, but this encompases the largest number
118 // of architectures and does not rely on carry flags, add-with-carry, or SWAR
119 // complications to be decently fast.
120 /*
121 let div_original = div;
122 let shl = $normalization_shift(duo, div, false);
123 let mut div: $uX = div << shl;
124 let mut pow: $uX = 1 << shl;
125 let mut quo: $uX = 0;
126 loop {
127 let sub = duo.wrapping_sub(div);
128 if 0 <= (sub as $iX) {
129 duo = sub;
130 quo |= pow;
131 if duo < div_original {
132 return (quo, duo)
133 }
134 }
135 div >>= 1;
136 pow >>= 1;
137 }
138 */
139
140 // If the architecture has flags and predicated arithmetic instructions, it is possible
141 // to do binary long division without branching and in only 3 or 4 instructions. This is
142 // a variation of a 3 instruction central loop from
143 // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
144 //
145 // What allows doing division in only 3 instructions is realizing that instead of
146 // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
147 // place and `duo` can be shifted left. This means `div` does not have to be updated,
148 // but causes edge case problems and makes `duo < div_original` tests harder. Some
149 // architectures have an option to shift an argument in an arithmetic operation, which
150 // means `duo` can be shifted left and subtracted from in one instruction. The other two
151 // instructions are updating `quo` and undoing the subtraction if it turns out things
152 // were not normalized.
153
154 /*
155 // Perform one binary long division step on the already normalized arguments, because
156 // the main. Note that this does a full normalization since the central loop needs
157 // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
158 // variation only did normalization to the nearest 4 steps, but this makes handling edge
159 // cases much harder. We do a full normalization and perform a binary long division
160 // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
161 // of `duo`, then the edge case handler shifts `div` right and does another long
162 // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
163 let div_original = div;
164 let mut shl = $normalization_shift(duo, div, true);
165 let mut div: $uX = (div << shl);
166 let mut quo: $uX = 1;
167 duo = duo.wrapping_sub(div);
168 if duo < div_original {
169 return (1 << shl, duo);
170 }
171 let div_neg: $uX;
172 if (div as $iX) < 0 {
173 // A very ugly edge case where the most significant bit of `div` is set (after
174 // shifting to match `duo` when its most significant bit is at the sign bit), which
175 // leads to the sign bit of `div_neg` being cut off and carries not happening when
176 // they should. This branch performs a long division step that keeps `duo` in place
177 // and shifts `div` down.
178 div >>= 1;
179 div_neg = div.wrapping_neg();
180 let (sub, carry) = duo.overflowing_add(div_neg);
181 duo = sub;
182 quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
183 if !carry {
184 duo = duo.wrapping_add(div);
185 }
186 shl -= 1;
187 } else {
188 div_neg = div.wrapping_neg();
189 }
190 // The add-with-carry that updates `quo` needs to have the carry set when a normalized
191 // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
192 // subtraction generates a carry when an unnormalized subtract happens, which is the
193 // opposite of what we want. Instead, we use
194 // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
195 let mut i = shl;
196 loop {
197 if i == 0 {
198 break;
199 }
200 i -= 1;
201 // `ADDS duo, div, duo, LSL #1`
202 // (add `div` to `duo << 1` and set flags)
203 let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
204 duo = sub;
205 // `ADC quo, quo, quo`
206 // (add with carry). Effectively shifts `quo` left by 1 and sets the least
207 // significant bit to the carry.
208 quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
209 // `ADDCC duo, duo, div`
210 // (add if carry clear). Undoes the subtraction if no carry was generated.
211 if !carry {
212 duo = duo.wrapping_add(div);
213 }
214 }
215 return (quo, duo >> shl);
216 */
217
218 // This is the SWAR (SIMD within in a register) restoring division algorithm.
219 // This combines several ideas of the above algorithms:
220 // - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
221 // restoring division algorithm, some architectures can do the shifting and
222 // subtraction step in one instruction.
223 // - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
224 // and adding one.
225 // - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
226 // LSB, so what if we use those bits to store `quo`?
227 // Through a complex setup, it is possible to manage `duo` and `quo` in the same
228 // register, and perform one step with 2 or 3 instructions. The only major downsides are
229 // that there is significant setup (it is only saves instructions if `shl` is
230 // approximately more than 4), `duo < div_original` checks are impractical once SWAR is
231 // initiated, and the number of division steps taken has to be exact (we cannot do more
232 // division steps than `shl`, because it introduces edge cases where quotient bits in
233 // `duo` start to collide with the real part of `div`.
234 /*
235 // first step. The quotient bit is stored in `quo` for now
236 let div_original = div;
237 let mut shl = $normalization_shift(duo, div, true);
238 let mut div: $uX = (div << shl);
239 duo = duo.wrapping_sub(div);
240 let mut quo: $uX = 1 << shl;
241 if duo < div_original {
242 return (quo, duo);
243 }
244
245 let mask: $uX;
246 if (div as $iX) < 0 {
247 // deal with same edge case as the 3 instruction restoring division algorithm, but
248 // the quotient bit from this step also has to be stored in `quo`
249 div >>= 1;
250 shl -= 1;
251 let tmp = 1 << shl;
252 mask = tmp - 1;
253 let sub = duo.wrapping_sub(div);
254 if (sub as $iX) >= 0 {
255 // restore
256 duo = sub;
257 quo |= tmp;
258 }
259 if duo < div_original {
260 return (quo, duo);
261 }
262 } else {
263 mask = quo - 1;
264 }
265 // There is now room for quotient bits in `duo`.
266
267 // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
268 // `div` and end up with the subset of `shl` bits being all being set. This subset acts
269 // just like a two's complement negative one. The subset of `div` containing the divisor
270 // had 1 subtracted from it, but a carry will always be generated from the `shl` subset
271 // as long as the quotient stays positive.
272 //
273 // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
274 // adds a quotient bit to the least significant bit.
275 // For example, 89 (0b01011001) divided by 3 (0b11):
276 //
277 // shl:4, div:0b00110000
278 // first step:
279 // duo:0b01011001
280 // + div_neg:0b11010000
281 // ____________________
282 // 0b00101001
283 // quo is set to 0b00010000 and mask is set to 0b00001111 for later
284 //
285 // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
286 // quotient part of `duo` with `^`s.
287 // chars.
288 // div:0b00110000
289 // ^^^^
290 // + 0b11111111
291 // ________________
292 // 0b00101111
293 // ^^^^
294 // div_neg:0b11010001
295 //
296 // first SWAR step:
297 // duo_shl1:0b01010010
298 // ^
299 // + div_neg:0b11010001
300 // ____________________
301 // 0b00100011
302 // ^
303 // second:
304 // duo_shl1:0b01000110
305 // ^^
306 // + div_neg:0b11010001
307 // ____________________
308 // 0b00010111
309 // ^^
310 // third:
311 // duo_shl1:0b00101110
312 // ^^^
313 // + div_neg:0b11010001
314 // ____________________
315 // 0b11111111
316 // ^^^
317 // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
318 // part of `duo` is negative and the third step was an unnormalized step. The restore
319 // branch then restores `duo`. Note that the restore branch does not shift `duo` left.
320 //
321 // duo:0b11111111
322 // ^^^
323 // + div:0b00101111
324 // ^^^^
325 // ________________
326 // 0b00101110
327 // ^^^
328 // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
329 // unset quotient bit.
330 //
331 // final step (`shl` was 4, so exactly 4 steps must be taken)
332 // duo_shl1:0b01011100
333 // ^^^^
334 // + div_neg:0b11010001
335 // ____________________
336 // 0b00101101
337 // ^^^^
338 // The quotient includes the `^` bits added with the `quo` bits from the beginning that
339 // contained the first step and potential edge case step,
340 // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
341 // The remainder is the bits remaining in `duo` that are not part of the quotient bits,
342 // `duo:0b00101101 >> shl == 0b0010 == 2u8`.
343 let div: $uX = div.wrapping_sub(1);
344 let mut i = shl;
345 loop {
346 if i == 0 {
347 break;
348 }
349 i -= 1;
350 duo = duo.wrapping_shl(1).wrapping_sub(div);
351 if (duo as $iX) < 0 {
352 // restore
353 duo = duo.wrapping_add(div);
354 }
355 }
356 // unpack the results of SWAR
357 return ((duo & mask) | quo, duo >> shl);
358 */
359
360 // The problem with the conditional restoring SWAR algorithm above is that, in practice,
361 // it requires assembly code to bring out its full unrolled potential (It seems that
362 // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
363 // that my algorithm intends. On architectures without predicated instructions, the code
364 // gen is especially bad. We need a default software division algorithm that is
365 // guaranteed to get decent code gen for the central loop.
366
367 // For non-SWAR algorithms, there is a way to do binary long division without
368 // predication or even branching. This involves creating a mask from the sign bit and
369 // performing different kinds of steps using that.
370 /*
371 let shl = $normalization_shift(duo, div, true);
372 let mut div: $uX = div << shl;
373 let mut pow: $uX = 1 << shl;
374 let mut quo: $uX = 0;
375 loop {
376 let sub = duo.wrapping_sub(div);
377 let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
378 duo -= div & sign_mask;
379 quo |= pow & sign_mask;
380 div >>= 1;
381 pow >>= 1;
382 if pow == 0 {
383 break;
384 }
385 }
386 return (quo, duo);
387 */
388 // However, it requires about 4 extra operations (smearing the sign bit, negating the
389 // mask, and applying the mask twice) on top of the operations done by the actual
390 // algorithm. With SWAR however, just 2 extra operations are needed, making it
391 // practical and even the most optimal algorithm for some architectures.
392
393 // What we do is use custom assembly for predicated architectures that need software
394 // division, and for the default algorithm use a mask based restoring SWAR algorithm
395 // without conditionals or branches. On almost all architectures, this Rust code is
396 // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
397 // will unroll it in a decent way.
398
399 // standard opening for SWAR algorithm with first step and edge case handling
400 let div_original = div;
401 let mut shl = $normalization_shift(duo, div, true);
402 let mut div: $uX = (div << shl);
403 duo = duo.wrapping_sub(div);
404 let mut quo: $uX = 1 << shl;
405 if duo < div_original {
406 return (quo, duo);
407 }
408 let mask: $uX;
409 if (div as $iX) < 0 {
410 div >>= 1;
411 shl -= 1;
412 let tmp = 1 << shl;
413 mask = tmp - 1;
414 let sub = duo.wrapping_sub(div);
415 if (sub as $iX) >= 0 {
416 duo = sub;
417 quo |= tmp;
418 }
419 if duo < div_original {
420 return (quo, duo);
421 }
422 } else {
423 mask = quo - 1;
424 }
425
426 // central loop
427 div = div.wrapping_sub(1);
428 let mut i = shl;
429 loop {
430 if i == 0 {
431 break;
432 }
433 i -= 1;
434 // shift left 1 and subtract
435 duo = duo.wrapping_shl(1).wrapping_sub(div);
436 // create mask
437 let mask = (duo as $iX).wrapping_shr($n - 1) as $uX;
438 // restore
439 duo = duo.wrapping_add(div & mask);
440 }
441 // unpack
442 return ((duo & mask) | quo, duo >> shl);
443
444 // miscellanious binary long division algorithms that might be better for specific
445 // architectures
446
447 // Another kind of long division uses an interesting fact that `div` and `pow` can be
448 // negated when `duo` is negative to perform a "negated" division step that works in
449 // place of any normalization mechanism. This is a non-restoring division algorithm that
450 // is very similar to the non-restoring division algorithms that can be found on the
451 // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
452 // be viewed as shifting the least significant set bit right (e.x. if we enter a series
453 // of negated binary long division steps starting with `quo == 0b1011_0000` and
454 // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
455 // 0b1010_0010, 0b1010_0001).
456 /*
457 let div_original = div;
458 let shl = $normalization_shift(duo, div, true);
459 let mut div: $uX = (div << shl);
460 let mut pow: $uX = 1 << shl;
461 let mut quo: $uX = pow;
462 duo = duo.wrapping_sub(div);
463 if duo < div_original {
464 return (quo, duo);
465 }
466 div >>= 1;
467 pow >>= 1;
468 loop {
469 if (duo as $iX) < 0 {
470 // Negated binary long division step.
471 duo = duo.wrapping_add(div);
472 quo = quo.wrapping_sub(pow);
473 } else {
474 // Normal long division step.
475 if duo < div_original {
476 return (quo, duo)
477 }
478 duo = duo.wrapping_sub(div);
479 quo = quo.wrapping_add(pow);
480 }
481 pow >>= 1;
482 div >>= 1;
483 }
484 */
485
486 // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
487 // SWAR techniques that makes the only difference between steps be negation of `div`.
488 // If there was an architecture with an instruction that negated inputs to an adder
489 // based on conditionals, and in place shifting (or a three input addition operation
490 // that can have `duo` as two of the inputs to effectively shift it left by 1), then a
491 // single instruction central loop is possible. Microarchitectures often have inputs to
492 // their ALU that can invert the arguments and carry in of adders, but the architectures
493 // unfortunately do not have an instruction to dynamically invert this input based on
494 // conditionals.
495 /*
496 // SWAR opening
497 let div_original = div;
498 let mut shl = $normalization_shift(duo, div, true);
499 let mut div: $uX = (div << shl);
500 duo = duo.wrapping_sub(div);
501 let mut quo: $uX = 1 << shl;
502 if duo < div_original {
503 return (quo, duo);
504 }
505 let mask: $uX;
506 if (div as $iX) < 0 {
507 div >>= 1;
508 shl -= 1;
509 let tmp = 1 << shl;
510 let sub = duo.wrapping_sub(div);
511 if (sub as $iX) >= 0 {
512 // restore
513 duo = sub;
514 quo |= tmp;
515 }
516 if duo < div_original {
517 return (quo, duo);
518 }
519 mask = tmp - 1;
520 } else {
521 mask = quo - 1;
522 }
523
524 // central loop
525 let div: $uX = div.wrapping_sub(1);
526 let mut i = shl;
527 loop {
528 if i == 0 {
529 break;
530 }
531 i -= 1;
532 // note: the `wrapping_shl(1)` can be factored out, but would require another
533 // restoring division step to prevent `(duo as $iX)` from overflowing
534 if (duo as $iX) < 0 {
535 // Negated binary long division step.
536 duo = duo.wrapping_shl(1).wrapping_add(div);
537 } else {
538 // Normal long division step.
539 duo = duo.wrapping_shl(1).wrapping_sub(div);
540 }
541 }
542 if (duo as $iX) < 0 {
543 // Restore. This was not needed in the original nonrestoring algorithm because of
544 // the `duo < div_original` checks.
545 duo = duo.wrapping_add(div);
546 }
547 // unpack
548 return ((duo & mask) | quo, duo >> shl);
549 */
550 }
551 };
552}
553