| 1 | /// Creates an unsigned division function that uses binary long division, designed for |
| 2 | /// computer architectures without division instructions. These functions have good performance for |
| 3 | /// microarchitectures with large branch miss penalties and architectures without the ability to |
| 4 | /// predicate instructions. For architectures with predicated instructions, one of the algorithms |
| 5 | /// described in the documentation of these functions probably has higher performance, and a custom |
| 6 | /// assembly routine should be used instead. |
| 7 | #[allow (unused_macros)] |
| 8 | macro_rules! impl_binary_long { |
| 9 | ( |
| 10 | $fn:ident, // name of the unsigned division function |
| 11 | $zero_div_fn:ident, // function called when division by zero is attempted |
| 12 | $normalization_shift:ident, // function for finding the normalization shift |
| 13 | $n:tt, // the number of bits in a $iX or $uX |
| 14 | $uX:ident, // unsigned integer type for the inputs and outputs of `$fn` |
| 15 | $iX:ident // signed integer type with same bitwidth as `$uX` |
| 16 | $(, $fun_attr:meta)* // attributes for the function |
| 17 | ) => { |
| 18 | /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a |
| 19 | /// tuple. |
| 20 | $( |
| 21 | #[$fun_attr] |
| 22 | )* |
| 23 | pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) { |
| 24 | let mut duo = duo; |
| 25 | // handle edge cases before calling `$normalization_shift` |
| 26 | if div == 0 { |
| 27 | $zero_div_fn() |
| 28 | } |
| 29 | if duo < div { |
| 30 | return (0, duo); |
| 31 | } |
| 32 | |
| 33 | // There are many variations of binary division algorithm that could be used. This |
| 34 | // documentation gives a tour of different methods so that future readers wanting to |
| 35 | // optimize further do not have to painstakingly derive them. The SWAR variation is |
| 36 | // especially hard to understand without reading the less convoluted methods first. |
| 37 | |
| 38 | // You may notice that a `duo < div_original` check is included in many these |
| 39 | // algorithms. A critical optimization that many algorithms miss is handling of |
| 40 | // quotients that will turn out to have many trailing zeros or many leading zeros. This |
| 41 | // happens in cases of exact or close-to-exact divisions, divisions by power of two, and |
| 42 | // in cases where the quotient is small. The `duo < div_original` check handles these |
| 43 | // cases of early returns and ends up replacing other kinds of mundane checks that |
| 44 | // normally terminate a binary division algorithm. |
| 45 | // |
| 46 | // Something you may see in other algorithms that is not special-cased here is checks |
| 47 | // for division by powers of two. The `duo < div_original` check handles this case and |
| 48 | // more, however it can be checked up front before the bisection using the |
| 49 | // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because |
| 50 | // compilers should handle most cases where divisions by power of two occur, and we do |
| 51 | // not want to add on a few cycles for every division operation just to save a few |
| 52 | // cycles rarely. |
| 53 | |
| 54 | // The following example is the most straightforward translation from the way binary |
| 55 | // long division is typically visualized: |
| 56 | // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to |
| 57 | // the result from `$normalization_shift(duo, div, false)`. |
| 58 | // |
| 59 | // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set |
| 60 | // and `duo` is kept unchanged. |
| 61 | // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5 |
| 62 | // |
| 63 | // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step. |
| 64 | // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4 |
| 65 | // |
| 66 | // Step 2: Continue based on `sub`. The `quo` bits start accumulating. |
| 67 | // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3 |
| 68 | // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2 |
| 69 | // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1 |
| 70 | // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0 |
| 71 | // The `duo < div_original` check terminates the algorithm with the correct quotient of |
| 72 | // 29u8 and remainder of 4u8 |
| 73 | /* |
| 74 | let div_original = div; |
| 75 | let mut shl = $normalization_shift(duo, div, false); |
| 76 | let mut quo = 0; |
| 77 | loop { |
| 78 | let div_shifted = div << shl; |
| 79 | let sub = duo.wrapping_sub(div_shifted); |
| 80 | // it is recommended to use `println!`s like this if functionality is unclear |
| 81 | /* |
| 82 | println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}", |
| 83 | duo, |
| 84 | div_shifted, |
| 85 | sub, |
| 86 | quo, |
| 87 | shl |
| 88 | ); |
| 89 | */ |
| 90 | if 0 <= (sub as $iX) { |
| 91 | duo = sub; |
| 92 | quo += 1 << shl; |
| 93 | if duo < div_original { |
| 94 | // this branch is optional |
| 95 | return (quo, duo) |
| 96 | } |
| 97 | } |
| 98 | if shl == 0 { |
| 99 | return (quo, duo) |
| 100 | } |
| 101 | shl -= 1; |
| 102 | } |
| 103 | */ |
| 104 | |
| 105 | // This restoring binary long division algorithm reduces the number of operations |
| 106 | // overall via: |
| 107 | // - `pow` can be shifted right instead of recalculating from `shl` |
| 108 | // - starting `div` shifted left and shifting it right for each step instead of |
| 109 | // recalculating from `shl` |
| 110 | // - The `duo < div_original` branch is used to terminate the algorithm instead of the |
| 111 | // `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and |
| 112 | // `div` from being shifted off the end. This check also only occurs on half of steps |
| 113 | // on average, since it is behind the `(sub as $iX) >= 0` branch. |
| 114 | // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are |
| 115 | // being updated between steps |
| 116 | // |
| 117 | // There are many variations of this algorithm, but this encompases the largest number |
| 118 | // of architectures and does not rely on carry flags, add-with-carry, or SWAR |
| 119 | // complications to be decently fast. |
| 120 | /* |
| 121 | let div_original = div; |
| 122 | let shl = $normalization_shift(duo, div, false); |
| 123 | let mut div: $uX = div << shl; |
| 124 | let mut pow: $uX = 1 << shl; |
| 125 | let mut quo: $uX = 0; |
| 126 | loop { |
| 127 | let sub = duo.wrapping_sub(div); |
| 128 | if 0 <= (sub as $iX) { |
| 129 | duo = sub; |
| 130 | quo |= pow; |
| 131 | if duo < div_original { |
| 132 | return (quo, duo) |
| 133 | } |
| 134 | } |
| 135 | div >>= 1; |
| 136 | pow >>= 1; |
| 137 | } |
| 138 | */ |
| 139 | |
| 140 | // If the architecture has flags and predicated arithmetic instructions, it is possible |
| 141 | // to do binary long division without branching and in only 3 or 4 instructions. This is |
| 142 | // a variation of a 3 instruction central loop from |
| 143 | // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt. |
| 144 | // |
| 145 | // What allows doing division in only 3 instructions is realizing that instead of |
| 146 | // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in |
| 147 | // place and `duo` can be shifted left. This means `div` does not have to be updated, |
| 148 | // but causes edge case problems and makes `duo < div_original` tests harder. Some |
| 149 | // architectures have an option to shift an argument in an arithmetic operation, which |
| 150 | // means `duo` can be shifted left and subtracted from in one instruction. The other two |
| 151 | // instructions are updating `quo` and undoing the subtraction if it turns out things |
| 152 | // were not normalized. |
| 153 | |
| 154 | /* |
| 155 | // Perform one binary long division step on the already normalized arguments, because |
| 156 | // the main. Note that this does a full normalization since the central loop needs |
| 157 | // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original |
| 158 | // variation only did normalization to the nearest 4 steps, but this makes handling edge |
| 159 | // cases much harder. We do a full normalization and perform a binary long division |
| 160 | // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb |
| 161 | // of `duo`, then the edge case handler shifts `div` right and does another long |
| 162 | // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`. |
| 163 | let div_original = div; |
| 164 | let mut shl = $normalization_shift(duo, div, true); |
| 165 | let mut div: $uX = (div << shl); |
| 166 | let mut quo: $uX = 1; |
| 167 | duo = duo.wrapping_sub(div); |
| 168 | if duo < div_original { |
| 169 | return (1 << shl, duo); |
| 170 | } |
| 171 | let div_neg: $uX; |
| 172 | if (div as $iX) < 0 { |
| 173 | // A very ugly edge case where the most significant bit of `div` is set (after |
| 174 | // shifting to match `duo` when its most significant bit is at the sign bit), which |
| 175 | // leads to the sign bit of `div_neg` being cut off and carries not happening when |
| 176 | // they should. This branch performs a long division step that keeps `duo` in place |
| 177 | // and shifts `div` down. |
| 178 | div >>= 1; |
| 179 | div_neg = div.wrapping_neg(); |
| 180 | let (sub, carry) = duo.overflowing_add(div_neg); |
| 181 | duo = sub; |
| 182 | quo = quo.wrapping_add(quo).wrapping_add(carry as $uX); |
| 183 | if !carry { |
| 184 | duo = duo.wrapping_add(div); |
| 185 | } |
| 186 | shl -= 1; |
| 187 | } else { |
| 188 | div_neg = div.wrapping_neg(); |
| 189 | } |
| 190 | // The add-with-carry that updates `quo` needs to have the carry set when a normalized |
| 191 | // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the |
| 192 | // subtraction generates a carry when an unnormalized subtract happens, which is the |
| 193 | // opposite of what we want. Instead, we use |
| 194 | // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`. |
| 195 | let mut i = shl; |
| 196 | loop { |
| 197 | if i == 0 { |
| 198 | break; |
| 199 | } |
| 200 | i -= 1; |
| 201 | // `ADDS duo, div, duo, LSL #1` |
| 202 | // (add `div` to `duo << 1` and set flags) |
| 203 | let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg); |
| 204 | duo = sub; |
| 205 | // `ADC quo, quo, quo` |
| 206 | // (add with carry). Effectively shifts `quo` left by 1 and sets the least |
| 207 | // significant bit to the carry. |
| 208 | quo = quo.wrapping_add(quo).wrapping_add(carry as $uX); |
| 209 | // `ADDCC duo, duo, div` |
| 210 | // (add if carry clear). Undoes the subtraction if no carry was generated. |
| 211 | if !carry { |
| 212 | duo = duo.wrapping_add(div); |
| 213 | } |
| 214 | } |
| 215 | return (quo, duo >> shl); |
| 216 | */ |
| 217 | |
| 218 | // This is the SWAR (SIMD within in a register) restoring division algorithm. |
| 219 | // This combines several ideas of the above algorithms: |
| 220 | // - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction |
| 221 | // restoring division algorithm, some architectures can do the shifting and |
| 222 | // subtraction step in one instruction. |
| 223 | // - `quo` can be constructed by adding powers-of-two to it or shifting it left by one |
| 224 | // and adding one. |
| 225 | // - Every time `duo` is shifted left, there is another unused 0 bit shifted into the |
| 226 | // LSB, so what if we use those bits to store `quo`? |
| 227 | // Through a complex setup, it is possible to manage `duo` and `quo` in the same |
| 228 | // register, and perform one step with 2 or 3 instructions. The only major downsides are |
| 229 | // that there is significant setup (it is only saves instructions if `shl` is |
| 230 | // approximately more than 4), `duo < div_original` checks are impractical once SWAR is |
| 231 | // initiated, and the number of division steps taken has to be exact (we cannot do more |
| 232 | // division steps than `shl`, because it introduces edge cases where quotient bits in |
| 233 | // `duo` start to collide with the real part of `div`. |
| 234 | /* |
| 235 | // first step. The quotient bit is stored in `quo` for now |
| 236 | let div_original = div; |
| 237 | let mut shl = $normalization_shift(duo, div, true); |
| 238 | let mut div: $uX = (div << shl); |
| 239 | duo = duo.wrapping_sub(div); |
| 240 | let mut quo: $uX = 1 << shl; |
| 241 | if duo < div_original { |
| 242 | return (quo, duo); |
| 243 | } |
| 244 | |
| 245 | let mask: $uX; |
| 246 | if (div as $iX) < 0 { |
| 247 | // deal with same edge case as the 3 instruction restoring division algorithm, but |
| 248 | // the quotient bit from this step also has to be stored in `quo` |
| 249 | div >>= 1; |
| 250 | shl -= 1; |
| 251 | let tmp = 1 << shl; |
| 252 | mask = tmp - 1; |
| 253 | let sub = duo.wrapping_sub(div); |
| 254 | if (sub as $iX) >= 0 { |
| 255 | // restore |
| 256 | duo = sub; |
| 257 | quo |= tmp; |
| 258 | } |
| 259 | if duo < div_original { |
| 260 | return (quo, duo); |
| 261 | } |
| 262 | } else { |
| 263 | mask = quo - 1; |
| 264 | } |
| 265 | // There is now room for quotient bits in `duo`. |
| 266 | |
| 267 | // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from |
| 268 | // `div` and end up with the subset of `shl` bits being all being set. This subset acts |
| 269 | // just like a two's complement negative one. The subset of `div` containing the divisor |
| 270 | // had 1 subtracted from it, but a carry will always be generated from the `shl` subset |
| 271 | // as long as the quotient stays positive. |
| 272 | // |
| 273 | // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset |
| 274 | // adds a quotient bit to the least significant bit. |
| 275 | // For example, 89 (0b01011001) divided by 3 (0b11): |
| 276 | // |
| 277 | // shl:4, div:0b00110000 |
| 278 | // first step: |
| 279 | // duo:0b01011001 |
| 280 | // + div_neg:0b11010000 |
| 281 | // ____________________ |
| 282 | // 0b00101001 |
| 283 | // quo is set to 0b00010000 and mask is set to 0b00001111 for later |
| 284 | // |
| 285 | // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the |
| 286 | // quotient part of `duo` with `^`s. |
| 287 | // chars. |
| 288 | // div:0b00110000 |
| 289 | // ^^^^ |
| 290 | // + 0b11111111 |
| 291 | // ________________ |
| 292 | // 0b00101111 |
| 293 | // ^^^^ |
| 294 | // div_neg:0b11010001 |
| 295 | // |
| 296 | // first SWAR step: |
| 297 | // duo_shl1:0b01010010 |
| 298 | // ^ |
| 299 | // + div_neg:0b11010001 |
| 300 | // ____________________ |
| 301 | // 0b00100011 |
| 302 | // ^ |
| 303 | // second: |
| 304 | // duo_shl1:0b01000110 |
| 305 | // ^^ |
| 306 | // + div_neg:0b11010001 |
| 307 | // ____________________ |
| 308 | // 0b00010111 |
| 309 | // ^^ |
| 310 | // third: |
| 311 | // duo_shl1:0b00101110 |
| 312 | // ^^^ |
| 313 | // + div_neg:0b11010001 |
| 314 | // ____________________ |
| 315 | // 0b11111111 |
| 316 | // ^^^ |
| 317 | // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real |
| 318 | // part of `duo` is negative and the third step was an unnormalized step. The restore |
| 319 | // branch then restores `duo`. Note that the restore branch does not shift `duo` left. |
| 320 | // |
| 321 | // duo:0b11111111 |
| 322 | // ^^^ |
| 323 | // + div:0b00101111 |
| 324 | // ^^^^ |
| 325 | // ________________ |
| 326 | // 0b00101110 |
| 327 | // ^^^ |
| 328 | // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an |
| 329 | // unset quotient bit. |
| 330 | // |
| 331 | // final step (`shl` was 4, so exactly 4 steps must be taken) |
| 332 | // duo_shl1:0b01011100 |
| 333 | // ^^^^ |
| 334 | // + div_neg:0b11010001 |
| 335 | // ____________________ |
| 336 | // 0b00101101 |
| 337 | // ^^^^ |
| 338 | // The quotient includes the `^` bits added with the `quo` bits from the beginning that |
| 339 | // contained the first step and potential edge case step, |
| 340 | // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`. |
| 341 | // The remainder is the bits remaining in `duo` that are not part of the quotient bits, |
| 342 | // `duo:0b00101101 >> shl == 0b0010 == 2u8`. |
| 343 | let div: $uX = div.wrapping_sub(1); |
| 344 | let mut i = shl; |
| 345 | loop { |
| 346 | if i == 0 { |
| 347 | break; |
| 348 | } |
| 349 | i -= 1; |
| 350 | duo = duo.wrapping_shl(1).wrapping_sub(div); |
| 351 | if (duo as $iX) < 0 { |
| 352 | // restore |
| 353 | duo = duo.wrapping_add(div); |
| 354 | } |
| 355 | } |
| 356 | // unpack the results of SWAR |
| 357 | return ((duo & mask) | quo, duo >> shl); |
| 358 | */ |
| 359 | |
| 360 | // The problem with the conditional restoring SWAR algorithm above is that, in practice, |
| 361 | // it requires assembly code to bring out its full unrolled potential (It seems that |
| 362 | // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit |
| 363 | // that my algorithm intends. On architectures without predicated instructions, the code |
| 364 | // gen is especially bad. We need a default software division algorithm that is |
| 365 | // guaranteed to get decent code gen for the central loop. |
| 366 | |
| 367 | // For non-SWAR algorithms, there is a way to do binary long division without |
| 368 | // predication or even branching. This involves creating a mask from the sign bit and |
| 369 | // performing different kinds of steps using that. |
| 370 | /* |
| 371 | let shl = $normalization_shift(duo, div, true); |
| 372 | let mut div: $uX = div << shl; |
| 373 | let mut pow: $uX = 1 << shl; |
| 374 | let mut quo: $uX = 0; |
| 375 | loop { |
| 376 | let sub = duo.wrapping_sub(div); |
| 377 | let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX); |
| 378 | duo -= div & sign_mask; |
| 379 | quo |= pow & sign_mask; |
| 380 | div >>= 1; |
| 381 | pow >>= 1; |
| 382 | if pow == 0 { |
| 383 | break; |
| 384 | } |
| 385 | } |
| 386 | return (quo, duo); |
| 387 | */ |
| 388 | // However, it requires about 4 extra operations (smearing the sign bit, negating the |
| 389 | // mask, and applying the mask twice) on top of the operations done by the actual |
| 390 | // algorithm. With SWAR however, just 2 extra operations are needed, making it |
| 391 | // practical and even the most optimal algorithm for some architectures. |
| 392 | |
| 393 | // What we do is use custom assembly for predicated architectures that need software |
| 394 | // division, and for the default algorithm use a mask based restoring SWAR algorithm |
| 395 | // without conditionals or branches. On almost all architectures, this Rust code is |
| 396 | // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM |
| 397 | // will unroll it in a decent way. |
| 398 | |
| 399 | // standard opening for SWAR algorithm with first step and edge case handling |
| 400 | let div_original = div; |
| 401 | let mut shl = $normalization_shift(duo, div, true); |
| 402 | let mut div: $uX = (div << shl); |
| 403 | duo = duo.wrapping_sub(div); |
| 404 | let mut quo: $uX = 1 << shl; |
| 405 | if duo < div_original { |
| 406 | return (quo, duo); |
| 407 | } |
| 408 | let mask: $uX; |
| 409 | if (div as $iX) < 0 { |
| 410 | div >>= 1; |
| 411 | shl -= 1; |
| 412 | let tmp = 1 << shl; |
| 413 | mask = tmp - 1; |
| 414 | let sub = duo.wrapping_sub(div); |
| 415 | if (sub as $iX) >= 0 { |
| 416 | duo = sub; |
| 417 | quo |= tmp; |
| 418 | } |
| 419 | if duo < div_original { |
| 420 | return (quo, duo); |
| 421 | } |
| 422 | } else { |
| 423 | mask = quo - 1; |
| 424 | } |
| 425 | |
| 426 | // central loop |
| 427 | div = div.wrapping_sub(1); |
| 428 | let mut i = shl; |
| 429 | loop { |
| 430 | if i == 0 { |
| 431 | break; |
| 432 | } |
| 433 | i -= 1; |
| 434 | // shift left 1 and subtract |
| 435 | duo = duo.wrapping_shl(1).wrapping_sub(div); |
| 436 | // create mask |
| 437 | let mask = (duo as $iX).wrapping_shr($n - 1) as $uX; |
| 438 | // restore |
| 439 | duo = duo.wrapping_add(div & mask); |
| 440 | } |
| 441 | // unpack |
| 442 | return ((duo & mask) | quo, duo >> shl); |
| 443 | |
| 444 | // miscellanious binary long division algorithms that might be better for specific |
| 445 | // architectures |
| 446 | |
| 447 | // Another kind of long division uses an interesting fact that `div` and `pow` can be |
| 448 | // negated when `duo` is negative to perform a "negated" division step that works in |
| 449 | // place of any normalization mechanism. This is a non-restoring division algorithm that |
| 450 | // is very similar to the non-restoring division algorithms that can be found on the |
| 451 | // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can |
| 452 | // be viewed as shifting the least significant set bit right (e.x. if we enter a series |
| 453 | // of negated binary long division steps starting with `quo == 0b1011_0000` and |
| 454 | // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100, |
| 455 | // 0b1010_0010, 0b1010_0001). |
| 456 | /* |
| 457 | let div_original = div; |
| 458 | let shl = $normalization_shift(duo, div, true); |
| 459 | let mut div: $uX = (div << shl); |
| 460 | let mut pow: $uX = 1 << shl; |
| 461 | let mut quo: $uX = pow; |
| 462 | duo = duo.wrapping_sub(div); |
| 463 | if duo < div_original { |
| 464 | return (quo, duo); |
| 465 | } |
| 466 | div >>= 1; |
| 467 | pow >>= 1; |
| 468 | loop { |
| 469 | if (duo as $iX) < 0 { |
| 470 | // Negated binary long division step. |
| 471 | duo = duo.wrapping_add(div); |
| 472 | quo = quo.wrapping_sub(pow); |
| 473 | } else { |
| 474 | // Normal long division step. |
| 475 | if duo < div_original { |
| 476 | return (quo, duo) |
| 477 | } |
| 478 | duo = duo.wrapping_sub(div); |
| 479 | quo = quo.wrapping_add(pow); |
| 480 | } |
| 481 | pow >>= 1; |
| 482 | div >>= 1; |
| 483 | } |
| 484 | */ |
| 485 | |
| 486 | // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with |
| 487 | // SWAR techniques that makes the only difference between steps be negation of `div`. |
| 488 | // If there was an architecture with an instruction that negated inputs to an adder |
| 489 | // based on conditionals, and in place shifting (or a three input addition operation |
| 490 | // that can have `duo` as two of the inputs to effectively shift it left by 1), then a |
| 491 | // single instruction central loop is possible. Microarchitectures often have inputs to |
| 492 | // their ALU that can invert the arguments and carry in of adders, but the architectures |
| 493 | // unfortunately do not have an instruction to dynamically invert this input based on |
| 494 | // conditionals. |
| 495 | /* |
| 496 | // SWAR opening |
| 497 | let div_original = div; |
| 498 | let mut shl = $normalization_shift(duo, div, true); |
| 499 | let mut div: $uX = (div << shl); |
| 500 | duo = duo.wrapping_sub(div); |
| 501 | let mut quo: $uX = 1 << shl; |
| 502 | if duo < div_original { |
| 503 | return (quo, duo); |
| 504 | } |
| 505 | let mask: $uX; |
| 506 | if (div as $iX) < 0 { |
| 507 | div >>= 1; |
| 508 | shl -= 1; |
| 509 | let tmp = 1 << shl; |
| 510 | let sub = duo.wrapping_sub(div); |
| 511 | if (sub as $iX) >= 0 { |
| 512 | // restore |
| 513 | duo = sub; |
| 514 | quo |= tmp; |
| 515 | } |
| 516 | if duo < div_original { |
| 517 | return (quo, duo); |
| 518 | } |
| 519 | mask = tmp - 1; |
| 520 | } else { |
| 521 | mask = quo - 1; |
| 522 | } |
| 523 | |
| 524 | // central loop |
| 525 | let div: $uX = div.wrapping_sub(1); |
| 526 | let mut i = shl; |
| 527 | loop { |
| 528 | if i == 0 { |
| 529 | break; |
| 530 | } |
| 531 | i -= 1; |
| 532 | // note: the `wrapping_shl(1)` can be factored out, but would require another |
| 533 | // restoring division step to prevent `(duo as $iX)` from overflowing |
| 534 | if (duo as $iX) < 0 { |
| 535 | // Negated binary long division step. |
| 536 | duo = duo.wrapping_shl(1).wrapping_add(div); |
| 537 | } else { |
| 538 | // Normal long division step. |
| 539 | duo = duo.wrapping_shl(1).wrapping_sub(div); |
| 540 | } |
| 541 | } |
| 542 | if (duo as $iX) < 0 { |
| 543 | // Restore. This was not needed in the original nonrestoring algorithm because of |
| 544 | // the `duo < div_original` checks. |
| 545 | duo = duo.wrapping_add(div); |
| 546 | } |
| 547 | // unpack |
| 548 | return ((duo & mask) | quo, duo >> shl); |
| 549 | */ |
| 550 | } |
| 551 | }; |
| 552 | } |
| 553 | |