1 | /// Creates an unsigned division function that uses a combination of hardware division and |
2 | /// binary long division to divide integers larger than what hardware division by itself can do. This |
3 | /// function is intended for microarchitectures that have division hardware, but not fast enough |
4 | /// multiplication hardware for `impl_trifecta` to be faster. |
5 | #[allow (unused_macros)] |
6 | macro_rules! impl_delegate { |
7 | ( |
8 | $fn:ident, // name of the unsigned division function |
9 | $zero_div_fn:ident, // function called when division by zero is attempted |
10 | $half_normalization_shift:ident, // function for finding the normalization shift of $uX |
11 | $half_division:ident, // function for division of a $uX by a $uX |
12 | $n_h:expr, // the number of bits in $iH or $uH |
13 | $uH:ident, // unsigned integer with half the bit width of $uX |
14 | $uX:ident, // unsigned integer with half the bit width of $uD. |
15 | $uD:ident, // unsigned integer type for the inputs and outputs of `$fn` |
16 | $iD:ident // signed integer type with the same bitwidth as `$uD` |
17 | ) => { |
18 | /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a |
19 | /// tuple. |
20 | pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { |
21 | // The two possibility algorithm, undersubtracting long division algorithm, or any kind |
22 | // of reciprocal based algorithm will not be fastest, because they involve large |
23 | // multiplications that we assume to not be fast enough relative to the divisions to |
24 | // outweigh setup times. |
25 | |
26 | // the number of bits in a $uX |
27 | let n = $n_h * 2; |
28 | |
29 | let duo_lo = duo as $uX; |
30 | let duo_hi = (duo >> n) as $uX; |
31 | let div_lo = div as $uX; |
32 | let div_hi = (div >> n) as $uX; |
33 | |
34 | match (div_lo == 0, div_hi == 0, duo_hi == 0) { |
35 | (true, true, _) => $zero_div_fn(), |
36 | (_, false, true) => { |
37 | // `duo` < `div` |
38 | return (0, duo); |
39 | } |
40 | (false, true, true) => { |
41 | // delegate to smaller division |
42 | let tmp = $half_division(duo_lo, div_lo); |
43 | return (tmp.0 as $uD, tmp.1 as $uD); |
44 | } |
45 | (false, true, false) => { |
46 | if duo_hi < div_lo { |
47 | // `quo_hi` will always be 0. This performs a binary long division algorithm |
48 | // to zero `duo_hi` followed by a half division. |
49 | |
50 | // We can calculate the normalization shift using only `$uX` size functions. |
51 | // If we calculated the normalization shift using |
52 | // `$half_normalization_shift(duo_hi, div_lo false)`, it would break the |
53 | // assumption the function has that the first argument is more than the |
54 | // second argument. If the arguments are switched, the assumption holds true |
55 | // since `duo_hi < div_lo`. |
56 | let norm_shift = $half_normalization_shift(div_lo, duo_hi, false); |
57 | let shl = if norm_shift == 0 { |
58 | // Consider what happens if the msbs of `duo_hi` and `div_lo` align with |
59 | // no shifting. The normalization shift will always return |
60 | // `norm_shift == 0` regardless of whether it is fully normalized, |
61 | // because `duo_hi < div_lo`. In that edge case, `n - norm_shift` would |
62 | // result in shift overflow down the line. For the edge case, because |
63 | // both `duo_hi < div_lo` and we are comparing all the significant bits |
64 | // of `duo_hi` and `div`, we can make `shl = n - 1`. |
65 | n - 1 |
66 | } else { |
67 | // We also cannot just use `shl = n - norm_shift - 1` in the general |
68 | // case, because when we are not in the edge case comparing all the |
69 | // significant bits, then the full `duo < div` may not be true and thus |
70 | // breaks the division algorithm. |
71 | n - norm_shift |
72 | }; |
73 | |
74 | // The 3 variable restoring division algorithm (see binary_long.rs) is ideal |
75 | // for this task, since `pow` and `quo` can be `$uX` and the delegation |
76 | // check is simple. |
77 | let mut div: $uD = div << shl; |
78 | let mut pow_lo: $uX = 1 << shl; |
79 | let mut quo_lo: $uX = 0; |
80 | let mut duo = duo; |
81 | loop { |
82 | let sub = duo.wrapping_sub(div); |
83 | if 0 <= (sub as $iD) { |
84 | duo = sub; |
85 | quo_lo |= pow_lo; |
86 | let duo_hi = (duo >> n) as $uX; |
87 | if duo_hi == 0 { |
88 | // Delegate to get the rest of the quotient. Note that the |
89 | // `div_lo` here is the original unshifted `div`. |
90 | let tmp = $half_division(duo as $uX, div_lo); |
91 | return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD); |
92 | } |
93 | } |
94 | div >>= 1; |
95 | pow_lo >>= 1; |
96 | } |
97 | } else if duo_hi == div_lo { |
98 | // `quo_hi == 1`. This branch is cheap and helps with edge cases. |
99 | let tmp = $half_division(duo as $uX, div as $uX); |
100 | return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD); |
101 | } else { |
102 | // `div_lo < duo_hi` |
103 | // `rem_hi == 0` |
104 | if (div_lo >> $n_h) == 0 { |
105 | // Short division of $uD by a $uH, using $uX by $uX division |
106 | let div_0 = div_lo as $uH as $uX; |
107 | let (quo_hi, rem_3) = $half_division(duo_hi, div_0); |
108 | |
109 | let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h); |
110 | let (quo_1, rem_2) = $half_division(duo_mid, div_0); |
111 | |
112 | let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h); |
113 | let (quo_0, rem_1) = $half_division(duo_lo, div_0); |
114 | |
115 | return ( |
116 | (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n), |
117 | rem_1 as $uD, |
118 | ); |
119 | } |
120 | |
121 | // This is basically a short division composed of a half division for the hi |
122 | // part, specialized 3 variable binary long division in the middle, and |
123 | // another half division for the lo part. |
124 | let duo_lo = duo as $uX; |
125 | let tmp = $half_division(duo_hi, div_lo); |
126 | let quo_hi = tmp.0; |
127 | let mut duo = (duo_lo as $uD) | ((tmp.1 as $uD) << n); |
128 | // This check is required to avoid breaking the long division below. |
129 | if duo < div { |
130 | return ((quo_hi as $uD) << n, duo); |
131 | } |
132 | |
133 | // The half division handled all shift alignments down to `n`, so this |
134 | // division can continue with a shift of `n - 1`. |
135 | let mut div: $uD = div << (n - 1); |
136 | let mut pow_lo: $uX = 1 << (n - 1); |
137 | let mut quo_lo: $uX = 0; |
138 | loop { |
139 | let sub = duo.wrapping_sub(div); |
140 | if 0 <= (sub as $iD) { |
141 | duo = sub; |
142 | quo_lo |= pow_lo; |
143 | let duo_hi = (duo >> n) as $uX; |
144 | if duo_hi == 0 { |
145 | // Delegate to get the rest of the quotient. Note that the |
146 | // `div_lo` here is the original unshifted `div`. |
147 | let tmp = $half_division(duo as $uX, div_lo); |
148 | return ( |
149 | (tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n), |
150 | tmp.1 as $uD, |
151 | ); |
152 | } |
153 | } |
154 | div >>= 1; |
155 | pow_lo >>= 1; |
156 | } |
157 | } |
158 | } |
159 | (_, false, false) => { |
160 | // Full $uD by $uD binary long division. `quo_hi` will always be 0. |
161 | if duo < div { |
162 | return (0, duo); |
163 | } |
164 | let div_original = div; |
165 | let shl = $half_normalization_shift(duo_hi, div_hi, false); |
166 | let mut duo = duo; |
167 | let mut div: $uD = div << shl; |
168 | let mut pow_lo: $uX = 1 << shl; |
169 | let mut quo_lo: $uX = 0; |
170 | loop { |
171 | let sub = duo.wrapping_sub(div); |
172 | if 0 <= (sub as $iD) { |
173 | duo = sub; |
174 | quo_lo |= pow_lo; |
175 | if duo < div_original { |
176 | return (quo_lo as $uD, duo); |
177 | } |
178 | } |
179 | div >>= 1; |
180 | pow_lo >>= 1; |
181 | } |
182 | } |
183 | } |
184 | } |
185 | }; |
186 | } |
187 | |
188 | public_test_dep! { |
189 | /// Returns `n / d` and sets `*rem = n % d`. |
190 | /// |
191 | /// This specialization exists because: |
192 | /// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`, |
193 | /// so we have to use an old fashioned `&mut u128` argument to return the remainder. |
194 | /// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the |
195 | /// delegate algorithm strategy the only reasonably fast way to perform `u128` division. |
196 | // used on SPARC |
197 | #[allow (dead_code)] |
198 | pub(crate) fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 { |
199 | use super::*; |
200 | let duo_lo = duo as u64; |
201 | let duo_hi = (duo >> 64) as u64; |
202 | let div_lo = div as u64; |
203 | let div_hi = (div >> 64) as u64; |
204 | |
205 | match (div_lo == 0, div_hi == 0, duo_hi == 0) { |
206 | (true, true, _) => zero_div_fn(), |
207 | (_, false, true) => { |
208 | *rem = duo; |
209 | return 0; |
210 | } |
211 | (false, true, true) => { |
212 | let tmp = u64_by_u64_div_rem(duo_lo, div_lo); |
213 | *rem = tmp.1 as u128; |
214 | return tmp.0 as u128; |
215 | } |
216 | (false, true, false) => { |
217 | if duo_hi < div_lo { |
218 | let norm_shift = u64_normalization_shift(div_lo, duo_hi, false); |
219 | let shl = if norm_shift == 0 { |
220 | 64 - 1 |
221 | } else { |
222 | 64 - norm_shift |
223 | }; |
224 | |
225 | let mut div: u128 = div << shl; |
226 | let mut pow_lo: u64 = 1 << shl; |
227 | let mut quo_lo: u64 = 0; |
228 | let mut duo = duo; |
229 | loop { |
230 | let sub = duo.wrapping_sub(div); |
231 | if 0 <= (sub as i128) { |
232 | duo = sub; |
233 | quo_lo |= pow_lo; |
234 | let duo_hi = (duo >> 64) as u64; |
235 | if duo_hi == 0 { |
236 | let tmp = u64_by_u64_div_rem(duo as u64, div_lo); |
237 | *rem = tmp.1 as u128; |
238 | return (quo_lo | tmp.0) as u128; |
239 | } |
240 | } |
241 | div >>= 1; |
242 | pow_lo >>= 1; |
243 | } |
244 | } else if duo_hi == div_lo { |
245 | let tmp = u64_by_u64_div_rem(duo as u64, div as u64); |
246 | *rem = tmp.1 as u128; |
247 | return (1 << 64) | (tmp.0 as u128); |
248 | } else { |
249 | if (div_lo >> 32) == 0 { |
250 | let div_0 = div_lo as u32 as u64; |
251 | let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0); |
252 | |
253 | let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32); |
254 | let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0); |
255 | |
256 | let duo_lo = (duo as u32 as u64) | (rem_2 << 32); |
257 | let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0); |
258 | |
259 | *rem = rem_1 as u128; |
260 | return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64); |
261 | } |
262 | |
263 | let duo_lo = duo as u64; |
264 | let tmp = u64_by_u64_div_rem(duo_hi, div_lo); |
265 | let quo_hi = tmp.0; |
266 | let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64); |
267 | if duo < div { |
268 | *rem = duo; |
269 | return (quo_hi as u128) << 64; |
270 | } |
271 | |
272 | let mut div: u128 = div << (64 - 1); |
273 | let mut pow_lo: u64 = 1 << (64 - 1); |
274 | let mut quo_lo: u64 = 0; |
275 | loop { |
276 | let sub = duo.wrapping_sub(div); |
277 | if 0 <= (sub as i128) { |
278 | duo = sub; |
279 | quo_lo |= pow_lo; |
280 | let duo_hi = (duo >> 64) as u64; |
281 | if duo_hi == 0 { |
282 | let tmp = u64_by_u64_div_rem(duo as u64, div_lo); |
283 | *rem = tmp.1 as u128; |
284 | return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64); |
285 | } |
286 | } |
287 | div >>= 1; |
288 | pow_lo >>= 1; |
289 | } |
290 | } |
291 | } |
292 | (_, false, false) => { |
293 | if duo < div { |
294 | *rem = duo; |
295 | return 0; |
296 | } |
297 | let div_original = div; |
298 | let shl = u64_normalization_shift(duo_hi, div_hi, false); |
299 | let mut duo = duo; |
300 | let mut div: u128 = div << shl; |
301 | let mut pow_lo: u64 = 1 << shl; |
302 | let mut quo_lo: u64 = 0; |
303 | loop { |
304 | let sub = duo.wrapping_sub(div); |
305 | if 0 <= (sub as i128) { |
306 | duo = sub; |
307 | quo_lo |= pow_lo; |
308 | if duo < div_original { |
309 | *rem = duo; |
310 | return quo_lo as u128; |
311 | } |
312 | } |
313 | div >>= 1; |
314 | pow_lo >>= 1; |
315 | } |
316 | } |
317 | } |
318 | } |
319 | } |
320 | |