1 | // Adapted from https://github.com/Alexhuszagh/rust-lexical. |
2 | |
3 | //! Building-blocks for arbitrary-precision math. |
4 | //! |
5 | //! These algorithms assume little-endian order for the large integer |
6 | //! buffers, so for a `vec![0, 1, 2, 3]`, `3` is the most significant limb, |
7 | //! and `0` is the least significant limb. |
8 | |
9 | use super::large_powers; |
10 | use super::num::*; |
11 | use super::small_powers::*; |
12 | use alloc::vec::Vec; |
13 | use core::{cmp, iter, mem}; |
14 | |
15 | // ALIASES |
16 | // ------- |
17 | |
18 | // Type for a single limb of the big integer. |
19 | // |
20 | // A limb is analogous to a digit in base10, except, it stores 32-bit |
21 | // or 64-bit numbers instead. |
22 | // |
23 | // This should be all-known 64-bit platforms supported by Rust. |
24 | // https://forge.rust-lang.org/platform-support.html |
25 | // |
26 | // Platforms where native 128-bit multiplication is explicitly supported: |
27 | // - x86_64 (Supported via `MUL`). |
28 | // - mips64 (Supported via `DMULTU`, which `HI` and `LO` can be read-from). |
29 | // |
30 | // Platforms where native 64-bit multiplication is supported and |
31 | // you can extract hi-lo for 64-bit multiplications. |
32 | // aarch64 (Requires `UMULH` and `MUL` to capture high and low bits). |
33 | // powerpc64 (Requires `MULHDU` and `MULLD` to capture high and low bits). |
34 | // |
35 | // Platforms where native 128-bit multiplication is not supported, |
36 | // requiring software emulation. |
37 | // sparc64 (`UMUL` only supported double-word arguments). |
38 | |
39 | // 32-BIT LIMB |
40 | #[cfg (limb_width_32)] |
41 | pub type Limb = u32; |
42 | |
43 | #[cfg (limb_width_32)] |
44 | pub const POW5_LIMB: &[Limb] = &POW5_32; |
45 | |
46 | #[cfg (limb_width_32)] |
47 | pub const POW10_LIMB: &[Limb] = &POW10_32; |
48 | |
49 | #[cfg (limb_width_32)] |
50 | type Wide = u64; |
51 | |
52 | // 64-BIT LIMB |
53 | #[cfg (limb_width_64)] |
54 | pub type Limb = u64; |
55 | |
56 | #[cfg (limb_width_64)] |
57 | pub const POW5_LIMB: &[Limb] = &POW5_64; |
58 | |
59 | #[cfg (limb_width_64)] |
60 | pub const POW10_LIMB: &[Limb] = &POW10_64; |
61 | |
62 | #[cfg (limb_width_64)] |
63 | type Wide = u128; |
64 | |
65 | /// Cast to limb type. |
66 | #[inline ] |
67 | pub(crate) fn as_limb<T: Integer>(t: T) -> Limb { |
68 | Limb::as_cast(t) |
69 | } |
70 | |
71 | /// Cast to wide type. |
72 | #[inline ] |
73 | fn as_wide<T: Integer>(t: T) -> Wide { |
74 | Wide::as_cast(t) |
75 | } |
76 | |
77 | // SPLIT |
78 | // ----- |
79 | |
80 | /// Split u64 into limbs, in little-endian order. |
81 | #[inline ] |
82 | #[cfg (limb_width_32)] |
83 | fn split_u64(x: u64) -> [Limb; 2] { |
84 | [as_limb(x), as_limb(x >> 32)] |
85 | } |
86 | |
87 | /// Split u64 into limbs, in little-endian order. |
88 | #[inline ] |
89 | #[cfg (limb_width_64)] |
90 | fn split_u64(x: u64) -> [Limb; 1] { |
91 | [as_limb(x)] |
92 | } |
93 | |
94 | // HI64 |
95 | // ---- |
96 | |
97 | // NONZERO |
98 | |
99 | /// Check if any of the remaining bits are non-zero. |
100 | #[inline ] |
101 | pub fn nonzero<T: Integer>(x: &[T], rindex: usize) -> bool { |
102 | let len = x.len(); |
103 | let slc = &x[..len - rindex]; |
104 | slc.iter().rev().any(|&x| x != T::ZERO) |
105 | } |
106 | |
107 | /// Shift 64-bit integer to high 64-bits. |
108 | #[inline ] |
109 | fn u64_to_hi64_1(r0: u64) -> (u64, bool) { |
110 | debug_assert!(r0 != 0); |
111 | let ls = r0.leading_zeros(); |
112 | (r0 << ls, false) |
113 | } |
114 | |
115 | /// Shift 2 64-bit integers to high 64-bits. |
116 | #[inline ] |
117 | fn u64_to_hi64_2(r0: u64, r1: u64) -> (u64, bool) { |
118 | debug_assert!(r0 != 0); |
119 | let ls = r0.leading_zeros(); |
120 | let rs = 64 - ls; |
121 | let v = match ls { |
122 | 0 => r0, |
123 | _ => (r0 << ls) | (r1 >> rs), |
124 | }; |
125 | let n = r1 << ls != 0; |
126 | (v, n) |
127 | } |
128 | |
129 | /// Trait to export the high 64-bits from a little-endian slice. |
130 | trait Hi64<T>: AsRef<[T]> { |
131 | /// Get the hi64 bits from a 1-limb slice. |
132 | fn hi64_1(&self) -> (u64, bool); |
133 | |
134 | /// Get the hi64 bits from a 2-limb slice. |
135 | fn hi64_2(&self) -> (u64, bool); |
136 | |
137 | /// Get the hi64 bits from a 3-limb slice. |
138 | fn hi64_3(&self) -> (u64, bool); |
139 | |
140 | /// High-level exporter to extract the high 64 bits from a little-endian slice. |
141 | #[inline ] |
142 | fn hi64(&self) -> (u64, bool) { |
143 | match self.as_ref().len() { |
144 | 0 => (0, false), |
145 | 1 => self.hi64_1(), |
146 | 2 => self.hi64_2(), |
147 | _ => self.hi64_3(), |
148 | } |
149 | } |
150 | } |
151 | |
152 | impl Hi64<u32> for [u32] { |
153 | #[inline ] |
154 | fn hi64_1(&self) -> (u64, bool) { |
155 | debug_assert!(self.len() == 1); |
156 | let r0 = self[0] as u64; |
157 | u64_to_hi64_1(r0) |
158 | } |
159 | |
160 | #[inline ] |
161 | fn hi64_2(&self) -> (u64, bool) { |
162 | debug_assert!(self.len() == 2); |
163 | let r0 = (self[1] as u64) << 32; |
164 | let r1 = self[0] as u64; |
165 | u64_to_hi64_1(r0 | r1) |
166 | } |
167 | |
168 | #[inline ] |
169 | fn hi64_3(&self) -> (u64, bool) { |
170 | debug_assert!(self.len() >= 3); |
171 | let r0 = self[self.len() - 1] as u64; |
172 | let r1 = (self[self.len() - 2] as u64) << 32; |
173 | let r2 = self[self.len() - 3] as u64; |
174 | let (v, n) = u64_to_hi64_2(r0, r1 | r2); |
175 | (v, n || nonzero(self, 3)) |
176 | } |
177 | } |
178 | |
179 | impl Hi64<u64> for [u64] { |
180 | #[inline ] |
181 | fn hi64_1(&self) -> (u64, bool) { |
182 | debug_assert!(self.len() == 1); |
183 | let r0 = self[0]; |
184 | u64_to_hi64_1(r0) |
185 | } |
186 | |
187 | #[inline ] |
188 | fn hi64_2(&self) -> (u64, bool) { |
189 | debug_assert!(self.len() >= 2); |
190 | let r0 = self[self.len() - 1]; |
191 | let r1 = self[self.len() - 2]; |
192 | let (v, n) = u64_to_hi64_2(r0, r1); |
193 | (v, n || nonzero(self, 2)) |
194 | } |
195 | |
196 | #[inline ] |
197 | fn hi64_3(&self) -> (u64, bool) { |
198 | self.hi64_2() |
199 | } |
200 | } |
201 | |
202 | // SCALAR |
203 | // ------ |
204 | |
205 | // Scalar-to-scalar operations, for building-blocks for arbitrary-precision |
206 | // operations. |
207 | |
208 | mod scalar { |
209 | use super::*; |
210 | |
211 | // ADDITION |
212 | |
213 | /// Add two small integers and return the resulting value and if overflow happens. |
214 | #[inline ] |
215 | pub fn add(x: Limb, y: Limb) -> (Limb, bool) { |
216 | x.overflowing_add(y) |
217 | } |
218 | |
219 | /// AddAssign two small integers and return if overflow happens. |
220 | #[inline ] |
221 | pub fn iadd(x: &mut Limb, y: Limb) -> bool { |
222 | let t = add(*x, y); |
223 | *x = t.0; |
224 | t.1 |
225 | } |
226 | |
227 | // SUBTRACTION |
228 | |
229 | /// Subtract two small integers and return the resulting value and if overflow happens. |
230 | #[inline ] |
231 | pub fn sub(x: Limb, y: Limb) -> (Limb, bool) { |
232 | x.overflowing_sub(y) |
233 | } |
234 | |
235 | /// SubAssign two small integers and return if overflow happens. |
236 | #[inline ] |
237 | pub fn isub(x: &mut Limb, y: Limb) -> bool { |
238 | let t = sub(*x, y); |
239 | *x = t.0; |
240 | t.1 |
241 | } |
242 | |
243 | // MULTIPLICATION |
244 | |
245 | /// Multiply two small integers (with carry) (and return the overflow contribution). |
246 | /// |
247 | /// Returns the (low, high) components. |
248 | #[inline ] |
249 | pub fn mul(x: Limb, y: Limb, carry: Limb) -> (Limb, Limb) { |
250 | // Cannot overflow, as long as wide is 2x as wide. This is because |
251 | // the following is always true: |
252 | // `Wide::max_value() - (Narrow::max_value() * Narrow::max_value()) >= Narrow::max_value()` |
253 | let z: Wide = as_wide(x) * as_wide(y) + as_wide(carry); |
254 | let bits = mem::size_of::<Limb>() * 8; |
255 | (as_limb(z), as_limb(z >> bits)) |
256 | } |
257 | |
258 | /// Multiply two small integers (with carry) (and return if overflow happens). |
259 | #[inline ] |
260 | pub fn imul(x: &mut Limb, y: Limb, carry: Limb) -> Limb { |
261 | let t = mul(*x, y, carry); |
262 | *x = t.0; |
263 | t.1 |
264 | } |
265 | } // scalar |
266 | |
267 | // SMALL |
268 | // ----- |
269 | |
270 | // Large-to-small operations, to modify a big integer from a native scalar. |
271 | |
272 | mod small { |
273 | use super::*; |
274 | |
275 | // MULTIPLICATIION |
276 | |
277 | /// ADDITION |
278 | |
279 | /// Implied AddAssign implementation for adding a small integer to bigint. |
280 | /// |
281 | /// Allows us to choose a start-index in x to store, to allow incrementing |
282 | /// from a non-zero start. |
283 | #[inline ] |
284 | pub fn iadd_impl(x: &mut Vec<Limb>, y: Limb, xstart: usize) { |
285 | if x.len() <= xstart { |
286 | x.push(y); |
287 | } else { |
288 | // Initial add |
289 | let mut carry = scalar::iadd(&mut x[xstart], y); |
290 | |
291 | // Increment until overflow stops occurring. |
292 | let mut size = xstart + 1; |
293 | while carry && size < x.len() { |
294 | carry = scalar::iadd(&mut x[size], 1); |
295 | size += 1; |
296 | } |
297 | |
298 | // If we overflowed the buffer entirely, need to add 1 to the end |
299 | // of the buffer. |
300 | if carry { |
301 | x.push(1); |
302 | } |
303 | } |
304 | } |
305 | |
306 | /// AddAssign small integer to bigint. |
307 | #[inline ] |
308 | pub fn iadd(x: &mut Vec<Limb>, y: Limb) { |
309 | iadd_impl(x, y, 0); |
310 | } |
311 | |
312 | // SUBTRACTION |
313 | |
314 | /// SubAssign small integer to bigint. |
315 | /// Does not do overflowing subtraction. |
316 | #[inline ] |
317 | pub fn isub_impl(x: &mut Vec<Limb>, y: Limb, xstart: usize) { |
318 | debug_assert!(x.len() > xstart && (x[xstart] >= y || x.len() > xstart + 1)); |
319 | |
320 | // Initial subtraction |
321 | let mut carry = scalar::isub(&mut x[xstart], y); |
322 | |
323 | // Increment until overflow stops occurring. |
324 | let mut size = xstart + 1; |
325 | while carry && size < x.len() { |
326 | carry = scalar::isub(&mut x[size], 1); |
327 | size += 1; |
328 | } |
329 | normalize(x); |
330 | } |
331 | |
332 | // MULTIPLICATION |
333 | |
334 | /// MulAssign small integer to bigint. |
335 | #[inline ] |
336 | pub fn imul(x: &mut Vec<Limb>, y: Limb) { |
337 | // Multiply iteratively over all elements, adding the carry each time. |
338 | let mut carry: Limb = 0; |
339 | for xi in &mut *x { |
340 | carry = scalar::imul(xi, y, carry); |
341 | } |
342 | |
343 | // Overflow of value, add to end. |
344 | if carry != 0 { |
345 | x.push(carry); |
346 | } |
347 | } |
348 | |
349 | /// Mul small integer to bigint. |
350 | #[inline ] |
351 | pub fn mul(x: &[Limb], y: Limb) -> Vec<Limb> { |
352 | let mut z = Vec::<Limb>::default(); |
353 | z.extend_from_slice(x); |
354 | imul(&mut z, y); |
355 | z |
356 | } |
357 | |
358 | /// MulAssign by a power. |
359 | /// |
360 | /// Theoretically... |
361 | /// |
362 | /// Use an exponentiation by squaring method, since it reduces the time |
363 | /// complexity of the multiplication to ~`O(log(n))` for the squaring, |
364 | /// and `O(n*m)` for the result. Since `m` is typically a lower-order |
365 | /// factor, this significantly reduces the number of multiplications |
366 | /// we need to do. Iteratively multiplying by small powers follows |
367 | /// the nth triangular number series, which scales as `O(p^2)`, but |
368 | /// where `p` is `n+m`. In short, it scales very poorly. |
369 | /// |
370 | /// Practically.... |
371 | /// |
372 | /// Exponentiation by Squaring: |
373 | /// running 2 tests |
374 | /// test bigcomp_f32_lexical ... bench: 1,018 ns/iter (+/- 78) |
375 | /// test bigcomp_f64_lexical ... bench: 3,639 ns/iter (+/- 1,007) |
376 | /// |
377 | /// Exponentiation by Iterative Small Powers: |
378 | /// running 2 tests |
379 | /// test bigcomp_f32_lexical ... bench: 518 ns/iter (+/- 31) |
380 | /// test bigcomp_f64_lexical ... bench: 583 ns/iter (+/- 47) |
381 | /// |
382 | /// Exponentiation by Iterative Large Powers (of 2): |
383 | /// running 2 tests |
384 | /// test bigcomp_f32_lexical ... bench: 671 ns/iter (+/- 31) |
385 | /// test bigcomp_f64_lexical ... bench: 1,394 ns/iter (+/- 47) |
386 | /// |
387 | /// Even using worst-case scenarios, exponentiation by squaring is |
388 | /// significantly slower for our workloads. Just multiply by small powers, |
389 | /// in simple cases, and use precalculated large powers in other cases. |
390 | pub fn imul_pow5(x: &mut Vec<Limb>, n: u32) { |
391 | use super::large::KARATSUBA_CUTOFF; |
392 | |
393 | let small_powers = POW5_LIMB; |
394 | let large_powers = large_powers::POW5; |
395 | |
396 | if n == 0 { |
397 | // No exponent, just return. |
398 | // The 0-index of the large powers is `2^0`, which is 1, so we want |
399 | // to make sure we don't take that path with a literal 0. |
400 | return; |
401 | } |
402 | |
403 | // We want to use the asymptotically faster algorithm if we're going |
404 | // to be using Karabatsu multiplication sometime during the result, |
405 | // otherwise, just use exponentiation by squaring. |
406 | let bit_length = 32 - n.leading_zeros() as usize; |
407 | debug_assert!(bit_length != 0 && bit_length <= large_powers.len()); |
408 | if x.len() + large_powers[bit_length - 1].len() < 2 * KARATSUBA_CUTOFF { |
409 | // We can use iterative small powers to make this faster for the |
410 | // easy cases. |
411 | |
412 | // Multiply by the largest small power until n < step. |
413 | let step = small_powers.len() - 1; |
414 | let power = small_powers[step]; |
415 | let mut n = n as usize; |
416 | while n >= step { |
417 | imul(x, power); |
418 | n -= step; |
419 | } |
420 | |
421 | // Multiply by the remainder. |
422 | imul(x, small_powers[n]); |
423 | } else { |
424 | // In theory, this code should be asymptotically a lot faster, |
425 | // in practice, our small::imul seems to be the limiting step, |
426 | // and large imul is slow as well. |
427 | |
428 | // Multiply by higher order powers. |
429 | let mut idx: usize = 0; |
430 | let mut bit: usize = 1; |
431 | let mut n = n as usize; |
432 | while n != 0 { |
433 | if n & bit != 0 { |
434 | debug_assert!(idx < large_powers.len()); |
435 | large::imul(x, large_powers[idx]); |
436 | n ^= bit; |
437 | } |
438 | idx += 1; |
439 | bit <<= 1; |
440 | } |
441 | } |
442 | } |
443 | |
444 | // BIT LENGTH |
445 | |
446 | /// Get number of leading zero bits in the storage. |
447 | #[inline ] |
448 | pub fn leading_zeros(x: &[Limb]) -> usize { |
449 | x.last().map_or(0, |x| x.leading_zeros() as usize) |
450 | } |
451 | |
452 | /// Calculate the bit-length of the big-integer. |
453 | #[inline ] |
454 | pub fn bit_length(x: &[Limb]) -> usize { |
455 | let bits = mem::size_of::<Limb>() * 8; |
456 | // Avoid overflowing, calculate via total number of bits |
457 | // minus leading zero bits. |
458 | let nlz = leading_zeros(x); |
459 | bits.checked_mul(x.len()) |
460 | .map_or_else(usize::max_value, |v| v - nlz) |
461 | } |
462 | |
463 | // SHL |
464 | |
465 | /// Shift-left bits inside a buffer. |
466 | /// |
467 | /// Assumes `n < Limb::BITS`, IE, internally shifting bits. |
468 | #[inline ] |
469 | pub fn ishl_bits(x: &mut Vec<Limb>, n: usize) { |
470 | // Need to shift by the number of `bits % Limb::BITS)`. |
471 | let bits = mem::size_of::<Limb>() * 8; |
472 | debug_assert!(n < bits); |
473 | if n == 0 { |
474 | return; |
475 | } |
476 | |
477 | // Internally, for each item, we shift left by n, and add the previous |
478 | // right shifted limb-bits. |
479 | // For example, we transform (for u8) shifted left 2, to: |
480 | // b10100100 b01000010 |
481 | // b10 b10010001 b00001000 |
482 | let rshift = bits - n; |
483 | let lshift = n; |
484 | let mut prev: Limb = 0; |
485 | for xi in &mut *x { |
486 | let tmp = *xi; |
487 | *xi <<= lshift; |
488 | *xi |= prev >> rshift; |
489 | prev = tmp; |
490 | } |
491 | |
492 | // Always push the carry, even if it creates a non-normal result. |
493 | let carry = prev >> rshift; |
494 | if carry != 0 { |
495 | x.push(carry); |
496 | } |
497 | } |
498 | |
499 | /// Shift-left `n` digits inside a buffer. |
500 | /// |
501 | /// Assumes `n` is not 0. |
502 | #[inline ] |
503 | pub fn ishl_limbs(x: &mut Vec<Limb>, n: usize) { |
504 | debug_assert!(n != 0); |
505 | if !x.is_empty() { |
506 | x.reserve(n); |
507 | x.splice(..0, iter::repeat(0).take(n)); |
508 | } |
509 | } |
510 | |
511 | /// Shift-left buffer by n bits. |
512 | #[inline ] |
513 | pub fn ishl(x: &mut Vec<Limb>, n: usize) { |
514 | let bits = mem::size_of::<Limb>() * 8; |
515 | // Need to pad with zeros for the number of `bits / Limb::BITS`, |
516 | // and shift-left with carry for `bits % Limb::BITS`. |
517 | let rem = n % bits; |
518 | let div = n / bits; |
519 | ishl_bits(x, rem); |
520 | if div != 0 { |
521 | ishl_limbs(x, div); |
522 | } |
523 | } |
524 | |
525 | // NORMALIZE |
526 | |
527 | /// Normalize the container by popping any leading zeros. |
528 | #[inline ] |
529 | pub fn normalize(x: &mut Vec<Limb>) { |
530 | // Remove leading zero if we cause underflow. Since we're dividing |
531 | // by a small power, we have at max 1 int removed. |
532 | while x.last() == Some(&0) { |
533 | x.pop(); |
534 | } |
535 | } |
536 | } // small |
537 | |
538 | // LARGE |
539 | // ----- |
540 | |
541 | // Large-to-large operations, to modify a big integer from a native scalar. |
542 | |
543 | mod large { |
544 | use super::*; |
545 | |
546 | // RELATIVE OPERATORS |
547 | |
548 | /// Compare `x` to `y`, in little-endian order. |
549 | #[inline ] |
550 | pub fn compare(x: &[Limb], y: &[Limb]) -> cmp::Ordering { |
551 | if x.len() > y.len() { |
552 | cmp::Ordering::Greater |
553 | } else if x.len() < y.len() { |
554 | cmp::Ordering::Less |
555 | } else { |
556 | let iter = x.iter().rev().zip(y.iter().rev()); |
557 | for (&xi, &yi) in iter { |
558 | if xi > yi { |
559 | return cmp::Ordering::Greater; |
560 | } else if xi < yi { |
561 | return cmp::Ordering::Less; |
562 | } |
563 | } |
564 | // Equal case. |
565 | cmp::Ordering::Equal |
566 | } |
567 | } |
568 | |
569 | /// Check if x is less than y. |
570 | #[inline ] |
571 | pub fn less(x: &[Limb], y: &[Limb]) -> bool { |
572 | compare(x, y) == cmp::Ordering::Less |
573 | } |
574 | |
575 | /// Check if x is greater than or equal to y. |
576 | #[inline ] |
577 | pub fn greater_equal(x: &[Limb], y: &[Limb]) -> bool { |
578 | !less(x, y) |
579 | } |
580 | |
581 | // ADDITION |
582 | |
583 | /// Implied AddAssign implementation for bigints. |
584 | /// |
585 | /// Allows us to choose a start-index in x to store, so we can avoid |
586 | /// padding the buffer with zeros when not needed, optimized for vectors. |
587 | pub fn iadd_impl(x: &mut Vec<Limb>, y: &[Limb], xstart: usize) { |
588 | // The effective x buffer is from `xstart..x.len()`, so we need to treat |
589 | // that as the current range. If the effective y buffer is longer, need |
590 | // to resize to that, + the start index. |
591 | if y.len() > x.len() - xstart { |
592 | x.resize(y.len() + xstart, 0); |
593 | } |
594 | |
595 | // Iteratively add elements from y to x. |
596 | let mut carry = false; |
597 | for (xi, yi) in x[xstart..].iter_mut().zip(y.iter()) { |
598 | // Only one op of the two can overflow, since we added at max |
599 | // Limb::max_value() + Limb::max_value(). Add the previous carry, |
600 | // and store the current carry for the next. |
601 | let mut tmp = scalar::iadd(xi, *yi); |
602 | if carry { |
603 | tmp |= scalar::iadd(xi, 1); |
604 | } |
605 | carry = tmp; |
606 | } |
607 | |
608 | // Overflow from the previous bit. |
609 | if carry { |
610 | small::iadd_impl(x, 1, y.len() + xstart); |
611 | } |
612 | } |
613 | |
614 | /// AddAssign bigint to bigint. |
615 | #[inline ] |
616 | pub fn iadd(x: &mut Vec<Limb>, y: &[Limb]) { |
617 | iadd_impl(x, y, 0); |
618 | } |
619 | |
620 | /// Add bigint to bigint. |
621 | #[inline ] |
622 | pub fn add(x: &[Limb], y: &[Limb]) -> Vec<Limb> { |
623 | let mut z = Vec::<Limb>::default(); |
624 | z.extend_from_slice(x); |
625 | iadd(&mut z, y); |
626 | z |
627 | } |
628 | |
629 | // SUBTRACTION |
630 | |
631 | /// SubAssign bigint to bigint. |
632 | pub fn isub(x: &mut Vec<Limb>, y: &[Limb]) { |
633 | // Basic underflow checks. |
634 | debug_assert!(greater_equal(x, y)); |
635 | |
636 | // Iteratively add elements from y to x. |
637 | let mut carry = false; |
638 | for (xi, yi) in x.iter_mut().zip(y.iter()) { |
639 | // Only one op of the two can overflow, since we added at max |
640 | // Limb::max_value() + Limb::max_value(). Add the previous carry, |
641 | // and store the current carry for the next. |
642 | let mut tmp = scalar::isub(xi, *yi); |
643 | if carry { |
644 | tmp |= scalar::isub(xi, 1); |
645 | } |
646 | carry = tmp; |
647 | } |
648 | |
649 | if carry { |
650 | small::isub_impl(x, 1, y.len()); |
651 | } else { |
652 | small::normalize(x); |
653 | } |
654 | } |
655 | |
656 | // MULTIPLICATION |
657 | |
658 | /// Number of digits to bottom-out to asymptotically slow algorithms. |
659 | /// |
660 | /// Karatsuba tends to out-perform long-multiplication at ~320-640 bits, |
661 | /// so we go halfway, while Newton division tends to out-perform |
662 | /// Algorithm D at ~1024 bits. We can toggle this for optimal performance. |
663 | pub const KARATSUBA_CUTOFF: usize = 32; |
664 | |
665 | /// Grade-school multiplication algorithm. |
666 | /// |
667 | /// Slow, naive algorithm, using limb-bit bases and just shifting left for |
668 | /// each iteration. This could be optimized with numerous other algorithms, |
669 | /// but it's extremely simple, and works in O(n*m) time, which is fine |
670 | /// by me. Each iteration, of which there are `m` iterations, requires |
671 | /// `n` multiplications, and `n` additions, or grade-school multiplication. |
672 | fn long_mul(x: &[Limb], y: &[Limb]) -> Vec<Limb> { |
673 | // Using the immutable value, multiply by all the scalars in y, using |
674 | // the algorithm defined above. Use a single buffer to avoid |
675 | // frequent reallocations. Handle the first case to avoid a redundant |
676 | // addition, since we know y.len() >= 1. |
677 | let mut z: Vec<Limb> = small::mul(x, y[0]); |
678 | z.resize(x.len() + y.len(), 0); |
679 | |
680 | // Handle the iterative cases. |
681 | for (i, &yi) in y[1..].iter().enumerate() { |
682 | let zi: Vec<Limb> = small::mul(x, yi); |
683 | iadd_impl(&mut z, &zi, i + 1); |
684 | } |
685 | |
686 | small::normalize(&mut z); |
687 | |
688 | z |
689 | } |
690 | |
691 | /// Split two buffers into halfway, into (lo, hi). |
692 | #[inline ] |
693 | pub fn karatsuba_split(z: &[Limb], m: usize) -> (&[Limb], &[Limb]) { |
694 | (&z[..m], &z[m..]) |
695 | } |
696 | |
697 | /// Karatsuba multiplication algorithm with roughly equal input sizes. |
698 | /// |
699 | /// Assumes `y.len() >= x.len()`. |
700 | fn karatsuba_mul(x: &[Limb], y: &[Limb]) -> Vec<Limb> { |
701 | if y.len() <= KARATSUBA_CUTOFF { |
702 | // Bottom-out to long division for small cases. |
703 | long_mul(x, y) |
704 | } else if x.len() < y.len() / 2 { |
705 | karatsuba_uneven_mul(x, y) |
706 | } else { |
707 | // Do our 3 multiplications. |
708 | let m = y.len() / 2; |
709 | let (xl, xh) = karatsuba_split(x, m); |
710 | let (yl, yh) = karatsuba_split(y, m); |
711 | let sumx = add(xl, xh); |
712 | let sumy = add(yl, yh); |
713 | let z0 = karatsuba_mul(xl, yl); |
714 | let mut z1 = karatsuba_mul(&sumx, &sumy); |
715 | let z2 = karatsuba_mul(xh, yh); |
716 | // Properly scale z1, which is `z1 - z2 - zo`. |
717 | isub(&mut z1, &z2); |
718 | isub(&mut z1, &z0); |
719 | |
720 | // Create our result, which is equal to, in little-endian order: |
721 | // [z0, z1 - z2 - z0, z2] |
722 | // z1 must be shifted m digits (2^(32m)) over. |
723 | // z2 must be shifted 2*m digits (2^(64m)) over. |
724 | let len = z0.len().max(m + z1.len()).max(2 * m + z2.len()); |
725 | let mut result = z0; |
726 | result.reserve_exact(len - result.len()); |
727 | iadd_impl(&mut result, &z1, m); |
728 | iadd_impl(&mut result, &z2, 2 * m); |
729 | |
730 | result |
731 | } |
732 | } |
733 | |
734 | /// Karatsuba multiplication algorithm where y is substantially larger than x. |
735 | /// |
736 | /// Assumes `y.len() >= x.len()`. |
737 | fn karatsuba_uneven_mul(x: &[Limb], mut y: &[Limb]) -> Vec<Limb> { |
738 | let mut result = Vec::<Limb>::default(); |
739 | result.resize(x.len() + y.len(), 0); |
740 | |
741 | // This effectively is like grade-school multiplication between |
742 | // two numbers, except we're using splits on `y`, and the intermediate |
743 | // step is a Karatsuba multiplication. |
744 | let mut start = 0; |
745 | while !y.is_empty() { |
746 | let m = x.len().min(y.len()); |
747 | let (yl, yh) = karatsuba_split(y, m); |
748 | let prod = karatsuba_mul(x, yl); |
749 | iadd_impl(&mut result, &prod, start); |
750 | y = yh; |
751 | start += m; |
752 | } |
753 | small::normalize(&mut result); |
754 | |
755 | result |
756 | } |
757 | |
758 | /// Forwarder to the proper Karatsuba algorithm. |
759 | #[inline ] |
760 | fn karatsuba_mul_fwd(x: &[Limb], y: &[Limb]) -> Vec<Limb> { |
761 | if x.len() < y.len() { |
762 | karatsuba_mul(x, y) |
763 | } else { |
764 | karatsuba_mul(y, x) |
765 | } |
766 | } |
767 | |
768 | /// MulAssign bigint to bigint. |
769 | #[inline ] |
770 | pub fn imul(x: &mut Vec<Limb>, y: &[Limb]) { |
771 | if y.len() == 1 { |
772 | small::imul(x, y[0]); |
773 | } else { |
774 | // We're not really in a condition where using Karatsuba |
775 | // multiplication makes sense, so we're just going to use long |
776 | // division. ~20% speedup compared to: |
777 | // *x = karatsuba_mul_fwd(x, y); |
778 | *x = karatsuba_mul_fwd(x, y); |
779 | } |
780 | } |
781 | } // large |
782 | |
783 | // TRAITS |
784 | // ------ |
785 | |
786 | /// Traits for shared operations for big integers. |
787 | /// |
788 | /// None of these are implemented using normal traits, since these |
789 | /// are very expensive operations, and we want to deliberately |
790 | /// and explicitly use these functions. |
791 | pub(crate) trait Math: Clone + Sized + Default { |
792 | // DATA |
793 | |
794 | /// Get access to the underlying data |
795 | fn data(&self) -> &Vec<Limb>; |
796 | |
797 | /// Get access to the underlying data |
798 | fn data_mut(&mut self) -> &mut Vec<Limb>; |
799 | |
800 | // RELATIVE OPERATIONS |
801 | |
802 | /// Compare self to y. |
803 | #[inline ] |
804 | fn compare(&self, y: &Self) -> cmp::Ordering { |
805 | large::compare(self.data(), y.data()) |
806 | } |
807 | |
808 | // PROPERTIES |
809 | |
810 | /// Get the high 64-bits from the bigint and if there are remaining bits. |
811 | #[inline ] |
812 | fn hi64(&self) -> (u64, bool) { |
813 | self.data().as_slice().hi64() |
814 | } |
815 | |
816 | /// Calculate the bit-length of the big-integer. |
817 | /// Returns usize::max_value() if the value overflows, |
818 | /// IE, if `self.data().len() > usize::max_value() / 8`. |
819 | #[inline ] |
820 | fn bit_length(&self) -> usize { |
821 | small::bit_length(self.data()) |
822 | } |
823 | |
824 | // INTEGER CONVERSIONS |
825 | |
826 | /// Create new big integer from u64. |
827 | #[inline ] |
828 | fn from_u64(x: u64) -> Self { |
829 | let mut v = Self::default(); |
830 | let slc = split_u64(x); |
831 | v.data_mut().extend_from_slice(&slc); |
832 | v.normalize(); |
833 | v |
834 | } |
835 | |
836 | // NORMALIZE |
837 | |
838 | /// Normalize the integer, so any leading zero values are removed. |
839 | #[inline ] |
840 | fn normalize(&mut self) { |
841 | small::normalize(self.data_mut()); |
842 | } |
843 | |
844 | // ADDITION |
845 | |
846 | /// AddAssign small integer. |
847 | #[inline ] |
848 | fn iadd_small(&mut self, y: Limb) { |
849 | small::iadd(self.data_mut(), y); |
850 | } |
851 | |
852 | // MULTIPLICATION |
853 | |
854 | /// MulAssign small integer. |
855 | #[inline ] |
856 | fn imul_small(&mut self, y: Limb) { |
857 | small::imul(self.data_mut(), y); |
858 | } |
859 | |
860 | /// Multiply by a power of 2. |
861 | #[inline ] |
862 | fn imul_pow2(&mut self, n: u32) { |
863 | self.ishl(n as usize); |
864 | } |
865 | |
866 | /// Multiply by a power of 5. |
867 | #[inline ] |
868 | fn imul_pow5(&mut self, n: u32) { |
869 | small::imul_pow5(self.data_mut(), n); |
870 | } |
871 | |
872 | /// MulAssign by a power of 10. |
873 | #[inline ] |
874 | fn imul_pow10(&mut self, n: u32) { |
875 | self.imul_pow5(n); |
876 | self.imul_pow2(n); |
877 | } |
878 | |
879 | // SHIFTS |
880 | |
881 | /// Shift-left the entire buffer n bits. |
882 | #[inline ] |
883 | fn ishl(&mut self, n: usize) { |
884 | small::ishl(self.data_mut(), n); |
885 | } |
886 | } |
887 | |