1 | //! A simple big-integer type for slow path algorithms. |
2 | //! |
3 | //! This includes minimal stackvector for use in big-integer arithmetic. |
4 | |
5 | #![doc (hidden)] |
6 | |
7 | #[cfg (feature = "alloc" )] |
8 | use crate::heapvec::HeapVec; |
9 | use crate::num::Float; |
10 | #[cfg (not(feature = "alloc" ))] |
11 | use crate::stackvec::StackVec; |
12 | #[cfg (not(feature = "compact" ))] |
13 | use crate::table::{LARGE_POW5, LARGE_POW5_STEP}; |
14 | use core::{cmp, ops, ptr}; |
15 | |
16 | /// Number of bits in a Bigint. |
17 | /// |
18 | /// This needs to be at least the number of bits required to store |
19 | /// a Bigint, which is `log2(radix**digits)`. |
20 | /// ≅ 3600 for base-10, rounded-up. |
21 | pub const BIGINT_BITS: usize = 4000; |
22 | |
23 | /// The number of limbs for the bigint. |
24 | pub const BIGINT_LIMBS: usize = BIGINT_BITS / LIMB_BITS; |
25 | |
26 | #[cfg (feature = "alloc" )] |
27 | pub type VecType = HeapVec; |
28 | |
29 | #[cfg (not(feature = "alloc" ))] |
30 | pub type VecType = StackVec; |
31 | |
32 | /// Storage for a big integer type. |
33 | /// |
34 | /// This is used for algorithms when we have a finite number of digits. |
35 | /// Specifically, it stores all the significant digits scaled to the |
36 | /// proper exponent, as an integral type, and then directly compares |
37 | /// these digits. |
38 | /// |
39 | /// This requires us to store the number of significant bits, plus the |
40 | /// number of exponent bits (required) since we scale everything |
41 | /// to the same exponent. |
42 | #[derive (Clone, PartialEq, Eq)] |
43 | pub struct Bigint { |
44 | /// Significant digits for the float, stored in a big integer in LE order. |
45 | /// |
46 | /// This is pretty much the same number of digits for any radix, since the |
47 | /// significant digits balances out the zeros from the exponent: |
48 | /// 1. Decimal is 1091 digits, 767 mantissa digits + 324 exponent zeros. |
49 | /// 2. Base 6 is 1097 digits, or 680 mantissa digits + 417 exponent zeros. |
50 | /// 3. Base 36 is 1086 digits, or 877 mantissa digits + 209 exponent zeros. |
51 | /// |
52 | /// However, the number of bytes required is larger for large radixes: |
53 | /// for decimal, we need `log2(10**1091) ≅ 3600`, while for base 36 |
54 | /// we need `log2(36**1086) ≅ 5600`. Since we use uninitialized data, |
55 | /// we avoid a major performance hit from the large buffer size. |
56 | pub data: VecType, |
57 | } |
58 | |
59 | #[allow (clippy::new_without_default)] |
60 | impl Bigint { |
61 | /// Construct a bigint representing 0. |
62 | #[inline (always)] |
63 | pub fn new() -> Self { |
64 | Self { |
65 | data: VecType::new(), |
66 | } |
67 | } |
68 | |
69 | /// Construct a bigint from an integer. |
70 | #[inline (always)] |
71 | pub fn from_u64(value: u64) -> Self { |
72 | Self { |
73 | data: VecType::from_u64(value), |
74 | } |
75 | } |
76 | |
77 | #[inline (always)] |
78 | pub fn hi64(&self) -> (u64, bool) { |
79 | self.data.hi64() |
80 | } |
81 | |
82 | /// Multiply and assign as if by exponentiation by a power. |
83 | #[inline ] |
84 | pub fn pow(&mut self, base: u32, exp: u32) -> Option<()> { |
85 | debug_assert!(base == 2 || base == 5 || base == 10); |
86 | if base % 5 == 0 { |
87 | pow(&mut self.data, exp)?; |
88 | } |
89 | if base % 2 == 0 { |
90 | shl(&mut self.data, exp as usize)?; |
91 | } |
92 | Some(()) |
93 | } |
94 | |
95 | /// Calculate the bit-length of the big-integer. |
96 | #[inline ] |
97 | pub fn bit_length(&self) -> u32 { |
98 | bit_length(&self.data) |
99 | } |
100 | } |
101 | |
102 | impl ops::MulAssign<&Bigint> for Bigint { |
103 | fn mul_assign(&mut self, rhs: &Bigint) { |
104 | self.data *= &rhs.data; |
105 | } |
106 | } |
107 | |
108 | /// REVERSE VIEW |
109 | |
110 | /// Reverse, immutable view of a sequence. |
111 | pub struct ReverseView<'a, T: 'a> { |
112 | inner: &'a [T], |
113 | } |
114 | |
115 | impl<'a, T> ops::Index<usize> for ReverseView<'a, T> { |
116 | type Output = T; |
117 | |
118 | #[inline ] |
119 | fn index(&self, index: usize) -> &T { |
120 | let len: usize = self.inner.len(); |
121 | &(*self.inner)[len - index - 1] |
122 | } |
123 | } |
124 | |
125 | /// Create a reverse view of the vector for indexing. |
126 | #[inline ] |
127 | pub fn rview(x: &[Limb]) -> ReverseView<Limb> { |
128 | ReverseView { |
129 | inner: x, |
130 | } |
131 | } |
132 | |
133 | // COMPARE |
134 | // ------- |
135 | |
136 | /// Compare `x` to `y`, in little-endian order. |
137 | #[inline ] |
138 | pub fn compare(x: &[Limb], y: &[Limb]) -> cmp::Ordering { |
139 | match x.len().cmp(&y.len()) { |
140 | cmp::Ordering::Equal => { |
141 | let iter: impl Iterator = x.iter().rev().zip(y.iter().rev()); |
142 | for (&xi: u64, yi: &u64) in iter { |
143 | match xi.cmp(yi) { |
144 | cmp::Ordering::Equal => (), |
145 | ord: Ordering => return ord, |
146 | } |
147 | } |
148 | // Equal case. |
149 | cmp::Ordering::Equal |
150 | }, |
151 | ord: Ordering => ord, |
152 | } |
153 | } |
154 | |
155 | // NORMALIZE |
156 | // --------- |
157 | |
158 | /// Normalize the integer, so any leading zero values are removed. |
159 | #[inline ] |
160 | pub fn normalize(x: &mut VecType) { |
161 | // We don't care if this wraps: the index is bounds-checked. |
162 | while let Some(&value: u64) = x.get(index:x.len().wrapping_sub(1)) { |
163 | if value == 0 { |
164 | unsafe { x.set_len(x.len() - 1) }; |
165 | } else { |
166 | break; |
167 | } |
168 | } |
169 | } |
170 | |
171 | /// Get if the big integer is normalized. |
172 | #[inline ] |
173 | #[allow (clippy::match_like_matches_macro)] |
174 | pub fn is_normalized(x: &[Limb]) -> bool { |
175 | // We don't care if this wraps: the index is bounds-checked. |
176 | match x.get(index:x.len().wrapping_sub(1)) { |
177 | Some(&0) => false, |
178 | _ => true, |
179 | } |
180 | } |
181 | |
182 | // FROM |
183 | // ---- |
184 | |
185 | /// Create StackVec from u64 value. |
186 | #[inline (always)] |
187 | #[allow (clippy::branches_sharing_code)] |
188 | pub fn from_u64(x: u64) -> VecType { |
189 | let mut vec: StackVec = VecType::new(); |
190 | debug_assert!(vec.capacity() >= 2); |
191 | if LIMB_BITS == 32 { |
192 | vec.try_push(x as Limb).unwrap(); |
193 | vec.try_push((x >> 32) as Limb).unwrap(); |
194 | } else { |
195 | vec.try_push(x as Limb).unwrap(); |
196 | } |
197 | vec.normalize(); |
198 | vec |
199 | } |
200 | |
201 | // HI |
202 | // -- |
203 | |
204 | /// Check if any of the remaining bits are non-zero. |
205 | /// |
206 | /// # Safety |
207 | /// |
208 | /// Safe as long as `rindex <= x.len()`. |
209 | #[inline ] |
210 | pub fn nonzero(x: &[Limb], rindex: usize) -> bool { |
211 | debug_assert!(rindex <= x.len()); |
212 | |
213 | let len: usize = x.len(); |
214 | let slc: &[u64] = &x[..len - rindex]; |
215 | slc.iter().rev().any(|&x: u64| x != 0) |
216 | } |
217 | |
218 | // These return the high X bits and if the bits were truncated. |
219 | |
220 | /// Shift 32-bit integer to high 64-bits. |
221 | #[inline ] |
222 | pub fn u32_to_hi64_1(r0: u32) -> (u64, bool) { |
223 | u64_to_hi64_1(r0 as u64) |
224 | } |
225 | |
226 | /// Shift 2 32-bit integers to high 64-bits. |
227 | #[inline ] |
228 | pub fn u32_to_hi64_2(r0: u32, r1: u32) -> (u64, bool) { |
229 | let r0: u64 = (r0 as u64) << 32; |
230 | let r1: u64 = r1 as u64; |
231 | u64_to_hi64_1(r0:r0 | r1) |
232 | } |
233 | |
234 | /// Shift 3 32-bit integers to high 64-bits. |
235 | #[inline ] |
236 | pub fn u32_to_hi64_3(r0: u32, r1: u32, r2: u32) -> (u64, bool) { |
237 | let r0: u64 = r0 as u64; |
238 | let r1: u64 = (r1 as u64) << 32; |
239 | let r2: u64 = r2 as u64; |
240 | u64_to_hi64_2(r0, r1:r1 | r2) |
241 | } |
242 | |
243 | /// Shift 64-bit integer to high 64-bits. |
244 | #[inline ] |
245 | pub fn u64_to_hi64_1(r0: u64) -> (u64, bool) { |
246 | let ls: u32 = r0.leading_zeros(); |
247 | (r0 << ls, false) |
248 | } |
249 | |
250 | /// Shift 2 64-bit integers to high 64-bits. |
251 | #[inline ] |
252 | pub fn u64_to_hi64_2(r0: u64, r1: u64) -> (u64, bool) { |
253 | let ls: u32 = r0.leading_zeros(); |
254 | let rs: u32 = 64 - ls; |
255 | let v: u64 = match ls { |
256 | 0 => r0, |
257 | _ => (r0 << ls) | (r1 >> rs), |
258 | }; |
259 | let n: bool = r1 << ls != 0; |
260 | (v, n) |
261 | } |
262 | |
263 | /// Extract the hi bits from the buffer. |
264 | macro_rules! hi { |
265 | // # Safety |
266 | // |
267 | // Safe as long as the `stackvec.len() >= 1`. |
268 | (@1 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
269 | $fn($rview[0] as $t) |
270 | }}; |
271 | |
272 | // # Safety |
273 | // |
274 | // Safe as long as the `stackvec.len() >= 2`. |
275 | (@2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
276 | let r0 = $rview[0] as $t; |
277 | let r1 = $rview[1] as $t; |
278 | $fn(r0, r1) |
279 | }}; |
280 | |
281 | // # Safety |
282 | // |
283 | // Safe as long as the `stackvec.len() >= 2`. |
284 | (@nonzero2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
285 | let (v, n) = hi!(@2 $self, $rview, $t, $fn); |
286 | (v, n || nonzero($self, 2 )) |
287 | }}; |
288 | |
289 | // # Safety |
290 | // |
291 | // Safe as long as the `stackvec.len() >= 3`. |
292 | (@3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
293 | let r0 = $rview[0] as $t; |
294 | let r1 = $rview[1] as $t; |
295 | let r2 = $rview[2] as $t; |
296 | $fn(r0, r1, r2) |
297 | }}; |
298 | |
299 | // # Safety |
300 | // |
301 | // Safe as long as the `stackvec.len() >= 3`. |
302 | (@nonzero3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
303 | let (v, n) = hi!(@3 $self, $rview, $t, $fn); |
304 | (v, n || nonzero($self, 3)) |
305 | }}; |
306 | } |
307 | |
308 | /// Get the high 64 bits from the vector. |
309 | #[inline (always)] |
310 | pub fn hi64(x: &[Limb]) -> (u64, bool) { |
311 | let rslc: ReverseView<'_, u64> = rview(x); |
312 | // SAFETY: the buffer must be at least length bytes long. |
313 | match x.len() { |
314 | 0 => (0, false), |
315 | 1 if LIMB_BITS == 32 => hi!(@1 x, rslc, u32, u32_to_hi64_1), |
316 | 1 => hi!(@1 x, rslc, u64, u64_to_hi64_1), |
317 | 2 if LIMB_BITS == 32 => hi!(@2 x, rslc, u32, u32_to_hi64_2), |
318 | 2 => hi!(@2 x, rslc, u64, u64_to_hi64_2), |
319 | _ if LIMB_BITS == 32 => hi!(@nonzero3 x, rslc, u32, u32_to_hi64_3), |
320 | _ => hi!(@nonzero2 x, rslc, u64, u64_to_hi64_2), |
321 | } |
322 | } |
323 | |
324 | // POWERS |
325 | // ------ |
326 | |
327 | /// MulAssign by a power of 5. |
328 | /// |
329 | /// Theoretically... |
330 | /// |
331 | /// Use an exponentiation by squaring method, since it reduces the time |
332 | /// complexity of the multiplication to ~`O(log(n))` for the squaring, |
333 | /// and `O(n*m)` for the result. Since `m` is typically a lower-order |
334 | /// factor, this significantly reduces the number of multiplications |
335 | /// we need to do. Iteratively multiplying by small powers follows |
336 | /// the nth triangular number series, which scales as `O(p^2)`, but |
337 | /// where `p` is `n+m`. In short, it scales very poorly. |
338 | /// |
339 | /// Practically.... |
340 | /// |
341 | /// Exponentiation by Squaring: |
342 | /// running 2 tests |
343 | /// test bigcomp_f32_lexical ... bench: 1,018 ns/iter (+/- 78) |
344 | /// test bigcomp_f64_lexical ... bench: 3,639 ns/iter (+/- 1,007) |
345 | /// |
346 | /// Exponentiation by Iterative Small Powers: |
347 | /// running 2 tests |
348 | /// test bigcomp_f32_lexical ... bench: 518 ns/iter (+/- 31) |
349 | /// test bigcomp_f64_lexical ... bench: 583 ns/iter (+/- 47) |
350 | /// |
351 | /// Exponentiation by Iterative Large Powers (of 2): |
352 | /// running 2 tests |
353 | /// test bigcomp_f32_lexical ... bench: 671 ns/iter (+/- 31) |
354 | /// test bigcomp_f64_lexical ... bench: 1,394 ns/iter (+/- 47) |
355 | /// |
356 | /// The following benchmarks were run on `1 * 5^300`, using native `pow`, |
357 | /// a version with only small powers, and one with pre-computed powers |
358 | /// of `5^(3 * max_exp)`, rather than `5^(5 * max_exp)`. |
359 | /// |
360 | /// However, using large powers is crucial for good performance for higher |
361 | /// powers. |
362 | /// pow/default time: [426.20 ns 427.96 ns 429.89 ns] |
363 | /// pow/small time: [2.9270 us 2.9411 us 2.9565 us] |
364 | /// pow/large:3 time: [838.51 ns 842.21 ns 846.27 ns] |
365 | /// |
366 | /// Even using worst-case scenarios, exponentiation by squaring is |
367 | /// significantly slower for our workloads. Just multiply by small powers, |
368 | /// in simple cases, and use precalculated large powers in other cases. |
369 | /// |
370 | /// Furthermore, using sufficiently big large powers is also crucial for |
371 | /// performance. This is a tradeoff of binary size and performance, and |
372 | /// using a single value at ~`5^(5 * max_exp)` seems optimal. |
373 | pub fn pow(x: &mut VecType, mut exp: u32) -> Option<()> { |
374 | // Minimize the number of iterations for large exponents: just |
375 | // do a few steps with a large powers. |
376 | #[cfg (not(feature = "compact" ))] |
377 | { |
378 | while exp >= LARGE_POW5_STEP { |
379 | large_mul(x, &LARGE_POW5)?; |
380 | exp -= LARGE_POW5_STEP; |
381 | } |
382 | } |
383 | |
384 | // Now use our pre-computed small powers iteratively. |
385 | // This is calculated as `⌊log(2^BITS - 1, 5)⌋`. |
386 | let small_step = if LIMB_BITS == 32 { |
387 | 13 |
388 | } else { |
389 | 27 |
390 | }; |
391 | let max_native = (5 as Limb).pow(small_step); |
392 | while exp >= small_step { |
393 | small_mul(x, max_native)?; |
394 | exp -= small_step; |
395 | } |
396 | if exp != 0 { |
397 | // SAFETY: safe, since `exp < small_step`. |
398 | let small_power = unsafe { f64::int_pow_fast_path(exp as usize, 5) }; |
399 | small_mul(x, small_power as Limb)?; |
400 | } |
401 | Some(()) |
402 | } |
403 | |
404 | // SCALAR |
405 | // ------ |
406 | |
407 | /// Add two small integers and return the resulting value and if overflow happens. |
408 | #[inline (always)] |
409 | pub fn scalar_add(x: Limb, y: Limb) -> (Limb, bool) { |
410 | x.overflowing_add(y) |
411 | } |
412 | |
413 | /// Multiply two small integers (with carry) (and return the overflow contribution). |
414 | /// |
415 | /// Returns the (low, high) components. |
416 | #[inline (always)] |
417 | pub fn scalar_mul(x: Limb, y: Limb, carry: Limb) -> (Limb, Limb) { |
418 | // Cannot overflow, as long as wide is 2x as wide. This is because |
419 | // the following is always true: |
420 | // `Wide::MAX - (Narrow::MAX * Narrow::MAX) >= Narrow::MAX` |
421 | let z: Wide = (x as Wide) * (y as Wide) + (carry as Wide); |
422 | (z as Limb, (z >> LIMB_BITS) as Limb) |
423 | } |
424 | |
425 | // SMALL |
426 | // ----- |
427 | |
428 | /// Add small integer to bigint starting from offset. |
429 | #[inline ] |
430 | pub fn small_add_from(x: &mut VecType, y: Limb, start: usize) -> Option<()> { |
431 | let mut index: usize = start; |
432 | let mut carry: u64 = y; |
433 | while carry != 0 && index < x.len() { |
434 | let result: (u64, bool) = scalar_add(x:x[index], y:carry); |
435 | x[index] = result.0; |
436 | carry = result.1 as Limb; |
437 | index += 1; |
438 | } |
439 | // If we carried past all the elements, add to the end of the buffer. |
440 | if carry != 0 { |
441 | x.try_push(carry)?; |
442 | } |
443 | Some(()) |
444 | } |
445 | |
446 | /// Add small integer to bigint. |
447 | #[inline (always)] |
448 | pub fn small_add(x: &mut VecType, y: Limb) -> Option<()> { |
449 | small_add_from(x, y, start:0) |
450 | } |
451 | |
452 | /// Multiply bigint by small integer. |
453 | #[inline ] |
454 | pub fn small_mul(x: &mut VecType, y: Limb) -> Option<()> { |
455 | let mut carry: u64 = 0; |
456 | for xi: &mut u64 in x.iter_mut() { |
457 | let result: (u64, u64) = scalar_mul(*xi, y, carry); |
458 | *xi = result.0; |
459 | carry = result.1; |
460 | } |
461 | // If we carried past all the elements, add to the end of the buffer. |
462 | if carry != 0 { |
463 | x.try_push(carry)?; |
464 | } |
465 | Some(()) |
466 | } |
467 | |
468 | // LARGE |
469 | // ----- |
470 | |
471 | /// Add bigint to bigint starting from offset. |
472 | pub fn large_add_from(x: &mut VecType, y: &[Limb], start: usize) -> Option<()> { |
473 | // The effective x buffer is from `xstart..x.len()`, so we need to treat |
474 | // that as the current range. If the effective y buffer is longer, need |
475 | // to resize to that, + the start index. |
476 | if y.len() > x.len().saturating_sub(start) { |
477 | // Ensure we panic if we can't extend the buffer. |
478 | // This avoids any unsafe behavior afterwards. |
479 | x.try_resize(y.len() + start, 0)?; |
480 | } |
481 | |
482 | // Iteratively add elements from y to x. |
483 | let mut carry = false; |
484 | for (index, &yi) in y.iter().enumerate() { |
485 | // We panicked in `try_resize` if this wasn't true. |
486 | let xi = x.get_mut(start + index).unwrap(); |
487 | |
488 | // Only one op of the two ops can overflow, since we added at max |
489 | // Limb::max_value() + Limb::max_value(). Add the previous carry, |
490 | // and store the current carry for the next. |
491 | let result = scalar_add(*xi, yi); |
492 | *xi = result.0; |
493 | let mut tmp = result.1; |
494 | if carry { |
495 | let result = scalar_add(*xi, 1); |
496 | *xi = result.0; |
497 | tmp |= result.1; |
498 | } |
499 | carry = tmp; |
500 | } |
501 | |
502 | // Handle overflow. |
503 | if carry { |
504 | small_add_from(x, 1, y.len() + start)?; |
505 | } |
506 | Some(()) |
507 | } |
508 | |
509 | /// Add bigint to bigint. |
510 | #[inline (always)] |
511 | pub fn large_add(x: &mut VecType, y: &[Limb]) -> Option<()> { |
512 | large_add_from(x, y, start:0) |
513 | } |
514 | |
515 | /// Grade-school multiplication algorithm. |
516 | /// |
517 | /// Slow, naive algorithm, using limb-bit bases and just shifting left for |
518 | /// each iteration. This could be optimized with numerous other algorithms, |
519 | /// but it's extremely simple, and works in O(n*m) time, which is fine |
520 | /// by me. Each iteration, of which there are `m` iterations, requires |
521 | /// `n` multiplications, and `n` additions, or grade-school multiplication. |
522 | /// |
523 | /// Don't use Karatsuba multiplication, since out implementation seems to |
524 | /// be slower asymptotically, which is likely just due to the small sizes |
525 | /// we deal with here. For example, running on the following data: |
526 | /// |
527 | /// ```text |
528 | /// const SMALL_X: &[u32] = &[ |
529 | /// 766857581, 3588187092, 1583923090, 2204542082, 1564708913, 2695310100, 3676050286, |
530 | /// 1022770393, 468044626, 446028186 |
531 | /// ]; |
532 | /// const SMALL_Y: &[u32] = &[ |
533 | /// 3945492125, 3250752032, 1282554898, 1708742809, 1131807209, 3171663979, 1353276095, |
534 | /// 1678845844, 2373924447, 3640713171 |
535 | /// ]; |
536 | /// const LARGE_X: &[u32] = &[ |
537 | /// 3647536243, 2836434412, 2154401029, 1297917894, 137240595, 790694805, 2260404854, |
538 | /// 3872698172, 690585094, 99641546, 3510774932, 1672049983, 2313458559, 2017623719, |
539 | /// 638180197, 1140936565, 1787190494, 1797420655, 14113450, 2350476485, 3052941684, |
540 | /// 1993594787, 2901001571, 4156930025, 1248016552, 848099908, 2660577483, 4030871206, |
541 | /// 692169593, 2835966319, 1781364505, 4266390061, 1813581655, 4210899844, 2137005290, |
542 | /// 2346701569, 3715571980, 3386325356, 1251725092, 2267270902, 474686922, 2712200426, |
543 | /// 197581715, 3087636290, 1379224439, 1258285015, 3230794403, 2759309199, 1494932094, |
544 | /// 326310242 |
545 | /// ]; |
546 | /// const LARGE_Y: &[u32] = &[ |
547 | /// 1574249566, 868970575, 76716509, 3198027972, 1541766986, 1095120699, 3891610505, |
548 | /// 2322545818, 1677345138, 865101357, 2650232883, 2831881215, 3985005565, 2294283760, |
549 | /// 3468161605, 393539559, 3665153349, 1494067812, 106699483, 2596454134, 797235106, |
550 | /// 705031740, 1209732933, 2732145769, 4122429072, 141002534, 790195010, 4014829800, |
551 | /// 1303930792, 3649568494, 308065964, 1233648836, 2807326116, 79326486, 1262500691, |
552 | /// 621809229, 2258109428, 3819258501, 171115668, 1139491184, 2979680603, 1333372297, |
553 | /// 1657496603, 2790845317, 4090236532, 4220374789, 601876604, 1828177209, 2372228171, |
554 | /// 2247372529 |
555 | /// ]; |
556 | /// ``` |
557 | /// |
558 | /// We get the following results: |
559 | |
560 | /// ```text |
561 | /// mul/small:long time: [220.23 ns 221.47 ns 222.81 ns] |
562 | /// Found 4 outliers among 100 measurements (4.00%) |
563 | /// 2 (2.00%) high mild |
564 | /// 2 (2.00%) high severe |
565 | /// mul/small:karatsuba time: [233.88 ns 234.63 ns 235.44 ns] |
566 | /// Found 11 outliers among 100 measurements (11.00%) |
567 | /// 8 (8.00%) high mild |
568 | /// 3 (3.00%) high severe |
569 | /// mul/large:long time: [1.9365 us 1.9455 us 1.9558 us] |
570 | /// Found 12 outliers among 100 measurements (12.00%) |
571 | /// 7 (7.00%) high mild |
572 | /// 5 (5.00%) high severe |
573 | /// mul/large:karatsuba time: [4.4250 us 4.4515 us 4.4812 us] |
574 | /// ``` |
575 | /// |
576 | /// In short, Karatsuba multiplication is never worthwhile for out use-case. |
577 | pub fn long_mul(x: &[Limb], y: &[Limb]) -> Option<VecType> { |
578 | // Using the immutable value, multiply by all the scalars in y, using |
579 | // the algorithm defined above. Use a single buffer to avoid |
580 | // frequent reallocations. Handle the first case to avoid a redundant |
581 | // addition, since we know y.len() >= 1. |
582 | let mut z: StackVec = VecType::try_from(x)?; |
583 | if !y.is_empty() { |
584 | let y0: u64 = y[0]; |
585 | small_mul(&mut z, y:y0)?; |
586 | |
587 | for (index: usize, &yi: u64) in y.iter().enumerate().skip(1) { |
588 | if yi != 0 { |
589 | let mut zi: StackVec = VecType::try_from(x)?; |
590 | small_mul(&mut zi, y:yi)?; |
591 | large_add_from(&mut z, &zi, start:index)?; |
592 | } |
593 | } |
594 | } |
595 | |
596 | z.normalize(); |
597 | Some(z) |
598 | } |
599 | |
600 | /// Multiply bigint by bigint using grade-school multiplication algorithm. |
601 | #[inline (always)] |
602 | pub fn large_mul(x: &mut VecType, y: &[Limb]) -> Option<()> { |
603 | // Karatsuba multiplication never makes sense, so just use grade school |
604 | // multiplication. |
605 | if y.len() == 1 { |
606 | // SAFETY: safe since `y.len() == 1`. |
607 | small_mul(x, y:y[0])?; |
608 | } else { |
609 | *x = long_mul(x:y, y:x)?; |
610 | } |
611 | Some(()) |
612 | } |
613 | |
614 | // SHIFT |
615 | // ----- |
616 | |
617 | /// Shift-left `n` bits inside a buffer. |
618 | #[inline ] |
619 | pub fn shl_bits(x: &mut VecType, n: usize) -> Option<()> { |
620 | debug_assert!(n != 0); |
621 | |
622 | // Internally, for each item, we shift left by n, and add the previous |
623 | // right shifted limb-bits. |
624 | // For example, we transform (for u8) shifted left 2, to: |
625 | // b10100100 b01000010 |
626 | // b10 b10010001 b00001000 |
627 | debug_assert!(n < LIMB_BITS); |
628 | let rshift = LIMB_BITS - n; |
629 | let lshift = n; |
630 | let mut prev: Limb = 0; |
631 | for xi in x.iter_mut() { |
632 | let tmp = *xi; |
633 | *xi <<= lshift; |
634 | *xi |= prev >> rshift; |
635 | prev = tmp; |
636 | } |
637 | |
638 | // Always push the carry, even if it creates a non-normal result. |
639 | let carry = prev >> rshift; |
640 | if carry != 0 { |
641 | x.try_push(carry)?; |
642 | } |
643 | |
644 | Some(()) |
645 | } |
646 | |
647 | /// Shift-left `n` limbs inside a buffer. |
648 | #[inline ] |
649 | pub fn shl_limbs(x: &mut VecType, n: usize) -> Option<()> { |
650 | debug_assert!(n != 0); |
651 | if n + x.len() > x.capacity() { |
652 | None |
653 | } else if !x.is_empty() { |
654 | let len: usize = n + x.len(); |
655 | // SAFE: since x is not empty, and `x.len() + n <= x.capacity()`. |
656 | unsafe { |
657 | // Move the elements. |
658 | let src: *const u64 = x.as_ptr(); |
659 | let dst: *mut u64 = x.as_mut_ptr().add(count:n); |
660 | ptr::copy(src, dst, count:x.len()); |
661 | // Write our 0s. |
662 | ptr::write_bytes(dst:x.as_mut_ptr(), val:0, count:n); |
663 | x.set_len(len); |
664 | } |
665 | Some(()) |
666 | } else { |
667 | Some(()) |
668 | } |
669 | } |
670 | |
671 | /// Shift-left buffer by n bits. |
672 | #[inline ] |
673 | pub fn shl(x: &mut VecType, n: usize) -> Option<()> { |
674 | let rem: usize = n % LIMB_BITS; |
675 | let div: usize = n / LIMB_BITS; |
676 | if rem != 0 { |
677 | shl_bits(x, n:rem)?; |
678 | } |
679 | if div != 0 { |
680 | shl_limbs(x, n:div)?; |
681 | } |
682 | Some(()) |
683 | } |
684 | |
685 | /// Get number of leading zero bits in the storage. |
686 | #[inline ] |
687 | pub fn leading_zeros(x: &[Limb]) -> u32 { |
688 | let length: usize = x.len(); |
689 | // wrapping_sub is fine, since it'll just return None. |
690 | if let Some(&value: u64) = x.get(index:length.wrapping_sub(1)) { |
691 | value.leading_zeros() |
692 | } else { |
693 | 0 |
694 | } |
695 | } |
696 | |
697 | /// Calculate the bit-length of the big-integer. |
698 | #[inline ] |
699 | pub fn bit_length(x: &[Limb]) -> u32 { |
700 | let nlz: u32 = leading_zeros(x); |
701 | LIMB_BITS as u32 * x.len() as u32 - nlz |
702 | } |
703 | |
704 | // LIMB |
705 | // ---- |
706 | |
707 | // Type for a single limb of the big integer. |
708 | // |
709 | // A limb is analogous to a digit in base10, except, it stores 32-bit |
710 | // or 64-bit numbers instead. We want types where 64-bit multiplication |
711 | // is well-supported by the architecture, rather than emulated in 3 |
712 | // instructions. The quickest way to check this support is using a |
713 | // cross-compiler for numerous architectures, along with the following |
714 | // source file and command: |
715 | // |
716 | // Compile with `gcc main.c -c -S -O3 -masm=intel` |
717 | // |
718 | // And the source code is: |
719 | // ```text |
720 | // #include <stdint.h> |
721 | // |
722 | // struct i128 { |
723 | // uint64_t hi; |
724 | // uint64_t lo; |
725 | // }; |
726 | // |
727 | // // Type your code here, or load an example. |
728 | // struct i128 square(uint64_t x, uint64_t y) { |
729 | // __int128 prod = (__int128)x * (__int128)y; |
730 | // struct i128 z; |
731 | // z.hi = (uint64_t)(prod >> 64); |
732 | // z.lo = (uint64_t)prod; |
733 | // return z; |
734 | // } |
735 | // ``` |
736 | // |
737 | // If the result contains `call __multi3`, then the multiplication |
738 | // is emulated by the compiler. Otherwise, it's natively supported. |
739 | // |
740 | // This should be all-known 64-bit platforms supported by Rust. |
741 | // https://forge.rust-lang.org/platform-support.html |
742 | // |
743 | // # Supported |
744 | // |
745 | // Platforms where native 128-bit multiplication is explicitly supported: |
746 | // - x86_64 (Supported via `MUL`). |
747 | // - mips64 (Supported via `DMULTU`, which `HI` and `LO` can be read-from). |
748 | // - s390x (Supported via `MLGR`). |
749 | // |
750 | // # Efficient |
751 | // |
752 | // Platforms where native 64-bit multiplication is supported and |
753 | // you can extract hi-lo for 64-bit multiplications. |
754 | // - aarch64 (Requires `UMULH` and `MUL` to capture high and low bits). |
755 | // - powerpc64 (Requires `MULHDU` and `MULLD` to capture high and low bits). |
756 | // - riscv64 (Requires `MUL` and `MULH` to capture high and low bits). |
757 | // |
758 | // # Unsupported |
759 | // |
760 | // Platforms where native 128-bit multiplication is not supported, |
761 | // requiring software emulation. |
762 | // sparc64 (`UMUL` only supports double-word arguments). |
763 | // sparcv9 (Same as sparc64). |
764 | // |
765 | // These tests are run via `xcross`, my own library for C cross-compiling, |
766 | // which supports numerous targets (far in excess of Rust's tier 1 support, |
767 | // or rust-embedded/cross's list). xcross may be found here: |
768 | // https://github.com/Alexhuszagh/xcross |
769 | // |
770 | // To compile for the given target, run: |
771 | // `xcross gcc main.c -c -S -O3 --target $target` |
772 | // |
773 | // All 32-bit architectures inherently do not have support. That means |
774 | // we can essentially look for 64-bit architectures that are not SPARC. |
775 | |
776 | #[cfg (all(target_pointer_width = "64" , not(target_arch = "sparc" )))] |
777 | pub type Limb = u64; |
778 | #[cfg (all(target_pointer_width = "64" , not(target_arch = "sparc" )))] |
779 | pub type Wide = u128; |
780 | #[cfg (all(target_pointer_width = "64" , not(target_arch = "sparc" )))] |
781 | pub const LIMB_BITS: usize = 64; |
782 | |
783 | #[cfg (not(all(target_pointer_width = "64" , not(target_arch = "sparc" ))))] |
784 | pub type Limb = u32; |
785 | #[cfg (not(all(target_pointer_width = "64" , not(target_arch = "sparc" ))))] |
786 | pub type Wide = u64; |
787 | #[cfg (not(all(target_pointer_width = "64" , not(target_arch = "sparc" ))))] |
788 | pub const LIMB_BITS: usize = 32; |
789 | |