algo.rs source code [crates/adler2/src/algo.rs]

1	use crate::Adler32;
2	use std::ops::{AddAssign, MulAssign, RemAssign};
3
4	impl Adler32 {
5	pub(crate) fn compute(&mut self, bytes: &[u8]) {
6	// The basic algorithm is, for every byte:
7	// a = (a + byte) % MOD
8	// b = (b + a) % MOD
9	// where MOD = 65521.
10	//
11	// For efficiency, we can defer the `% MOD` operations as long as neither a nor b overflows:
12	// - Between calls to `write`, we ensure that a and b are always in range 0..MOD.
13	// - We use 32-bit arithmetic in this function.
14	// - Therefore, a and b must not increase by more than 2^32-MOD without performing a `% MOD`
15	// operation.
16	//
17	// According to Wikipedia, b is calculated as follows for non-incremental checksumming:
18	// b = n×D1 + (n−1)×D2 + (n−2)×D3 + ... + Dn + n1 (mod 65521)*
19	// Where n is the number of bytes and Di is the i-th Byte. We need to change this to account
20	// for the previous values of a and b, as well as treat every input Byte as being 255:
21	// b_inc = n×255 + (n-1)×255 + ... + 255 + n65520*
22	// Or in other words:
23	// b_inc = n65520 + n(n+1)/2255
24	// The max chunk size is thus the largest value of n so that b_inc <= 2^32-65521.
25	// 2^32-65521 = n65520 + n(n+1)/2255
26	// Plugging this into an equation solver since I can't math gives n = 5552.18..., so 5552.
27	//
28	// On top of the optimization outlined above, the algorithm can also be parallelized with a
29	// bit more work:
30	//
31	// Note that b is a linear combination of a vector of input bytes (D1, ..., Dn).
32	//
33	// If we fix some value k<N and rewrite indices 1, ..., N as
34	//
35	// 1_1, 1_2, ..., 1_k, 2_1, ..., 2_k, ..., (N/k)_k,
36	//
37	// then we can express a and b in terms of sums of smaller sequences kb and ka:
38	//
39	// ka(j) := D1_j + D2_j + ... + D(N/k)_j where j <= k
40	// kb(j) := (N/k)D1_j + (N/k-1)D2_j + ... + D(N/k)_j where j <= k
41	//
42	// a = ka(1) + ka(2) + ... + ka(k) + 1
43	// b = k(kb(1) + kb(2) + ... + kb(k)) - 1ka(2) - ... - (k-1)ka(k) + N*
44	//
45	// We use this insight to unroll the main loop and process k=4 bytes at a time.
46	// The resulting code is highly amenable to SIMD acceleration, although the immediate speedups
47	// stem from increased pipeline parallelism rather than auto-vectorization.
48	//
49	// This technique is described in-depth (here:)[https://software.intel.com/content/www/us/\
50	// en/develop/articles/fast-computation-of-fletcher-checksums.html]
51
52	const MOD: u32 = `65521`;
53	const CHUNK_SIZE: usize = `5552` * `4`;
54
55	let mut a = u32::from(self.a);
56	let mut b = u32::from(self.b);
57	let mut a_vec = U32X4([`0`; `4`]);
58	let mut b_vec = a_vec;
59
60	let (bytes, remainder) = bytes.split_at(bytes.len() - bytes.len() % `4`);
61
62	// iterate over 4 bytes at a time
63	let chunk_iter = bytes.chunks_exact(CHUNK_SIZE);
64	let remainder_chunk = chunk_iter.remainder();
65	for chunk in chunk_iter {
66	for byte_vec in chunk.chunks_exact(`4`) {
67	let val = U32X4::from(byte_vec);
68	a_vec += val;
69	b_vec += a_vec;
70	}
71
72	b += CHUNK_SIZE as u32 * a;
73	a_vec %= MOD;
74	b_vec %= MOD;
75	b %= MOD;
76	}
77	// special-case the final chunk because it may be shorter than the rest
78	for byte_vec in remainder_chunk.chunks_exact(`4`) {
79	let val = U32X4::from(byte_vec);
80	a_vec += val;
81	b_vec += a_vec;
82	}
83	b += remainder_chunk.len() as u32 * a;
84	a_vec %= MOD;
85	b_vec %= MOD;
86	b %= MOD;
87
88	// combine the sub-sum results into the main sum
89	b_vec *= `4`;
90	b_vec.0[`1`] += MOD - a_vec.0[`1`];
91	b_vec.0[`2`] += (MOD - a_vec.0[`2`]) * `2`;
92	b_vec.0[`3`] += (MOD - a_vec.0[`3`]) * `3`;
93	for &av in a_vec.0.iter() {
94	a += av;
95	}
96	for &bv in b_vec.0.iter() {
97	b += bv;
98	}
99
100	// iterate over the remaining few bytes in serial
101	for &byte in remainder.iter() {
102	a += u32::from(byte);
103	b += a;
104	}
105
106	self.a = (a % MOD) as u16;
107	self.b = (b % MOD) as u16;
108	}
109	}
110
111	#[derive(Copy, Clone)]
112	struct U32X4([u32; `4`]);
113
114	impl U32X4 {
115	#[inline]
116	fn from(bytes: &[u8]) -> Self {
117	U32X4([
118	u32::from(bytes[`0`]),
119	u32::from(bytes[`1`]),
120	u32::from(bytes[`2`]),
121	u32::from(bytes[`3`]),
122	])
123	}
124	}
125
126	impl AddAssign<Self> for U32X4 {
127	#[inline]
128	fn add_assign(&mut self, other: Self) {
129	// Implement this in a primitive manner to help out the compiler a bit.
130	self.0[`0`] += other.0[`0`];
131	self.0[`1`] += other.0[`1`];
132	self.0[`2`] += other.0[`2`];
133	self.0[`3`] += other.0[`3`];
134	}
135	}
136
137	impl RemAssign<u32> for U32X4 {
138	#[inline]
139	fn rem_assign(&mut self, quotient: u32) {
140	self.0[`0`] %= quotient;
141	self.0[`1`] %= quotient;
142	self.0[`2`] %= quotient;
143	self.0[`3`] %= quotient;
144	}
145	}
146
147	impl MulAssign<u32> for U32X4 {
148	#[inline]
149	fn mul_assign(&mut self, rhs: u32) {
150	self.0[`0`] *= rhs;
151	self.0[`1`] *= rhs;
152	self.0[`2`] *= rhs;
153	self.0[`3`] *= rhs;
154	}
155	}
156