algo.rs source code [crates/adler/src/algo.rs]

1	use crate::Adler32;
2	use std::ops::{AddAssign, MulAssign, RemAssign};
3
4	impl Adler32 {
5	pub(crate) fn compute(&mut self, bytes: &[u8]) {
6	// The basic algorithm is, for every byte:
7	// a = (a + byte) % MOD
8	// b = (b + a) % MOD
9	// where MOD = 65521.
10	//
11	// For efficiency, we can defer the `% MOD` operations as long as neither a nor b overflows:
12	// - Between calls to `write`, we ensure that a and b are always in range 0..MOD.
13	// - We use 32-bit arithmetic in this function.
14	// - Therefore, a and b must not increase by more than 2^32-MOD without performing a `% MOD`
15	// operation.
16	//
17	// According to Wikipedia, b is calculated as follows for non-incremental checksumming:
18	// b = n×D1 + (n−1)×D2 + (n−2)×D3 + ... + Dn + n1 (mod 65521)*
19	// Where n is the number of bytes and Di is the i-th Byte. We need to change this to account
20	// for the previous values of a and b, as well as treat every input Byte as being 255:
21	// b_inc = n×255 + (n-1)×255 + ... + 255 + n65520*
22	// Or in other words:
23	// b_inc = n65520 + n(n+1)/2255
24	// The max chunk size is thus the largest value of n so that b_inc <= 2^32-65521.
25	// 2^32-65521 = n65520 + n(n+1)/2255
26	// Plugging this into an equation solver since I can't math gives n = 5552.18..., so 5552.
27	//
28	// On top of the optimization outlined above, the algorithm can also be parallelized with a
29	// bit more work:
30	//
31	// Note that b is a linear combination of a vector of input bytes (D1, ..., Dn).
32	//
33	// If we fix some value k<N and rewrite indices 1, ..., N as
34	//
35	// 1_1, 1_2, ..., 1_k, 2_1, ..., 2_k, ..., (N/k)_k,
36	//
37	// then we can express a and b in terms of sums of smaller sequences kb and ka:
38	//
39	// ka(j) := D1_j + D2_j + ... + D(N/k)_j where j <= k
40	// kb(j) := (N/k)D1_j + (N/k-1)D2_j + ... + D(N/k)_j where j <= k
41	//
42	// a = ka(1) + ka(2) + ... + ka(k) + 1
43	// b = k(kb(1) + kb(2) + ... + kb(k)) - 1ka(2) - ... - (k-1)ka(k) + N*
44	//
45	// We use this insight to unroll the main loop and process k=4 bytes at a time.
46	// The resulting code is highly amenable to SIMD acceleration, although the immediate speedups
47	// stem from increased pipeline parallelism rather than auto-vectorization.
48	//
49	// This technique is described in-depth (here:)[https://software.intel.com/content/www/us/\
50	// en/develop/articles/fast-computation-of-fletcher-checksums.html]
51
52	const MOD: u32 = `65521`;
53	const CHUNK_SIZE: usize = `5552` * `4`;
54
55	let mut a = u32::from(self.a);
56	let mut b = u32::from(self.b);
57	let mut a_vec = U32X4([`0`; `4`]);
58	let mut b_vec = a_vec;
59
60	let (bytes, remainder) = bytes.split_at(bytes.len() - bytes.len() % `4`);
61
62	// iterate over 4 bytes at a time
63	let chunk_iter = bytes.chunks_exact(CHUNK_SIZE);
64	let remainder_chunk = chunk_iter.remainder();
65	for chunk in chunk_iter {
66	for byte_vec in chunk.chunks_exact(`4`) {
67	let val = U32X4::from(byte_vec);
68	a_vec += val;
69	b_vec += a_vec;
70	}
71	b += CHUNK_SIZE as u32 * a;
72	a_vec %= MOD;
73	b_vec %= MOD;
74	b %= MOD;
75	}
76	// special-case the final chunk because it may be shorter than the rest
77	for byte_vec in remainder_chunk.chunks_exact(`4`) {
78	let val = U32X4::from(byte_vec);
79	a_vec += val;
80	b_vec += a_vec;
81	}
82	b += remainder_chunk.len() as u32 * a;
83	a_vec %= MOD;
84	b_vec %= MOD;
85	b %= MOD;
86
87	// combine the sub-sum results into the main sum
88	b_vec *= `4`;
89	b_vec.0[`1`] += MOD - a_vec.0[`1`];
90	b_vec.0[`2`] += (MOD - a_vec.0[`2`]) * `2`;
91	b_vec.0[`3`] += (MOD - a_vec.0[`3`]) * `3`;
92	for &av in a_vec.0.iter() {
93	a += av;
94	}
95	for &bv in b_vec.0.iter() {
96	b += bv;
97	}
98
99	// iterate over the remaining few bytes in serial
100	for &byte in remainder.iter() {
101	a += u32::from(byte);
102	b += a;
103	}
104
105	self.a = (a % MOD) as u16;
106	self.b = (b % MOD) as u16;
107	}
108	}
109
110	#[derive(Copy, Clone)]
111	struct U32X4([u32; `4`]);
112
113	impl U32X4 {
114	fn from(bytes: &[u8]) -> Self {
115	U32X4([
116	u32::from(bytes[`0`]),
117	u32::from(bytes[`1`]),
118	u32::from(bytes[`2`]),
119	u32::from(bytes[`3`]),
120	])
121	}
122	}
123
124	impl AddAssign<Self> for U32X4 {
125	fn add_assign(&mut self, other: Self) {
126	for (s: &mut u32, o: &u32) in self.0.iter_mut().zip(other.0.iter()) {
127	*s += o;
128	}
129	}
130	}
131
132	impl RemAssign<u32> for U32X4 {
133	fn rem_assign(&mut self, quotient: u32) {
134	for s: &mut u32 in self.0.iter_mut() {
135	*s %= quotient;
136	}
137	}
138	}
139
140	impl MulAssign<u32> for U32X4 {
141	fn mul_assign(&mut self, rhs: u32) {
142	for s: &mut u32 in self.0.iter_mut() {
143	s = rhs;
144	}
145	}
146	}
147