filter.rs source code [crates/png/src/filter.rs]

1	use core::convert::TryInto;
2
3	use crate::common::BytesPerPixel;
4
5	/// SIMD helpers for `fn unfilter`
6	///
7	/// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the
8	/// "unstable" feature of the `png` crate. This should be possible once the "portable_simd"
9	/// feature of Rust gets stabilized.
10	///
11	/// This is only known to help on x86, with no change measured on most benchmarks on ARM,
12	/// and even severely regressing some of them.
13	/// So despite the code being portable, we only enable this for x86.
14	/// We can add more platforms once this code is proven to be beneficial for them.
15	#[cfg(all(feature = "unstable", target_arch = "x86_64"))]
16	mod simd {
17	use std::simd::num::{SimdInt, SimdUint};
18	use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount};
19
20	/// Scalar Paeth function wrapped in SIMD scaffolding.
21	///
22	/// This is needed because simply running the function on the inputs
23	/// makes the compiler think our inputs are too short
24	/// to benefit from vectorization.
25	/// Putting it in SIMD scaffolding fixes that.
26	/// https://github.com/image-rs/image-png/issues/511
27	///
28	/// Funnily, the autovectorizer does a better job here
29	/// than a handwritten algorithm using std::simd!
30	/// We used to have a handwritten one but this is just faster.
31	fn paeth_predictor<const N: usize>(
32	a: Simd<i16, N>,
33	b: Simd<i16, N>,
34	c: Simd<i16, N>,
35	) -> Simd<i16, N>
36	where
37	LaneCount<N>: SupportedLaneCount,
38	{
39	let mut out = [`0`; N];
40	for i in `0`..N {
41	out[i] = super::filter_paeth_stbi_i16(a[i].into(), b[i].into(), c[i].into());
42	}
43	out.into()
44	}
45
46	/// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert
47	/// the SIMD elements to `i16`.
48	fn paeth_predictor_u8<const N: usize>(
49	a: Simd<u8, N>,
50	b: Simd<u8, N>,
51	c: Simd<u8, N>,
52	) -> Simd<u8, N>
53	where
54	LaneCount<N>: SupportedLaneCount,
55	{
56	let mut out = [`0`; N];
57	for i in `0`..N {
58	out[i] = super::filter_paeth_stbi(a[i].into(), b[i].into(), c[i].into());
59	}
60	out.into()
61	}
62
63	/// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`).
64	/// See also https://www.w3.org/TR/png/#filter-byte-positions
65	#[derive(Default)]
66	struct PaethState<T, const N: usize>
67	where
68	T: SimdElement,
69	LaneCount<N>: SupportedLaneCount,
70	{
71	/// Previous pixel in the previous row.
72	c: Simd<T, N>,
73
74	/// Previous pixel in the current row.
75	a: Simd<T, N>,
76	}
77
78	/// Mutates `x` as needed to unfilter `FilterType::Paeth`.
79	///
80	/// `b` is the current pixel in the previous row. `x` is the current pixel in the current row.
81	/// See also https://www.w3.org/TR/png/#filter-byte-positions
82	fn paeth_step<const N: usize>(
83	state: &mut PaethState<i16, N>,
84	b: Simd<u8, N>,
85	x: &mut Simd<u8, N>,
86	) where
87	LaneCount<N>: SupportedLaneCount,
88	{
89	// Storing the inputs.
90	let b = b.cast::<i16>();
91
92	// Calculating the new value of the current pixel.
93	let predictor = paeth_predictor(state.a, b, state.c);
94	*x += predictor.cast::<u8>();
95
96	// Preparing for the next step.
97	state.c = b;
98	state.a = x.cast::<i16>();
99	}
100
101	/// Computes the Paeth predictor without converting `u8` to `i16`.
102	///
103	/// See `simd::paeth_step`.
104	fn paeth_step_u8<const N: usize>(
105	state: &mut PaethState<u8, N>,
106	b: Simd<u8, N>,
107	x: &mut Simd<u8, N>,
108	) where
109	LaneCount<N>: SupportedLaneCount,
110	{
111	// Calculating the new value of the current pixel.
112	*x += paeth_predictor_u8(state.a, b, state.c);
113
114	// Preparing for the next step.
115	state.c = b;
116	state.a = *x;
117	}
118
119	fn load3(src: &[u8]) -> u8x4 {
120	u8x4::from_array([src[`0`], src[`1`], src[`2`], `0`])
121	}
122
123	fn store3(src: u8x4, dest: &mut [u8]) {
124	dest[`0`..`3`].copy_from_slice(&src.to_array()[`0`..`3`])
125	}
126
127	/// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
128	pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
129	debug_assert_eq!(prev_row.len(), curr_row.len());
130	debug_assert_eq!(prev_row.len() % `3`, `0`);
131
132	let mut state = PaethState::<i16, `4`>::default();
133	while prev_row.len() >= `4` {
134	// `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first
135	// byte from the next triple. This optimization technique mimics the algorithm found
136	// in
137	// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
138	let b = u8x4::from_slice(prev_row);
139	let mut x = u8x4::from_slice(curr_row);
140
141	paeth_step(&mut state, b, &mut x);
142
143	// We can speculate that writing 4 bytes might be more efficient (just as with using
144	// `u8x4::from_slice` above), but we can't use that here, because we can't clobber the
145	// first byte of the next pixel in the `curr_row`.
146	store3(x, curr_row);
147
148	prev_row = &prev_row[`3`..];
149	curr_row = &mut curr_row[`3`..];
150	}
151	// Can't use `u8x4::from_slice` for the last `[u8;3]`.
152	let b = load3(prev_row);
153	let mut x = load3(curr_row);
154	paeth_step(&mut state, b, &mut x);
155	store3(x, curr_row);
156	}
157
158	/// Undoes `FilterType::Paeth` for `BytesPerPixel::Four` and `BytesPerPixel::Eight`.
159	///
160	/// This function calculates the Paeth predictor entirely in `Simd<u8, N>`
161	/// without converting to an intermediate `Simd<i16, N>`. Doing so avoids
162	/// paying a small performance penalty converting between types.
163	pub fn unfilter_paeth_u8<const N: usize>(prev_row: &[u8], curr_row: &mut [u8])
164	where
165	LaneCount<N>: SupportedLaneCount,
166	{
167	debug_assert_eq!(prev_row.len(), curr_row.len());
168	debug_assert_eq!(prev_row.len() % N, `0`);
169	assert!(matches!(N, `4` \| `8`));
170
171	let mut state = PaethState::<u8, N>::default();
172	for (prev_row, curr_row) in prev_row.chunks_exact(N).zip(curr_row.chunks_exact_mut(N)) {
173	let b = Simd::from_slice(prev_row);
174	let mut x = Simd::from_slice(curr_row);
175
176	paeth_step_u8(&mut state, b, &mut x);
177
178	curr_row[..N].copy_from_slice(&x.to_array()[..N]);
179	}
180	}
181
182	fn load6(src: &[u8]) -> u8x8 {
183	u8x8::from_array([src[`0`], src[`1`], src[`2`], src[`3`], src[`4`], src[`5`], `0`, `0`])
184	}
185
186	fn store6(src: u8x8, dest: &mut [u8]) {
187	dest[`0`..`6`].copy_from_slice(&src.to_array()[`0`..`6`])
188	}
189
190	/// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`.
191	pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
192	debug_assert_eq!(prev_row.len(), curr_row.len());
193	debug_assert_eq!(prev_row.len() % `6`, `0`);
194
195	let mut state = PaethState::<i16, `8`>::default();
196	while prev_row.len() >= `8` {
197	// `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two
198	// bytes from the next pixel. This optimization technique mimics the algorithm found
199	// in
200	// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
201	let b = u8x8::from_slice(prev_row);
202	let mut x = u8x8::from_slice(curr_row);
203
204	paeth_step(&mut state, b, &mut x);
205
206	// We can speculate that writing 8 bytes might be more efficient (just as with using
207	// `u8x8::from_slice` above), but we can't use that here, because we can't clobber the
208	// first bytes of the next pixel in the `curr_row`.
209	store6(x, curr_row);
210
211	prev_row = &prev_row[`6`..];
212	curr_row = &mut curr_row[`6`..];
213	}
214	// Can't use `u8x8::from_slice` for the last `[u8;6]`.
215	let b = load6(prev_row);
216	let mut x = load6(curr_row);
217	paeth_step(&mut state, b, &mut x);
218	store6(x, curr_row);
219	}
220	}
221
222	/// The byte level filter applied to scanlines to prepare them for compression.
223	///
224	/// Compression in general benefits from repetitive data. The filter is a content-aware method of
225	/// compressing the range of occurring byte values to help the compression algorithm. Note that
226	/// this does not operate on pixels but on raw bytes of a scanline.
227	///
228	/// Details on how each filter works can be found in the [PNG Book](http://www.libpng.org/pub/png/book/chapter09.html).
229	#[derive(Debug, Clone, Copy, PartialEq, Eq)]
230	#[repr(u8)]
231	pub enum FilterType {
232	NoFilter = `0`,
233	Sub = `1`,
234	Up = `2`,
235	Avg = `3`,
236	Paeth = `4`,
237	}
238
239	impl Default for FilterType {
240	fn default() -> Self {
241	FilterType::Sub
242	}
243	}
244
245	impl FilterType {
246	/// u8 -> Self. Temporary solution until Rust provides a canonical one.
247	pub fn from_u8(n: u8) -> Option<FilterType> {
248	match n {
249	`0` => Some(FilterType::NoFilter),
250	`1` => Some(FilterType::Sub),
251	`2` => Some(FilterType::Up),
252	`3` => Some(FilterType::Avg),
253	`4` => Some(FilterType::Paeth),
254	_ => None,
255	}
256	}
257	}
258
259	/// Adaptive filtering tries every possible filter for each row and uses a heuristic to select the best one.
260	/// This improves compression ratio, but makes encoding slightly slower.
261	///
262	/// It is recommended to use `Adaptive` whenever you care about compression ratio.
263	/// Filtering is quite cheap compared to other parts of encoding, but can contribute
264	/// to the compression ratio significantly.
265	///
266	/// `NonAdaptive` filtering is the default.
267	#[derive(Debug, Clone, Copy, PartialEq, Eq)]
268	#[repr(u8)]
269	pub enum AdaptiveFilterType {
270	Adaptive,
271	NonAdaptive,
272	}
273
274	impl Default for AdaptiveFilterType {
275	fn default() -> Self {
276	AdaptiveFilterType::NonAdaptive
277	}
278	}
279
280	fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
281	// On ARM this algorithm performs much better than the one above adapted from stb,
282	// and this is the better-studied algorithm we've always used here,
283	// so we default to it on all non-x86 platforms.
284	let pa: i16 = (i16::from(b) - i16::from(c)).abs();
285	let pb: i16 = (i16::from(a) - i16::from(c)).abs();
286	let pc: i16 = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
287
288	let mut out: u8 = a;
289	let mut min: i16 = pa;
290
291	if pb < min {
292	min = pb;
293	out = b;
294	}
295	if pc < min {
296	out = c;
297	}
298
299	out
300	}
301
302	fn filter_paeth_stbi(a: u8, b: u8, c: u8) -> u8 {
303	// Decoding optimizes better with this algorithm than with `filter_paeth`
304	//
305	// This formulation looks very different from the reference in the PNG spec, but is
306	// actually equivalent and has favorable data dependencies and admits straightforward
307	// generation of branch-free code, which helps performance significantly.
308	//
309	// Adapted from public domain PNG implementation:
310	// https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
311	let thresh: i16 = i16::from(c) * `3` - (i16::from(a) + i16::from(b));
312	let lo: u8 = a.min(b);
313	let hi: u8 = a.max(b);
314	let t0: u8 = if hi as i16 <= thresh { lo } else { c };
315	let t1: u8 = if thresh <= lo as i16 { hi } else { t0 };
316	return t1;
317	}
318
319	#[cfg(any(test, all(feature = "unstable", target_arch = "x86_64")))]
320	fn filter_paeth_stbi_i16(a: i16, b: i16, c: i16) -> i16 {
321	// Like `filter_paeth_stbi` but vectorizes better when wrapped in SIMD types.
322	// Used for bpp=3 and bpp=6
323	let thresh = c * `3` - (a + b);
324	let lo = a.min(b);
325	let hi = a.max(b);
326	let t0 = if hi <= thresh { lo } else { c };
327	let t1 = if thresh <= lo { hi } else { t0 };
328	return t1;
329	}
330
331	fn filter_paeth_fpnge(a: u8, b: u8, c: u8) -> u8 {
332	// This is an optimized version of the paeth filter from the PNG specification, proposed by
333	// Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates
334	// entirely on unsigned 8-bit quantities, making it more conducive to vectorization.
335	//
336	// p = a + b - c
337	// pa = \|p - a\| = \|a + b - c - a\| = \|b - c\| = max(b, c) - min(b, c)
338	// pb = \|p - b\| = \|a + b - c - b\| = \|a - c\| = max(a, c) - min(a, c)
339	// pc = \|p - c\| = \|a + b - c - c\| = \|(b - c) + (a - c)\| = ...
340	//
341	// Further optimizing the calculation of `pc` a bit tricker. However, notice that:
342	//
343	// a > c && b > c
344	// ==> (a - c) > 0 && (b - c) > 0
345	// ==> pc > (a - c) && pc > (b - c)
346	// ==> pc > \|a - c\| && pc > \|b - c\|
347	// ==> pc > pb && pc > pa
348	//
349	// Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar
350	// reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and
351	// `c <= b` or vice versa:
352	//
353	// pc = \|\|b - c\| - \|a - c\|\| = \|pa - pb\| = max(pa, pb) - min(pa, pb)
354	//
355	let pa = b.max(c) - c.min(b);
356	let pb = a.max(c) - c.min(a);
357	let pc = if (a < c) == (c < b) {
358	pa.max(pb) - pa.min(pb)
359	} else {
360	`255`
361	};
362
363	if pa <= pb && pa <= pc {
364	a
365	} else if pb <= pc {
366	b
367	} else {
368	c
369	}
370	}
371
372	pub(crate) fn unfilter(
373	mut filter: FilterType,
374	tbpp: BytesPerPixel,
375	previous: &[u8],
376	current: &mut [u8],
377	) {
378	use self::FilterType::*;
379
380	// If the previous row is empty, then treat it as if it were filled with zeros.
381	if previous.is_empty() {
382	if filter == Paeth {
383	filter = Sub;
384	} else if filter == Up {
385	filter = NoFilter;
386	}
387	}
388
389	// [2023/01 @okaneco] - Notes on optimizing decoding filters
390	//
391	// Links:
392	// [PR]: https://github.com/image-rs/image-png/pull/382
393	// [SWAR]: http://aggregate.org/SWAR/over.html
394	// [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers
395	//
396	// #382 heavily refactored and optimized the following filters making the
397	// implementation nonobvious. These comments function as a summary of that
398	// PR with an explanation of the choices made below.
399	//
400	// #382 originally started with trying to optimize using a technique called
401	// SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32`
402	// and `u64` as SIMD registers to perform vertical operations in parallel,
403	// usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp)
404	// pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp
405	// in a `u64`. The `Sub` filter looked like the following code block, `Avg`
406	// was similar but used a bitwise average method from [AVG]:
407	// ```
408	// // See "Unpartitioned Operations With Correction Code" from [SWAR]
409	// fn swar_add_u32(x: u32, y: u32) -> u32 {
410	// // 7-bit addition so there's no carry over the most significant bit
411	// let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111
412	// // 1-bit parity/XOR addition to fill in the missing MSB
413	// n ^ (x ^ y) & 0x80808080 // 0x80 = 0b_1000_0000
414	// }
415	//
416	// let mut prev =
417	// u32::from_ne_bytes([current[0], current[1], current[2], current[3]]);
418	// for chunk in current[4..].chunks_exact_mut(4) {
419	// let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
420	// let new_chunk = swar_add_u32(cur, prev);
421	// chunk.copy_from_slice(&new_chunk.to_ne_bytes());
422	// prev = new_chunk;
423	// }
424	// ```
425	// While this provided a measurable increase, @fintelia found that this idea
426	// could be taken even further by unrolling the chunks component-wise and
427	// avoiding unnecessary byte-shuffling by using byte arrays instead of
428	// `u32::from\|to_ne_bytes`. The bitwise operations were no longer necessary
429	// so they were reverted to their obvious arithmetic equivalent. Lastly,
430	// `TryInto` was used instead of `copy_from_slice`. The `Sub` code now
431	// looked like this (with asserts to remove `0..bpp` bounds checks):
432	// ```
433	// assert!(len > 3);
434	// let mut prev = [current[0], current[1], current[2], current[3]];
435	// for chunk in current[4..].chunks_exact_mut(4) {
436	// let new_chunk = [
437	// chunk[0].wrapping_add(prev[0]),
438	// chunk[1].wrapping_add(prev[1]),
439	// chunk[2].wrapping_add(prev[2]),
440	// chunk[3].wrapping_add(prev[3]),
441	// ];
442	// TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;*
443	// prev = new_chunk;
444	// }
445	// ```
446	// The compiler was able to optimize the code to be even faster and this
447	// method even sped up Paeth filtering! Assertions were experimentally
448	// added within loop bodies which produced better instructions but no
449	// difference in speed. Finally, the code was refactored to remove manual
450	// slicing and start the previous pixel chunks with arrays of `[0; N]`.
451	// ```
452	// let mut prev = [0; 4];
453	// for chunk in current.chunks_exact_mut(4) {
454	// let new_chunk = [
455	// chunk[0].wrapping_add(prev[0]),
456	// chunk[1].wrapping_add(prev[1]),
457	// chunk[2].wrapping_add(prev[2]),
458	// chunk[3].wrapping_add(prev[3]),
459	// ];
460	// TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;*
461	// prev = new_chunk;
462	// }
463	// ```
464	// While we're not manually bit-twiddling anymore, a possible takeaway from
465	// this is to "think in SWAR" when dealing with small byte arrays. Unrolling
466	// array operations and performing them component-wise may unlock previously
467	// unavailable optimizations from the compiler, even when using the
468	// `chunks_exact` methods for their potential auto-vectorization benefits.
469	match filter {
470	NoFilter => {}
471	Sub => match tbpp {
472	BytesPerPixel::One => {
473	current.iter_mut().reduce(\|&mut prev, curr\| {
474	*curr = curr.wrapping_add(prev);
475	curr
476	});
477	}
478	BytesPerPixel::Two => {
479	let mut prev = [`0`; `2`];
480	for chunk in current.chunks_exact_mut(`2`) {
481	let new_chunk = [
482	chunk[`0`].wrapping_add(prev[`0`]),
483	chunk[`1`].wrapping_add(prev[`1`]),
484	];
485	TryInto::<&mut* [u8; `2`]>::try_into(chunk).unwrap() = new_chunk;
486	prev = new_chunk;
487	}
488	}
489	BytesPerPixel::Three => {
490	let mut prev = [`0`; `3`];
491	for chunk in current.chunks_exact_mut(`3`) {
492	let new_chunk = [
493	chunk[`0`].wrapping_add(prev[`0`]),
494	chunk[`1`].wrapping_add(prev[`1`]),
495	chunk[`2`].wrapping_add(prev[`2`]),
496	];
497	TryInto::<&mut* [u8; `3`]>::try_into(chunk).unwrap() = new_chunk;
498	prev = new_chunk;
499	}
500	}
501	BytesPerPixel::Four => {
502	let mut prev = [`0`; `4`];
503	for chunk in current.chunks_exact_mut(`4`) {
504	let new_chunk = [
505	chunk[`0`].wrapping_add(prev[`0`]),
506	chunk[`1`].wrapping_add(prev[`1`]),
507	chunk[`2`].wrapping_add(prev[`2`]),
508	chunk[`3`].wrapping_add(prev[`3`]),
509	];
510	TryInto::<&mut* [u8; `4`]>::try_into(chunk).unwrap() = new_chunk;
511	prev = new_chunk;
512	}
513	}
514	BytesPerPixel::Six => {
515	let mut prev = [`0`; `6`];
516	for chunk in current.chunks_exact_mut(`6`) {
517	let new_chunk = [
518	chunk[`0`].wrapping_add(prev[`0`]),
519	chunk[`1`].wrapping_add(prev[`1`]),
520	chunk[`2`].wrapping_add(prev[`2`]),
521	chunk[`3`].wrapping_add(prev[`3`]),
522	chunk[`4`].wrapping_add(prev[`4`]),
523	chunk[`5`].wrapping_add(prev[`5`]),
524	];
525	TryInto::<&mut* [u8; `6`]>::try_into(chunk).unwrap() = new_chunk;
526	prev = new_chunk;
527	}
528	}
529	BytesPerPixel::Eight => {
530	let mut prev = [`0`; `8`];
531	for chunk in current.chunks_exact_mut(`8`) {
532	let new_chunk = [
533	chunk[`0`].wrapping_add(prev[`0`]),
534	chunk[`1`].wrapping_add(prev[`1`]),
535	chunk[`2`].wrapping_add(prev[`2`]),
536	chunk[`3`].wrapping_add(prev[`3`]),
537	chunk[`4`].wrapping_add(prev[`4`]),
538	chunk[`5`].wrapping_add(prev[`5`]),
539	chunk[`6`].wrapping_add(prev[`6`]),
540	chunk[`7`].wrapping_add(prev[`7`]),
541	];
542	TryInto::<&mut* [u8; `8`]>::try_into(chunk).unwrap() = new_chunk;
543	prev = new_chunk;
544	}
545	}
546	},
547	Up => {
548	for (curr, &above) in current.iter_mut().zip(previous) {
549	*curr = curr.wrapping_add(above);
550	}
551	}
552	Avg if previous.is_empty() => match tbpp {
553	BytesPerPixel::One => {
554	current.iter_mut().reduce(\|&mut prev, curr\| {
555	*curr = curr.wrapping_add(prev / `2`);
556	curr
557	});
558	}
559	BytesPerPixel::Two => {
560	let mut prev = [`0`; `2`];
561	for chunk in current.chunks_exact_mut(`2`) {
562	let new_chunk = [
563	chunk[`0`].wrapping_add(prev[`0`] / `2`),
564	chunk[`1`].wrapping_add(prev[`1`] / `2`),
565	];
566	TryInto::<&mut* [u8; `2`]>::try_into(chunk).unwrap() = new_chunk;
567	prev = new_chunk;
568	}
569	}
570	BytesPerPixel::Three => {
571	let mut prev = [`0`; `3`];
572	for chunk in current.chunks_exact_mut(`3`) {
573	let new_chunk = [
574	chunk[`0`].wrapping_add(prev[`0`] / `2`),
575	chunk[`1`].wrapping_add(prev[`1`] / `2`),
576	chunk[`2`].wrapping_add(prev[`2`] / `2`),
577	];
578	TryInto::<&mut* [u8; `3`]>::try_into(chunk).unwrap() = new_chunk;
579	prev = new_chunk;
580	}
581	}
582	BytesPerPixel::Four => {
583	let mut prev = [`0`; `4`];
584	for chunk in current.chunks_exact_mut(`4`) {
585	let new_chunk = [
586	chunk[`0`].wrapping_add(prev[`0`] / `2`),
587	chunk[`1`].wrapping_add(prev[`1`] / `2`),
588	chunk[`2`].wrapping_add(prev[`2`] / `2`),
589	chunk[`3`].wrapping_add(prev[`3`] / `2`),
590	];
591	TryInto::<&mut* [u8; `4`]>::try_into(chunk).unwrap() = new_chunk;
592	prev = new_chunk;
593	}
594	}
595	BytesPerPixel::Six => {
596	let mut prev = [`0`; `6`];
597	for chunk in current.chunks_exact_mut(`6`) {
598	let new_chunk = [
599	chunk[`0`].wrapping_add(prev[`0`] / `2`),
600	chunk[`1`].wrapping_add(prev[`1`] / `2`),
601	chunk[`2`].wrapping_add(prev[`2`] / `2`),
602	chunk[`3`].wrapping_add(prev[`3`] / `2`),
603	chunk[`4`].wrapping_add(prev[`4`] / `2`),
604	chunk[`5`].wrapping_add(prev[`5`] / `2`),
605	];
606	TryInto::<&mut* [u8; `6`]>::try_into(chunk).unwrap() = new_chunk;
607	prev = new_chunk;
608	}
609	}
610	BytesPerPixel::Eight => {
611	let mut prev = [`0`; `8`];
612	for chunk in current.chunks_exact_mut(`8`) {
613	let new_chunk = [
614	chunk[`0`].wrapping_add(prev[`0`] / `2`),
615	chunk[`1`].wrapping_add(prev[`1`] / `2`),
616	chunk[`2`].wrapping_add(prev[`2`] / `2`),
617	chunk[`3`].wrapping_add(prev[`3`] / `2`),
618	chunk[`4`].wrapping_add(prev[`4`] / `2`),
619	chunk[`5`].wrapping_add(prev[`5`] / `2`),
620	chunk[`6`].wrapping_add(prev[`6`] / `2`),
621	chunk[`7`].wrapping_add(prev[`7`] / `2`),
622	];
623	TryInto::<&mut* [u8; `8`]>::try_into(chunk).unwrap() = new_chunk;
624	prev = new_chunk;
625	}
626	}
627	},
628	Avg => match tbpp {
629	BytesPerPixel::One => {
630	let mut lprev = [`0`; `1`];
631	for (chunk, above) in current.chunks_exact_mut(`1`).zip(previous.chunks_exact(`1`)) {
632	let new_chunk =
633	[chunk[`0`].wrapping_add(((above[`0`] as u16 + lprev[`0`] as u16) / `2`) as u8)];
634	TryInto::<&mut* [u8; `1`]>::try_into(chunk).unwrap() = new_chunk;
635	lprev = new_chunk;
636	}
637	}
638	BytesPerPixel::Two => {
639	let mut lprev = [`0`; `2`];
640	for (chunk, above) in current.chunks_exact_mut(`2`).zip(previous.chunks_exact(`2`)) {
641	let new_chunk = [
642	chunk[`0`].wrapping_add(((above[`0`] as u16 + lprev[`0`] as u16) / `2`) as u8),
643	chunk[`1`].wrapping_add(((above[`1`] as u16 + lprev[`1`] as u16) / `2`) as u8),
644	];
645	TryInto::<&mut* [u8; `2`]>::try_into(chunk).unwrap() = new_chunk;
646	lprev = new_chunk;
647	}
648	}
649	BytesPerPixel::Three => {
650	let mut lprev = [`0`; `3`];
651	for (chunk, above) in current.chunks_exact_mut(`3`).zip(previous.chunks_exact(`3`)) {
652	let new_chunk = [
653	chunk[`0`].wrapping_add(((above[`0`] as u16 + lprev[`0`] as u16) / `2`) as u8),
654	chunk[`1`].wrapping_add(((above[`1`] as u16 + lprev[`1`] as u16) / `2`) as u8),
655	chunk[`2`].wrapping_add(((above[`2`] as u16 + lprev[`2`] as u16) / `2`) as u8),
656	];
657	TryInto::<&mut* [u8; `3`]>::try_into(chunk).unwrap() = new_chunk;
658	lprev = new_chunk;
659	}
660	}
661	BytesPerPixel::Four => {
662	let mut lprev = [`0`; `4`];
663	for (chunk, above) in current.chunks_exact_mut(`4`).zip(previous.chunks_exact(`4`)) {
664	let new_chunk = [
665	chunk[`0`].wrapping_add(((above[`0`] as u16 + lprev[`0`] as u16) / `2`) as u8),
666	chunk[`1`].wrapping_add(((above[`1`] as u16 + lprev[`1`] as u16) / `2`) as u8),
667	chunk[`2`].wrapping_add(((above[`2`] as u16 + lprev[`2`] as u16) / `2`) as u8),
668	chunk[`3`].wrapping_add(((above[`3`] as u16 + lprev[`3`] as u16) / `2`) as u8),
669	];
670	TryInto::<&mut* [u8; `4`]>::try_into(chunk).unwrap() = new_chunk;
671	lprev = new_chunk;
672	}
673	}
674	BytesPerPixel::Six => {
675	let mut lprev = [`0`; `6`];
676	for (chunk, above) in current.chunks_exact_mut(`6`).zip(previous.chunks_exact(`6`)) {
677	let new_chunk = [
678	chunk[`0`].wrapping_add(((above[`0`] as u16 + lprev[`0`] as u16) / `2`) as u8),
679	chunk[`1`].wrapping_add(((above[`1`] as u16 + lprev[`1`] as u16) / `2`) as u8),
680	chunk[`2`].wrapping_add(((above[`2`] as u16 + lprev[`2`] as u16) / `2`) as u8),
681	chunk[`3`].wrapping_add(((above[`3`] as u16 + lprev[`3`] as u16) / `2`) as u8),
682	chunk[`4`].wrapping_add(((above[`4`] as u16 + lprev[`4`] as u16) / `2`) as u8),
683	chunk[`5`].wrapping_add(((above[`5`] as u16 + lprev[`5`] as u16) / `2`) as u8),
684	];
685	TryInto::<&mut* [u8; `6`]>::try_into(chunk).unwrap() = new_chunk;
686	lprev = new_chunk;
687	}
688	}
689	BytesPerPixel::Eight => {
690	let mut lprev = [`0`; `8`];
691	for (chunk, above) in current.chunks_exact_mut(`8`).zip(previous.chunks_exact(`8`)) {
692	let new_chunk = [
693	chunk[`0`].wrapping_add(((above[`0`] as u16 + lprev[`0`] as u16) / `2`) as u8),
694	chunk[`1`].wrapping_add(((above[`1`] as u16 + lprev[`1`] as u16) / `2`) as u8),
695	chunk[`2`].wrapping_add(((above[`2`] as u16 + lprev[`2`] as u16) / `2`) as u8),
696	chunk[`3`].wrapping_add(((above[`3`] as u16 + lprev[`3`] as u16) / `2`) as u8),
697	chunk[`4`].wrapping_add(((above[`4`] as u16 + lprev[`4`] as u16) / `2`) as u8),
698	chunk[`5`].wrapping_add(((above[`5`] as u16 + lprev[`5`] as u16) / `2`) as u8),
699	chunk[`6`].wrapping_add(((above[`6`] as u16 + lprev[`6`] as u16) / `2`) as u8),
700	chunk[`7`].wrapping_add(((above[`7`] as u16 + lprev[`7`] as u16) / `2`) as u8),
701	];
702	TryInto::<&mut* [u8; `8`]>::try_into(chunk).unwrap() = new_chunk;
703	lprev = new_chunk;
704	}
705	}
706	},
707	#[allow(unreachable_code)]
708	Paeth => {
709	// Select the fastest Paeth filter implementation based on the target architecture.
710	let filter_paeth_decode = if cfg!(target_arch = "x86_64") {
711	filter_paeth_stbi
712	} else {
713	filter_paeth
714	};
715
716	// Paeth filter pixels:
717	// C B D
718	// A X
719	match tbpp {
720	BytesPerPixel::One => {
721	let mut a_bpp = [`0`; `1`];
722	let mut c_bpp = [`0`; `1`];
723	for (chunk, b_bpp) in current.chunks_exact_mut(`1`).zip(previous.chunks_exact(`1`))
724	{
725	let new_chunk = [chunk[`0`]
726	.wrapping_add(filter_paeth_decode(a_bpp[`0`], b_bpp[`0`], c_bpp[`0`]))];
727	TryInto::<&mut* [u8; `1`]>::try_into(chunk).unwrap() = new_chunk;
728	a_bpp = new_chunk;
729	c_bpp = b_bpp.try_into().unwrap();
730	}
731	}
732	BytesPerPixel::Two => {
733	let mut a_bpp = [`0`; `2`];
734	let mut c_bpp = [`0`; `2`];
735	for (chunk, b_bpp) in current.chunks_exact_mut(`2`).zip(previous.chunks_exact(`2`))
736	{
737	let new_chunk = [
738	chunk[`0`]
739	.wrapping_add(filter_paeth_decode(a_bpp[`0`], b_bpp[`0`], c_bpp[`0`])),
740	chunk[`1`]
741	.wrapping_add(filter_paeth_decode(a_bpp[`1`], b_bpp[`1`], c_bpp[`1`])),
742	];
743	TryInto::<&mut* [u8; `2`]>::try_into(chunk).unwrap() = new_chunk;
744	a_bpp = new_chunk;
745	c_bpp = b_bpp.try_into().unwrap();
746	}
747	}
748	BytesPerPixel::Three => {
749	// Do not enable this algorithm on ARM, that would be a big performance hit
750	#[cfg(all(feature = "unstable", target_arch = "x86_64"))]
751	{
752	simd::unfilter_paeth3(previous, current);
753	return;
754	}
755
756	let mut a_bpp = [`0`; `3`];
757	let mut c_bpp = [`0`; `3`];
758	for (chunk, b_bpp) in current.chunks_exact_mut(`3`).zip(previous.chunks_exact(`3`))
759	{
760	let new_chunk = [
761	chunk[`0`]
762	.wrapping_add(filter_paeth_decode(a_bpp[`0`], b_bpp[`0`], c_bpp[`0`])),
763	chunk[`1`]
764	.wrapping_add(filter_paeth_decode(a_bpp[`1`], b_bpp[`1`], c_bpp[`1`])),
765	chunk[`2`]
766	.wrapping_add(filter_paeth_decode(a_bpp[`2`], b_bpp[`2`], c_bpp[`2`])),
767	];
768	TryInto::<&mut* [u8; `3`]>::try_into(chunk).unwrap() = new_chunk;
769	a_bpp = new_chunk;
770	c_bpp = b_bpp.try_into().unwrap();
771	}
772	}
773	BytesPerPixel::Four => {
774	#[cfg(all(feature = "unstable", target_arch = "x86_64"))]
775	{
776	simd::unfilter_paeth_u8::<`4`>(previous, current);
777	return;
778	}
779
780	let mut a_bpp = [`0`; `4`];
781	let mut c_bpp = [`0`; `4`];
782	for (chunk, b_bpp) in current.chunks_exact_mut(`4`).zip(previous.chunks_exact(`4`))
783	{
784	let new_chunk = [
785	chunk[`0`]
786	.wrapping_add(filter_paeth_decode(a_bpp[`0`], b_bpp[`0`], c_bpp[`0`])),
787	chunk[`1`]
788	.wrapping_add(filter_paeth_decode(a_bpp[`1`], b_bpp[`1`], c_bpp[`1`])),
789	chunk[`2`]
790	.wrapping_add(filter_paeth_decode(a_bpp[`2`], b_bpp[`2`], c_bpp[`2`])),
791	chunk[`3`]
792	.wrapping_add(filter_paeth_decode(a_bpp[`3`], b_bpp[`3`], c_bpp[`3`])),
793	];
794	TryInto::<&mut* [u8; `4`]>::try_into(chunk).unwrap() = new_chunk;
795	a_bpp = new_chunk;
796	c_bpp = b_bpp.try_into().unwrap();
797	}
798	}
799	BytesPerPixel::Six => {
800	#[cfg(all(feature = "unstable", target_arch = "x86_64"))]
801	{
802	simd::unfilter_paeth6(previous, current);
803	return;
804	}
805
806	let mut a_bpp = [`0`; `6`];
807	let mut c_bpp = [`0`; `6`];
808	for (chunk, b_bpp) in current.chunks_exact_mut(`6`).zip(previous.chunks_exact(`6`))
809	{
810	let new_chunk = [
811	chunk[`0`]
812	.wrapping_add(filter_paeth_decode(a_bpp[`0`], b_bpp[`0`], c_bpp[`0`])),
813	chunk[`1`]
814	.wrapping_add(filter_paeth_decode(a_bpp[`1`], b_bpp[`1`], c_bpp[`1`])),
815	chunk[`2`]
816	.wrapping_add(filter_paeth_decode(a_bpp[`2`], b_bpp[`2`], c_bpp[`2`])),
817	chunk[`3`]
818	.wrapping_add(filter_paeth_decode(a_bpp[`3`], b_bpp[`3`], c_bpp[`3`])),
819	chunk[`4`]
820	.wrapping_add(filter_paeth_decode(a_bpp[`4`], b_bpp[`4`], c_bpp[`4`])),
821	chunk[`5`]
822	.wrapping_add(filter_paeth_decode(a_bpp[`5`], b_bpp[`5`], c_bpp[`5`])),
823	];
824	TryInto::<&mut* [u8; `6`]>::try_into(chunk).unwrap() = new_chunk;
825	a_bpp = new_chunk;
826	c_bpp = b_bpp.try_into().unwrap();
827	}
828	}
829	BytesPerPixel::Eight => {
830	#[cfg(all(feature = "unstable", target_arch = "x86_64"))]
831	{
832	simd::unfilter_paeth_u8::<`8`>(previous, current);
833	return;
834	}
835
836	let mut a_bpp = [`0`; `8`];
837	let mut c_bpp = [`0`; `8`];
838	for (chunk, b_bpp) in current.chunks_exact_mut(`8`).zip(previous.chunks_exact(`8`))
839	{
840	let new_chunk = [
841	chunk[`0`]
842	.wrapping_add(filter_paeth_decode(a_bpp[`0`], b_bpp[`0`], c_bpp[`0`])),
843	chunk[`1`]
844	.wrapping_add(filter_paeth_decode(a_bpp[`1`], b_bpp[`1`], c_bpp[`1`])),
845	chunk[`2`]
846	.wrapping_add(filter_paeth_decode(a_bpp[`2`], b_bpp[`2`], c_bpp[`2`])),
847	chunk[`3`]
848	.wrapping_add(filter_paeth_decode(a_bpp[`3`], b_bpp[`3`], c_bpp[`3`])),
849	chunk[`4`]
850	.wrapping_add(filter_paeth_decode(a_bpp[`4`], b_bpp[`4`], c_bpp[`4`])),
851	chunk[`5`]
852	.wrapping_add(filter_paeth_decode(a_bpp[`5`], b_bpp[`5`], c_bpp[`5`])),
853	chunk[`6`]
854	.wrapping_add(filter_paeth_decode(a_bpp[`6`], b_bpp[`6`], c_bpp[`6`])),
855	chunk[`7`]
856	.wrapping_add(filter_paeth_decode(a_bpp[`7`], b_bpp[`7`], c_bpp[`7`])),
857	];
858	TryInto::<&mut* [u8; `8`]>::try_into(chunk).unwrap() = new_chunk;
859	a_bpp = new_chunk;
860	c_bpp = b_bpp.try_into().unwrap();
861	}
862	}
863	}
864	}
865	}
866	}
867
868	fn filter_internal(
869	method: FilterType,
870	bpp: usize,
871	len: usize,
872	previous: &[u8],
873	current: &[u8],
874	output: &mut [u8],
875	) -> FilterType {
876	use self::FilterType::*;
877
878	// This value was chosen experimentally based on what achieved the best performance. The
879	// Rust compiler does auto-vectorization, and 32-bytes per loop iteration seems to enable
880	// the fastest code when doing so.
881	const CHUNK_SIZE: usize = `32`;
882
883	match method {
884	NoFilter => {
885	output.copy_from_slice(current);
886	NoFilter
887	}
888	Sub => {
889	let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE);
890	let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE);
891	let mut prev_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE);
892
893	for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) {
894	for i in `0`..CHUNK_SIZE {
895	out[i] = cur[i].wrapping_sub(prev[i]);
896	}
897	}
898
899	for ((out, cur), &prev) in out_chunks
900	.into_remainder()
901	.iter_mut()
902	.zip(cur_chunks.remainder())
903	.zip(prev_chunks.remainder())
904	{
905	*out = cur.wrapping_sub(prev);
906	}
907
908	output[..bpp].copy_from_slice(&current[..bpp]);
909	Sub
910	}
911	Up => {
912	let mut out_chunks = output.chunks_exact_mut(CHUNK_SIZE);
913	let mut cur_chunks = current.chunks_exact(CHUNK_SIZE);
914	let mut prev_chunks = previous.chunks_exact(CHUNK_SIZE);
915
916	for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) {
917	for i in `0`..CHUNK_SIZE {
918	out[i] = cur[i].wrapping_sub(prev[i]);
919	}
920	}
921
922	for ((out, cur), &prev) in out_chunks
923	.into_remainder()
924	.iter_mut()
925	.zip(cur_chunks.remainder())
926	.zip(prev_chunks.remainder())
927	{
928	*out = cur.wrapping_sub(prev);
929	}
930	Up
931	}
932	Avg => {
933	let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE);
934	let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE);
935	let mut cur_minus_bpp_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE);
936	let mut prev_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE);
937
938	for (((out, cur), cur_minus_bpp), prev) in (&mut out_chunks)
939	.zip(&mut cur_chunks)
940	.zip(&mut cur_minus_bpp_chunks)
941	.zip(&mut prev_chunks)
942	{
943	for i in `0`..CHUNK_SIZE {
944	// Bitwise average of two integers without overflow and
945	// without converting to a wider bit-width. See:
946	// http://aggregate.org/MAGIC/#Average%20of%20Integers
947	// If this is unrolled by component, consider reverting to
948	// `((cur_minus_bpp[i] as u16 + prev[i] as u16) / 2) as u8`
949	out[i] = cur[i].wrapping_sub(
950	(cur_minus_bpp[i] & prev[i]) + ((cur_minus_bpp[i] ^ prev[i]) >> `1`),
951	);
952	}
953	}
954
955	for (((out, cur), &cur_minus_bpp), &prev) in out_chunks
956	.into_remainder()
957	.iter_mut()
958	.zip(cur_chunks.remainder())
959	.zip(cur_minus_bpp_chunks.remainder())
960	.zip(prev_chunks.remainder())
961	{
962	*out = cur.wrapping_sub((cur_minus_bpp & prev) + ((cur_minus_bpp ^ prev) >> `1`));
963	}
964
965	for i in `0`..bpp {
966	output[i] = current[i].wrapping_sub(previous[i] / `2`);
967	}
968	Avg
969	}
970	Paeth => {
971	let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE);
972	let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE);
973	let mut a_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE);
974	let mut b_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE);
975	let mut c_chunks = previous[..len - bpp].chunks_exact(CHUNK_SIZE);
976
977	for ((((out, cur), a), b), c) in (&mut out_chunks)
978	.zip(&mut cur_chunks)
979	.zip(&mut a_chunks)
980	.zip(&mut b_chunks)
981	.zip(&mut c_chunks)
982	{
983	for i in `0`..CHUNK_SIZE {
984	out[i] = cur[i].wrapping_sub(filter_paeth_fpnge(a[i], b[i], c[i]));
985	}
986	}
987
988	for ((((out, cur), &a), &b), &c) in out_chunks
989	.into_remainder()
990	.iter_mut()
991	.zip(cur_chunks.remainder())
992	.zip(a_chunks.remainder())
993	.zip(b_chunks.remainder())
994	.zip(c_chunks.remainder())
995	{
996	*out = cur.wrapping_sub(filter_paeth_fpnge(a, b, c));
997	}
998
999	for i in `0`..bpp {
1000	output[i] = current[i].wrapping_sub(filter_paeth_fpnge(`0`, previous[i], `0`));
1001	}
1002	Paeth
1003	}
1004	}
1005	}
1006
1007	pub(crate) fn filter(
1008	method: FilterType,
1009	adaptive: AdaptiveFilterType,
1010	bpp: BytesPerPixel,
1011	previous: &[u8],
1012	current: &[u8],
1013	output: &mut [u8],
1014	) -> FilterType {
1015	use FilterType::*;
1016	let bpp = bpp.into_usize();
1017	let len = current.len();
1018
1019	match adaptive {
1020	AdaptiveFilterType::NonAdaptive => {
1021	filter_internal(method, bpp, len, previous, current, output)
1022	}
1023	AdaptiveFilterType::Adaptive => {
1024	let mut min_sum: u64 = u64::MAX;
1025	let mut filter_choice = FilterType::NoFilter;
1026	for &filter in [Sub, Up, Avg, Paeth].iter() {
1027	filter_internal(filter, bpp, len, previous, current, output);
1028	let sum = sum_buffer(output);
1029	if sum <= min_sum {
1030	min_sum = sum;
1031	filter_choice = filter;
1032	}
1033	}
1034
1035	if filter_choice != Paeth {
1036	filter_internal(filter_choice, bpp, len, previous, current, output);
1037	}
1038	filter_choice
1039	}
1040	}
1041	}
1042
1043	// Helper function for Adaptive filter buffer summation
1044	fn sum_buffer(buf: &[u8]) -> u64 {
1045	const CHUNK_SIZE: usize = `32`;
1046
1047	let mut buf_chunks: ChunksExact<'_, u8> = buf.chunks_exact(CHUNK_SIZE);
1048	let mut sum: u64 = `0_u64`;
1049
1050	for chunk: &[u8] in &mut buf_chunks {
1051	// At most, `acc` can be `32 (i8::MIN as u8) = 32 * 128 = 4096`.*
1052	let mut acc: u64 = `0`;
1053	for &b: u8 in chunk {
1054	acc += u64::from((b as i8).unsigned_abs());
1055	}
1056	sum = sum.saturating_add(acc);
1057	}
1058
1059	let mut acc: u64 = `0`;
1060	for &b: u8 in buf_chunks.remainder() {
1061	acc += u64::from((b as i8).unsigned_abs());
1062	}
1063
1064	sum.saturating_add(acc)
1065	}
1066
1067	#[cfg(test)]
1068	mod test {
1069	use super::*;
1070	use core::iter;
1071
1072	#[test]
1073	fn roundtrip() {
1074	// A multiple of 8, 6, 4, 3, 2, 1
1075	const LEN: u8 = `240`;
1076	let previous: Vec<_> = iter::repeat(`1`).take(LEN.into()).collect();
1077	let current: Vec<_> = (`0`..LEN).collect();
1078	let expected = current.clone();
1079	let adaptive = AdaptiveFilterType::NonAdaptive;
1080
1081	let roundtrip = \|kind, bpp: BytesPerPixel\| {
1082	let mut output = vec![`0`; LEN.into()];
1083	filter(kind, adaptive, bpp, &previous, &current, &mut output);
1084	unfilter(kind, bpp, &previous, &mut output);
1085	assert_eq!(
1086	output, expected,
1087	"Filtering {:?} with {:?} does not roundtrip",
1088	bpp, kind
1089	);
1090	};
1091
1092	let filters = [
1093	FilterType::NoFilter,
1094	FilterType::Sub,
1095	FilterType::Up,
1096	FilterType::Avg,
1097	FilterType::Paeth,
1098	];
1099
1100	let bpps = [
1101	BytesPerPixel::One,
1102	BytesPerPixel::Two,
1103	BytesPerPixel::Three,
1104	BytesPerPixel::Four,
1105	BytesPerPixel::Six,
1106	BytesPerPixel::Eight,
1107	];
1108
1109	for &filter in filters.iter() {
1110	for &bpp in bpps.iter() {
1111	roundtrip(filter, bpp);
1112	}
1113	}
1114	}
1115
1116	#[test]
1117	#[ignore] // takes ~20s without optimizations
1118	fn paeth_impls_are_equivalent() {
1119	for a in `0`..=`255` {
1120	for b in `0`..=`255` {
1121	for c in `0`..=`255` {
1122	let baseline = filter_paeth(a, b, c);
1123	let fpnge = filter_paeth_fpnge(a, b, c);
1124	let stbi = filter_paeth_stbi(a, b, c);
1125	let stbi_i16 = filter_paeth_stbi_i16(a as i16, b as i16, c as i16);
1126
1127	assert_eq!(baseline, fpnge);
1128	assert_eq!(baseline, stbi);
1129	assert_eq!(baseline as i16, stbi_i16);
1130	}
1131	}
1132	}
1133	}
1134
1135	#[test]
1136	fn roundtrip_ascending_previous_line() {
1137	// A multiple of 8, 6, 4, 3, 2, 1
1138	const LEN: u8 = `240`;
1139	let previous: Vec<_> = (`0`..LEN).collect();
1140	let current: Vec<_> = (`0`..LEN).collect();
1141	let expected = current.clone();
1142	let adaptive = AdaptiveFilterType::NonAdaptive;
1143
1144	let roundtrip = \|kind, bpp: BytesPerPixel\| {
1145	let mut output = vec![`0`; LEN.into()];
1146	filter(kind, adaptive, bpp, &previous, &current, &mut output);
1147	unfilter(kind, bpp, &previous, &mut output);
1148	assert_eq!(
1149	output, expected,
1150	"Filtering {:?} with {:?} does not roundtrip",
1151	bpp, kind
1152	);
1153	};
1154
1155	let filters = [
1156	FilterType::NoFilter,
1157	FilterType::Sub,
1158	FilterType::Up,
1159	FilterType::Avg,
1160	FilterType::Paeth,
1161	];
1162
1163	let bpps = [
1164	BytesPerPixel::One,
1165	BytesPerPixel::Two,
1166	BytesPerPixel::Three,
1167	BytesPerPixel::Four,
1168	BytesPerPixel::Six,
1169	BytesPerPixel::Eight,
1170	];
1171
1172	for &filter in filters.iter() {
1173	for &bpp in bpps.iter() {
1174	roundtrip(filter, bpp);
1175	}
1176	}
1177	}
1178
1179	#[test]
1180	// This tests that converting u8 to i8 doesn't overflow when taking the
1181	// absolute value for adaptive filtering: -128_i8.abs() will panic in debug
1182	// or produce garbage in release mode. The sum of 0..=255u8 should equal the
1183	// sum of the absolute values of -128_i8..=127, or abs(-128..=0) + 1..=127.
1184	fn sum_buffer_test() {
1185	let sum = (`0`..=`128`).sum::<u64>() + (`1`..=`127`).sum::<u64>();
1186	let buf: Vec<u8> = (`0_u8`..=`255`).collect();
1187
1188	assert_eq!(sum, crate::filter::sum_buffer(&buf));
1189	}
1190	}
1191