1use core::convert::TryInto;
2
3use crate::common::BytesPerPixel;
4
5/// SIMD helpers for `fn unfilter`
6///
7/// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the
8/// "unstable" feature of the `png` crate. This should be possible once the "portable_simd"
9/// feature of Rust gets stabilized.
10///
11/// This is only known to help on x86, with no change measured on most benchmarks on ARM,
12/// and even severely regressing some of them.
13/// So despite the code being portable, we only enable this for x86.
14/// We can add more platforms once this code is proven to be beneficial for them.
15#[cfg(all(feature = "unstable", target_arch = "x86_64"))]
16mod simd {
17 use std::simd::num::{SimdInt, SimdUint};
18 use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount};
19
20 /// Scalar Paeth function wrapped in SIMD scaffolding.
21 ///
22 /// This is needed because simply running the function on the inputs
23 /// makes the compiler think our inputs are too short
24 /// to benefit from vectorization.
25 /// Putting it in SIMD scaffolding fixes that.
26 /// https://github.com/image-rs/image-png/issues/511
27 ///
28 /// Funnily, the autovectorizer does a better job here
29 /// than a handwritten algorithm using std::simd!
30 /// We used to have a handwritten one but this is just faster.
31 fn paeth_predictor<const N: usize>(
32 a: Simd<i16, N>,
33 b: Simd<i16, N>,
34 c: Simd<i16, N>,
35 ) -> Simd<i16, N>
36 where
37 LaneCount<N>: SupportedLaneCount,
38 {
39 let mut out = [0; N];
40 for i in 0..N {
41 out[i] = super::filter_paeth_stbi_i16(a[i].into(), b[i].into(), c[i].into());
42 }
43 out.into()
44 }
45
46 /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert
47 /// the SIMD elements to `i16`.
48 fn paeth_predictor_u8<const N: usize>(
49 a: Simd<u8, N>,
50 b: Simd<u8, N>,
51 c: Simd<u8, N>,
52 ) -> Simd<u8, N>
53 where
54 LaneCount<N>: SupportedLaneCount,
55 {
56 let mut out = [0; N];
57 for i in 0..N {
58 out[i] = super::filter_paeth_stbi(a[i].into(), b[i].into(), c[i].into());
59 }
60 out.into()
61 }
62
63 /// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`).
64 /// See also https://www.w3.org/TR/png/#filter-byte-positions
65 #[derive(Default)]
66 struct PaethState<T, const N: usize>
67 where
68 T: SimdElement,
69 LaneCount<N>: SupportedLaneCount,
70 {
71 /// Previous pixel in the previous row.
72 c: Simd<T, N>,
73
74 /// Previous pixel in the current row.
75 a: Simd<T, N>,
76 }
77
78 /// Mutates `x` as needed to unfilter `FilterType::Paeth`.
79 ///
80 /// `b` is the current pixel in the previous row. `x` is the current pixel in the current row.
81 /// See also https://www.w3.org/TR/png/#filter-byte-positions
82 fn paeth_step<const N: usize>(
83 state: &mut PaethState<i16, N>,
84 b: Simd<u8, N>,
85 x: &mut Simd<u8, N>,
86 ) where
87 LaneCount<N>: SupportedLaneCount,
88 {
89 // Storing the inputs.
90 let b = b.cast::<i16>();
91
92 // Calculating the new value of the current pixel.
93 let predictor = paeth_predictor(state.a, b, state.c);
94 *x += predictor.cast::<u8>();
95
96 // Preparing for the next step.
97 state.c = b;
98 state.a = x.cast::<i16>();
99 }
100
101 /// Computes the Paeth predictor without converting `u8` to `i16`.
102 ///
103 /// See `simd::paeth_step`.
104 fn paeth_step_u8<const N: usize>(
105 state: &mut PaethState<u8, N>,
106 b: Simd<u8, N>,
107 x: &mut Simd<u8, N>,
108 ) where
109 LaneCount<N>: SupportedLaneCount,
110 {
111 // Calculating the new value of the current pixel.
112 *x += paeth_predictor_u8(state.a, b, state.c);
113
114 // Preparing for the next step.
115 state.c = b;
116 state.a = *x;
117 }
118
119 fn load3(src: &[u8]) -> u8x4 {
120 u8x4::from_array([src[0], src[1], src[2], 0])
121 }
122
123 fn store3(src: u8x4, dest: &mut [u8]) {
124 dest[0..3].copy_from_slice(&src.to_array()[0..3])
125 }
126
127 /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
128 pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
129 debug_assert_eq!(prev_row.len(), curr_row.len());
130 debug_assert_eq!(prev_row.len() % 3, 0);
131
132 let mut state = PaethState::<i16, 4>::default();
133 while prev_row.len() >= 4 {
134 // `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first
135 // byte from the next triple. This optimization technique mimics the algorithm found
136 // in
137 // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
138 let b = u8x4::from_slice(prev_row);
139 let mut x = u8x4::from_slice(curr_row);
140
141 paeth_step(&mut state, b, &mut x);
142
143 // We can speculate that writing 4 bytes might be more efficient (just as with using
144 // `u8x4::from_slice` above), but we can't use that here, because we can't clobber the
145 // first byte of the next pixel in the `curr_row`.
146 store3(x, curr_row);
147
148 prev_row = &prev_row[3..];
149 curr_row = &mut curr_row[3..];
150 }
151 // Can't use `u8x4::from_slice` for the last `[u8;3]`.
152 let b = load3(prev_row);
153 let mut x = load3(curr_row);
154 paeth_step(&mut state, b, &mut x);
155 store3(x, curr_row);
156 }
157
158 /// Undoes `FilterType::Paeth` for `BytesPerPixel::Four` and `BytesPerPixel::Eight`.
159 ///
160 /// This function calculates the Paeth predictor entirely in `Simd<u8, N>`
161 /// without converting to an intermediate `Simd<i16, N>`. Doing so avoids
162 /// paying a small performance penalty converting between types.
163 pub fn unfilter_paeth_u8<const N: usize>(prev_row: &[u8], curr_row: &mut [u8])
164 where
165 LaneCount<N>: SupportedLaneCount,
166 {
167 debug_assert_eq!(prev_row.len(), curr_row.len());
168 debug_assert_eq!(prev_row.len() % N, 0);
169 assert!(matches!(N, 4 | 8));
170
171 let mut state = PaethState::<u8, N>::default();
172 for (prev_row, curr_row) in prev_row.chunks_exact(N).zip(curr_row.chunks_exact_mut(N)) {
173 let b = Simd::from_slice(prev_row);
174 let mut x = Simd::from_slice(curr_row);
175
176 paeth_step_u8(&mut state, b, &mut x);
177
178 curr_row[..N].copy_from_slice(&x.to_array()[..N]);
179 }
180 }
181
182 fn load6(src: &[u8]) -> u8x8 {
183 u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0])
184 }
185
186 fn store6(src: u8x8, dest: &mut [u8]) {
187 dest[0..6].copy_from_slice(&src.to_array()[0..6])
188 }
189
190 /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`.
191 pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
192 debug_assert_eq!(prev_row.len(), curr_row.len());
193 debug_assert_eq!(prev_row.len() % 6, 0);
194
195 let mut state = PaethState::<i16, 8>::default();
196 while prev_row.len() >= 8 {
197 // `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two
198 // bytes from the next pixel. This optimization technique mimics the algorithm found
199 // in
200 // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
201 let b = u8x8::from_slice(prev_row);
202 let mut x = u8x8::from_slice(curr_row);
203
204 paeth_step(&mut state, b, &mut x);
205
206 // We can speculate that writing 8 bytes might be more efficient (just as with using
207 // `u8x8::from_slice` above), but we can't use that here, because we can't clobber the
208 // first bytes of the next pixel in the `curr_row`.
209 store6(x, curr_row);
210
211 prev_row = &prev_row[6..];
212 curr_row = &mut curr_row[6..];
213 }
214 // Can't use `u8x8::from_slice` for the last `[u8;6]`.
215 let b = load6(prev_row);
216 let mut x = load6(curr_row);
217 paeth_step(&mut state, b, &mut x);
218 store6(x, curr_row);
219 }
220}
221
222/// The byte level filter applied to scanlines to prepare them for compression.
223///
224/// Compression in general benefits from repetitive data. The filter is a content-aware method of
225/// compressing the range of occurring byte values to help the compression algorithm. Note that
226/// this does not operate on pixels but on raw bytes of a scanline.
227///
228/// Details on how each filter works can be found in the [PNG Book](http://www.libpng.org/pub/png/book/chapter09.html).
229#[derive(Debug, Clone, Copy, PartialEq, Eq)]
230#[repr(u8)]
231pub enum FilterType {
232 NoFilter = 0,
233 Sub = 1,
234 Up = 2,
235 Avg = 3,
236 Paeth = 4,
237}
238
239impl Default for FilterType {
240 fn default() -> Self {
241 FilterType::Sub
242 }
243}
244
245impl FilterType {
246 /// u8 -> Self. Temporary solution until Rust provides a canonical one.
247 pub fn from_u8(n: u8) -> Option<FilterType> {
248 match n {
249 0 => Some(FilterType::NoFilter),
250 1 => Some(FilterType::Sub),
251 2 => Some(FilterType::Up),
252 3 => Some(FilterType::Avg),
253 4 => Some(FilterType::Paeth),
254 _ => None,
255 }
256 }
257}
258
259/// Adaptive filtering tries every possible filter for each row and uses a heuristic to select the best one.
260/// This improves compression ratio, but makes encoding slightly slower.
261///
262/// It is recommended to use `Adaptive` whenever you care about compression ratio.
263/// Filtering is quite cheap compared to other parts of encoding, but can contribute
264/// to the compression ratio significantly.
265///
266/// `NonAdaptive` filtering is the default.
267#[derive(Debug, Clone, Copy, PartialEq, Eq)]
268#[repr(u8)]
269pub enum AdaptiveFilterType {
270 Adaptive,
271 NonAdaptive,
272}
273
274impl Default for AdaptiveFilterType {
275 fn default() -> Self {
276 AdaptiveFilterType::NonAdaptive
277 }
278}
279
280fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
281 // On ARM this algorithm performs much better than the one above adapted from stb,
282 // and this is the better-studied algorithm we've always used here,
283 // so we default to it on all non-x86 platforms.
284 let pa: i16 = (i16::from(b) - i16::from(c)).abs();
285 let pb: i16 = (i16::from(a) - i16::from(c)).abs();
286 let pc: i16 = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
287
288 let mut out: u8 = a;
289 let mut min: i16 = pa;
290
291 if pb < min {
292 min = pb;
293 out = b;
294 }
295 if pc < min {
296 out = c;
297 }
298
299 out
300}
301
302fn filter_paeth_stbi(a: u8, b: u8, c: u8) -> u8 {
303 // Decoding optimizes better with this algorithm than with `filter_paeth`
304 //
305 // This formulation looks very different from the reference in the PNG spec, but is
306 // actually equivalent and has favorable data dependencies and admits straightforward
307 // generation of branch-free code, which helps performance significantly.
308 //
309 // Adapted from public domain PNG implementation:
310 // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
311 let thresh: i16 = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
312 let lo: u8 = a.min(b);
313 let hi: u8 = a.max(b);
314 let t0: u8 = if hi as i16 <= thresh { lo } else { c };
315 let t1: u8 = if thresh <= lo as i16 { hi } else { t0 };
316 return t1;
317}
318
319#[cfg(any(test, all(feature = "unstable", target_arch = "x86_64")))]
320fn filter_paeth_stbi_i16(a: i16, b: i16, c: i16) -> i16 {
321 // Like `filter_paeth_stbi` but vectorizes better when wrapped in SIMD types.
322 // Used for bpp=3 and bpp=6
323 let thresh = c * 3 - (a + b);
324 let lo = a.min(b);
325 let hi = a.max(b);
326 let t0 = if hi <= thresh { lo } else { c };
327 let t1 = if thresh <= lo { hi } else { t0 };
328 return t1;
329}
330
331fn filter_paeth_fpnge(a: u8, b: u8, c: u8) -> u8 {
332 // This is an optimized version of the paeth filter from the PNG specification, proposed by
333 // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates
334 // entirely on unsigned 8-bit quantities, making it more conducive to vectorization.
335 //
336 // p = a + b - c
337 // pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c)
338 // pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c)
339 // pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ...
340 //
341 // Further optimizing the calculation of `pc` a bit tricker. However, notice that:
342 //
343 // a > c && b > c
344 // ==> (a - c) > 0 && (b - c) > 0
345 // ==> pc > (a - c) && pc > (b - c)
346 // ==> pc > |a - c| && pc > |b - c|
347 // ==> pc > pb && pc > pa
348 //
349 // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar
350 // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and
351 // `c <= b` or vice versa:
352 //
353 // pc = ||b - c| - |a - c|| = |pa - pb| = max(pa, pb) - min(pa, pb)
354 //
355 let pa = b.max(c) - c.min(b);
356 let pb = a.max(c) - c.min(a);
357 let pc = if (a < c) == (c < b) {
358 pa.max(pb) - pa.min(pb)
359 } else {
360 255
361 };
362
363 if pa <= pb && pa <= pc {
364 a
365 } else if pb <= pc {
366 b
367 } else {
368 c
369 }
370}
371
372pub(crate) fn unfilter(
373 mut filter: FilterType,
374 tbpp: BytesPerPixel,
375 previous: &[u8],
376 current: &mut [u8],
377) {
378 use self::FilterType::*;
379
380 // If the previous row is empty, then treat it as if it were filled with zeros.
381 if previous.is_empty() {
382 if filter == Paeth {
383 filter = Sub;
384 } else if filter == Up {
385 filter = NoFilter;
386 }
387 }
388
389 // [2023/01 @okaneco] - Notes on optimizing decoding filters
390 //
391 // Links:
392 // [PR]: https://github.com/image-rs/image-png/pull/382
393 // [SWAR]: http://aggregate.org/SWAR/over.html
394 // [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers
395 //
396 // #382 heavily refactored and optimized the following filters making the
397 // implementation nonobvious. These comments function as a summary of that
398 // PR with an explanation of the choices made below.
399 //
400 // #382 originally started with trying to optimize using a technique called
401 // SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32`
402 // and `u64` as SIMD registers to perform vertical operations in parallel,
403 // usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp)
404 // pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp
405 // in a `u64`. The `Sub` filter looked like the following code block, `Avg`
406 // was similar but used a bitwise average method from [AVG]:
407 // ```
408 // // See "Unpartitioned Operations With Correction Code" from [SWAR]
409 // fn swar_add_u32(x: u32, y: u32) -> u32 {
410 // // 7-bit addition so there's no carry over the most significant bit
411 // let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111
412 // // 1-bit parity/XOR addition to fill in the missing MSB
413 // n ^ (x ^ y) & 0x80808080 // 0x80 = 0b_1000_0000
414 // }
415 //
416 // let mut prev =
417 // u32::from_ne_bytes([current[0], current[1], current[2], current[3]]);
418 // for chunk in current[4..].chunks_exact_mut(4) {
419 // let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
420 // let new_chunk = swar_add_u32(cur, prev);
421 // chunk.copy_from_slice(&new_chunk.to_ne_bytes());
422 // prev = new_chunk;
423 // }
424 // ```
425 // While this provided a measurable increase, @fintelia found that this idea
426 // could be taken even further by unrolling the chunks component-wise and
427 // avoiding unnecessary byte-shuffling by using byte arrays instead of
428 // `u32::from|to_ne_bytes`. The bitwise operations were no longer necessary
429 // so they were reverted to their obvious arithmetic equivalent. Lastly,
430 // `TryInto` was used instead of `copy_from_slice`. The `Sub` code now
431 // looked like this (with asserts to remove `0..bpp` bounds checks):
432 // ```
433 // assert!(len > 3);
434 // let mut prev = [current[0], current[1], current[2], current[3]];
435 // for chunk in current[4..].chunks_exact_mut(4) {
436 // let new_chunk = [
437 // chunk[0].wrapping_add(prev[0]),
438 // chunk[1].wrapping_add(prev[1]),
439 // chunk[2].wrapping_add(prev[2]),
440 // chunk[3].wrapping_add(prev[3]),
441 // ];
442 // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
443 // prev = new_chunk;
444 // }
445 // ```
446 // The compiler was able to optimize the code to be even faster and this
447 // method even sped up Paeth filtering! Assertions were experimentally
448 // added within loop bodies which produced better instructions but no
449 // difference in speed. Finally, the code was refactored to remove manual
450 // slicing and start the previous pixel chunks with arrays of `[0; N]`.
451 // ```
452 // let mut prev = [0; 4];
453 // for chunk in current.chunks_exact_mut(4) {
454 // let new_chunk = [
455 // chunk[0].wrapping_add(prev[0]),
456 // chunk[1].wrapping_add(prev[1]),
457 // chunk[2].wrapping_add(prev[2]),
458 // chunk[3].wrapping_add(prev[3]),
459 // ];
460 // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
461 // prev = new_chunk;
462 // }
463 // ```
464 // While we're not manually bit-twiddling anymore, a possible takeaway from
465 // this is to "think in SWAR" when dealing with small byte arrays. Unrolling
466 // array operations and performing them component-wise may unlock previously
467 // unavailable optimizations from the compiler, even when using the
468 // `chunks_exact` methods for their potential auto-vectorization benefits.
469 match filter {
470 NoFilter => {}
471 Sub => match tbpp {
472 BytesPerPixel::One => {
473 current.iter_mut().reduce(|&mut prev, curr| {
474 *curr = curr.wrapping_add(prev);
475 curr
476 });
477 }
478 BytesPerPixel::Two => {
479 let mut prev = [0; 2];
480 for chunk in current.chunks_exact_mut(2) {
481 let new_chunk = [
482 chunk[0].wrapping_add(prev[0]),
483 chunk[1].wrapping_add(prev[1]),
484 ];
485 *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
486 prev = new_chunk;
487 }
488 }
489 BytesPerPixel::Three => {
490 let mut prev = [0; 3];
491 for chunk in current.chunks_exact_mut(3) {
492 let new_chunk = [
493 chunk[0].wrapping_add(prev[0]),
494 chunk[1].wrapping_add(prev[1]),
495 chunk[2].wrapping_add(prev[2]),
496 ];
497 *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
498 prev = new_chunk;
499 }
500 }
501 BytesPerPixel::Four => {
502 let mut prev = [0; 4];
503 for chunk in current.chunks_exact_mut(4) {
504 let new_chunk = [
505 chunk[0].wrapping_add(prev[0]),
506 chunk[1].wrapping_add(prev[1]),
507 chunk[2].wrapping_add(prev[2]),
508 chunk[3].wrapping_add(prev[3]),
509 ];
510 *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
511 prev = new_chunk;
512 }
513 }
514 BytesPerPixel::Six => {
515 let mut prev = [0; 6];
516 for chunk in current.chunks_exact_mut(6) {
517 let new_chunk = [
518 chunk[0].wrapping_add(prev[0]),
519 chunk[1].wrapping_add(prev[1]),
520 chunk[2].wrapping_add(prev[2]),
521 chunk[3].wrapping_add(prev[3]),
522 chunk[4].wrapping_add(prev[4]),
523 chunk[5].wrapping_add(prev[5]),
524 ];
525 *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
526 prev = new_chunk;
527 }
528 }
529 BytesPerPixel::Eight => {
530 let mut prev = [0; 8];
531 for chunk in current.chunks_exact_mut(8) {
532 let new_chunk = [
533 chunk[0].wrapping_add(prev[0]),
534 chunk[1].wrapping_add(prev[1]),
535 chunk[2].wrapping_add(prev[2]),
536 chunk[3].wrapping_add(prev[3]),
537 chunk[4].wrapping_add(prev[4]),
538 chunk[5].wrapping_add(prev[5]),
539 chunk[6].wrapping_add(prev[6]),
540 chunk[7].wrapping_add(prev[7]),
541 ];
542 *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
543 prev = new_chunk;
544 }
545 }
546 },
547 Up => {
548 for (curr, &above) in current.iter_mut().zip(previous) {
549 *curr = curr.wrapping_add(above);
550 }
551 }
552 Avg if previous.is_empty() => match tbpp {
553 BytesPerPixel::One => {
554 current.iter_mut().reduce(|&mut prev, curr| {
555 *curr = curr.wrapping_add(prev / 2);
556 curr
557 });
558 }
559 BytesPerPixel::Two => {
560 let mut prev = [0; 2];
561 for chunk in current.chunks_exact_mut(2) {
562 let new_chunk = [
563 chunk[0].wrapping_add(prev[0] / 2),
564 chunk[1].wrapping_add(prev[1] / 2),
565 ];
566 *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
567 prev = new_chunk;
568 }
569 }
570 BytesPerPixel::Three => {
571 let mut prev = [0; 3];
572 for chunk in current.chunks_exact_mut(3) {
573 let new_chunk = [
574 chunk[0].wrapping_add(prev[0] / 2),
575 chunk[1].wrapping_add(prev[1] / 2),
576 chunk[2].wrapping_add(prev[2] / 2),
577 ];
578 *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
579 prev = new_chunk;
580 }
581 }
582 BytesPerPixel::Four => {
583 let mut prev = [0; 4];
584 for chunk in current.chunks_exact_mut(4) {
585 let new_chunk = [
586 chunk[0].wrapping_add(prev[0] / 2),
587 chunk[1].wrapping_add(prev[1] / 2),
588 chunk[2].wrapping_add(prev[2] / 2),
589 chunk[3].wrapping_add(prev[3] / 2),
590 ];
591 *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
592 prev = new_chunk;
593 }
594 }
595 BytesPerPixel::Six => {
596 let mut prev = [0; 6];
597 for chunk in current.chunks_exact_mut(6) {
598 let new_chunk = [
599 chunk[0].wrapping_add(prev[0] / 2),
600 chunk[1].wrapping_add(prev[1] / 2),
601 chunk[2].wrapping_add(prev[2] / 2),
602 chunk[3].wrapping_add(prev[3] / 2),
603 chunk[4].wrapping_add(prev[4] / 2),
604 chunk[5].wrapping_add(prev[5] / 2),
605 ];
606 *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
607 prev = new_chunk;
608 }
609 }
610 BytesPerPixel::Eight => {
611 let mut prev = [0; 8];
612 for chunk in current.chunks_exact_mut(8) {
613 let new_chunk = [
614 chunk[0].wrapping_add(prev[0] / 2),
615 chunk[1].wrapping_add(prev[1] / 2),
616 chunk[2].wrapping_add(prev[2] / 2),
617 chunk[3].wrapping_add(prev[3] / 2),
618 chunk[4].wrapping_add(prev[4] / 2),
619 chunk[5].wrapping_add(prev[5] / 2),
620 chunk[6].wrapping_add(prev[6] / 2),
621 chunk[7].wrapping_add(prev[7] / 2),
622 ];
623 *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
624 prev = new_chunk;
625 }
626 }
627 },
628 Avg => match tbpp {
629 BytesPerPixel::One => {
630 let mut lprev = [0; 1];
631 for (chunk, above) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1)) {
632 let new_chunk =
633 [chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8)];
634 *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk;
635 lprev = new_chunk;
636 }
637 }
638 BytesPerPixel::Two => {
639 let mut lprev = [0; 2];
640 for (chunk, above) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2)) {
641 let new_chunk = [
642 chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
643 chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
644 ];
645 *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
646 lprev = new_chunk;
647 }
648 }
649 BytesPerPixel::Three => {
650 let mut lprev = [0; 3];
651 for (chunk, above) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3)) {
652 let new_chunk = [
653 chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
654 chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
655 chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
656 ];
657 *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
658 lprev = new_chunk;
659 }
660 }
661 BytesPerPixel::Four => {
662 let mut lprev = [0; 4];
663 for (chunk, above) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) {
664 let new_chunk = [
665 chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
666 chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
667 chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
668 chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
669 ];
670 *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
671 lprev = new_chunk;
672 }
673 }
674 BytesPerPixel::Six => {
675 let mut lprev = [0; 6];
676 for (chunk, above) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6)) {
677 let new_chunk = [
678 chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
679 chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
680 chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
681 chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
682 chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8),
683 chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8),
684 ];
685 *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
686 lprev = new_chunk;
687 }
688 }
689 BytesPerPixel::Eight => {
690 let mut lprev = [0; 8];
691 for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) {
692 let new_chunk = [
693 chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
694 chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
695 chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
696 chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
697 chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8),
698 chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8),
699 chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8),
700 chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8),
701 ];
702 *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
703 lprev = new_chunk;
704 }
705 }
706 },
707 #[allow(unreachable_code)]
708 Paeth => {
709 // Select the fastest Paeth filter implementation based on the target architecture.
710 let filter_paeth_decode = if cfg!(target_arch = "x86_64") {
711 filter_paeth_stbi
712 } else {
713 filter_paeth
714 };
715
716 // Paeth filter pixels:
717 // C B D
718 // A X
719 match tbpp {
720 BytesPerPixel::One => {
721 let mut a_bpp = [0; 1];
722 let mut c_bpp = [0; 1];
723 for (chunk, b_bpp) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1))
724 {
725 let new_chunk = [chunk[0]
726 .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0]))];
727 *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk;
728 a_bpp = new_chunk;
729 c_bpp = b_bpp.try_into().unwrap();
730 }
731 }
732 BytesPerPixel::Two => {
733 let mut a_bpp = [0; 2];
734 let mut c_bpp = [0; 2];
735 for (chunk, b_bpp) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2))
736 {
737 let new_chunk = [
738 chunk[0]
739 .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
740 chunk[1]
741 .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
742 ];
743 *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
744 a_bpp = new_chunk;
745 c_bpp = b_bpp.try_into().unwrap();
746 }
747 }
748 BytesPerPixel::Three => {
749 // Do not enable this algorithm on ARM, that would be a big performance hit
750 #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
751 {
752 simd::unfilter_paeth3(previous, current);
753 return;
754 }
755
756 let mut a_bpp = [0; 3];
757 let mut c_bpp = [0; 3];
758 for (chunk, b_bpp) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
759 {
760 let new_chunk = [
761 chunk[0]
762 .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
763 chunk[1]
764 .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
765 chunk[2]
766 .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
767 ];
768 *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
769 a_bpp = new_chunk;
770 c_bpp = b_bpp.try_into().unwrap();
771 }
772 }
773 BytesPerPixel::Four => {
774 #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
775 {
776 simd::unfilter_paeth_u8::<4>(previous, current);
777 return;
778 }
779
780 let mut a_bpp = [0; 4];
781 let mut c_bpp = [0; 4];
782 for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4))
783 {
784 let new_chunk = [
785 chunk[0]
786 .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
787 chunk[1]
788 .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
789 chunk[2]
790 .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
791 chunk[3]
792 .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
793 ];
794 *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
795 a_bpp = new_chunk;
796 c_bpp = b_bpp.try_into().unwrap();
797 }
798 }
799 BytesPerPixel::Six => {
800 #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
801 {
802 simd::unfilter_paeth6(previous, current);
803 return;
804 }
805
806 let mut a_bpp = [0; 6];
807 let mut c_bpp = [0; 6];
808 for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
809 {
810 let new_chunk = [
811 chunk[0]
812 .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
813 chunk[1]
814 .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
815 chunk[2]
816 .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
817 chunk[3]
818 .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
819 chunk[4]
820 .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
821 chunk[5]
822 .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
823 ];
824 *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
825 a_bpp = new_chunk;
826 c_bpp = b_bpp.try_into().unwrap();
827 }
828 }
829 BytesPerPixel::Eight => {
830 #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
831 {
832 simd::unfilter_paeth_u8::<8>(previous, current);
833 return;
834 }
835
836 let mut a_bpp = [0; 8];
837 let mut c_bpp = [0; 8];
838 for (chunk, b_bpp) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8))
839 {
840 let new_chunk = [
841 chunk[0]
842 .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
843 chunk[1]
844 .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
845 chunk[2]
846 .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
847 chunk[3]
848 .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
849 chunk[4]
850 .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
851 chunk[5]
852 .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
853 chunk[6]
854 .wrapping_add(filter_paeth_decode(a_bpp[6], b_bpp[6], c_bpp[6])),
855 chunk[7]
856 .wrapping_add(filter_paeth_decode(a_bpp[7], b_bpp[7], c_bpp[7])),
857 ];
858 *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
859 a_bpp = new_chunk;
860 c_bpp = b_bpp.try_into().unwrap();
861 }
862 }
863 }
864 }
865 }
866}
867
868fn filter_internal(
869 method: FilterType,
870 bpp: usize,
871 len: usize,
872 previous: &[u8],
873 current: &[u8],
874 output: &mut [u8],
875) -> FilterType {
876 use self::FilterType::*;
877
878 // This value was chosen experimentally based on what achieved the best performance. The
879 // Rust compiler does auto-vectorization, and 32-bytes per loop iteration seems to enable
880 // the fastest code when doing so.
881 const CHUNK_SIZE: usize = 32;
882
883 match method {
884 NoFilter => {
885 output.copy_from_slice(current);
886 NoFilter
887 }
888 Sub => {
889 let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE);
890 let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE);
891 let mut prev_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE);
892
893 for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) {
894 for i in 0..CHUNK_SIZE {
895 out[i] = cur[i].wrapping_sub(prev[i]);
896 }
897 }
898
899 for ((out, cur), &prev) in out_chunks
900 .into_remainder()
901 .iter_mut()
902 .zip(cur_chunks.remainder())
903 .zip(prev_chunks.remainder())
904 {
905 *out = cur.wrapping_sub(prev);
906 }
907
908 output[..bpp].copy_from_slice(&current[..bpp]);
909 Sub
910 }
911 Up => {
912 let mut out_chunks = output.chunks_exact_mut(CHUNK_SIZE);
913 let mut cur_chunks = current.chunks_exact(CHUNK_SIZE);
914 let mut prev_chunks = previous.chunks_exact(CHUNK_SIZE);
915
916 for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) {
917 for i in 0..CHUNK_SIZE {
918 out[i] = cur[i].wrapping_sub(prev[i]);
919 }
920 }
921
922 for ((out, cur), &prev) in out_chunks
923 .into_remainder()
924 .iter_mut()
925 .zip(cur_chunks.remainder())
926 .zip(prev_chunks.remainder())
927 {
928 *out = cur.wrapping_sub(prev);
929 }
930 Up
931 }
932 Avg => {
933 let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE);
934 let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE);
935 let mut cur_minus_bpp_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE);
936 let mut prev_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE);
937
938 for (((out, cur), cur_minus_bpp), prev) in (&mut out_chunks)
939 .zip(&mut cur_chunks)
940 .zip(&mut cur_minus_bpp_chunks)
941 .zip(&mut prev_chunks)
942 {
943 for i in 0..CHUNK_SIZE {
944 // Bitwise average of two integers without overflow and
945 // without converting to a wider bit-width. See:
946 // http://aggregate.org/MAGIC/#Average%20of%20Integers
947 // If this is unrolled by component, consider reverting to
948 // `((cur_minus_bpp[i] as u16 + prev[i] as u16) / 2) as u8`
949 out[i] = cur[i].wrapping_sub(
950 (cur_minus_bpp[i] & prev[i]) + ((cur_minus_bpp[i] ^ prev[i]) >> 1),
951 );
952 }
953 }
954
955 for (((out, cur), &cur_minus_bpp), &prev) in out_chunks
956 .into_remainder()
957 .iter_mut()
958 .zip(cur_chunks.remainder())
959 .zip(cur_minus_bpp_chunks.remainder())
960 .zip(prev_chunks.remainder())
961 {
962 *out = cur.wrapping_sub((cur_minus_bpp & prev) + ((cur_minus_bpp ^ prev) >> 1));
963 }
964
965 for i in 0..bpp {
966 output[i] = current[i].wrapping_sub(previous[i] / 2);
967 }
968 Avg
969 }
970 Paeth => {
971 let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE);
972 let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE);
973 let mut a_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE);
974 let mut b_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE);
975 let mut c_chunks = previous[..len - bpp].chunks_exact(CHUNK_SIZE);
976
977 for ((((out, cur), a), b), c) in (&mut out_chunks)
978 .zip(&mut cur_chunks)
979 .zip(&mut a_chunks)
980 .zip(&mut b_chunks)
981 .zip(&mut c_chunks)
982 {
983 for i in 0..CHUNK_SIZE {
984 out[i] = cur[i].wrapping_sub(filter_paeth_fpnge(a[i], b[i], c[i]));
985 }
986 }
987
988 for ((((out, cur), &a), &b), &c) in out_chunks
989 .into_remainder()
990 .iter_mut()
991 .zip(cur_chunks.remainder())
992 .zip(a_chunks.remainder())
993 .zip(b_chunks.remainder())
994 .zip(c_chunks.remainder())
995 {
996 *out = cur.wrapping_sub(filter_paeth_fpnge(a, b, c));
997 }
998
999 for i in 0..bpp {
1000 output[i] = current[i].wrapping_sub(filter_paeth_fpnge(0, previous[i], 0));
1001 }
1002 Paeth
1003 }
1004 }
1005}
1006
1007pub(crate) fn filter(
1008 method: FilterType,
1009 adaptive: AdaptiveFilterType,
1010 bpp: BytesPerPixel,
1011 previous: &[u8],
1012 current: &[u8],
1013 output: &mut [u8],
1014) -> FilterType {
1015 use FilterType::*;
1016 let bpp = bpp.into_usize();
1017 let len = current.len();
1018
1019 match adaptive {
1020 AdaptiveFilterType::NonAdaptive => {
1021 filter_internal(method, bpp, len, previous, current, output)
1022 }
1023 AdaptiveFilterType::Adaptive => {
1024 let mut min_sum: u64 = u64::MAX;
1025 let mut filter_choice = FilterType::NoFilter;
1026 for &filter in [Sub, Up, Avg, Paeth].iter() {
1027 filter_internal(filter, bpp, len, previous, current, output);
1028 let sum = sum_buffer(output);
1029 if sum <= min_sum {
1030 min_sum = sum;
1031 filter_choice = filter;
1032 }
1033 }
1034
1035 if filter_choice != Paeth {
1036 filter_internal(filter_choice, bpp, len, previous, current, output);
1037 }
1038 filter_choice
1039 }
1040 }
1041}
1042
1043// Helper function for Adaptive filter buffer summation
1044fn sum_buffer(buf: &[u8]) -> u64 {
1045 const CHUNK_SIZE: usize = 32;
1046
1047 let mut buf_chunks: ChunksExact<'_, u8> = buf.chunks_exact(CHUNK_SIZE);
1048 let mut sum: u64 = 0_u64;
1049
1050 for chunk: &[u8] in &mut buf_chunks {
1051 // At most, `acc` can be `32 * (i8::MIN as u8) = 32 * 128 = 4096`.
1052 let mut acc: u64 = 0;
1053 for &b: u8 in chunk {
1054 acc += u64::from((b as i8).unsigned_abs());
1055 }
1056 sum = sum.saturating_add(acc);
1057 }
1058
1059 let mut acc: u64 = 0;
1060 for &b: u8 in buf_chunks.remainder() {
1061 acc += u64::from((b as i8).unsigned_abs());
1062 }
1063
1064 sum.saturating_add(acc)
1065}
1066
1067#[cfg(test)]
1068mod test {
1069 use super::*;
1070 use core::iter;
1071
1072 #[test]
1073 fn roundtrip() {
1074 // A multiple of 8, 6, 4, 3, 2, 1
1075 const LEN: u8 = 240;
1076 let previous: Vec<_> = iter::repeat(1).take(LEN.into()).collect();
1077 let current: Vec<_> = (0..LEN).collect();
1078 let expected = current.clone();
1079 let adaptive = AdaptiveFilterType::NonAdaptive;
1080
1081 let roundtrip = |kind, bpp: BytesPerPixel| {
1082 let mut output = vec![0; LEN.into()];
1083 filter(kind, adaptive, bpp, &previous, &current, &mut output);
1084 unfilter(kind, bpp, &previous, &mut output);
1085 assert_eq!(
1086 output, expected,
1087 "Filtering {:?} with {:?} does not roundtrip",
1088 bpp, kind
1089 );
1090 };
1091
1092 let filters = [
1093 FilterType::NoFilter,
1094 FilterType::Sub,
1095 FilterType::Up,
1096 FilterType::Avg,
1097 FilterType::Paeth,
1098 ];
1099
1100 let bpps = [
1101 BytesPerPixel::One,
1102 BytesPerPixel::Two,
1103 BytesPerPixel::Three,
1104 BytesPerPixel::Four,
1105 BytesPerPixel::Six,
1106 BytesPerPixel::Eight,
1107 ];
1108
1109 for &filter in filters.iter() {
1110 for &bpp in bpps.iter() {
1111 roundtrip(filter, bpp);
1112 }
1113 }
1114 }
1115
1116 #[test]
1117 #[ignore] // takes ~20s without optimizations
1118 fn paeth_impls_are_equivalent() {
1119 for a in 0..=255 {
1120 for b in 0..=255 {
1121 for c in 0..=255 {
1122 let baseline = filter_paeth(a, b, c);
1123 let fpnge = filter_paeth_fpnge(a, b, c);
1124 let stbi = filter_paeth_stbi(a, b, c);
1125 let stbi_i16 = filter_paeth_stbi_i16(a as i16, b as i16, c as i16);
1126
1127 assert_eq!(baseline, fpnge);
1128 assert_eq!(baseline, stbi);
1129 assert_eq!(baseline as i16, stbi_i16);
1130 }
1131 }
1132 }
1133 }
1134
1135 #[test]
1136 fn roundtrip_ascending_previous_line() {
1137 // A multiple of 8, 6, 4, 3, 2, 1
1138 const LEN: u8 = 240;
1139 let previous: Vec<_> = (0..LEN).collect();
1140 let current: Vec<_> = (0..LEN).collect();
1141 let expected = current.clone();
1142 let adaptive = AdaptiveFilterType::NonAdaptive;
1143
1144 let roundtrip = |kind, bpp: BytesPerPixel| {
1145 let mut output = vec![0; LEN.into()];
1146 filter(kind, adaptive, bpp, &previous, &current, &mut output);
1147 unfilter(kind, bpp, &previous, &mut output);
1148 assert_eq!(
1149 output, expected,
1150 "Filtering {:?} with {:?} does not roundtrip",
1151 bpp, kind
1152 );
1153 };
1154
1155 let filters = [
1156 FilterType::NoFilter,
1157 FilterType::Sub,
1158 FilterType::Up,
1159 FilterType::Avg,
1160 FilterType::Paeth,
1161 ];
1162
1163 let bpps = [
1164 BytesPerPixel::One,
1165 BytesPerPixel::Two,
1166 BytesPerPixel::Three,
1167 BytesPerPixel::Four,
1168 BytesPerPixel::Six,
1169 BytesPerPixel::Eight,
1170 ];
1171
1172 for &filter in filters.iter() {
1173 for &bpp in bpps.iter() {
1174 roundtrip(filter, bpp);
1175 }
1176 }
1177 }
1178
1179 #[test]
1180 // This tests that converting u8 to i8 doesn't overflow when taking the
1181 // absolute value for adaptive filtering: -128_i8.abs() will panic in debug
1182 // or produce garbage in release mode. The sum of 0..=255u8 should equal the
1183 // sum of the absolute values of -128_i8..=127, or abs(-128..=0) + 1..=127.
1184 fn sum_buffer_test() {
1185 let sum = (0..=128).sum::<u64>() + (1..=127).sum::<u64>();
1186 let buf: Vec<u8> = (0_u8..=255).collect();
1187
1188 assert_eq!(sum, crate::filter::sum_buffer(&buf));
1189 }
1190}
1191