1 | use core::convert::TryInto; |
2 | |
3 | use crate::common::BytesPerPixel; |
4 | |
5 | /// SIMD helpers for `fn unfilter` |
6 | /// |
7 | /// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the |
8 | /// "unstable" feature of the `png` crate. This should be possible once the "portable_simd" |
9 | /// feature of Rust gets stabilized. |
10 | /// |
11 | /// This is only known to help on x86, with no change measured on most benchmarks on ARM, |
12 | /// and even severely regressing some of them. |
13 | /// So despite the code being portable, we only enable this for x86. |
14 | /// We can add more platforms once this code is proven to be beneficial for them. |
15 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
16 | mod simd { |
17 | use std::simd::num::{SimdInt, SimdUint}; |
18 | use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount}; |
19 | |
20 | /// Scalar Paeth function wrapped in SIMD scaffolding. |
21 | /// |
22 | /// This is needed because simply running the function on the inputs |
23 | /// makes the compiler think our inputs are too short |
24 | /// to benefit from vectorization. |
25 | /// Putting it in SIMD scaffolding fixes that. |
26 | /// https://github.com/image-rs/image-png/issues/511 |
27 | /// |
28 | /// Funnily, the autovectorizer does a better job here |
29 | /// than a handwritten algorithm using std::simd! |
30 | /// We used to have a handwritten one but this is just faster. |
31 | fn paeth_predictor<const N: usize>( |
32 | a: Simd<i16, N>, |
33 | b: Simd<i16, N>, |
34 | c: Simd<i16, N>, |
35 | ) -> Simd<i16, N> |
36 | where |
37 | LaneCount<N>: SupportedLaneCount, |
38 | { |
39 | let mut out = [0; N]; |
40 | for i in 0..N { |
41 | out[i] = super::filter_paeth_stbi_i16(a[i].into(), b[i].into(), c[i].into()); |
42 | } |
43 | out.into() |
44 | } |
45 | |
46 | /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert |
47 | /// the SIMD elements to `i16`. |
48 | fn paeth_predictor_u8<const N: usize>( |
49 | a: Simd<u8, N>, |
50 | b: Simd<u8, N>, |
51 | c: Simd<u8, N>, |
52 | ) -> Simd<u8, N> |
53 | where |
54 | LaneCount<N>: SupportedLaneCount, |
55 | { |
56 | let mut out = [0; N]; |
57 | for i in 0..N { |
58 | out[i] = super::filter_paeth_stbi(a[i].into(), b[i].into(), c[i].into()); |
59 | } |
60 | out.into() |
61 | } |
62 | |
63 | /// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`). |
64 | /// See also https://www.w3.org/TR/png/#filter-byte-positions |
65 | #[derive (Default)] |
66 | struct PaethState<T, const N: usize> |
67 | where |
68 | T: SimdElement, |
69 | LaneCount<N>: SupportedLaneCount, |
70 | { |
71 | /// Previous pixel in the previous row. |
72 | c: Simd<T, N>, |
73 | |
74 | /// Previous pixel in the current row. |
75 | a: Simd<T, N>, |
76 | } |
77 | |
78 | /// Mutates `x` as needed to unfilter `FilterType::Paeth`. |
79 | /// |
80 | /// `b` is the current pixel in the previous row. `x` is the current pixel in the current row. |
81 | /// See also https://www.w3.org/TR/png/#filter-byte-positions |
82 | fn paeth_step<const N: usize>( |
83 | state: &mut PaethState<i16, N>, |
84 | b: Simd<u8, N>, |
85 | x: &mut Simd<u8, N>, |
86 | ) where |
87 | LaneCount<N>: SupportedLaneCount, |
88 | { |
89 | // Storing the inputs. |
90 | let b = b.cast::<i16>(); |
91 | |
92 | // Calculating the new value of the current pixel. |
93 | let predictor = paeth_predictor(state.a, b, state.c); |
94 | *x += predictor.cast::<u8>(); |
95 | |
96 | // Preparing for the next step. |
97 | state.c = b; |
98 | state.a = x.cast::<i16>(); |
99 | } |
100 | |
101 | /// Computes the Paeth predictor without converting `u8` to `i16`. |
102 | /// |
103 | /// See `simd::paeth_step`. |
104 | fn paeth_step_u8<const N: usize>( |
105 | state: &mut PaethState<u8, N>, |
106 | b: Simd<u8, N>, |
107 | x: &mut Simd<u8, N>, |
108 | ) where |
109 | LaneCount<N>: SupportedLaneCount, |
110 | { |
111 | // Calculating the new value of the current pixel. |
112 | *x += paeth_predictor_u8(state.a, b, state.c); |
113 | |
114 | // Preparing for the next step. |
115 | state.c = b; |
116 | state.a = *x; |
117 | } |
118 | |
119 | fn load3(src: &[u8]) -> u8x4 { |
120 | u8x4::from_array([src[0], src[1], src[2], 0]) |
121 | } |
122 | |
123 | fn store3(src: u8x4, dest: &mut [u8]) { |
124 | dest[0..3].copy_from_slice(&src.to_array()[0..3]) |
125 | } |
126 | |
127 | /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`. |
128 | pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) { |
129 | debug_assert_eq!(prev_row.len(), curr_row.len()); |
130 | debug_assert_eq!(prev_row.len() % 3, 0); |
131 | |
132 | let mut state = PaethState::<i16, 4>::default(); |
133 | while prev_row.len() >= 4 { |
134 | // `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first |
135 | // byte from the next triple. This optimization technique mimics the algorithm found |
136 | // in |
137 | // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131 |
138 | let b = u8x4::from_slice(prev_row); |
139 | let mut x = u8x4::from_slice(curr_row); |
140 | |
141 | paeth_step(&mut state, b, &mut x); |
142 | |
143 | // We can speculate that writing 4 bytes might be more efficient (just as with using |
144 | // `u8x4::from_slice` above), but we can't use that here, because we can't clobber the |
145 | // first byte of the next pixel in the `curr_row`. |
146 | store3(x, curr_row); |
147 | |
148 | prev_row = &prev_row[3..]; |
149 | curr_row = &mut curr_row[3..]; |
150 | } |
151 | // Can't use `u8x4::from_slice` for the last `[u8;3]`. |
152 | let b = load3(prev_row); |
153 | let mut x = load3(curr_row); |
154 | paeth_step(&mut state, b, &mut x); |
155 | store3(x, curr_row); |
156 | } |
157 | |
158 | /// Undoes `FilterType::Paeth` for `BytesPerPixel::Four` and `BytesPerPixel::Eight`. |
159 | /// |
160 | /// This function calculates the Paeth predictor entirely in `Simd<u8, N>` |
161 | /// without converting to an intermediate `Simd<i16, N>`. Doing so avoids |
162 | /// paying a small performance penalty converting between types. |
163 | pub fn unfilter_paeth_u8<const N: usize>(prev_row: &[u8], curr_row: &mut [u8]) |
164 | where |
165 | LaneCount<N>: SupportedLaneCount, |
166 | { |
167 | debug_assert_eq!(prev_row.len(), curr_row.len()); |
168 | debug_assert_eq!(prev_row.len() % N, 0); |
169 | assert!(matches!(N, 4 | 8)); |
170 | |
171 | let mut state = PaethState::<u8, N>::default(); |
172 | for (prev_row, curr_row) in prev_row.chunks_exact(N).zip(curr_row.chunks_exact_mut(N)) { |
173 | let b = Simd::from_slice(prev_row); |
174 | let mut x = Simd::from_slice(curr_row); |
175 | |
176 | paeth_step_u8(&mut state, b, &mut x); |
177 | |
178 | curr_row[..N].copy_from_slice(&x.to_array()[..N]); |
179 | } |
180 | } |
181 | |
182 | fn load6(src: &[u8]) -> u8x8 { |
183 | u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0]) |
184 | } |
185 | |
186 | fn store6(src: u8x8, dest: &mut [u8]) { |
187 | dest[0..6].copy_from_slice(&src.to_array()[0..6]) |
188 | } |
189 | |
190 | /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`. |
191 | pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) { |
192 | debug_assert_eq!(prev_row.len(), curr_row.len()); |
193 | debug_assert_eq!(prev_row.len() % 6, 0); |
194 | |
195 | let mut state = PaethState::<i16, 8>::default(); |
196 | while prev_row.len() >= 8 { |
197 | // `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two |
198 | // bytes from the next pixel. This optimization technique mimics the algorithm found |
199 | // in |
200 | // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131 |
201 | let b = u8x8::from_slice(prev_row); |
202 | let mut x = u8x8::from_slice(curr_row); |
203 | |
204 | paeth_step(&mut state, b, &mut x); |
205 | |
206 | // We can speculate that writing 8 bytes might be more efficient (just as with using |
207 | // `u8x8::from_slice` above), but we can't use that here, because we can't clobber the |
208 | // first bytes of the next pixel in the `curr_row`. |
209 | store6(x, curr_row); |
210 | |
211 | prev_row = &prev_row[6..]; |
212 | curr_row = &mut curr_row[6..]; |
213 | } |
214 | // Can't use `u8x8::from_slice` for the last `[u8;6]`. |
215 | let b = load6(prev_row); |
216 | let mut x = load6(curr_row); |
217 | paeth_step(&mut state, b, &mut x); |
218 | store6(x, curr_row); |
219 | } |
220 | } |
221 | |
222 | /// The byte level filter applied to scanlines to prepare them for compression. |
223 | /// |
224 | /// Compression in general benefits from repetitive data. The filter is a content-aware method of |
225 | /// compressing the range of occurring byte values to help the compression algorithm. Note that |
226 | /// this does not operate on pixels but on raw bytes of a scanline. |
227 | /// |
228 | /// Details on how each filter works can be found in the [PNG Book](http://www.libpng.org/pub/png/book/chapter09.html). |
229 | #[derive (Debug, Clone, Copy, PartialEq, Eq)] |
230 | #[repr (u8)] |
231 | pub enum FilterType { |
232 | NoFilter = 0, |
233 | Sub = 1, |
234 | Up = 2, |
235 | Avg = 3, |
236 | Paeth = 4, |
237 | } |
238 | |
239 | impl Default for FilterType { |
240 | fn default() -> Self { |
241 | FilterType::Sub |
242 | } |
243 | } |
244 | |
245 | impl FilterType { |
246 | /// u8 -> Self. Temporary solution until Rust provides a canonical one. |
247 | pub fn from_u8(n: u8) -> Option<FilterType> { |
248 | match n { |
249 | 0 => Some(FilterType::NoFilter), |
250 | 1 => Some(FilterType::Sub), |
251 | 2 => Some(FilterType::Up), |
252 | 3 => Some(FilterType::Avg), |
253 | 4 => Some(FilterType::Paeth), |
254 | _ => None, |
255 | } |
256 | } |
257 | } |
258 | |
259 | /// Adaptive filtering tries every possible filter for each row and uses a heuristic to select the best one. |
260 | /// This improves compression ratio, but makes encoding slightly slower. |
261 | /// |
262 | /// It is recommended to use `Adaptive` whenever you care about compression ratio. |
263 | /// Filtering is quite cheap compared to other parts of encoding, but can contribute |
264 | /// to the compression ratio significantly. |
265 | /// |
266 | /// `NonAdaptive` filtering is the default. |
267 | #[derive (Debug, Clone, Copy, PartialEq, Eq)] |
268 | #[repr (u8)] |
269 | pub enum AdaptiveFilterType { |
270 | Adaptive, |
271 | NonAdaptive, |
272 | } |
273 | |
274 | impl Default for AdaptiveFilterType { |
275 | fn default() -> Self { |
276 | AdaptiveFilterType::NonAdaptive |
277 | } |
278 | } |
279 | |
280 | fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { |
281 | // On ARM this algorithm performs much better than the one above adapted from stb, |
282 | // and this is the better-studied algorithm we've always used here, |
283 | // so we default to it on all non-x86 platforms. |
284 | let pa: i16 = (i16::from(b) - i16::from(c)).abs(); |
285 | let pb: i16 = (i16::from(a) - i16::from(c)).abs(); |
286 | let pc: i16 = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs(); |
287 | |
288 | let mut out: u8 = a; |
289 | let mut min: i16 = pa; |
290 | |
291 | if pb < min { |
292 | min = pb; |
293 | out = b; |
294 | } |
295 | if pc < min { |
296 | out = c; |
297 | } |
298 | |
299 | out |
300 | } |
301 | |
302 | fn filter_paeth_stbi(a: u8, b: u8, c: u8) -> u8 { |
303 | // Decoding optimizes better with this algorithm than with `filter_paeth` |
304 | // |
305 | // This formulation looks very different from the reference in the PNG spec, but is |
306 | // actually equivalent and has favorable data dependencies and admits straightforward |
307 | // generation of branch-free code, which helps performance significantly. |
308 | // |
309 | // Adapted from public domain PNG implementation: |
310 | // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668 |
311 | let thresh: i16 = i16::from(c) * 3 - (i16::from(a) + i16::from(b)); |
312 | let lo: u8 = a.min(b); |
313 | let hi: u8 = a.max(b); |
314 | let t0: u8 = if hi as i16 <= thresh { lo } else { c }; |
315 | let t1: u8 = if thresh <= lo as i16 { hi } else { t0 }; |
316 | return t1; |
317 | } |
318 | |
319 | #[cfg (any(test, all(feature = "unstable" , target_arch = "x86_64" )))] |
320 | fn filter_paeth_stbi_i16(a: i16, b: i16, c: i16) -> i16 { |
321 | // Like `filter_paeth_stbi` but vectorizes better when wrapped in SIMD types. |
322 | // Used for bpp=3 and bpp=6 |
323 | let thresh = c * 3 - (a + b); |
324 | let lo = a.min(b); |
325 | let hi = a.max(b); |
326 | let t0 = if hi <= thresh { lo } else { c }; |
327 | let t1 = if thresh <= lo { hi } else { t0 }; |
328 | return t1; |
329 | } |
330 | |
331 | fn filter_paeth_fpnge(a: u8, b: u8, c: u8) -> u8 { |
332 | // This is an optimized version of the paeth filter from the PNG specification, proposed by |
333 | // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates |
334 | // entirely on unsigned 8-bit quantities, making it more conducive to vectorization. |
335 | // |
336 | // p = a + b - c |
337 | // pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c) |
338 | // pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c) |
339 | // pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ... |
340 | // |
341 | // Further optimizing the calculation of `pc` a bit tricker. However, notice that: |
342 | // |
343 | // a > c && b > c |
344 | // ==> (a - c) > 0 && (b - c) > 0 |
345 | // ==> pc > (a - c) && pc > (b - c) |
346 | // ==> pc > |a - c| && pc > |b - c| |
347 | // ==> pc > pb && pc > pa |
348 | // |
349 | // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar |
350 | // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and |
351 | // `c <= b` or vice versa: |
352 | // |
353 | // pc = ||b - c| - |a - c|| = |pa - pb| = max(pa, pb) - min(pa, pb) |
354 | // |
355 | let pa = b.max(c) - c.min(b); |
356 | let pb = a.max(c) - c.min(a); |
357 | let pc = if (a < c) == (c < b) { |
358 | pa.max(pb) - pa.min(pb) |
359 | } else { |
360 | 255 |
361 | }; |
362 | |
363 | if pa <= pb && pa <= pc { |
364 | a |
365 | } else if pb <= pc { |
366 | b |
367 | } else { |
368 | c |
369 | } |
370 | } |
371 | |
372 | pub(crate) fn unfilter( |
373 | mut filter: FilterType, |
374 | tbpp: BytesPerPixel, |
375 | previous: &[u8], |
376 | current: &mut [u8], |
377 | ) { |
378 | use self::FilterType::*; |
379 | |
380 | // If the previous row is empty, then treat it as if it were filled with zeros. |
381 | if previous.is_empty() { |
382 | if filter == Paeth { |
383 | filter = Sub; |
384 | } else if filter == Up { |
385 | filter = NoFilter; |
386 | } |
387 | } |
388 | |
389 | // [2023/01 @okaneco] - Notes on optimizing decoding filters |
390 | // |
391 | // Links: |
392 | // [PR]: https://github.com/image-rs/image-png/pull/382 |
393 | // [SWAR]: http://aggregate.org/SWAR/over.html |
394 | // [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers |
395 | // |
396 | // #382 heavily refactored and optimized the following filters making the |
397 | // implementation nonobvious. These comments function as a summary of that |
398 | // PR with an explanation of the choices made below. |
399 | // |
400 | // #382 originally started with trying to optimize using a technique called |
401 | // SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32` |
402 | // and `u64` as SIMD registers to perform vertical operations in parallel, |
403 | // usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp) |
404 | // pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp |
405 | // in a `u64`. The `Sub` filter looked like the following code block, `Avg` |
406 | // was similar but used a bitwise average method from [AVG]: |
407 | // ``` |
408 | // // See "Unpartitioned Operations With Correction Code" from [SWAR] |
409 | // fn swar_add_u32(x: u32, y: u32) -> u32 { |
410 | // // 7-bit addition so there's no carry over the most significant bit |
411 | // let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111 |
412 | // // 1-bit parity/XOR addition to fill in the missing MSB |
413 | // n ^ (x ^ y) & 0x80808080 // 0x80 = 0b_1000_0000 |
414 | // } |
415 | // |
416 | // let mut prev = |
417 | // u32::from_ne_bytes([current[0], current[1], current[2], current[3]]); |
418 | // for chunk in current[4..].chunks_exact_mut(4) { |
419 | // let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]); |
420 | // let new_chunk = swar_add_u32(cur, prev); |
421 | // chunk.copy_from_slice(&new_chunk.to_ne_bytes()); |
422 | // prev = new_chunk; |
423 | // } |
424 | // ``` |
425 | // While this provided a measurable increase, @fintelia found that this idea |
426 | // could be taken even further by unrolling the chunks component-wise and |
427 | // avoiding unnecessary byte-shuffling by using byte arrays instead of |
428 | // `u32::from|to_ne_bytes`. The bitwise operations were no longer necessary |
429 | // so they were reverted to their obvious arithmetic equivalent. Lastly, |
430 | // `TryInto` was used instead of `copy_from_slice`. The `Sub` code now |
431 | // looked like this (with asserts to remove `0..bpp` bounds checks): |
432 | // ``` |
433 | // assert!(len > 3); |
434 | // let mut prev = [current[0], current[1], current[2], current[3]]; |
435 | // for chunk in current[4..].chunks_exact_mut(4) { |
436 | // let new_chunk = [ |
437 | // chunk[0].wrapping_add(prev[0]), |
438 | // chunk[1].wrapping_add(prev[1]), |
439 | // chunk[2].wrapping_add(prev[2]), |
440 | // chunk[3].wrapping_add(prev[3]), |
441 | // ]; |
442 | // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
443 | // prev = new_chunk; |
444 | // } |
445 | // ``` |
446 | // The compiler was able to optimize the code to be even faster and this |
447 | // method even sped up Paeth filtering! Assertions were experimentally |
448 | // added within loop bodies which produced better instructions but no |
449 | // difference in speed. Finally, the code was refactored to remove manual |
450 | // slicing and start the previous pixel chunks with arrays of `[0; N]`. |
451 | // ``` |
452 | // let mut prev = [0; 4]; |
453 | // for chunk in current.chunks_exact_mut(4) { |
454 | // let new_chunk = [ |
455 | // chunk[0].wrapping_add(prev[0]), |
456 | // chunk[1].wrapping_add(prev[1]), |
457 | // chunk[2].wrapping_add(prev[2]), |
458 | // chunk[3].wrapping_add(prev[3]), |
459 | // ]; |
460 | // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
461 | // prev = new_chunk; |
462 | // } |
463 | // ``` |
464 | // While we're not manually bit-twiddling anymore, a possible takeaway from |
465 | // this is to "think in SWAR" when dealing with small byte arrays. Unrolling |
466 | // array operations and performing them component-wise may unlock previously |
467 | // unavailable optimizations from the compiler, even when using the |
468 | // `chunks_exact` methods for their potential auto-vectorization benefits. |
469 | match filter { |
470 | NoFilter => {} |
471 | Sub => match tbpp { |
472 | BytesPerPixel::One => { |
473 | current.iter_mut().reduce(|&mut prev, curr| { |
474 | *curr = curr.wrapping_add(prev); |
475 | curr |
476 | }); |
477 | } |
478 | BytesPerPixel::Two => { |
479 | let mut prev = [0; 2]; |
480 | for chunk in current.chunks_exact_mut(2) { |
481 | let new_chunk = [ |
482 | chunk[0].wrapping_add(prev[0]), |
483 | chunk[1].wrapping_add(prev[1]), |
484 | ]; |
485 | *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk; |
486 | prev = new_chunk; |
487 | } |
488 | } |
489 | BytesPerPixel::Three => { |
490 | let mut prev = [0; 3]; |
491 | for chunk in current.chunks_exact_mut(3) { |
492 | let new_chunk = [ |
493 | chunk[0].wrapping_add(prev[0]), |
494 | chunk[1].wrapping_add(prev[1]), |
495 | chunk[2].wrapping_add(prev[2]), |
496 | ]; |
497 | *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; |
498 | prev = new_chunk; |
499 | } |
500 | } |
501 | BytesPerPixel::Four => { |
502 | let mut prev = [0; 4]; |
503 | for chunk in current.chunks_exact_mut(4) { |
504 | let new_chunk = [ |
505 | chunk[0].wrapping_add(prev[0]), |
506 | chunk[1].wrapping_add(prev[1]), |
507 | chunk[2].wrapping_add(prev[2]), |
508 | chunk[3].wrapping_add(prev[3]), |
509 | ]; |
510 | *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
511 | prev = new_chunk; |
512 | } |
513 | } |
514 | BytesPerPixel::Six => { |
515 | let mut prev = [0; 6]; |
516 | for chunk in current.chunks_exact_mut(6) { |
517 | let new_chunk = [ |
518 | chunk[0].wrapping_add(prev[0]), |
519 | chunk[1].wrapping_add(prev[1]), |
520 | chunk[2].wrapping_add(prev[2]), |
521 | chunk[3].wrapping_add(prev[3]), |
522 | chunk[4].wrapping_add(prev[4]), |
523 | chunk[5].wrapping_add(prev[5]), |
524 | ]; |
525 | *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; |
526 | prev = new_chunk; |
527 | } |
528 | } |
529 | BytesPerPixel::Eight => { |
530 | let mut prev = [0; 8]; |
531 | for chunk in current.chunks_exact_mut(8) { |
532 | let new_chunk = [ |
533 | chunk[0].wrapping_add(prev[0]), |
534 | chunk[1].wrapping_add(prev[1]), |
535 | chunk[2].wrapping_add(prev[2]), |
536 | chunk[3].wrapping_add(prev[3]), |
537 | chunk[4].wrapping_add(prev[4]), |
538 | chunk[5].wrapping_add(prev[5]), |
539 | chunk[6].wrapping_add(prev[6]), |
540 | chunk[7].wrapping_add(prev[7]), |
541 | ]; |
542 | *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; |
543 | prev = new_chunk; |
544 | } |
545 | } |
546 | }, |
547 | Up => { |
548 | for (curr, &above) in current.iter_mut().zip(previous) { |
549 | *curr = curr.wrapping_add(above); |
550 | } |
551 | } |
552 | Avg if previous.is_empty() => match tbpp { |
553 | BytesPerPixel::One => { |
554 | current.iter_mut().reduce(|&mut prev, curr| { |
555 | *curr = curr.wrapping_add(prev / 2); |
556 | curr |
557 | }); |
558 | } |
559 | BytesPerPixel::Two => { |
560 | let mut prev = [0; 2]; |
561 | for chunk in current.chunks_exact_mut(2) { |
562 | let new_chunk = [ |
563 | chunk[0].wrapping_add(prev[0] / 2), |
564 | chunk[1].wrapping_add(prev[1] / 2), |
565 | ]; |
566 | *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk; |
567 | prev = new_chunk; |
568 | } |
569 | } |
570 | BytesPerPixel::Three => { |
571 | let mut prev = [0; 3]; |
572 | for chunk in current.chunks_exact_mut(3) { |
573 | let new_chunk = [ |
574 | chunk[0].wrapping_add(prev[0] / 2), |
575 | chunk[1].wrapping_add(prev[1] / 2), |
576 | chunk[2].wrapping_add(prev[2] / 2), |
577 | ]; |
578 | *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; |
579 | prev = new_chunk; |
580 | } |
581 | } |
582 | BytesPerPixel::Four => { |
583 | let mut prev = [0; 4]; |
584 | for chunk in current.chunks_exact_mut(4) { |
585 | let new_chunk = [ |
586 | chunk[0].wrapping_add(prev[0] / 2), |
587 | chunk[1].wrapping_add(prev[1] / 2), |
588 | chunk[2].wrapping_add(prev[2] / 2), |
589 | chunk[3].wrapping_add(prev[3] / 2), |
590 | ]; |
591 | *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
592 | prev = new_chunk; |
593 | } |
594 | } |
595 | BytesPerPixel::Six => { |
596 | let mut prev = [0; 6]; |
597 | for chunk in current.chunks_exact_mut(6) { |
598 | let new_chunk = [ |
599 | chunk[0].wrapping_add(prev[0] / 2), |
600 | chunk[1].wrapping_add(prev[1] / 2), |
601 | chunk[2].wrapping_add(prev[2] / 2), |
602 | chunk[3].wrapping_add(prev[3] / 2), |
603 | chunk[4].wrapping_add(prev[4] / 2), |
604 | chunk[5].wrapping_add(prev[5] / 2), |
605 | ]; |
606 | *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; |
607 | prev = new_chunk; |
608 | } |
609 | } |
610 | BytesPerPixel::Eight => { |
611 | let mut prev = [0; 8]; |
612 | for chunk in current.chunks_exact_mut(8) { |
613 | let new_chunk = [ |
614 | chunk[0].wrapping_add(prev[0] / 2), |
615 | chunk[1].wrapping_add(prev[1] / 2), |
616 | chunk[2].wrapping_add(prev[2] / 2), |
617 | chunk[3].wrapping_add(prev[3] / 2), |
618 | chunk[4].wrapping_add(prev[4] / 2), |
619 | chunk[5].wrapping_add(prev[5] / 2), |
620 | chunk[6].wrapping_add(prev[6] / 2), |
621 | chunk[7].wrapping_add(prev[7] / 2), |
622 | ]; |
623 | *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; |
624 | prev = new_chunk; |
625 | } |
626 | } |
627 | }, |
628 | Avg => match tbpp { |
629 | BytesPerPixel::One => { |
630 | let mut lprev = [0; 1]; |
631 | for (chunk, above) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1)) { |
632 | let new_chunk = |
633 | [chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8)]; |
634 | *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk; |
635 | lprev = new_chunk; |
636 | } |
637 | } |
638 | BytesPerPixel::Two => { |
639 | let mut lprev = [0; 2]; |
640 | for (chunk, above) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2)) { |
641 | let new_chunk = [ |
642 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
643 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
644 | ]; |
645 | *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk; |
646 | lprev = new_chunk; |
647 | } |
648 | } |
649 | BytesPerPixel::Three => { |
650 | let mut lprev = [0; 3]; |
651 | for (chunk, above) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3)) { |
652 | let new_chunk = [ |
653 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
654 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
655 | chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), |
656 | ]; |
657 | *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; |
658 | lprev = new_chunk; |
659 | } |
660 | } |
661 | BytesPerPixel::Four => { |
662 | let mut lprev = [0; 4]; |
663 | for (chunk, above) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) { |
664 | let new_chunk = [ |
665 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
666 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
667 | chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), |
668 | chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8), |
669 | ]; |
670 | *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
671 | lprev = new_chunk; |
672 | } |
673 | } |
674 | BytesPerPixel::Six => { |
675 | let mut lprev = [0; 6]; |
676 | for (chunk, above) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6)) { |
677 | let new_chunk = [ |
678 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
679 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
680 | chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), |
681 | chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8), |
682 | chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8), |
683 | chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8), |
684 | ]; |
685 | *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; |
686 | lprev = new_chunk; |
687 | } |
688 | } |
689 | BytesPerPixel::Eight => { |
690 | let mut lprev = [0; 8]; |
691 | for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) { |
692 | let new_chunk = [ |
693 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
694 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
695 | chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), |
696 | chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8), |
697 | chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8), |
698 | chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8), |
699 | chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8), |
700 | chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8), |
701 | ]; |
702 | *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; |
703 | lprev = new_chunk; |
704 | } |
705 | } |
706 | }, |
707 | #[allow (unreachable_code)] |
708 | Paeth => { |
709 | // Select the fastest Paeth filter implementation based on the target architecture. |
710 | let filter_paeth_decode = if cfg!(target_arch = "x86_64" ) { |
711 | filter_paeth_stbi |
712 | } else { |
713 | filter_paeth |
714 | }; |
715 | |
716 | // Paeth filter pixels: |
717 | // C B D |
718 | // A X |
719 | match tbpp { |
720 | BytesPerPixel::One => { |
721 | let mut a_bpp = [0; 1]; |
722 | let mut c_bpp = [0; 1]; |
723 | for (chunk, b_bpp) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1)) |
724 | { |
725 | let new_chunk = [chunk[0] |
726 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0]))]; |
727 | *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk; |
728 | a_bpp = new_chunk; |
729 | c_bpp = b_bpp.try_into().unwrap(); |
730 | } |
731 | } |
732 | BytesPerPixel::Two => { |
733 | let mut a_bpp = [0; 2]; |
734 | let mut c_bpp = [0; 2]; |
735 | for (chunk, b_bpp) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2)) |
736 | { |
737 | let new_chunk = [ |
738 | chunk[0] |
739 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
740 | chunk[1] |
741 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
742 | ]; |
743 | *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk; |
744 | a_bpp = new_chunk; |
745 | c_bpp = b_bpp.try_into().unwrap(); |
746 | } |
747 | } |
748 | BytesPerPixel::Three => { |
749 | // Do not enable this algorithm on ARM, that would be a big performance hit |
750 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
751 | { |
752 | simd::unfilter_paeth3(previous, current); |
753 | return; |
754 | } |
755 | |
756 | let mut a_bpp = [0; 3]; |
757 | let mut c_bpp = [0; 3]; |
758 | for (chunk, b_bpp) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3)) |
759 | { |
760 | let new_chunk = [ |
761 | chunk[0] |
762 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
763 | chunk[1] |
764 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
765 | chunk[2] |
766 | .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), |
767 | ]; |
768 | *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; |
769 | a_bpp = new_chunk; |
770 | c_bpp = b_bpp.try_into().unwrap(); |
771 | } |
772 | } |
773 | BytesPerPixel::Four => { |
774 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
775 | { |
776 | simd::unfilter_paeth_u8::<4>(previous, current); |
777 | return; |
778 | } |
779 | |
780 | let mut a_bpp = [0; 4]; |
781 | let mut c_bpp = [0; 4]; |
782 | for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) |
783 | { |
784 | let new_chunk = [ |
785 | chunk[0] |
786 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
787 | chunk[1] |
788 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
789 | chunk[2] |
790 | .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), |
791 | chunk[3] |
792 | .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])), |
793 | ]; |
794 | *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
795 | a_bpp = new_chunk; |
796 | c_bpp = b_bpp.try_into().unwrap(); |
797 | } |
798 | } |
799 | BytesPerPixel::Six => { |
800 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
801 | { |
802 | simd::unfilter_paeth6(previous, current); |
803 | return; |
804 | } |
805 | |
806 | let mut a_bpp = [0; 6]; |
807 | let mut c_bpp = [0; 6]; |
808 | for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6)) |
809 | { |
810 | let new_chunk = [ |
811 | chunk[0] |
812 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
813 | chunk[1] |
814 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
815 | chunk[2] |
816 | .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), |
817 | chunk[3] |
818 | .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])), |
819 | chunk[4] |
820 | .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])), |
821 | chunk[5] |
822 | .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])), |
823 | ]; |
824 | *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; |
825 | a_bpp = new_chunk; |
826 | c_bpp = b_bpp.try_into().unwrap(); |
827 | } |
828 | } |
829 | BytesPerPixel::Eight => { |
830 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
831 | { |
832 | simd::unfilter_paeth_u8::<8>(previous, current); |
833 | return; |
834 | } |
835 | |
836 | let mut a_bpp = [0; 8]; |
837 | let mut c_bpp = [0; 8]; |
838 | for (chunk, b_bpp) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) |
839 | { |
840 | let new_chunk = [ |
841 | chunk[0] |
842 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
843 | chunk[1] |
844 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
845 | chunk[2] |
846 | .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), |
847 | chunk[3] |
848 | .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])), |
849 | chunk[4] |
850 | .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])), |
851 | chunk[5] |
852 | .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])), |
853 | chunk[6] |
854 | .wrapping_add(filter_paeth_decode(a_bpp[6], b_bpp[6], c_bpp[6])), |
855 | chunk[7] |
856 | .wrapping_add(filter_paeth_decode(a_bpp[7], b_bpp[7], c_bpp[7])), |
857 | ]; |
858 | *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; |
859 | a_bpp = new_chunk; |
860 | c_bpp = b_bpp.try_into().unwrap(); |
861 | } |
862 | } |
863 | } |
864 | } |
865 | } |
866 | } |
867 | |
868 | fn filter_internal( |
869 | method: FilterType, |
870 | bpp: usize, |
871 | len: usize, |
872 | previous: &[u8], |
873 | current: &[u8], |
874 | output: &mut [u8], |
875 | ) -> FilterType { |
876 | use self::FilterType::*; |
877 | |
878 | // This value was chosen experimentally based on what achieved the best performance. The |
879 | // Rust compiler does auto-vectorization, and 32-bytes per loop iteration seems to enable |
880 | // the fastest code when doing so. |
881 | const CHUNK_SIZE: usize = 32; |
882 | |
883 | match method { |
884 | NoFilter => { |
885 | output.copy_from_slice(current); |
886 | NoFilter |
887 | } |
888 | Sub => { |
889 | let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE); |
890 | let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE); |
891 | let mut prev_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE); |
892 | |
893 | for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) { |
894 | for i in 0..CHUNK_SIZE { |
895 | out[i] = cur[i].wrapping_sub(prev[i]); |
896 | } |
897 | } |
898 | |
899 | for ((out, cur), &prev) in out_chunks |
900 | .into_remainder() |
901 | .iter_mut() |
902 | .zip(cur_chunks.remainder()) |
903 | .zip(prev_chunks.remainder()) |
904 | { |
905 | *out = cur.wrapping_sub(prev); |
906 | } |
907 | |
908 | output[..bpp].copy_from_slice(¤t[..bpp]); |
909 | Sub |
910 | } |
911 | Up => { |
912 | let mut out_chunks = output.chunks_exact_mut(CHUNK_SIZE); |
913 | let mut cur_chunks = current.chunks_exact(CHUNK_SIZE); |
914 | let mut prev_chunks = previous.chunks_exact(CHUNK_SIZE); |
915 | |
916 | for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) { |
917 | for i in 0..CHUNK_SIZE { |
918 | out[i] = cur[i].wrapping_sub(prev[i]); |
919 | } |
920 | } |
921 | |
922 | for ((out, cur), &prev) in out_chunks |
923 | .into_remainder() |
924 | .iter_mut() |
925 | .zip(cur_chunks.remainder()) |
926 | .zip(prev_chunks.remainder()) |
927 | { |
928 | *out = cur.wrapping_sub(prev); |
929 | } |
930 | Up |
931 | } |
932 | Avg => { |
933 | let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE); |
934 | let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE); |
935 | let mut cur_minus_bpp_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE); |
936 | let mut prev_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE); |
937 | |
938 | for (((out, cur), cur_minus_bpp), prev) in (&mut out_chunks) |
939 | .zip(&mut cur_chunks) |
940 | .zip(&mut cur_minus_bpp_chunks) |
941 | .zip(&mut prev_chunks) |
942 | { |
943 | for i in 0..CHUNK_SIZE { |
944 | // Bitwise average of two integers without overflow and |
945 | // without converting to a wider bit-width. See: |
946 | // http://aggregate.org/MAGIC/#Average%20of%20Integers |
947 | // If this is unrolled by component, consider reverting to |
948 | // `((cur_minus_bpp[i] as u16 + prev[i] as u16) / 2) as u8` |
949 | out[i] = cur[i].wrapping_sub( |
950 | (cur_minus_bpp[i] & prev[i]) + ((cur_minus_bpp[i] ^ prev[i]) >> 1), |
951 | ); |
952 | } |
953 | } |
954 | |
955 | for (((out, cur), &cur_minus_bpp), &prev) in out_chunks |
956 | .into_remainder() |
957 | .iter_mut() |
958 | .zip(cur_chunks.remainder()) |
959 | .zip(cur_minus_bpp_chunks.remainder()) |
960 | .zip(prev_chunks.remainder()) |
961 | { |
962 | *out = cur.wrapping_sub((cur_minus_bpp & prev) + ((cur_minus_bpp ^ prev) >> 1)); |
963 | } |
964 | |
965 | for i in 0..bpp { |
966 | output[i] = current[i].wrapping_sub(previous[i] / 2); |
967 | } |
968 | Avg |
969 | } |
970 | Paeth => { |
971 | let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE); |
972 | let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE); |
973 | let mut a_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE); |
974 | let mut b_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE); |
975 | let mut c_chunks = previous[..len - bpp].chunks_exact(CHUNK_SIZE); |
976 | |
977 | for ((((out, cur), a), b), c) in (&mut out_chunks) |
978 | .zip(&mut cur_chunks) |
979 | .zip(&mut a_chunks) |
980 | .zip(&mut b_chunks) |
981 | .zip(&mut c_chunks) |
982 | { |
983 | for i in 0..CHUNK_SIZE { |
984 | out[i] = cur[i].wrapping_sub(filter_paeth_fpnge(a[i], b[i], c[i])); |
985 | } |
986 | } |
987 | |
988 | for ((((out, cur), &a), &b), &c) in out_chunks |
989 | .into_remainder() |
990 | .iter_mut() |
991 | .zip(cur_chunks.remainder()) |
992 | .zip(a_chunks.remainder()) |
993 | .zip(b_chunks.remainder()) |
994 | .zip(c_chunks.remainder()) |
995 | { |
996 | *out = cur.wrapping_sub(filter_paeth_fpnge(a, b, c)); |
997 | } |
998 | |
999 | for i in 0..bpp { |
1000 | output[i] = current[i].wrapping_sub(filter_paeth_fpnge(0, previous[i], 0)); |
1001 | } |
1002 | Paeth |
1003 | } |
1004 | } |
1005 | } |
1006 | |
1007 | pub(crate) fn filter( |
1008 | method: FilterType, |
1009 | adaptive: AdaptiveFilterType, |
1010 | bpp: BytesPerPixel, |
1011 | previous: &[u8], |
1012 | current: &[u8], |
1013 | output: &mut [u8], |
1014 | ) -> FilterType { |
1015 | use FilterType::*; |
1016 | let bpp = bpp.into_usize(); |
1017 | let len = current.len(); |
1018 | |
1019 | match adaptive { |
1020 | AdaptiveFilterType::NonAdaptive => { |
1021 | filter_internal(method, bpp, len, previous, current, output) |
1022 | } |
1023 | AdaptiveFilterType::Adaptive => { |
1024 | let mut min_sum: u64 = u64::MAX; |
1025 | let mut filter_choice = FilterType::NoFilter; |
1026 | for &filter in [Sub, Up, Avg, Paeth].iter() { |
1027 | filter_internal(filter, bpp, len, previous, current, output); |
1028 | let sum = sum_buffer(output); |
1029 | if sum <= min_sum { |
1030 | min_sum = sum; |
1031 | filter_choice = filter; |
1032 | } |
1033 | } |
1034 | |
1035 | if filter_choice != Paeth { |
1036 | filter_internal(filter_choice, bpp, len, previous, current, output); |
1037 | } |
1038 | filter_choice |
1039 | } |
1040 | } |
1041 | } |
1042 | |
1043 | // Helper function for Adaptive filter buffer summation |
1044 | fn sum_buffer(buf: &[u8]) -> u64 { |
1045 | const CHUNK_SIZE: usize = 32; |
1046 | |
1047 | let mut buf_chunks: ChunksExact<'_, u8> = buf.chunks_exact(CHUNK_SIZE); |
1048 | let mut sum: u64 = 0_u64; |
1049 | |
1050 | for chunk: &[u8] in &mut buf_chunks { |
1051 | // At most, `acc` can be `32 * (i8::MIN as u8) = 32 * 128 = 4096`. |
1052 | let mut acc: u64 = 0; |
1053 | for &b: u8 in chunk { |
1054 | acc += u64::from((b as i8).unsigned_abs()); |
1055 | } |
1056 | sum = sum.saturating_add(acc); |
1057 | } |
1058 | |
1059 | let mut acc: u64 = 0; |
1060 | for &b: u8 in buf_chunks.remainder() { |
1061 | acc += u64::from((b as i8).unsigned_abs()); |
1062 | } |
1063 | |
1064 | sum.saturating_add(acc) |
1065 | } |
1066 | |
1067 | #[cfg (test)] |
1068 | mod test { |
1069 | use super::*; |
1070 | use core::iter; |
1071 | |
1072 | #[test ] |
1073 | fn roundtrip() { |
1074 | // A multiple of 8, 6, 4, 3, 2, 1 |
1075 | const LEN: u8 = 240; |
1076 | let previous: Vec<_> = iter::repeat(1).take(LEN.into()).collect(); |
1077 | let current: Vec<_> = (0..LEN).collect(); |
1078 | let expected = current.clone(); |
1079 | let adaptive = AdaptiveFilterType::NonAdaptive; |
1080 | |
1081 | let roundtrip = |kind, bpp: BytesPerPixel| { |
1082 | let mut output = vec![0; LEN.into()]; |
1083 | filter(kind, adaptive, bpp, &previous, ¤t, &mut output); |
1084 | unfilter(kind, bpp, &previous, &mut output); |
1085 | assert_eq!( |
1086 | output, expected, |
1087 | "Filtering {:?} with {:?} does not roundtrip" , |
1088 | bpp, kind |
1089 | ); |
1090 | }; |
1091 | |
1092 | let filters = [ |
1093 | FilterType::NoFilter, |
1094 | FilterType::Sub, |
1095 | FilterType::Up, |
1096 | FilterType::Avg, |
1097 | FilterType::Paeth, |
1098 | ]; |
1099 | |
1100 | let bpps = [ |
1101 | BytesPerPixel::One, |
1102 | BytesPerPixel::Two, |
1103 | BytesPerPixel::Three, |
1104 | BytesPerPixel::Four, |
1105 | BytesPerPixel::Six, |
1106 | BytesPerPixel::Eight, |
1107 | ]; |
1108 | |
1109 | for &filter in filters.iter() { |
1110 | for &bpp in bpps.iter() { |
1111 | roundtrip(filter, bpp); |
1112 | } |
1113 | } |
1114 | } |
1115 | |
1116 | #[test ] |
1117 | #[ignore ] // takes ~20s without optimizations |
1118 | fn paeth_impls_are_equivalent() { |
1119 | for a in 0..=255 { |
1120 | for b in 0..=255 { |
1121 | for c in 0..=255 { |
1122 | let baseline = filter_paeth(a, b, c); |
1123 | let fpnge = filter_paeth_fpnge(a, b, c); |
1124 | let stbi = filter_paeth_stbi(a, b, c); |
1125 | let stbi_i16 = filter_paeth_stbi_i16(a as i16, b as i16, c as i16); |
1126 | |
1127 | assert_eq!(baseline, fpnge); |
1128 | assert_eq!(baseline, stbi); |
1129 | assert_eq!(baseline as i16, stbi_i16); |
1130 | } |
1131 | } |
1132 | } |
1133 | } |
1134 | |
1135 | #[test ] |
1136 | fn roundtrip_ascending_previous_line() { |
1137 | // A multiple of 8, 6, 4, 3, 2, 1 |
1138 | const LEN: u8 = 240; |
1139 | let previous: Vec<_> = (0..LEN).collect(); |
1140 | let current: Vec<_> = (0..LEN).collect(); |
1141 | let expected = current.clone(); |
1142 | let adaptive = AdaptiveFilterType::NonAdaptive; |
1143 | |
1144 | let roundtrip = |kind, bpp: BytesPerPixel| { |
1145 | let mut output = vec![0; LEN.into()]; |
1146 | filter(kind, adaptive, bpp, &previous, ¤t, &mut output); |
1147 | unfilter(kind, bpp, &previous, &mut output); |
1148 | assert_eq!( |
1149 | output, expected, |
1150 | "Filtering {:?} with {:?} does not roundtrip" , |
1151 | bpp, kind |
1152 | ); |
1153 | }; |
1154 | |
1155 | let filters = [ |
1156 | FilterType::NoFilter, |
1157 | FilterType::Sub, |
1158 | FilterType::Up, |
1159 | FilterType::Avg, |
1160 | FilterType::Paeth, |
1161 | ]; |
1162 | |
1163 | let bpps = [ |
1164 | BytesPerPixel::One, |
1165 | BytesPerPixel::Two, |
1166 | BytesPerPixel::Three, |
1167 | BytesPerPixel::Four, |
1168 | BytesPerPixel::Six, |
1169 | BytesPerPixel::Eight, |
1170 | ]; |
1171 | |
1172 | for &filter in filters.iter() { |
1173 | for &bpp in bpps.iter() { |
1174 | roundtrip(filter, bpp); |
1175 | } |
1176 | } |
1177 | } |
1178 | |
1179 | #[test ] |
1180 | // This tests that converting u8 to i8 doesn't overflow when taking the |
1181 | // absolute value for adaptive filtering: -128_i8.abs() will panic in debug |
1182 | // or produce garbage in release mode. The sum of 0..=255u8 should equal the |
1183 | // sum of the absolute values of -128_i8..=127, or abs(-128..=0) + 1..=127. |
1184 | fn sum_buffer_test() { |
1185 | let sum = (0..=128).sum::<u64>() + (1..=127).sum::<u64>(); |
1186 | let buf: Vec<u8> = (0_u8..=255).collect(); |
1187 | |
1188 | assert_eq!(sum, crate::filter::sum_buffer(&buf)); |
1189 | } |
1190 | } |
1191 | |