| 1 | use core::convert::TryInto; |
| 2 | |
| 3 | use crate::common::BytesPerPixel; |
| 4 | |
| 5 | /// SIMD helpers for `fn unfilter` |
| 6 | /// |
| 7 | /// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the |
| 8 | /// "unstable" feature of the `png` crate. This should be possible once the "portable_simd" |
| 9 | /// feature of Rust gets stabilized. |
| 10 | /// |
| 11 | /// This is only known to help on x86, with no change measured on most benchmarks on ARM, |
| 12 | /// and even severely regressing some of them. |
| 13 | /// So despite the code being portable, we only enable this for x86. |
| 14 | /// We can add more platforms once this code is proven to be beneficial for them. |
| 15 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
| 16 | mod simd { |
| 17 | use std::simd::num::{SimdInt, SimdUint}; |
| 18 | use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount}; |
| 19 | |
| 20 | /// Scalar Paeth function wrapped in SIMD scaffolding. |
| 21 | /// |
| 22 | /// This is needed because simply running the function on the inputs |
| 23 | /// makes the compiler think our inputs are too short |
| 24 | /// to benefit from vectorization. |
| 25 | /// Putting it in SIMD scaffolding fixes that. |
| 26 | /// https://github.com/image-rs/image-png/issues/511 |
| 27 | /// |
| 28 | /// Funnily, the autovectorizer does a better job here |
| 29 | /// than a handwritten algorithm using std::simd! |
| 30 | /// We used to have a handwritten one but this is just faster. |
| 31 | fn paeth_predictor<const N: usize>( |
| 32 | a: Simd<i16, N>, |
| 33 | b: Simd<i16, N>, |
| 34 | c: Simd<i16, N>, |
| 35 | ) -> Simd<i16, N> |
| 36 | where |
| 37 | LaneCount<N>: SupportedLaneCount, |
| 38 | { |
| 39 | let mut out = [0; N]; |
| 40 | for i in 0..N { |
| 41 | out[i] = super::filter_paeth_stbi_i16(a[i].into(), b[i].into(), c[i].into()); |
| 42 | } |
| 43 | out.into() |
| 44 | } |
| 45 | |
| 46 | /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert |
| 47 | /// the SIMD elements to `i16`. |
| 48 | fn paeth_predictor_u8<const N: usize>( |
| 49 | a: Simd<u8, N>, |
| 50 | b: Simd<u8, N>, |
| 51 | c: Simd<u8, N>, |
| 52 | ) -> Simd<u8, N> |
| 53 | where |
| 54 | LaneCount<N>: SupportedLaneCount, |
| 55 | { |
| 56 | let mut out = [0; N]; |
| 57 | for i in 0..N { |
| 58 | out[i] = super::filter_paeth_stbi(a[i].into(), b[i].into(), c[i].into()); |
| 59 | } |
| 60 | out.into() |
| 61 | } |
| 62 | |
| 63 | /// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`). |
| 64 | /// See also https://www.w3.org/TR/png/#filter-byte-positions |
| 65 | #[derive (Default)] |
| 66 | struct PaethState<T, const N: usize> |
| 67 | where |
| 68 | T: SimdElement, |
| 69 | LaneCount<N>: SupportedLaneCount, |
| 70 | { |
| 71 | /// Previous pixel in the previous row. |
| 72 | c: Simd<T, N>, |
| 73 | |
| 74 | /// Previous pixel in the current row. |
| 75 | a: Simd<T, N>, |
| 76 | } |
| 77 | |
| 78 | /// Mutates `x` as needed to unfilter `FilterType::Paeth`. |
| 79 | /// |
| 80 | /// `b` is the current pixel in the previous row. `x` is the current pixel in the current row. |
| 81 | /// See also https://www.w3.org/TR/png/#filter-byte-positions |
| 82 | fn paeth_step<const N: usize>( |
| 83 | state: &mut PaethState<i16, N>, |
| 84 | b: Simd<u8, N>, |
| 85 | x: &mut Simd<u8, N>, |
| 86 | ) where |
| 87 | LaneCount<N>: SupportedLaneCount, |
| 88 | { |
| 89 | // Storing the inputs. |
| 90 | let b = b.cast::<i16>(); |
| 91 | |
| 92 | // Calculating the new value of the current pixel. |
| 93 | let predictor = paeth_predictor(state.a, b, state.c); |
| 94 | *x += predictor.cast::<u8>(); |
| 95 | |
| 96 | // Preparing for the next step. |
| 97 | state.c = b; |
| 98 | state.a = x.cast::<i16>(); |
| 99 | } |
| 100 | |
| 101 | /// Computes the Paeth predictor without converting `u8` to `i16`. |
| 102 | /// |
| 103 | /// See `simd::paeth_step`. |
| 104 | fn paeth_step_u8<const N: usize>( |
| 105 | state: &mut PaethState<u8, N>, |
| 106 | b: Simd<u8, N>, |
| 107 | x: &mut Simd<u8, N>, |
| 108 | ) where |
| 109 | LaneCount<N>: SupportedLaneCount, |
| 110 | { |
| 111 | // Calculating the new value of the current pixel. |
| 112 | *x += paeth_predictor_u8(state.a, b, state.c); |
| 113 | |
| 114 | // Preparing for the next step. |
| 115 | state.c = b; |
| 116 | state.a = *x; |
| 117 | } |
| 118 | |
| 119 | fn load3(src: &[u8]) -> u8x4 { |
| 120 | u8x4::from_array([src[0], src[1], src[2], 0]) |
| 121 | } |
| 122 | |
| 123 | fn store3(src: u8x4, dest: &mut [u8]) { |
| 124 | dest[0..3].copy_from_slice(&src.to_array()[0..3]) |
| 125 | } |
| 126 | |
| 127 | /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`. |
| 128 | pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) { |
| 129 | debug_assert_eq!(prev_row.len(), curr_row.len()); |
| 130 | debug_assert_eq!(prev_row.len() % 3, 0); |
| 131 | |
| 132 | let mut state = PaethState::<i16, 4>::default(); |
| 133 | while prev_row.len() >= 4 { |
| 134 | // `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first |
| 135 | // byte from the next triple. This optimization technique mimics the algorithm found |
| 136 | // in |
| 137 | // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131 |
| 138 | let b = u8x4::from_slice(prev_row); |
| 139 | let mut x = u8x4::from_slice(curr_row); |
| 140 | |
| 141 | paeth_step(&mut state, b, &mut x); |
| 142 | |
| 143 | // We can speculate that writing 4 bytes might be more efficient (just as with using |
| 144 | // `u8x4::from_slice` above), but we can't use that here, because we can't clobber the |
| 145 | // first byte of the next pixel in the `curr_row`. |
| 146 | store3(x, curr_row); |
| 147 | |
| 148 | prev_row = &prev_row[3..]; |
| 149 | curr_row = &mut curr_row[3..]; |
| 150 | } |
| 151 | // Can't use `u8x4::from_slice` for the last `[u8;3]`. |
| 152 | let b = load3(prev_row); |
| 153 | let mut x = load3(curr_row); |
| 154 | paeth_step(&mut state, b, &mut x); |
| 155 | store3(x, curr_row); |
| 156 | } |
| 157 | |
| 158 | /// Undoes `FilterType::Paeth` for `BytesPerPixel::Four` and `BytesPerPixel::Eight`. |
| 159 | /// |
| 160 | /// This function calculates the Paeth predictor entirely in `Simd<u8, N>` |
| 161 | /// without converting to an intermediate `Simd<i16, N>`. Doing so avoids |
| 162 | /// paying a small performance penalty converting between types. |
| 163 | pub fn unfilter_paeth_u8<const N: usize>(prev_row: &[u8], curr_row: &mut [u8]) |
| 164 | where |
| 165 | LaneCount<N>: SupportedLaneCount, |
| 166 | { |
| 167 | debug_assert_eq!(prev_row.len(), curr_row.len()); |
| 168 | debug_assert_eq!(prev_row.len() % N, 0); |
| 169 | assert!(matches!(N, 4 | 8)); |
| 170 | |
| 171 | let mut state = PaethState::<u8, N>::default(); |
| 172 | for (prev_row, curr_row) in prev_row.chunks_exact(N).zip(curr_row.chunks_exact_mut(N)) { |
| 173 | let b = Simd::from_slice(prev_row); |
| 174 | let mut x = Simd::from_slice(curr_row); |
| 175 | |
| 176 | paeth_step_u8(&mut state, b, &mut x); |
| 177 | |
| 178 | curr_row[..N].copy_from_slice(&x.to_array()[..N]); |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | fn load6(src: &[u8]) -> u8x8 { |
| 183 | u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0]) |
| 184 | } |
| 185 | |
| 186 | fn store6(src: u8x8, dest: &mut [u8]) { |
| 187 | dest[0..6].copy_from_slice(&src.to_array()[0..6]) |
| 188 | } |
| 189 | |
| 190 | /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`. |
| 191 | pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) { |
| 192 | debug_assert_eq!(prev_row.len(), curr_row.len()); |
| 193 | debug_assert_eq!(prev_row.len() % 6, 0); |
| 194 | |
| 195 | let mut state = PaethState::<i16, 8>::default(); |
| 196 | while prev_row.len() >= 8 { |
| 197 | // `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two |
| 198 | // bytes from the next pixel. This optimization technique mimics the algorithm found |
| 199 | // in |
| 200 | // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131 |
| 201 | let b = u8x8::from_slice(prev_row); |
| 202 | let mut x = u8x8::from_slice(curr_row); |
| 203 | |
| 204 | paeth_step(&mut state, b, &mut x); |
| 205 | |
| 206 | // We can speculate that writing 8 bytes might be more efficient (just as with using |
| 207 | // `u8x8::from_slice` above), but we can't use that here, because we can't clobber the |
| 208 | // first bytes of the next pixel in the `curr_row`. |
| 209 | store6(x, curr_row); |
| 210 | |
| 211 | prev_row = &prev_row[6..]; |
| 212 | curr_row = &mut curr_row[6..]; |
| 213 | } |
| 214 | // Can't use `u8x8::from_slice` for the last `[u8;6]`. |
| 215 | let b = load6(prev_row); |
| 216 | let mut x = load6(curr_row); |
| 217 | paeth_step(&mut state, b, &mut x); |
| 218 | store6(x, curr_row); |
| 219 | } |
| 220 | } |
| 221 | |
| 222 | /// The byte level filter applied to scanlines to prepare them for compression. |
| 223 | /// |
| 224 | /// Compression in general benefits from repetitive data. The filter is a content-aware method of |
| 225 | /// compressing the range of occurring byte values to help the compression algorithm. Note that |
| 226 | /// this does not operate on pixels but on raw bytes of a scanline. |
| 227 | /// |
| 228 | /// Details on how each filter works can be found in the [PNG Book](http://www.libpng.org/pub/png/book/chapter09.html). |
| 229 | #[derive (Debug, Clone, Copy, PartialEq, Eq)] |
| 230 | #[repr (u8)] |
| 231 | pub enum FilterType { |
| 232 | NoFilter = 0, |
| 233 | Sub = 1, |
| 234 | Up = 2, |
| 235 | Avg = 3, |
| 236 | Paeth = 4, |
| 237 | } |
| 238 | |
| 239 | impl Default for FilterType { |
| 240 | fn default() -> Self { |
| 241 | FilterType::Sub |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | impl FilterType { |
| 246 | /// u8 -> Self. Temporary solution until Rust provides a canonical one. |
| 247 | pub fn from_u8(n: u8) -> Option<FilterType> { |
| 248 | match n { |
| 249 | 0 => Some(FilterType::NoFilter), |
| 250 | 1 => Some(FilterType::Sub), |
| 251 | 2 => Some(FilterType::Up), |
| 252 | 3 => Some(FilterType::Avg), |
| 253 | 4 => Some(FilterType::Paeth), |
| 254 | _ => None, |
| 255 | } |
| 256 | } |
| 257 | } |
| 258 | |
| 259 | /// Adaptive filtering tries every possible filter for each row and uses a heuristic to select the best one. |
| 260 | /// This improves compression ratio, but makes encoding slightly slower. |
| 261 | /// |
| 262 | /// It is recommended to use `Adaptive` whenever you care about compression ratio. |
| 263 | /// Filtering is quite cheap compared to other parts of encoding, but can contribute |
| 264 | /// to the compression ratio significantly. |
| 265 | /// |
| 266 | /// `NonAdaptive` filtering is the default. |
| 267 | #[derive (Debug, Clone, Copy, PartialEq, Eq)] |
| 268 | #[repr (u8)] |
| 269 | pub enum AdaptiveFilterType { |
| 270 | Adaptive, |
| 271 | NonAdaptive, |
| 272 | } |
| 273 | |
| 274 | impl Default for AdaptiveFilterType { |
| 275 | fn default() -> Self { |
| 276 | AdaptiveFilterType::NonAdaptive |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { |
| 281 | // On ARM this algorithm performs much better than the one above adapted from stb, |
| 282 | // and this is the better-studied algorithm we've always used here, |
| 283 | // so we default to it on all non-x86 platforms. |
| 284 | let pa: i16 = (i16::from(b) - i16::from(c)).abs(); |
| 285 | let pb: i16 = (i16::from(a) - i16::from(c)).abs(); |
| 286 | let pc: i16 = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs(); |
| 287 | |
| 288 | let mut out: u8 = a; |
| 289 | let mut min: i16 = pa; |
| 290 | |
| 291 | if pb < min { |
| 292 | min = pb; |
| 293 | out = b; |
| 294 | } |
| 295 | if pc < min { |
| 296 | out = c; |
| 297 | } |
| 298 | |
| 299 | out |
| 300 | } |
| 301 | |
| 302 | fn filter_paeth_stbi(a: u8, b: u8, c: u8) -> u8 { |
| 303 | // Decoding optimizes better with this algorithm than with `filter_paeth` |
| 304 | // |
| 305 | // This formulation looks very different from the reference in the PNG spec, but is |
| 306 | // actually equivalent and has favorable data dependencies and admits straightforward |
| 307 | // generation of branch-free code, which helps performance significantly. |
| 308 | // |
| 309 | // Adapted from public domain PNG implementation: |
| 310 | // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668 |
| 311 | let thresh: i16 = i16::from(c) * 3 - (i16::from(a) + i16::from(b)); |
| 312 | let lo: u8 = a.min(b); |
| 313 | let hi: u8 = a.max(b); |
| 314 | let t0: u8 = if hi as i16 <= thresh { lo } else { c }; |
| 315 | let t1: u8 = if thresh <= lo as i16 { hi } else { t0 }; |
| 316 | return t1; |
| 317 | } |
| 318 | |
| 319 | #[cfg (any(test, all(feature = "unstable" , target_arch = "x86_64" )))] |
| 320 | fn filter_paeth_stbi_i16(a: i16, b: i16, c: i16) -> i16 { |
| 321 | // Like `filter_paeth_stbi` but vectorizes better when wrapped in SIMD types. |
| 322 | // Used for bpp=3 and bpp=6 |
| 323 | let thresh = c * 3 - (a + b); |
| 324 | let lo = a.min(b); |
| 325 | let hi = a.max(b); |
| 326 | let t0 = if hi <= thresh { lo } else { c }; |
| 327 | let t1 = if thresh <= lo { hi } else { t0 }; |
| 328 | return t1; |
| 329 | } |
| 330 | |
| 331 | fn filter_paeth_fpnge(a: u8, b: u8, c: u8) -> u8 { |
| 332 | // This is an optimized version of the paeth filter from the PNG specification, proposed by |
| 333 | // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates |
| 334 | // entirely on unsigned 8-bit quantities, making it more conducive to vectorization. |
| 335 | // |
| 336 | // p = a + b - c |
| 337 | // pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c) |
| 338 | // pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c) |
| 339 | // pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ... |
| 340 | // |
| 341 | // Further optimizing the calculation of `pc` a bit tricker. However, notice that: |
| 342 | // |
| 343 | // a > c && b > c |
| 344 | // ==> (a - c) > 0 && (b - c) > 0 |
| 345 | // ==> pc > (a - c) && pc > (b - c) |
| 346 | // ==> pc > |a - c| && pc > |b - c| |
| 347 | // ==> pc > pb && pc > pa |
| 348 | // |
| 349 | // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar |
| 350 | // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and |
| 351 | // `c <= b` or vice versa: |
| 352 | // |
| 353 | // pc = ||b - c| - |a - c|| = |pa - pb| = max(pa, pb) - min(pa, pb) |
| 354 | // |
| 355 | let pa = b.max(c) - c.min(b); |
| 356 | let pb = a.max(c) - c.min(a); |
| 357 | let pc = if (a < c) == (c < b) { |
| 358 | pa.max(pb) - pa.min(pb) |
| 359 | } else { |
| 360 | 255 |
| 361 | }; |
| 362 | |
| 363 | if pa <= pb && pa <= pc { |
| 364 | a |
| 365 | } else if pb <= pc { |
| 366 | b |
| 367 | } else { |
| 368 | c |
| 369 | } |
| 370 | } |
| 371 | |
| 372 | pub(crate) fn unfilter( |
| 373 | mut filter: FilterType, |
| 374 | tbpp: BytesPerPixel, |
| 375 | previous: &[u8], |
| 376 | current: &mut [u8], |
| 377 | ) { |
| 378 | use self::FilterType::*; |
| 379 | |
| 380 | // If the previous row is empty, then treat it as if it were filled with zeros. |
| 381 | if previous.is_empty() { |
| 382 | if filter == Paeth { |
| 383 | filter = Sub; |
| 384 | } else if filter == Up { |
| 385 | filter = NoFilter; |
| 386 | } |
| 387 | } |
| 388 | |
| 389 | // [2023/01 @okaneco] - Notes on optimizing decoding filters |
| 390 | // |
| 391 | // Links: |
| 392 | // [PR]: https://github.com/image-rs/image-png/pull/382 |
| 393 | // [SWAR]: http://aggregate.org/SWAR/over.html |
| 394 | // [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers |
| 395 | // |
| 396 | // #382 heavily refactored and optimized the following filters making the |
| 397 | // implementation nonobvious. These comments function as a summary of that |
| 398 | // PR with an explanation of the choices made below. |
| 399 | // |
| 400 | // #382 originally started with trying to optimize using a technique called |
| 401 | // SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32` |
| 402 | // and `u64` as SIMD registers to perform vertical operations in parallel, |
| 403 | // usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp) |
| 404 | // pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp |
| 405 | // in a `u64`. The `Sub` filter looked like the following code block, `Avg` |
| 406 | // was similar but used a bitwise average method from [AVG]: |
| 407 | // ``` |
| 408 | // // See "Unpartitioned Operations With Correction Code" from [SWAR] |
| 409 | // fn swar_add_u32(x: u32, y: u32) -> u32 { |
| 410 | // // 7-bit addition so there's no carry over the most significant bit |
| 411 | // let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111 |
| 412 | // // 1-bit parity/XOR addition to fill in the missing MSB |
| 413 | // n ^ (x ^ y) & 0x80808080 // 0x80 = 0b_1000_0000 |
| 414 | // } |
| 415 | // |
| 416 | // let mut prev = |
| 417 | // u32::from_ne_bytes([current[0], current[1], current[2], current[3]]); |
| 418 | // for chunk in current[4..].chunks_exact_mut(4) { |
| 419 | // let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]); |
| 420 | // let new_chunk = swar_add_u32(cur, prev); |
| 421 | // chunk.copy_from_slice(&new_chunk.to_ne_bytes()); |
| 422 | // prev = new_chunk; |
| 423 | // } |
| 424 | // ``` |
| 425 | // While this provided a measurable increase, @fintelia found that this idea |
| 426 | // could be taken even further by unrolling the chunks component-wise and |
| 427 | // avoiding unnecessary byte-shuffling by using byte arrays instead of |
| 428 | // `u32::from|to_ne_bytes`. The bitwise operations were no longer necessary |
| 429 | // so they were reverted to their obvious arithmetic equivalent. Lastly, |
| 430 | // `TryInto` was used instead of `copy_from_slice`. The `Sub` code now |
| 431 | // looked like this (with asserts to remove `0..bpp` bounds checks): |
| 432 | // ``` |
| 433 | // assert!(len > 3); |
| 434 | // let mut prev = [current[0], current[1], current[2], current[3]]; |
| 435 | // for chunk in current[4..].chunks_exact_mut(4) { |
| 436 | // let new_chunk = [ |
| 437 | // chunk[0].wrapping_add(prev[0]), |
| 438 | // chunk[1].wrapping_add(prev[1]), |
| 439 | // chunk[2].wrapping_add(prev[2]), |
| 440 | // chunk[3].wrapping_add(prev[3]), |
| 441 | // ]; |
| 442 | // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
| 443 | // prev = new_chunk; |
| 444 | // } |
| 445 | // ``` |
| 446 | // The compiler was able to optimize the code to be even faster and this |
| 447 | // method even sped up Paeth filtering! Assertions were experimentally |
| 448 | // added within loop bodies which produced better instructions but no |
| 449 | // difference in speed. Finally, the code was refactored to remove manual |
| 450 | // slicing and start the previous pixel chunks with arrays of `[0; N]`. |
| 451 | // ``` |
| 452 | // let mut prev = [0; 4]; |
| 453 | // for chunk in current.chunks_exact_mut(4) { |
| 454 | // let new_chunk = [ |
| 455 | // chunk[0].wrapping_add(prev[0]), |
| 456 | // chunk[1].wrapping_add(prev[1]), |
| 457 | // chunk[2].wrapping_add(prev[2]), |
| 458 | // chunk[3].wrapping_add(prev[3]), |
| 459 | // ]; |
| 460 | // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
| 461 | // prev = new_chunk; |
| 462 | // } |
| 463 | // ``` |
| 464 | // While we're not manually bit-twiddling anymore, a possible takeaway from |
| 465 | // this is to "think in SWAR" when dealing with small byte arrays. Unrolling |
| 466 | // array operations and performing them component-wise may unlock previously |
| 467 | // unavailable optimizations from the compiler, even when using the |
| 468 | // `chunks_exact` methods for their potential auto-vectorization benefits. |
| 469 | match filter { |
| 470 | NoFilter => {} |
| 471 | Sub => match tbpp { |
| 472 | BytesPerPixel::One => { |
| 473 | current.iter_mut().reduce(|&mut prev, curr| { |
| 474 | *curr = curr.wrapping_add(prev); |
| 475 | curr |
| 476 | }); |
| 477 | } |
| 478 | BytesPerPixel::Two => { |
| 479 | let mut prev = [0; 2]; |
| 480 | for chunk in current.chunks_exact_mut(2) { |
| 481 | let new_chunk = [ |
| 482 | chunk[0].wrapping_add(prev[0]), |
| 483 | chunk[1].wrapping_add(prev[1]), |
| 484 | ]; |
| 485 | *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk; |
| 486 | prev = new_chunk; |
| 487 | } |
| 488 | } |
| 489 | BytesPerPixel::Three => { |
| 490 | let mut prev = [0; 3]; |
| 491 | for chunk in current.chunks_exact_mut(3) { |
| 492 | let new_chunk = [ |
| 493 | chunk[0].wrapping_add(prev[0]), |
| 494 | chunk[1].wrapping_add(prev[1]), |
| 495 | chunk[2].wrapping_add(prev[2]), |
| 496 | ]; |
| 497 | *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; |
| 498 | prev = new_chunk; |
| 499 | } |
| 500 | } |
| 501 | BytesPerPixel::Four => { |
| 502 | let mut prev = [0; 4]; |
| 503 | for chunk in current.chunks_exact_mut(4) { |
| 504 | let new_chunk = [ |
| 505 | chunk[0].wrapping_add(prev[0]), |
| 506 | chunk[1].wrapping_add(prev[1]), |
| 507 | chunk[2].wrapping_add(prev[2]), |
| 508 | chunk[3].wrapping_add(prev[3]), |
| 509 | ]; |
| 510 | *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
| 511 | prev = new_chunk; |
| 512 | } |
| 513 | } |
| 514 | BytesPerPixel::Six => { |
| 515 | let mut prev = [0; 6]; |
| 516 | for chunk in current.chunks_exact_mut(6) { |
| 517 | let new_chunk = [ |
| 518 | chunk[0].wrapping_add(prev[0]), |
| 519 | chunk[1].wrapping_add(prev[1]), |
| 520 | chunk[2].wrapping_add(prev[2]), |
| 521 | chunk[3].wrapping_add(prev[3]), |
| 522 | chunk[4].wrapping_add(prev[4]), |
| 523 | chunk[5].wrapping_add(prev[5]), |
| 524 | ]; |
| 525 | *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; |
| 526 | prev = new_chunk; |
| 527 | } |
| 528 | } |
| 529 | BytesPerPixel::Eight => { |
| 530 | let mut prev = [0; 8]; |
| 531 | for chunk in current.chunks_exact_mut(8) { |
| 532 | let new_chunk = [ |
| 533 | chunk[0].wrapping_add(prev[0]), |
| 534 | chunk[1].wrapping_add(prev[1]), |
| 535 | chunk[2].wrapping_add(prev[2]), |
| 536 | chunk[3].wrapping_add(prev[3]), |
| 537 | chunk[4].wrapping_add(prev[4]), |
| 538 | chunk[5].wrapping_add(prev[5]), |
| 539 | chunk[6].wrapping_add(prev[6]), |
| 540 | chunk[7].wrapping_add(prev[7]), |
| 541 | ]; |
| 542 | *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; |
| 543 | prev = new_chunk; |
| 544 | } |
| 545 | } |
| 546 | }, |
| 547 | Up => { |
| 548 | for (curr, &above) in current.iter_mut().zip(previous) { |
| 549 | *curr = curr.wrapping_add(above); |
| 550 | } |
| 551 | } |
| 552 | Avg if previous.is_empty() => match tbpp { |
| 553 | BytesPerPixel::One => { |
| 554 | current.iter_mut().reduce(|&mut prev, curr| { |
| 555 | *curr = curr.wrapping_add(prev / 2); |
| 556 | curr |
| 557 | }); |
| 558 | } |
| 559 | BytesPerPixel::Two => { |
| 560 | let mut prev = [0; 2]; |
| 561 | for chunk in current.chunks_exact_mut(2) { |
| 562 | let new_chunk = [ |
| 563 | chunk[0].wrapping_add(prev[0] / 2), |
| 564 | chunk[1].wrapping_add(prev[1] / 2), |
| 565 | ]; |
| 566 | *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk; |
| 567 | prev = new_chunk; |
| 568 | } |
| 569 | } |
| 570 | BytesPerPixel::Three => { |
| 571 | let mut prev = [0; 3]; |
| 572 | for chunk in current.chunks_exact_mut(3) { |
| 573 | let new_chunk = [ |
| 574 | chunk[0].wrapping_add(prev[0] / 2), |
| 575 | chunk[1].wrapping_add(prev[1] / 2), |
| 576 | chunk[2].wrapping_add(prev[2] / 2), |
| 577 | ]; |
| 578 | *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; |
| 579 | prev = new_chunk; |
| 580 | } |
| 581 | } |
| 582 | BytesPerPixel::Four => { |
| 583 | let mut prev = [0; 4]; |
| 584 | for chunk in current.chunks_exact_mut(4) { |
| 585 | let new_chunk = [ |
| 586 | chunk[0].wrapping_add(prev[0] / 2), |
| 587 | chunk[1].wrapping_add(prev[1] / 2), |
| 588 | chunk[2].wrapping_add(prev[2] / 2), |
| 589 | chunk[3].wrapping_add(prev[3] / 2), |
| 590 | ]; |
| 591 | *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
| 592 | prev = new_chunk; |
| 593 | } |
| 594 | } |
| 595 | BytesPerPixel::Six => { |
| 596 | let mut prev = [0; 6]; |
| 597 | for chunk in current.chunks_exact_mut(6) { |
| 598 | let new_chunk = [ |
| 599 | chunk[0].wrapping_add(prev[0] / 2), |
| 600 | chunk[1].wrapping_add(prev[1] / 2), |
| 601 | chunk[2].wrapping_add(prev[2] / 2), |
| 602 | chunk[3].wrapping_add(prev[3] / 2), |
| 603 | chunk[4].wrapping_add(prev[4] / 2), |
| 604 | chunk[5].wrapping_add(prev[5] / 2), |
| 605 | ]; |
| 606 | *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; |
| 607 | prev = new_chunk; |
| 608 | } |
| 609 | } |
| 610 | BytesPerPixel::Eight => { |
| 611 | let mut prev = [0; 8]; |
| 612 | for chunk in current.chunks_exact_mut(8) { |
| 613 | let new_chunk = [ |
| 614 | chunk[0].wrapping_add(prev[0] / 2), |
| 615 | chunk[1].wrapping_add(prev[1] / 2), |
| 616 | chunk[2].wrapping_add(prev[2] / 2), |
| 617 | chunk[3].wrapping_add(prev[3] / 2), |
| 618 | chunk[4].wrapping_add(prev[4] / 2), |
| 619 | chunk[5].wrapping_add(prev[5] / 2), |
| 620 | chunk[6].wrapping_add(prev[6] / 2), |
| 621 | chunk[7].wrapping_add(prev[7] / 2), |
| 622 | ]; |
| 623 | *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; |
| 624 | prev = new_chunk; |
| 625 | } |
| 626 | } |
| 627 | }, |
| 628 | Avg => match tbpp { |
| 629 | BytesPerPixel::One => { |
| 630 | let mut lprev = [0; 1]; |
| 631 | for (chunk, above) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1)) { |
| 632 | let new_chunk = |
| 633 | [chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8)]; |
| 634 | *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk; |
| 635 | lprev = new_chunk; |
| 636 | } |
| 637 | } |
| 638 | BytesPerPixel::Two => { |
| 639 | let mut lprev = [0; 2]; |
| 640 | for (chunk, above) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2)) { |
| 641 | let new_chunk = [ |
| 642 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
| 643 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
| 644 | ]; |
| 645 | *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk; |
| 646 | lprev = new_chunk; |
| 647 | } |
| 648 | } |
| 649 | BytesPerPixel::Three => { |
| 650 | let mut lprev = [0; 3]; |
| 651 | for (chunk, above) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3)) { |
| 652 | let new_chunk = [ |
| 653 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
| 654 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
| 655 | chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), |
| 656 | ]; |
| 657 | *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; |
| 658 | lprev = new_chunk; |
| 659 | } |
| 660 | } |
| 661 | BytesPerPixel::Four => { |
| 662 | let mut lprev = [0; 4]; |
| 663 | for (chunk, above) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) { |
| 664 | let new_chunk = [ |
| 665 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
| 666 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
| 667 | chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), |
| 668 | chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8), |
| 669 | ]; |
| 670 | *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
| 671 | lprev = new_chunk; |
| 672 | } |
| 673 | } |
| 674 | BytesPerPixel::Six => { |
| 675 | let mut lprev = [0; 6]; |
| 676 | for (chunk, above) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6)) { |
| 677 | let new_chunk = [ |
| 678 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
| 679 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
| 680 | chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), |
| 681 | chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8), |
| 682 | chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8), |
| 683 | chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8), |
| 684 | ]; |
| 685 | *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; |
| 686 | lprev = new_chunk; |
| 687 | } |
| 688 | } |
| 689 | BytesPerPixel::Eight => { |
| 690 | let mut lprev = [0; 8]; |
| 691 | for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) { |
| 692 | let new_chunk = [ |
| 693 | chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), |
| 694 | chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), |
| 695 | chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), |
| 696 | chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8), |
| 697 | chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8), |
| 698 | chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8), |
| 699 | chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8), |
| 700 | chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8), |
| 701 | ]; |
| 702 | *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; |
| 703 | lprev = new_chunk; |
| 704 | } |
| 705 | } |
| 706 | }, |
| 707 | #[allow (unreachable_code)] |
| 708 | Paeth => { |
| 709 | // Select the fastest Paeth filter implementation based on the target architecture. |
| 710 | let filter_paeth_decode = if cfg!(target_arch = "x86_64" ) { |
| 711 | filter_paeth_stbi |
| 712 | } else { |
| 713 | filter_paeth |
| 714 | }; |
| 715 | |
| 716 | // Paeth filter pixels: |
| 717 | // C B D |
| 718 | // A X |
| 719 | match tbpp { |
| 720 | BytesPerPixel::One => { |
| 721 | let mut a_bpp = [0; 1]; |
| 722 | let mut c_bpp = [0; 1]; |
| 723 | for (chunk, b_bpp) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1)) |
| 724 | { |
| 725 | let new_chunk = [chunk[0] |
| 726 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0]))]; |
| 727 | *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk; |
| 728 | a_bpp = new_chunk; |
| 729 | c_bpp = b_bpp.try_into().unwrap(); |
| 730 | } |
| 731 | } |
| 732 | BytesPerPixel::Two => { |
| 733 | let mut a_bpp = [0; 2]; |
| 734 | let mut c_bpp = [0; 2]; |
| 735 | for (chunk, b_bpp) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2)) |
| 736 | { |
| 737 | let new_chunk = [ |
| 738 | chunk[0] |
| 739 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
| 740 | chunk[1] |
| 741 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
| 742 | ]; |
| 743 | *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk; |
| 744 | a_bpp = new_chunk; |
| 745 | c_bpp = b_bpp.try_into().unwrap(); |
| 746 | } |
| 747 | } |
| 748 | BytesPerPixel::Three => { |
| 749 | // Do not enable this algorithm on ARM, that would be a big performance hit |
| 750 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
| 751 | { |
| 752 | simd::unfilter_paeth3(previous, current); |
| 753 | return; |
| 754 | } |
| 755 | |
| 756 | let mut a_bpp = [0; 3]; |
| 757 | let mut c_bpp = [0; 3]; |
| 758 | for (chunk, b_bpp) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3)) |
| 759 | { |
| 760 | let new_chunk = [ |
| 761 | chunk[0] |
| 762 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
| 763 | chunk[1] |
| 764 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
| 765 | chunk[2] |
| 766 | .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), |
| 767 | ]; |
| 768 | *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; |
| 769 | a_bpp = new_chunk; |
| 770 | c_bpp = b_bpp.try_into().unwrap(); |
| 771 | } |
| 772 | } |
| 773 | BytesPerPixel::Four => { |
| 774 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
| 775 | { |
| 776 | simd::unfilter_paeth_u8::<4>(previous, current); |
| 777 | return; |
| 778 | } |
| 779 | |
| 780 | let mut a_bpp = [0; 4]; |
| 781 | let mut c_bpp = [0; 4]; |
| 782 | for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) |
| 783 | { |
| 784 | let new_chunk = [ |
| 785 | chunk[0] |
| 786 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
| 787 | chunk[1] |
| 788 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
| 789 | chunk[2] |
| 790 | .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), |
| 791 | chunk[3] |
| 792 | .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])), |
| 793 | ]; |
| 794 | *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; |
| 795 | a_bpp = new_chunk; |
| 796 | c_bpp = b_bpp.try_into().unwrap(); |
| 797 | } |
| 798 | } |
| 799 | BytesPerPixel::Six => { |
| 800 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
| 801 | { |
| 802 | simd::unfilter_paeth6(previous, current); |
| 803 | return; |
| 804 | } |
| 805 | |
| 806 | let mut a_bpp = [0; 6]; |
| 807 | let mut c_bpp = [0; 6]; |
| 808 | for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6)) |
| 809 | { |
| 810 | let new_chunk = [ |
| 811 | chunk[0] |
| 812 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
| 813 | chunk[1] |
| 814 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
| 815 | chunk[2] |
| 816 | .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), |
| 817 | chunk[3] |
| 818 | .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])), |
| 819 | chunk[4] |
| 820 | .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])), |
| 821 | chunk[5] |
| 822 | .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])), |
| 823 | ]; |
| 824 | *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; |
| 825 | a_bpp = new_chunk; |
| 826 | c_bpp = b_bpp.try_into().unwrap(); |
| 827 | } |
| 828 | } |
| 829 | BytesPerPixel::Eight => { |
| 830 | #[cfg (all(feature = "unstable" , target_arch = "x86_64" ))] |
| 831 | { |
| 832 | simd::unfilter_paeth_u8::<8>(previous, current); |
| 833 | return; |
| 834 | } |
| 835 | |
| 836 | let mut a_bpp = [0; 8]; |
| 837 | let mut c_bpp = [0; 8]; |
| 838 | for (chunk, b_bpp) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) |
| 839 | { |
| 840 | let new_chunk = [ |
| 841 | chunk[0] |
| 842 | .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), |
| 843 | chunk[1] |
| 844 | .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), |
| 845 | chunk[2] |
| 846 | .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), |
| 847 | chunk[3] |
| 848 | .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])), |
| 849 | chunk[4] |
| 850 | .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])), |
| 851 | chunk[5] |
| 852 | .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])), |
| 853 | chunk[6] |
| 854 | .wrapping_add(filter_paeth_decode(a_bpp[6], b_bpp[6], c_bpp[6])), |
| 855 | chunk[7] |
| 856 | .wrapping_add(filter_paeth_decode(a_bpp[7], b_bpp[7], c_bpp[7])), |
| 857 | ]; |
| 858 | *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; |
| 859 | a_bpp = new_chunk; |
| 860 | c_bpp = b_bpp.try_into().unwrap(); |
| 861 | } |
| 862 | } |
| 863 | } |
| 864 | } |
| 865 | } |
| 866 | } |
| 867 | |
| 868 | fn filter_internal( |
| 869 | method: FilterType, |
| 870 | bpp: usize, |
| 871 | len: usize, |
| 872 | previous: &[u8], |
| 873 | current: &[u8], |
| 874 | output: &mut [u8], |
| 875 | ) -> FilterType { |
| 876 | use self::FilterType::*; |
| 877 | |
| 878 | // This value was chosen experimentally based on what achieved the best performance. The |
| 879 | // Rust compiler does auto-vectorization, and 32-bytes per loop iteration seems to enable |
| 880 | // the fastest code when doing so. |
| 881 | const CHUNK_SIZE: usize = 32; |
| 882 | |
| 883 | match method { |
| 884 | NoFilter => { |
| 885 | output.copy_from_slice(current); |
| 886 | NoFilter |
| 887 | } |
| 888 | Sub => { |
| 889 | let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE); |
| 890 | let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE); |
| 891 | let mut prev_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE); |
| 892 | |
| 893 | for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) { |
| 894 | for i in 0..CHUNK_SIZE { |
| 895 | out[i] = cur[i].wrapping_sub(prev[i]); |
| 896 | } |
| 897 | } |
| 898 | |
| 899 | for ((out, cur), &prev) in out_chunks |
| 900 | .into_remainder() |
| 901 | .iter_mut() |
| 902 | .zip(cur_chunks.remainder()) |
| 903 | .zip(prev_chunks.remainder()) |
| 904 | { |
| 905 | *out = cur.wrapping_sub(prev); |
| 906 | } |
| 907 | |
| 908 | output[..bpp].copy_from_slice(¤t[..bpp]); |
| 909 | Sub |
| 910 | } |
| 911 | Up => { |
| 912 | let mut out_chunks = output.chunks_exact_mut(CHUNK_SIZE); |
| 913 | let mut cur_chunks = current.chunks_exact(CHUNK_SIZE); |
| 914 | let mut prev_chunks = previous.chunks_exact(CHUNK_SIZE); |
| 915 | |
| 916 | for ((out, cur), prev) in (&mut out_chunks).zip(&mut cur_chunks).zip(&mut prev_chunks) { |
| 917 | for i in 0..CHUNK_SIZE { |
| 918 | out[i] = cur[i].wrapping_sub(prev[i]); |
| 919 | } |
| 920 | } |
| 921 | |
| 922 | for ((out, cur), &prev) in out_chunks |
| 923 | .into_remainder() |
| 924 | .iter_mut() |
| 925 | .zip(cur_chunks.remainder()) |
| 926 | .zip(prev_chunks.remainder()) |
| 927 | { |
| 928 | *out = cur.wrapping_sub(prev); |
| 929 | } |
| 930 | Up |
| 931 | } |
| 932 | Avg => { |
| 933 | let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE); |
| 934 | let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE); |
| 935 | let mut cur_minus_bpp_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE); |
| 936 | let mut prev_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE); |
| 937 | |
| 938 | for (((out, cur), cur_minus_bpp), prev) in (&mut out_chunks) |
| 939 | .zip(&mut cur_chunks) |
| 940 | .zip(&mut cur_minus_bpp_chunks) |
| 941 | .zip(&mut prev_chunks) |
| 942 | { |
| 943 | for i in 0..CHUNK_SIZE { |
| 944 | // Bitwise average of two integers without overflow and |
| 945 | // without converting to a wider bit-width. See: |
| 946 | // http://aggregate.org/MAGIC/#Average%20of%20Integers |
| 947 | // If this is unrolled by component, consider reverting to |
| 948 | // `((cur_minus_bpp[i] as u16 + prev[i] as u16) / 2) as u8` |
| 949 | out[i] = cur[i].wrapping_sub( |
| 950 | (cur_minus_bpp[i] & prev[i]) + ((cur_minus_bpp[i] ^ prev[i]) >> 1), |
| 951 | ); |
| 952 | } |
| 953 | } |
| 954 | |
| 955 | for (((out, cur), &cur_minus_bpp), &prev) in out_chunks |
| 956 | .into_remainder() |
| 957 | .iter_mut() |
| 958 | .zip(cur_chunks.remainder()) |
| 959 | .zip(cur_minus_bpp_chunks.remainder()) |
| 960 | .zip(prev_chunks.remainder()) |
| 961 | { |
| 962 | *out = cur.wrapping_sub((cur_minus_bpp & prev) + ((cur_minus_bpp ^ prev) >> 1)); |
| 963 | } |
| 964 | |
| 965 | for i in 0..bpp { |
| 966 | output[i] = current[i].wrapping_sub(previous[i] / 2); |
| 967 | } |
| 968 | Avg |
| 969 | } |
| 970 | Paeth => { |
| 971 | let mut out_chunks = output[bpp..].chunks_exact_mut(CHUNK_SIZE); |
| 972 | let mut cur_chunks = current[bpp..].chunks_exact(CHUNK_SIZE); |
| 973 | let mut a_chunks = current[..len - bpp].chunks_exact(CHUNK_SIZE); |
| 974 | let mut b_chunks = previous[bpp..].chunks_exact(CHUNK_SIZE); |
| 975 | let mut c_chunks = previous[..len - bpp].chunks_exact(CHUNK_SIZE); |
| 976 | |
| 977 | for ((((out, cur), a), b), c) in (&mut out_chunks) |
| 978 | .zip(&mut cur_chunks) |
| 979 | .zip(&mut a_chunks) |
| 980 | .zip(&mut b_chunks) |
| 981 | .zip(&mut c_chunks) |
| 982 | { |
| 983 | for i in 0..CHUNK_SIZE { |
| 984 | out[i] = cur[i].wrapping_sub(filter_paeth_fpnge(a[i], b[i], c[i])); |
| 985 | } |
| 986 | } |
| 987 | |
| 988 | for ((((out, cur), &a), &b), &c) in out_chunks |
| 989 | .into_remainder() |
| 990 | .iter_mut() |
| 991 | .zip(cur_chunks.remainder()) |
| 992 | .zip(a_chunks.remainder()) |
| 993 | .zip(b_chunks.remainder()) |
| 994 | .zip(c_chunks.remainder()) |
| 995 | { |
| 996 | *out = cur.wrapping_sub(filter_paeth_fpnge(a, b, c)); |
| 997 | } |
| 998 | |
| 999 | for i in 0..bpp { |
| 1000 | output[i] = current[i].wrapping_sub(filter_paeth_fpnge(0, previous[i], 0)); |
| 1001 | } |
| 1002 | Paeth |
| 1003 | } |
| 1004 | } |
| 1005 | } |
| 1006 | |
| 1007 | pub(crate) fn filter( |
| 1008 | method: FilterType, |
| 1009 | adaptive: AdaptiveFilterType, |
| 1010 | bpp: BytesPerPixel, |
| 1011 | previous: &[u8], |
| 1012 | current: &[u8], |
| 1013 | output: &mut [u8], |
| 1014 | ) -> FilterType { |
| 1015 | use FilterType::*; |
| 1016 | let bpp = bpp.into_usize(); |
| 1017 | let len = current.len(); |
| 1018 | |
| 1019 | match adaptive { |
| 1020 | AdaptiveFilterType::NonAdaptive => { |
| 1021 | filter_internal(method, bpp, len, previous, current, output) |
| 1022 | } |
| 1023 | AdaptiveFilterType::Adaptive => { |
| 1024 | let mut min_sum: u64 = u64::MAX; |
| 1025 | let mut filter_choice = FilterType::NoFilter; |
| 1026 | for &filter in [Sub, Up, Avg, Paeth].iter() { |
| 1027 | filter_internal(filter, bpp, len, previous, current, output); |
| 1028 | let sum = sum_buffer(output); |
| 1029 | if sum <= min_sum { |
| 1030 | min_sum = sum; |
| 1031 | filter_choice = filter; |
| 1032 | } |
| 1033 | } |
| 1034 | |
| 1035 | if filter_choice != Paeth { |
| 1036 | filter_internal(filter_choice, bpp, len, previous, current, output); |
| 1037 | } |
| 1038 | filter_choice |
| 1039 | } |
| 1040 | } |
| 1041 | } |
| 1042 | |
| 1043 | // Helper function for Adaptive filter buffer summation |
| 1044 | fn sum_buffer(buf: &[u8]) -> u64 { |
| 1045 | const CHUNK_SIZE: usize = 32; |
| 1046 | |
| 1047 | let mut buf_chunks: ChunksExact<'_, u8> = buf.chunks_exact(CHUNK_SIZE); |
| 1048 | let mut sum: u64 = 0_u64; |
| 1049 | |
| 1050 | for chunk: &[u8] in &mut buf_chunks { |
| 1051 | // At most, `acc` can be `32 * (i8::MIN as u8) = 32 * 128 = 4096`. |
| 1052 | let mut acc: u64 = 0; |
| 1053 | for &b: u8 in chunk { |
| 1054 | acc += u64::from((b as i8).unsigned_abs()); |
| 1055 | } |
| 1056 | sum = sum.saturating_add(acc); |
| 1057 | } |
| 1058 | |
| 1059 | let mut acc: u64 = 0; |
| 1060 | for &b: u8 in buf_chunks.remainder() { |
| 1061 | acc += u64::from((b as i8).unsigned_abs()); |
| 1062 | } |
| 1063 | |
| 1064 | sum.saturating_add(acc) |
| 1065 | } |
| 1066 | |
| 1067 | #[cfg (test)] |
| 1068 | mod test { |
| 1069 | use super::*; |
| 1070 | use core::iter; |
| 1071 | |
| 1072 | #[test ] |
| 1073 | fn roundtrip() { |
| 1074 | // A multiple of 8, 6, 4, 3, 2, 1 |
| 1075 | const LEN: u8 = 240; |
| 1076 | let previous: Vec<_> = iter::repeat(1).take(LEN.into()).collect(); |
| 1077 | let current: Vec<_> = (0..LEN).collect(); |
| 1078 | let expected = current.clone(); |
| 1079 | let adaptive = AdaptiveFilterType::NonAdaptive; |
| 1080 | |
| 1081 | let roundtrip = |kind, bpp: BytesPerPixel| { |
| 1082 | let mut output = vec![0; LEN.into()]; |
| 1083 | filter(kind, adaptive, bpp, &previous, ¤t, &mut output); |
| 1084 | unfilter(kind, bpp, &previous, &mut output); |
| 1085 | assert_eq!( |
| 1086 | output, expected, |
| 1087 | "Filtering {:?} with {:?} does not roundtrip" , |
| 1088 | bpp, kind |
| 1089 | ); |
| 1090 | }; |
| 1091 | |
| 1092 | let filters = [ |
| 1093 | FilterType::NoFilter, |
| 1094 | FilterType::Sub, |
| 1095 | FilterType::Up, |
| 1096 | FilterType::Avg, |
| 1097 | FilterType::Paeth, |
| 1098 | ]; |
| 1099 | |
| 1100 | let bpps = [ |
| 1101 | BytesPerPixel::One, |
| 1102 | BytesPerPixel::Two, |
| 1103 | BytesPerPixel::Three, |
| 1104 | BytesPerPixel::Four, |
| 1105 | BytesPerPixel::Six, |
| 1106 | BytesPerPixel::Eight, |
| 1107 | ]; |
| 1108 | |
| 1109 | for &filter in filters.iter() { |
| 1110 | for &bpp in bpps.iter() { |
| 1111 | roundtrip(filter, bpp); |
| 1112 | } |
| 1113 | } |
| 1114 | } |
| 1115 | |
| 1116 | #[test ] |
| 1117 | #[ignore ] // takes ~20s without optimizations |
| 1118 | fn paeth_impls_are_equivalent() { |
| 1119 | for a in 0..=255 { |
| 1120 | for b in 0..=255 { |
| 1121 | for c in 0..=255 { |
| 1122 | let baseline = filter_paeth(a, b, c); |
| 1123 | let fpnge = filter_paeth_fpnge(a, b, c); |
| 1124 | let stbi = filter_paeth_stbi(a, b, c); |
| 1125 | let stbi_i16 = filter_paeth_stbi_i16(a as i16, b as i16, c as i16); |
| 1126 | |
| 1127 | assert_eq!(baseline, fpnge); |
| 1128 | assert_eq!(baseline, stbi); |
| 1129 | assert_eq!(baseline as i16, stbi_i16); |
| 1130 | } |
| 1131 | } |
| 1132 | } |
| 1133 | } |
| 1134 | |
| 1135 | #[test ] |
| 1136 | fn roundtrip_ascending_previous_line() { |
| 1137 | // A multiple of 8, 6, 4, 3, 2, 1 |
| 1138 | const LEN: u8 = 240; |
| 1139 | let previous: Vec<_> = (0..LEN).collect(); |
| 1140 | let current: Vec<_> = (0..LEN).collect(); |
| 1141 | let expected = current.clone(); |
| 1142 | let adaptive = AdaptiveFilterType::NonAdaptive; |
| 1143 | |
| 1144 | let roundtrip = |kind, bpp: BytesPerPixel| { |
| 1145 | let mut output = vec![0; LEN.into()]; |
| 1146 | filter(kind, adaptive, bpp, &previous, ¤t, &mut output); |
| 1147 | unfilter(kind, bpp, &previous, &mut output); |
| 1148 | assert_eq!( |
| 1149 | output, expected, |
| 1150 | "Filtering {:?} with {:?} does not roundtrip" , |
| 1151 | bpp, kind |
| 1152 | ); |
| 1153 | }; |
| 1154 | |
| 1155 | let filters = [ |
| 1156 | FilterType::NoFilter, |
| 1157 | FilterType::Sub, |
| 1158 | FilterType::Up, |
| 1159 | FilterType::Avg, |
| 1160 | FilterType::Paeth, |
| 1161 | ]; |
| 1162 | |
| 1163 | let bpps = [ |
| 1164 | BytesPerPixel::One, |
| 1165 | BytesPerPixel::Two, |
| 1166 | BytesPerPixel::Three, |
| 1167 | BytesPerPixel::Four, |
| 1168 | BytesPerPixel::Six, |
| 1169 | BytesPerPixel::Eight, |
| 1170 | ]; |
| 1171 | |
| 1172 | for &filter in filters.iter() { |
| 1173 | for &bpp in bpps.iter() { |
| 1174 | roundtrip(filter, bpp); |
| 1175 | } |
| 1176 | } |
| 1177 | } |
| 1178 | |
| 1179 | #[test ] |
| 1180 | // This tests that converting u8 to i8 doesn't overflow when taking the |
| 1181 | // absolute value for adaptive filtering: -128_i8.abs() will panic in debug |
| 1182 | // or produce garbage in release mode. The sum of 0..=255u8 should equal the |
| 1183 | // sum of the absolute values of -128_i8..=127, or abs(-128..=0) + 1..=127. |
| 1184 | fn sum_buffer_test() { |
| 1185 | let sum = (0..=128).sum::<u64>() + (1..=127).sum::<u64>(); |
| 1186 | let buf: Vec<u8> = (0_u8..=255).collect(); |
| 1187 | |
| 1188 | assert_eq!(sum, crate::filter::sum_buffer(&buf)); |
| 1189 | } |
| 1190 | } |
| 1191 | |