| 1 |
|
| 2 | //! Contains the compression attribute definition
|
| 3 | //! and methods to compress and decompress data.
|
| 4 |
|
| 5 |
|
| 6 | // private modules make non-breaking changes easier
|
| 7 | mod zip;
|
| 8 | mod rle;
|
| 9 | mod piz;
|
| 10 | mod pxr24;
|
| 11 | mod b44;
|
| 12 |
|
| 13 |
|
| 14 | use std::convert::TryInto;
|
| 15 | use std::mem::size_of;
|
| 16 | use half::f16;
|
| 17 | use crate::meta::attribute::{IntegerBounds, SampleType, ChannelList};
|
| 18 | use crate::error::{Result, Error, usize_to_i32};
|
| 19 | use crate::meta::header::Header;
|
| 20 |
|
| 21 |
|
| 22 | /// A byte vector.
|
| 23 | pub type ByteVec = Vec<u8>;
|
| 24 |
|
| 25 | /// A byte slice.
|
| 26 | pub type Bytes<'s> = &'s [u8];
|
| 27 |
|
| 28 | /// Specifies which compression method to use.
|
| 29 | /// Use uncompressed data for fastest loading and writing speeds.
|
| 30 | /// Use RLE compression for fast loading and writing with slight memory savings.
|
| 31 | /// Use ZIP compression for slow processing with large memory savings.
|
| 32 | #[derive (Debug, Clone, Copy, PartialEq)]
|
| 33 | pub enum Compression {
|
| 34 |
|
| 35 | /// Store uncompressed values.
|
| 36 | /// Produces large files that can be read and written very quickly.
|
| 37 | /// Consider using RLE instead, as it provides some compression with almost equivalent speed.
|
| 38 | Uncompressed,
|
| 39 |
|
| 40 | /// Produces slightly smaller files
|
| 41 | /// that can still be read and written rather quickly.
|
| 42 | /// The compressed file size is usually between 60 and 75 percent of the uncompressed size.
|
| 43 | /// Works best for images with large flat areas, such as masks and abstract graphics.
|
| 44 | /// This compression method is lossless.
|
| 45 | RLE,
|
| 46 |
|
| 47 | /// Uses ZIP compression to compress each line. Slowly produces small images
|
| 48 | /// which can be read with moderate speed. This compression method is lossless.
|
| 49 | /// Might be slightly faster but larger than `ZIP16´.
|
| 50 | ZIP1, // TODO ZIP { individual_lines: bool, compression_level: Option<u8> } // TODO specify zip compression level?
|
| 51 |
|
| 52 | /// Uses ZIP compression to compress blocks of 16 lines. Slowly produces small images
|
| 53 | /// which can be read with moderate speed. This compression method is lossless.
|
| 54 | /// Might be slightly slower but smaller than `ZIP1´.
|
| 55 | ZIP16, // TODO collapse with ZIP1
|
| 56 |
|
| 57 | /// PIZ compression works well for noisy and natural images. Works better with larger tiles.
|
| 58 | /// Only supported for flat images, but not for deep data.
|
| 59 | /// This compression method is lossless.
|
| 60 | // A wavelet transform is applied to the pixel data, and the result is Huffman-
|
| 61 | // encoded. This scheme tends to provide the best compression ratio for the types of
|
| 62 | // images that are typically processed at Industrial Light & Magic. Files are
|
| 63 | // compressed and decompressed at roughly the same speed. For photographic
|
| 64 | // images with film grain, the files are reduced to between 35 and 55 percent of their
|
| 65 | // uncompressed size.
|
| 66 | // PIZ compression works well for scan-line based files, and also for tiled files with
|
| 67 | // large tiles, but small tiles do not shrink much. (PIZ-compressed data start with a
|
| 68 | // relatively long header; if the input to the compressor is short, adding the header
|
| 69 | // tends to offset any size reduction of the input.)
|
| 70 | PIZ,
|
| 71 |
|
| 72 | /// Like `ZIP1`, but reduces precision of `f32` images to `f24`.
|
| 73 | /// Therefore, this is lossless compression for `f16` and `u32` data, lossy compression for `f32` data.
|
| 74 | /// This compression method works well for depth
|
| 75 | /// buffers and similar images, where the possible range of values is very large, but
|
| 76 | /// where full 32-bit floating-point accuracy is not necessary. Rounding improves
|
| 77 | /// compression significantly by eliminating the pixels' 8 least significant bits, which
|
| 78 | /// tend to be very noisy, and therefore difficult to compress.
|
| 79 | /// This produces really small image files. Only supported for flat images, not for deep data.
|
| 80 | // After reducing 32-bit floating-point data to 24 bits by rounding (while leaving 16-bit
|
| 81 | // floating-point data unchanged), differences between horizontally adjacent pixels
|
| 82 | // are compressed with zlib, similar to ZIP. PXR24 compression preserves image
|
| 83 | // channels of type HALF and UINT exactly, but the relative error of FLOAT data
|
| 84 | // increases to about ???.
|
| 85 | PXR24, // TODO specify zip compression level?
|
| 86 |
|
| 87 | /// This is a lossy compression method for f16 images.
|
| 88 | /// It's the predecessor of the `B44A` compression,
|
| 89 | /// which has improved compression rates for uniformly colored areas.
|
| 90 | /// You should probably use `B44A` instead of the plain `B44`.
|
| 91 | ///
|
| 92 | /// Only supported for flat images, not for deep data.
|
| 93 | // lossy 4-by-4 pixel block compression,
|
| 94 | // flat fields are compressed more
|
| 95 | // Channels of type HALF are split into blocks of four by four pixels or 32 bytes. Each
|
| 96 | // block is then packed into 14 bytes, reducing the data to 44 percent of their
|
| 97 | // uncompressed size. When B44 compression is applied to RGB images in
|
| 98 | // combination with luminance/chroma encoding (see below), the size of the
|
| 99 | // compressed pixels is about 22 percent of the size of the original RGB data.
|
| 100 | // Channels of type UINT or FLOAT are not compressed.
|
| 101 | // Decoding is fast enough to allow real-time playback of B44-compressed OpenEXR
|
| 102 | // image sequences on commodity hardware.
|
| 103 | // The size of a B44-compressed file depends on the number of pixels in the image,
|
| 104 | // but not on the data in the pixels. All images with the same resolution and the same
|
| 105 | // set of channels have the same size. This can be advantageous for systems that
|
| 106 | // support real-time playback of image sequences; the predictable file size makes it
|
| 107 | // easier to allocate space on storage media efficiently.
|
| 108 | // B44 compression is only supported for flat images.
|
| 109 | B44, // TODO B44 { optimize_uniform_areas: bool }
|
| 110 |
|
| 111 | /// This is a lossy compression method for f16 images.
|
| 112 | /// All f32 and u32 channels will be stored without compression.
|
| 113 | /// All the f16 pixels are divided into 4x4 blocks.
|
| 114 | /// Each block is then compressed as a whole.
|
| 115 | ///
|
| 116 | /// The 32 bytes of a block will require only ~14 bytes after compression,
|
| 117 | /// independent of the actual pixel contents. With chroma subsampling,
|
| 118 | /// a block will be compressed to ~7 bytes.
|
| 119 | /// Uniformly colored blocks will be compressed to ~3 bytes.
|
| 120 | ///
|
| 121 | /// The 512 bytes of an f32 block will not be compressed at all.
|
| 122 | ///
|
| 123 | /// Should be fast enough for realtime playback.
|
| 124 | /// Only supported for flat images, not for deep data.
|
| 125 | B44A, // TODO collapse with B44
|
| 126 |
|
| 127 | /// __This lossy compression is not yet supported by this implementation.__
|
| 128 | // lossy DCT based compression, in blocks
|
| 129 | // of 32 scanlines. More efficient for partial buffer access.
|
| 130 | DWAA(Option<f32>), // TODO does this have a default value? make this non optional? default Compression Level setting is 45.0
|
| 131 |
|
| 132 | /// __This lossy compression is not yet supported by this implementation.__
|
| 133 | // lossy DCT based compression, in blocks
|
| 134 | // of 256 scanlines. More efficient space
|
| 135 | // wise and faster to decode full frames
|
| 136 | // than DWAA_COMPRESSION.
|
| 137 | DWAB(Option<f32>), // TODO collapse with B44. default Compression Level setting is 45.0
|
| 138 | }
|
| 139 |
|
| 140 | impl std::fmt::Display for Compression {
|
| 141 | fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
| 142 | write!(formatter, " {} compression" , match self {
|
| 143 | Compression::Uncompressed => "no" ,
|
| 144 | Compression::RLE => "rle" ,
|
| 145 | Compression::ZIP1 => "zip line" ,
|
| 146 | Compression::ZIP16 => "zip block" ,
|
| 147 | Compression::B44 => "b44" ,
|
| 148 | Compression::B44A => "b44a" ,
|
| 149 | Compression::DWAA(_) => "dwaa" ,
|
| 150 | Compression::DWAB(_) => "dwab" ,
|
| 151 | Compression::PIZ => "piz" ,
|
| 152 | Compression::PXR24 => "pxr24" ,
|
| 153 | })
|
| 154 | }
|
| 155 | }
|
| 156 |
|
| 157 |
|
| 158 |
|
| 159 | impl Compression {
|
| 160 |
|
| 161 | /// Compress the image section of bytes.
|
| 162 | pub fn compress_image_section(self, header: &Header, uncompressed_native_endian: ByteVec, pixel_section: IntegerBounds) -> Result<ByteVec> {
|
| 163 | let max_tile_size = header.max_block_pixel_size();
|
| 164 |
|
| 165 | assert!(pixel_section.validate(Some(max_tile_size)).is_ok(), "decompress tile coordinate bug" );
|
| 166 | if header.deep { assert!(self.supports_deep_data()) }
|
| 167 |
|
| 168 | use self::Compression::*;
|
| 169 | let compressed_little_endian = match self {
|
| 170 | Uncompressed => {
|
| 171 | return Ok(convert_current_to_little_endian(
|
| 172 | uncompressed_native_endian, &header.channels, pixel_section
|
| 173 | ))
|
| 174 | },
|
| 175 |
|
| 176 | // we need to clone here, because we might have to fallback to the uncompressed data later (when compressed data is larger than raw data)
|
| 177 | ZIP16 => zip::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section),
|
| 178 | ZIP1 => zip::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section),
|
| 179 | RLE => rle::compress_bytes(&header.channels, uncompressed_native_endian.clone(), pixel_section),
|
| 180 | PIZ => piz::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section),
|
| 181 | PXR24 => pxr24::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section),
|
| 182 | B44 => b44::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section, false),
|
| 183 | B44A => b44::compress(&header.channels, uncompressed_native_endian.clone(), pixel_section, true),
|
| 184 | _ => return Err(Error::unsupported(format!("yet unimplemented compression method: {}" , self)))
|
| 185 | };
|
| 186 |
|
| 187 | let compressed_little_endian = compressed_little_endian.map_err(|_|
|
| 188 | Error::invalid(format!("pixels cannot be compressed ( {})" , self))
|
| 189 | )?;
|
| 190 |
|
| 191 | if self == Uncompressed || compressed_little_endian.len() < uncompressed_native_endian.len() {
|
| 192 | // only write compressed if it actually is smaller than raw
|
| 193 | Ok(compressed_little_endian)
|
| 194 | }
|
| 195 | else {
|
| 196 | // if we do not use compression, manually convert uncompressed data
|
| 197 | Ok(convert_current_to_little_endian(uncompressed_native_endian, &header.channels, pixel_section))
|
| 198 | }
|
| 199 | }
|
| 200 |
|
| 201 | /// Decompress the image section of bytes.
|
| 202 | pub fn decompress_image_section(self, header: &Header, compressed: ByteVec, pixel_section: IntegerBounds, pedantic: bool) -> Result<ByteVec> {
|
| 203 | let max_tile_size = header.max_block_pixel_size();
|
| 204 |
|
| 205 | assert!(pixel_section.validate(Some(max_tile_size)).is_ok(), "decompress tile coordinate bug" );
|
| 206 | if header.deep { assert!(self.supports_deep_data()) }
|
| 207 |
|
| 208 | let expected_byte_size = pixel_section.size.area() * header.channels.bytes_per_pixel; // FIXME this needs to account for subsampling anywhere
|
| 209 |
|
| 210 | // note: always true where self == Uncompressed
|
| 211 | if compressed.len() == expected_byte_size {
|
| 212 | // the compressed data was larger than the raw data, so the small raw data has been written
|
| 213 | Ok(convert_little_endian_to_current(compressed, &header.channels, pixel_section))
|
| 214 | }
|
| 215 | else {
|
| 216 | use self::Compression::*;
|
| 217 | let bytes = match self {
|
| 218 | Uncompressed => Ok(convert_little_endian_to_current(compressed, &header.channels, pixel_section)),
|
| 219 | ZIP16 => zip::decompress_bytes(&header.channels, compressed, pixel_section, expected_byte_size, pedantic),
|
| 220 | ZIP1 => zip::decompress_bytes(&header.channels, compressed, pixel_section, expected_byte_size, pedantic),
|
| 221 | RLE => rle::decompress_bytes(&header.channels, compressed, pixel_section, expected_byte_size, pedantic),
|
| 222 | PIZ => piz::decompress(&header.channels, compressed, pixel_section, expected_byte_size, pedantic),
|
| 223 | PXR24 => pxr24::decompress(&header.channels, compressed, pixel_section, expected_byte_size, pedantic),
|
| 224 | B44 | B44A => b44::decompress(&header.channels, compressed, pixel_section, expected_byte_size, pedantic),
|
| 225 | _ => return Err(Error::unsupported(format!("yet unimplemented compression method: {}" , self)))
|
| 226 | };
|
| 227 |
|
| 228 | // map all errors to compression errors
|
| 229 | let bytes = bytes
|
| 230 | .map_err(|decompression_error| match decompression_error {
|
| 231 | Error::NotSupported(message) =>
|
| 232 | Error::unsupported(format!("yet unimplemented compression special case ( {})" , message)),
|
| 233 |
|
| 234 | error => Error::invalid(format!(
|
| 235 | "compressed {:?} data ( {})" ,
|
| 236 | self, error.to_string()
|
| 237 | )),
|
| 238 | })?;
|
| 239 |
|
| 240 | if bytes.len() != expected_byte_size {
|
| 241 | Err(Error::invalid("decompressed data" ))
|
| 242 | }
|
| 243 |
|
| 244 | else { Ok(bytes) }
|
| 245 | }
|
| 246 | }
|
| 247 |
|
| 248 | /// For scan line images and deep scan line images, one or more scan lines may be
|
| 249 | /// stored together as a scan line block. The number of scan lines per block
|
| 250 | /// depends on how the pixel data are compressed.
|
| 251 | pub fn scan_lines_per_block(self) -> usize {
|
| 252 | use self::Compression::*;
|
| 253 | match self {
|
| 254 | Uncompressed | RLE | ZIP1 => 1,
|
| 255 | ZIP16 | PXR24 => 16,
|
| 256 | PIZ | B44 | B44A | DWAA(_) => 32,
|
| 257 | DWAB(_) => 256,
|
| 258 | }
|
| 259 | }
|
| 260 |
|
| 261 | /// Deep data can only be compressed using RLE or ZIP compression.
|
| 262 | pub fn supports_deep_data(self) -> bool {
|
| 263 | use self::Compression::*;
|
| 264 | match self {
|
| 265 | Uncompressed | RLE | ZIP1 => true,
|
| 266 | _ => false,
|
| 267 | }
|
| 268 | }
|
| 269 |
|
| 270 | /// Most compression methods will reconstruct the exact pixel bytes,
|
| 271 | /// but some might throw away unimportant data for specific types of samples.
|
| 272 | pub fn is_lossless_for(self, sample_type: SampleType) -> bool {
|
| 273 | use self::Compression::*;
|
| 274 | match self {
|
| 275 | PXR24 => sample_type != SampleType::F32, // pxr reduces f32 to f24
|
| 276 | B44 | B44A => sample_type != SampleType::F16, // b44 only compresses f16 values, others are left uncompressed
|
| 277 | Uncompressed | RLE | ZIP1 | ZIP16 | PIZ => true,
|
| 278 | DWAB(_) | DWAA(_) => false,
|
| 279 | }
|
| 280 | }
|
| 281 |
|
| 282 | /// Most compression methods will reconstruct the exact pixel bytes,
|
| 283 | /// but some might throw away unimportant data in some cases.
|
| 284 | pub fn may_loose_data(self) -> bool {
|
| 285 | use self::Compression::*;
|
| 286 | match self {
|
| 287 | Uncompressed | RLE | ZIP1 | ZIP16 | PIZ => false,
|
| 288 | PXR24 | B44 | B44A | DWAB(_) | DWAA(_) => true,
|
| 289 | }
|
| 290 | }
|
| 291 |
|
| 292 | /// Most compression methods will reconstruct the exact pixel bytes,
|
| 293 | /// but some might replace NaN with zeroes.
|
| 294 | pub fn supports_nan(self) -> bool {
|
| 295 | use self::Compression::*;
|
| 296 | match self {
|
| 297 | B44 | B44A | DWAB(_) | DWAA(_) => false, // TODO dwa might support it?
|
| 298 | _ => true
|
| 299 | }
|
| 300 | }
|
| 301 |
|
| 302 | }
|
| 303 |
|
| 304 | // see https://github.com/AcademySoftwareFoundation/openexr/blob/6a9f8af6e89547bcd370ae3cec2b12849eee0b54/OpenEXR/IlmImf/ImfMisc.cpp#L1456-L1541
|
| 305 |
|
| 306 | #[allow (unused)] // allows the extra parameters to be unused
|
| 307 | fn convert_current_to_little_endian(mut bytes: ByteVec, channels: &ChannelList, rectangle: IntegerBounds) -> ByteVec {
|
| 308 | #[cfg (target = "big_endian" )]
|
| 309 | reverse_block_endianness(&mut byte_vec, channels, rectangle);
|
| 310 |
|
| 311 | bytes
|
| 312 | }
|
| 313 |
|
| 314 | #[allow (unused)] // allows the extra parameters to be unused
|
| 315 | fn convert_little_endian_to_current(mut bytes: ByteVec, channels: &ChannelList, rectangle: IntegerBounds) -> ByteVec {
|
| 316 | #[cfg (target = "big_endian" )]
|
| 317 | reverse_block_endianness(&mut bytes, channels, rectangle);
|
| 318 |
|
| 319 | bytes
|
| 320 | }
|
| 321 |
|
| 322 | #[allow (unused)] // unused when on little endian system
|
| 323 | fn reverse_block_endianness(bytes: &mut [u8], channels: &ChannelList, rectangle: IntegerBounds){
|
| 324 | let mut remaining_bytes: &mut [u8] = bytes;
|
| 325 |
|
| 326 | for y in rectangle.position.y() .. rectangle.end().y() {
|
| 327 | for channel in &channels.list {
|
| 328 | let line_is_subsampled = mod_p(y, usize_to_i32(channel.sampling.y())) != 0;
|
| 329 | if line_is_subsampled { continue; }
|
| 330 |
|
| 331 | let sample_count = rectangle.size.width() / channel.sampling.x();
|
| 332 |
|
| 333 | match channel.sample_type {
|
| 334 | SampleType::F16 => remaining_bytes = chomp_convert_n::<f16>(reverse_2_bytes, remaining_bytes, sample_count),
|
| 335 | SampleType::F32 => remaining_bytes = chomp_convert_n::<f32>(reverse_4_bytes, remaining_bytes, sample_count),
|
| 336 | SampleType::U32 => remaining_bytes = chomp_convert_n::<u32>(reverse_4_bytes, remaining_bytes, sample_count),
|
| 337 | }
|
| 338 | }
|
| 339 | }
|
| 340 |
|
| 341 | #[inline ]
|
| 342 | fn chomp_convert_n<T>(convert_single_value: fn(&mut[u8]), mut bytes: &mut [u8], count: usize) -> &mut [u8] {
|
| 343 | let type_size = size_of::<T>();
|
| 344 | let (line_bytes, rest) = bytes.split_at_mut(count * type_size);
|
| 345 | let value_byte_chunks = line_bytes.chunks_exact_mut(type_size);
|
| 346 |
|
| 347 | for value_bytes in value_byte_chunks {
|
| 348 | convert_single_value(value_bytes);
|
| 349 | }
|
| 350 |
|
| 351 | rest
|
| 352 | }
|
| 353 |
|
| 354 | debug_assert!(remaining_bytes.is_empty(), "not all bytes were converted to little endian" );
|
| 355 | }
|
| 356 |
|
| 357 | #[inline ]
|
| 358 | fn reverse_2_bytes(bytes: &mut [u8]){
|
| 359 | // this code seems like it could be optimized easily by the compiler
|
| 360 | let two_bytes: [u8; 2] = bytes.try_into().expect(msg:"invalid byte count" );
|
| 361 | bytes.copy_from_slice(&[two_bytes[1], two_bytes[0]]);
|
| 362 | }
|
| 363 |
|
| 364 | #[inline ]
|
| 365 | fn reverse_4_bytes(bytes: &mut [u8]){
|
| 366 | let four_bytes: [u8; 4] = bytes.try_into().expect(msg:"invalid byte count" );
|
| 367 | bytes.copy_from_slice(&[four_bytes[3], four_bytes[2], four_bytes[1], four_bytes[0]]);
|
| 368 | }
|
| 369 |
|
| 370 | #[inline ]
|
| 371 | fn div_p (x: i32, y: i32) -> i32 {
|
| 372 | if x >= 0 {
|
| 373 | if y >= 0 { x / y }
|
| 374 | else { -(x / -y) }
|
| 375 | }
|
| 376 | else {
|
| 377 | if y >= 0 { -((y-1-x) / y) }
|
| 378 | else { (-y-1-x) / -y }
|
| 379 | }
|
| 380 | }
|
| 381 |
|
| 382 | #[inline ]
|
| 383 | fn mod_p(x: i32, y: i32) -> i32 {
|
| 384 | x - y * div_p(x, y)
|
| 385 | }
|
| 386 |
|
| 387 | /// A collection of functions used to prepare data for compression.
|
| 388 | mod optimize_bytes {
|
| 389 |
|
| 390 | /// Integrate over all differences to the previous value in order to reconstruct sample values.
|
| 391 | pub fn differences_to_samples(buffer: &mut [u8]) {
|
| 392 | // The naive implementation is very simple:
|
| 393 | //
|
| 394 | // for index in 1..buffer.len() {
|
| 395 | // buffer[index] = (buffer[index - 1] as i32 + buffer[index] as i32 - 128) as u8;
|
| 396 | // }
|
| 397 | //
|
| 398 | // But we process elements in pairs to take advantage of instruction-level parallelism.
|
| 399 | // When computations within a pair do not depend on each other, they can be processed in parallel.
|
| 400 | // Since this function is responsible for a very large chunk of execution time,
|
| 401 | // this tweak alone improves decoding performance of RLE images by 20%.
|
| 402 | if let Some(first) = buffer.get(0) {
|
| 403 | let mut previous = *first as i16;
|
| 404 | for chunk in &mut buffer[1..].chunks_exact_mut(2) {
|
| 405 | // no bounds checks here due to indices and chunk size being constant
|
| 406 | let diff0 = chunk[0] as i16;
|
| 407 | let diff1 = chunk[1] as i16;
|
| 408 | // these two computations do not depend on each other, unlike in the naive version,
|
| 409 | // so they can be executed by the CPU in parallel via instruction-level parallelism
|
| 410 | let sample0 = (previous + diff0 - 128) as u8;
|
| 411 | let sample1 = (previous + diff0 + diff1 - 128 * 2) as u8;
|
| 412 | chunk[0] = sample0;
|
| 413 | chunk[1] = sample1;
|
| 414 | previous = sample1 as i16;
|
| 415 | }
|
| 416 | // handle the remaining element at the end not processed by the loop over pairs, if present
|
| 417 | for elem in &mut buffer[1..].chunks_exact_mut(2).into_remainder().iter_mut() {
|
| 418 | let sample = (previous + *elem as i16 - 128) as u8;
|
| 419 | *elem = sample;
|
| 420 | previous = sample as i16;
|
| 421 | }
|
| 422 | }
|
| 423 | }
|
| 424 |
|
| 425 | /// Derive over all values in order to produce differences to the previous value.
|
| 426 | pub fn samples_to_differences(buffer: &mut [u8]){
|
| 427 | // naive version:
|
| 428 | // for index in (1..buffer.len()).rev() {
|
| 429 | // buffer[index] = (buffer[index] as i32 - buffer[index - 1] as i32 + 128) as u8;
|
| 430 | // }
|
| 431 | //
|
| 432 | // But we process elements in batches to take advantage of autovectorization.
|
| 433 | // If the target platform has no vector instructions (e.g. 32-bit ARM without `-C target-cpu=native`)
|
| 434 | // this will instead take advantage of instruction-level parallelism.
|
| 435 | if let Some(first) = buffer.get(0) {
|
| 436 | let mut previous = *first as i16;
|
| 437 | // Chunk size is 16 because we process bytes (8 bits),
|
| 438 | // and 8*16 = 128 bits is the size of a typical SIMD register.
|
| 439 | // Even WASM has 128-bit SIMD registers.
|
| 440 | for chunk in &mut buffer[1..].chunks_exact_mut(16) {
|
| 441 | // no bounds checks here due to indices and chunk size being constant
|
| 442 | let sample0 = chunk[0] as i16;
|
| 443 | let sample1 = chunk[1] as i16;
|
| 444 | let sample2 = chunk[2] as i16;
|
| 445 | let sample3 = chunk[3] as i16;
|
| 446 | let sample4 = chunk[4] as i16;
|
| 447 | let sample5 = chunk[5] as i16;
|
| 448 | let sample6 = chunk[6] as i16;
|
| 449 | let sample7 = chunk[7] as i16;
|
| 450 | let sample8 = chunk[8] as i16;
|
| 451 | let sample9 = chunk[9] as i16;
|
| 452 | let sample10 = chunk[10] as i16;
|
| 453 | let sample11 = chunk[11] as i16;
|
| 454 | let sample12 = chunk[12] as i16;
|
| 455 | let sample13 = chunk[13] as i16;
|
| 456 | let sample14 = chunk[14] as i16;
|
| 457 | let sample15 = chunk[15] as i16;
|
| 458 | // Unlike in decoding, computations in here are truly independent from each other,
|
| 459 | // which enables the compiler to vectorize this loop.
|
| 460 | // Even if the target platform has no vector instructions,
|
| 461 | // so using more parallelism doesn't imply doing more work,
|
| 462 | // and we're not really limited in how wide we can go.
|
| 463 | chunk[0] = (sample0 - previous + 128) as u8;
|
| 464 | chunk[1] = (sample1 - sample0 + 128) as u8;
|
| 465 | chunk[2] = (sample2 - sample1 + 128) as u8;
|
| 466 | chunk[3] = (sample3 - sample2 + 128) as u8;
|
| 467 | chunk[4] = (sample4 - sample3 + 128) as u8;
|
| 468 | chunk[5] = (sample5 - sample4 + 128) as u8;
|
| 469 | chunk[6] = (sample6 - sample5 + 128) as u8;
|
| 470 | chunk[7] = (sample7 - sample6 + 128) as u8;
|
| 471 | chunk[8] = (sample8 - sample7 + 128) as u8;
|
| 472 | chunk[9] = (sample9 - sample8 + 128) as u8;
|
| 473 | chunk[10] = (sample10 - sample9 + 128) as u8;
|
| 474 | chunk[11] = (sample11 - sample10 + 128) as u8;
|
| 475 | chunk[12] = (sample12 - sample11 + 128) as u8;
|
| 476 | chunk[13] = (sample13 - sample12 + 128) as u8;
|
| 477 | chunk[14] = (sample14 - sample13 + 128) as u8;
|
| 478 | chunk[15] = (sample15 - sample14 + 128) as u8;
|
| 479 | previous = sample15;
|
| 480 | }
|
| 481 | // Handle the remaining element at the end not processed by the loop over batches, if present
|
| 482 | // This is what the iterator-based version of this function would look like without vectorization
|
| 483 | for elem in &mut buffer[1..].chunks_exact_mut(16).into_remainder().iter_mut() {
|
| 484 | let diff = (*elem as i16 - previous + 128) as u8;
|
| 485 | previous = *elem as i16;
|
| 486 | *elem = diff;
|
| 487 | }
|
| 488 | }
|
| 489 | }
|
| 490 |
|
| 491 | use std::cell::Cell;
|
| 492 | thread_local! {
|
| 493 | // A buffer for reusing between invocations of interleaving and deinterleaving.
|
| 494 | // Allocating memory is cheap, but zeroing or otherwise initializing it is not.
|
| 495 | // Doing it hundreds of times (once per block) would be expensive.
|
| 496 | // This optimization brings down the time spent in interleaving from 15% to 5%.
|
| 497 | static SCRATCH_SPACE: Cell<Vec<u8>> = Cell::new(Vec::new());
|
| 498 | }
|
| 499 |
|
| 500 | fn with_reused_buffer<F>(length: usize, mut func: F) where F: FnMut(&mut [u8]) {
|
| 501 | SCRATCH_SPACE.with(|scratch_space| {
|
| 502 | // reuse a buffer if we've already initialized one
|
| 503 | let mut buffer = scratch_space.take();
|
| 504 | if buffer.len() < length {
|
| 505 | // Efficiently create a zeroed Vec by requesting zeroed memory from the OS.
|
| 506 | // This is slightly faster than a `memcpy()` plus `memset()` that would happen otherwise,
|
| 507 | // but is not a big deal either way since it's not a hot codepath.
|
| 508 | buffer = vec![0u8; length];
|
| 509 | }
|
| 510 |
|
| 511 | // call the function
|
| 512 | func(&mut buffer[..length]);
|
| 513 |
|
| 514 | // save the internal buffer for reuse
|
| 515 | scratch_space.set(buffer);
|
| 516 | });
|
| 517 | }
|
| 518 |
|
| 519 | /// Interleave the bytes such that the second half of the array is every other byte.
|
| 520 | pub fn interleave_byte_blocks(separated: &mut [u8]) {
|
| 521 | with_reused_buffer(separated.len(), |interleaved| {
|
| 522 |
|
| 523 | // Split the two halves that we are going to interleave.
|
| 524 | let (first_half, second_half) = separated.split_at((separated.len() + 1) / 2);
|
| 525 | // The first half can be 1 byte longer than the second if the length of the input is odd,
|
| 526 | // but the loop below only processes numbers in pairs.
|
| 527 | // To handle it, preserve the last element of the first slice, to be handled after the loop.
|
| 528 | let first_half_last = first_half.last();
|
| 529 | // Truncate the first half to match the lenght of the second one; more optimizer-friendly
|
| 530 | let first_half_iter = &first_half[..second_half.len()];
|
| 531 |
|
| 532 | // Main loop that performs the interleaving
|
| 533 | for ((first, second), interleaved) in first_half_iter.iter().zip(second_half.iter())
|
| 534 | .zip(interleaved.chunks_exact_mut(2)) {
|
| 535 | // The length of each chunk is known to be 2 at compile time,
|
| 536 | // and each index is also a constant.
|
| 537 | // This allows the compiler to remove the bounds checks.
|
| 538 | interleaved[0] = *first;
|
| 539 | interleaved[1] = *second;
|
| 540 | }
|
| 541 |
|
| 542 | // If the length of the slice was odd, restore the last element of the first half that we saved
|
| 543 | if interleaved.len() % 2 == 1 {
|
| 544 | if let Some(value) = first_half_last {
|
| 545 | // we can unwrap() here because we just checked that the lenght is non-zero:
|
| 546 | // `% 2 == 1` will fail for zero
|
| 547 | *interleaved.last_mut().unwrap() = *value;
|
| 548 | }
|
| 549 | }
|
| 550 |
|
| 551 | // write out the results
|
| 552 | separated.copy_from_slice(&interleaved);
|
| 553 | });
|
| 554 | }
|
| 555 |
|
| 556 | /// Separate the bytes such that the second half contains every other byte.
|
| 557 | /// This performs deinterleaving - the inverse of interleaving.
|
| 558 | pub fn separate_bytes_fragments(source: &mut [u8]) {
|
| 559 | with_reused_buffer(source.len(), |separated| {
|
| 560 |
|
| 561 | // Split the two halves that we are going to interleave.
|
| 562 | let (first_half, second_half) = separated.split_at_mut((source.len() + 1) / 2);
|
| 563 | // The first half can be 1 byte longer than the second if the length of the input is odd,
|
| 564 | // but the loop below only processes numbers in pairs.
|
| 565 | // To handle it, preserve the last element of the input, to be handled after the loop.
|
| 566 | let last = source.last();
|
| 567 | let first_half_iter = &mut first_half[..second_half.len()];
|
| 568 |
|
| 569 | // Main loop that performs the deinterleaving
|
| 570 | for ((first, second), interleaved) in first_half_iter.iter_mut().zip(second_half.iter_mut())
|
| 571 | .zip(source.chunks_exact(2)) {
|
| 572 | // The length of each chunk is known to be 2 at compile time,
|
| 573 | // and each index is also a constant.
|
| 574 | // This allows the compiler to remove the bounds checks.
|
| 575 | *first = interleaved[0];
|
| 576 | *second = interleaved[1];
|
| 577 | }
|
| 578 |
|
| 579 | // If the length of the slice was odd, restore the last element of the input that we saved
|
| 580 | if source.len() % 2 == 1 {
|
| 581 | if let Some(value) = last {
|
| 582 | // we can unwrap() here because we just checked that the lenght is non-zero:
|
| 583 | // `% 2 == 1` will fail for zero
|
| 584 | *first_half.last_mut().unwrap() = *value;
|
| 585 | }
|
| 586 | }
|
| 587 |
|
| 588 | // write out the results
|
| 589 | source.copy_from_slice(&separated);
|
| 590 | });
|
| 591 | }
|
| 592 |
|
| 593 |
|
| 594 | #[cfg (test)]
|
| 595 | pub mod test {
|
| 596 |
|
| 597 | #[test ]
|
| 598 | fn roundtrip_interleave(){
|
| 599 | let source = vec![ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ];
|
| 600 | let mut modified = source.clone();
|
| 601 |
|
| 602 | super::separate_bytes_fragments(&mut modified);
|
| 603 | super::interleave_byte_blocks(&mut modified);
|
| 604 |
|
| 605 | assert_eq!(source, modified);
|
| 606 | }
|
| 607 |
|
| 608 | #[test ]
|
| 609 | fn roundtrip_derive(){
|
| 610 | let source = vec![ 0, 1, 2, 7, 4, 5, 6, 7, 13, 9, 10 ];
|
| 611 | let mut modified = source.clone();
|
| 612 |
|
| 613 | super::samples_to_differences(&mut modified);
|
| 614 | super::differences_to_samples(&mut modified);
|
| 615 |
|
| 616 | assert_eq!(source, modified);
|
| 617 | }
|
| 618 |
|
| 619 | }
|
| 620 | }
|
| 621 |
|
| 622 |
|
| 623 | #[cfg (test)]
|
| 624 | pub mod test {
|
| 625 | use super::*;
|
| 626 | use crate::meta::attribute::ChannelDescription;
|
| 627 | use crate::block::samples::IntoNativeSample;
|
| 628 |
|
| 629 | #[test ]
|
| 630 | fn roundtrip_endianness_mixed_channels(){
|
| 631 | let a32 = ChannelDescription::new("A" , SampleType::F32, true);
|
| 632 | let y16 = ChannelDescription::new("Y" , SampleType::F16, true);
|
| 633 | let channels = ChannelList::new(smallvec![ a32, y16 ]);
|
| 634 |
|
| 635 | let data = vec![
|
| 636 | 23582740683_f32.to_ne_bytes().as_slice(),
|
| 637 | 35827420683_f32.to_ne_bytes().as_slice(),
|
| 638 | 27406832358_f32.to_f16().to_ne_bytes().as_slice(),
|
| 639 | 74062358283_f32.to_f16().to_ne_bytes().as_slice(),
|
| 640 |
|
| 641 | 52582740683_f32.to_ne_bytes().as_slice(),
|
| 642 | 45827420683_f32.to_ne_bytes().as_slice(),
|
| 643 | 15406832358_f32.to_f16().to_ne_bytes().as_slice(),
|
| 644 | 65062358283_f32.to_f16().to_ne_bytes().as_slice(),
|
| 645 | ].into_iter().flatten().map(|x| *x).collect();
|
| 646 |
|
| 647 | roundtrip_convert_endianness(
|
| 648 | data, &channels,
|
| 649 | IntegerBounds::from_dimensions((2, 2))
|
| 650 | );
|
| 651 | }
|
| 652 |
|
| 653 | fn roundtrip_convert_endianness(
|
| 654 | current_endian: ByteVec, channels: &ChannelList, rectangle: IntegerBounds
|
| 655 | ){
|
| 656 | let little_endian = convert_current_to_little_endian(
|
| 657 | current_endian.clone(), channels, rectangle
|
| 658 | );
|
| 659 |
|
| 660 | let current_endian_decoded = convert_little_endian_to_current(
|
| 661 | little_endian.clone(), channels, rectangle
|
| 662 | );
|
| 663 |
|
| 664 | assert_eq!(current_endian, current_endian_decoded, "endianness conversion failed" );
|
| 665 | }
|
| 666 | } |