| 1 | /* |
| 2 | * Copyright (c) 2023. |
| 3 | * |
| 4 | * This software is free software; |
| 5 | * |
| 6 | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
| 7 | */ |
| 8 | |
| 9 | //! Routines for IDCT |
| 10 | //! |
| 11 | //! Essentially we provide 2 routines for IDCT, a scalar implementation and a not super optimized |
| 12 | //! AVX2 one, i'll talk about them here. |
| 13 | //! |
| 14 | //! There are 2 reasons why we have the avx one |
| 15 | //! 1. No one compiles with -C target-features=avx2 hence binaries won't probably take advantage(even |
| 16 | //! if it exists). |
| 17 | //! 2. AVX employs zero short circuit in a way the scalar code cannot employ it. |
| 18 | //! - AVX does this by checking for MCU's whose 63 AC coefficients are zero and if true, it writes |
| 19 | //! values directly, if false, it goes the long way of calculating. |
| 20 | //! - Although this can be trivially implemented in the scalar version, it generates code |
| 21 | //! I'm not happy width(scalar version that basically loops and that is too many branches for me) |
| 22 | //! The avx one does a better job of using bitwise or's with (`_mm256_or_si256`) which is magnitudes of faster |
| 23 | //! than anything I could come up with |
| 24 | //! |
| 25 | //! The AVX code also has some cool transpose_u16 instructions which look so complicated to be cool |
| 26 | //! (spoiler alert, i barely understand how it works, that's why I credited the owner). |
| 27 | //! |
| 28 | #![allow ( |
| 29 | clippy::excessive_precision, |
| 30 | clippy::unreadable_literal, |
| 31 | clippy::module_name_repetitions, |
| 32 | unused_parens, |
| 33 | clippy::wildcard_imports |
| 34 | )] |
| 35 | |
| 36 | use zune_core::log::debug; |
| 37 | use zune_core::options::DecoderOptions; |
| 38 | |
| 39 | use crate::decoder::IDCTPtr; |
| 40 | use crate::idct::scalar::idct_int; |
| 41 | |
| 42 | #[cfg (feature = "x86" )] |
| 43 | pub mod avx2; |
| 44 | #[cfg (feature = "neon" )] |
| 45 | pub mod neon; |
| 46 | |
| 47 | pub mod scalar; |
| 48 | |
| 49 | /// Choose an appropriate IDCT function |
| 50 | #[allow (unused_variables)] |
| 51 | pub fn choose_idct_func(options: &DecoderOptions) -> IDCTPtr { |
| 52 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
| 53 | #[cfg (feature = "x86" )] |
| 54 | { |
| 55 | if options.use_avx2() { |
| 56 | debug!("Using vector integer IDCT" ); |
| 57 | // use avx one |
| 58 | return crate::idct::avx2::idct_avx2; |
| 59 | } |
| 60 | } |
| 61 | #[cfg (target_arch = "aarch64" )] |
| 62 | #[cfg (feature = "neon" )] |
| 63 | { |
| 64 | if options.use_neon() { |
| 65 | debug!("Using vector integer IDCT" ); |
| 66 | return crate::idct::neon::idct_neon; |
| 67 | } |
| 68 | } |
| 69 | debug!("Using scalar integer IDCT" ); |
| 70 | // use generic one |
| 71 | return idct_int; |
| 72 | } |
| 73 | |
| 74 | #[cfg (test)] |
| 75 | #[allow (unreachable_code)] |
| 76 | #[allow (dead_code)] |
| 77 | mod tests { |
| 78 | use super::*; |
| 79 | |
| 80 | #[test ] |
| 81 | fn idct_test0() { |
| 82 | let stride = 8; |
| 83 | let mut coeff = [10; 64]; |
| 84 | let mut coeff2 = [10; 64]; |
| 85 | let mut output_scalar = [0; 64]; |
| 86 | let mut output_vector = [0; 64]; |
| 87 | idct_fnc()(&mut coeff, &mut output_vector, stride); |
| 88 | idct_int(&mut coeff2, &mut output_scalar, stride); |
| 89 | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match" ); |
| 90 | } |
| 91 | |
| 92 | #[test ] |
| 93 | fn do_idct_test1() { |
| 94 | let stride = 8; |
| 95 | let mut coeff = [14; 64]; |
| 96 | let mut coeff2 = [14; 64]; |
| 97 | let mut output_scalar = [0; 64]; |
| 98 | let mut output_vector = [0; 64]; |
| 99 | idct_fnc()(&mut coeff, &mut output_vector, stride); |
| 100 | idct_int(&mut coeff2, &mut output_scalar, stride); |
| 101 | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match" ); |
| 102 | } |
| 103 | |
| 104 | #[test ] |
| 105 | fn do_idct_test2() { |
| 106 | let stride = 8; |
| 107 | let mut coeff = [0; 64]; |
| 108 | coeff[0] = 255; |
| 109 | coeff[63] = -256; |
| 110 | let mut coeff2 = coeff; |
| 111 | let mut output_scalar = [0; 64]; |
| 112 | let mut output_vector = [0; 64]; |
| 113 | idct_fnc()(&mut coeff, &mut output_vector, stride); |
| 114 | idct_int(&mut coeff2, &mut output_scalar, stride); |
| 115 | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match" ); |
| 116 | } |
| 117 | |
| 118 | #[test ] |
| 119 | fn do_idct_zeros() { |
| 120 | let stride = 8; |
| 121 | let mut coeff = [0; 64]; |
| 122 | let mut coeff2 = [0; 64]; |
| 123 | let mut output_scalar = [0; 64]; |
| 124 | let mut output_vector = [0; 64]; |
| 125 | idct_fnc()(&mut coeff, &mut output_vector, stride); |
| 126 | idct_int(&mut coeff2, &mut output_scalar, stride); |
| 127 | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match" ); |
| 128 | } |
| 129 | |
| 130 | fn idct_fnc() -> IDCTPtr { |
| 131 | #[cfg (feature = "neon" )] |
| 132 | #[cfg (target_arch = "aarch64" )] |
| 133 | { |
| 134 | use crate::idct::neon::idct_neon; |
| 135 | return idct_neon; |
| 136 | } |
| 137 | |
| 138 | #[cfg (feature = "x86" )] |
| 139 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
| 140 | { |
| 141 | use crate::idct::avx2::idct_avx2; |
| 142 | return idct_avx2; |
| 143 | } |
| 144 | |
| 145 | idct_int |
| 146 | } |
| 147 | } |
| 148 | |