1 | /* |
2 | * Copyright (c) 2023. |
3 | * |
4 | * This software is free software; |
5 | * |
6 | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
7 | */ |
8 | |
9 | //! Routines for IDCT |
10 | //! |
11 | //! Essentially we provide 2 routines for IDCT, a scalar implementation and a not super optimized |
12 | //! AVX2 one, i'll talk about them here. |
13 | //! |
14 | //! There are 2 reasons why we have the avx one |
15 | //! 1. No one compiles with -C target-features=avx2 hence binaries won't probably take advantage(even |
16 | //! if it exists). |
17 | //! 2. AVX employs zero short circuit in a way the scalar code cannot employ it. |
18 | //! - AVX does this by checking for MCU's whose 63 AC coefficients are zero and if true, it writes |
19 | //! values directly, if false, it goes the long way of calculating. |
20 | //! - Although this can be trivially implemented in the scalar version, it generates code |
21 | //! I'm not happy width(scalar version that basically loops and that is too many branches for me) |
22 | //! The avx one does a better job of using bitwise or's with (`_mm256_or_si256`) which is magnitudes of faster |
23 | //! than anything I could come up with |
24 | //! |
25 | //! The AVX code also has some cool transpose_u16 instructions which look so complicated to be cool |
26 | //! (spoiler alert, i barely understand how it works, that's why I credited the owner). |
27 | //! |
28 | #![allow ( |
29 | clippy::excessive_precision, |
30 | clippy::unreadable_literal, |
31 | clippy::module_name_repetitions, |
32 | unused_parens, |
33 | clippy::wildcard_imports |
34 | )] |
35 | |
36 | use zune_core::log::debug; |
37 | use zune_core::options::DecoderOptions; |
38 | |
39 | use crate::decoder::IDCTPtr; |
40 | use crate::idct::scalar::idct_int; |
41 | |
42 | #[cfg (feature = "x86" )] |
43 | pub mod avx2; |
44 | #[cfg (feature = "neon" )] |
45 | pub mod neon; |
46 | |
47 | pub mod scalar; |
48 | |
49 | /// Choose an appropriate IDCT function |
50 | #[allow (unused_variables)] |
51 | pub fn choose_idct_func(options: &DecoderOptions) -> IDCTPtr { |
52 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
53 | #[cfg (feature = "x86" )] |
54 | { |
55 | if options.use_avx2() { |
56 | debug!("Using vector integer IDCT" ); |
57 | // use avx one |
58 | return crate::idct::avx2::idct_avx2; |
59 | } |
60 | } |
61 | #[cfg (target_arch = "aarch64" )] |
62 | #[cfg (feature = "neon" )] |
63 | { |
64 | if options.use_neon() { |
65 | debug!("Using vector integer IDCT" ); |
66 | return crate::idct::neon::idct_neon; |
67 | } |
68 | } |
69 | debug!("Using scalar integer IDCT" ); |
70 | // use generic one |
71 | return idct_int; |
72 | } |
73 | |
74 | #[cfg (test)] |
75 | #[allow (unreachable_code)] |
76 | #[allow (dead_code)] |
77 | mod tests { |
78 | use super::*; |
79 | |
80 | #[test ] |
81 | fn idct_test0() { |
82 | let stride = 8; |
83 | let mut coeff = [10; 64]; |
84 | let mut coeff2 = [10; 64]; |
85 | let mut output_scalar = [0; 64]; |
86 | let mut output_vector = [0; 64]; |
87 | idct_fnc()(&mut coeff, &mut output_vector, stride); |
88 | idct_int(&mut coeff2, &mut output_scalar, stride); |
89 | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match" ); |
90 | } |
91 | |
92 | #[test ] |
93 | fn do_idct_test1() { |
94 | let stride = 8; |
95 | let mut coeff = [14; 64]; |
96 | let mut coeff2 = [14; 64]; |
97 | let mut output_scalar = [0; 64]; |
98 | let mut output_vector = [0; 64]; |
99 | idct_fnc()(&mut coeff, &mut output_vector, stride); |
100 | idct_int(&mut coeff2, &mut output_scalar, stride); |
101 | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match" ); |
102 | } |
103 | |
104 | #[test ] |
105 | fn do_idct_test2() { |
106 | let stride = 8; |
107 | let mut coeff = [0; 64]; |
108 | coeff[0] = 255; |
109 | coeff[63] = -256; |
110 | let mut coeff2 = coeff; |
111 | let mut output_scalar = [0; 64]; |
112 | let mut output_vector = [0; 64]; |
113 | idct_fnc()(&mut coeff, &mut output_vector, stride); |
114 | idct_int(&mut coeff2, &mut output_scalar, stride); |
115 | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match" ); |
116 | } |
117 | |
118 | #[test ] |
119 | fn do_idct_zeros() { |
120 | let stride = 8; |
121 | let mut coeff = [0; 64]; |
122 | let mut coeff2 = [0; 64]; |
123 | let mut output_scalar = [0; 64]; |
124 | let mut output_vector = [0; 64]; |
125 | idct_fnc()(&mut coeff, &mut output_vector, stride); |
126 | idct_int(&mut coeff2, &mut output_scalar, stride); |
127 | assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match" ); |
128 | } |
129 | |
130 | fn idct_fnc() -> IDCTPtr { |
131 | #[cfg (feature = "neon" )] |
132 | #[cfg (target_arch = "aarch64" )] |
133 | { |
134 | use crate::idct::neon::idct_neon; |
135 | return idct_neon; |
136 | } |
137 | |
138 | #[cfg (feature = "x86" )] |
139 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
140 | { |
141 | use crate::idct::avx2::idct_avx2; |
142 | return idct_avx2; |
143 | } |
144 | |
145 | idct_int |
146 | } |
147 | } |
148 | |