1 | /* |
2 | * Copyright (c) 2023. |
3 | * |
4 | * This software is free software; |
5 | * |
6 | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
7 | */ |
8 | |
9 | //! Platform independent IDCT algorithm |
10 | //! |
11 | //! Not as fast as AVX one. |
12 | |
13 | const SCALE_BITS: i32 = 512 + 65536 + (128 << 17); |
14 | |
15 | #[allow (unused_assignments)] |
16 | #[allow ( |
17 | clippy::too_many_lines, |
18 | clippy::op_ref, |
19 | clippy::cast_possible_truncation |
20 | )] |
21 | pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) { |
22 | // Temporary variables. |
23 | |
24 | let mut pos = 0; |
25 | |
26 | let mut i = 0; |
27 | // Don't check for zeroes inside loop, lift it and check outside |
28 | // we want to accelerate the case with 63 0 ac coeff |
29 | if &in_vector[1..] == &[0_i32; 63] { |
30 | // okay then if you work, yay, let's write you really quick |
31 | let coeff = [(((in_vector[0] >> 3) + 128) as i16).clamp(0, 255); 8]; |
32 | |
33 | macro_rules! store { |
34 | ($index:tt) => { |
35 | // position of the MCU |
36 | let mcu_stride: &mut [i16; 8] = out_vector |
37 | .get_mut($index..$index + 8) |
38 | .unwrap() |
39 | .try_into() |
40 | .unwrap(); |
41 | // copy coefficients |
42 | mcu_stride.copy_from_slice(&coeff); |
43 | // increment index |
44 | $index += stride; |
45 | }; |
46 | } |
47 | // write to four positions |
48 | store!(pos); |
49 | store!(pos); |
50 | store!(pos); |
51 | store!(pos); |
52 | |
53 | store!(pos); |
54 | store!(pos); |
55 | store!(pos); |
56 | store!(pos); |
57 | } else { |
58 | // because the compiler fails to see that it can be auto_vectorised so i'll |
59 | // leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9 |
60 | for ptr in 0..8 { |
61 | let p2 = in_vector[ptr + 16]; |
62 | let p3 = in_vector[ptr + 48]; |
63 | |
64 | let p1 = (p2 + p3).wrapping_mul(2217); |
65 | |
66 | let t2 = p1 + p3 * -7567; |
67 | let t3 = p1 + p2 * 3135; |
68 | |
69 | let p2 = in_vector[ptr]; |
70 | let p3 = in_vector[32 + ptr]; |
71 | let t0 = fsh(p2 + p3); |
72 | let t1 = fsh(p2 - p3); |
73 | |
74 | let x0 = t0 + t3 + 512; |
75 | let x3 = t0 - t3 + 512; |
76 | let x1 = t1 + t2 + 512; |
77 | let x2 = t1 - t2 + 512; |
78 | |
79 | // odd part |
80 | let mut t0 = in_vector[ptr + 56]; |
81 | let mut t1 = in_vector[ptr + 40]; |
82 | let mut t2 = in_vector[ptr + 24]; |
83 | let mut t3 = in_vector[ptr + 8]; |
84 | |
85 | let p3 = t0 + t2; |
86 | let p4 = t1 + t3; |
87 | let p1 = t0 + t3; |
88 | let p2 = t1 + t2; |
89 | let p5 = (p3 + p4) * 4816; |
90 | |
91 | t0 *= 1223; |
92 | t1 *= 8410; |
93 | t2 *= 12586; |
94 | t3 *= 6149; |
95 | |
96 | let p1 = p5 + p1 * -3685; |
97 | let p2 = p5 + p2 * -10497; |
98 | let p3 = p3 * -8034; |
99 | let p4 = p4 * -1597; |
100 | |
101 | t3 += p1 + p4; |
102 | t2 += p2 + p3; |
103 | t1 += p2 + p4; |
104 | t0 += p1 + p3; |
105 | |
106 | // constants scaled things up by 1<<12; let's bring them back |
107 | // down, but keep 2 extra bits of precision |
108 | in_vector[ptr] = (x0 + t3) >> 10; |
109 | in_vector[ptr + 8] = (x1 + t2) >> 10; |
110 | in_vector[ptr + 16] = (x2 + t1) >> 10; |
111 | in_vector[ptr + 24] = (x3 + t0) >> 10; |
112 | in_vector[ptr + 32] = (x3 - t0) >> 10; |
113 | in_vector[ptr + 40] = (x2 - t1) >> 10; |
114 | in_vector[ptr + 48] = (x1 - t2) >> 10; |
115 | in_vector[ptr + 56] = (x0 - t3) >> 10; |
116 | } |
117 | |
118 | // This is vectorised in architectures supporting SSE 4.1 |
119 | while i < 64 { |
120 | // We won't try to short circuit here because it rarely works |
121 | |
122 | // Even part |
123 | let p2 = in_vector[i + 2]; |
124 | let p3 = in_vector[i + 6]; |
125 | |
126 | let p1 = (p2 + p3) * 2217; |
127 | let t2 = p1 + p3 * -7567; |
128 | let t3 = p1 + p2 * 3135; |
129 | |
130 | let p2 = in_vector[i]; |
131 | let p3 = in_vector[i + 4]; |
132 | |
133 | let t0 = fsh(p2 + p3); |
134 | let t1 = fsh(p2 - p3); |
135 | // constants scaled things up by 1<<12, plus we had 1<<2 from first |
136 | // loop, plus horizontal and vertical each scale by sqrt(8) so together |
137 | // we've got an extra 1<<3, so 1<<17 total we need to remove. |
138 | // so we want to round that, which means adding 0.5 * 1<<17, |
139 | // aka 65536. Also, we'll end up with -128 to 127 that we want |
140 | // to encode as 0..255 by adding 128, so we'll add that before the shift |
141 | let x0 = t0 + t3 + SCALE_BITS; |
142 | let x3 = t0 - t3 + SCALE_BITS; |
143 | let x1 = t1 + t2 + SCALE_BITS; |
144 | let x2 = t1 - t2 + SCALE_BITS; |
145 | // odd part |
146 | let mut t0 = in_vector[i + 7]; |
147 | let mut t1 = in_vector[i + 5]; |
148 | let mut t2 = in_vector[i + 3]; |
149 | let mut t3 = in_vector[i + 1]; |
150 | |
151 | let p3 = t0 + t2; |
152 | let p4 = t1 + t3; |
153 | let p1 = t0 + t3; |
154 | let p2 = t1 + t2; |
155 | let p5 = (p3 + p4) * f2f(1.175875602); |
156 | |
157 | t0 = t0.wrapping_mul(1223); |
158 | t1 = t1.wrapping_mul(8410); |
159 | t2 = t2.wrapping_mul(12586); |
160 | t3 = t3.wrapping_mul(6149); |
161 | |
162 | let p1 = p5 + p1 * -3685; |
163 | let p2 = p5 + p2 * -10497; |
164 | let p3 = p3 * -8034; |
165 | let p4 = p4 * -1597; |
166 | |
167 | t3 += p1 + p4; |
168 | t2 += p2 + p3; |
169 | t1 += p2 + p4; |
170 | t0 += p1 + p3; |
171 | |
172 | let out: &mut [i16; 8] = out_vector |
173 | .get_mut(pos..pos + 8) |
174 | .unwrap() |
175 | .try_into() |
176 | .unwrap(); |
177 | |
178 | out[0] = clamp((x0 + t3) >> 17); |
179 | out[1] = clamp((x1 + t2) >> 17); |
180 | out[2] = clamp((x2 + t1) >> 17); |
181 | out[3] = clamp((x3 + t0) >> 17); |
182 | out[4] = clamp((x3 - t0) >> 17); |
183 | out[5] = clamp((x2 - t1) >> 17); |
184 | out[6] = clamp((x1 - t2) >> 17); |
185 | out[7] = clamp((x0 - t3) >> 17); |
186 | |
187 | i += 8; |
188 | |
189 | pos += stride; |
190 | } |
191 | } |
192 | } |
193 | |
194 | #[inline ] |
195 | #[allow (clippy::cast_possible_truncation)] |
196 | /// Multiply a number by 4096 |
197 | fn f2f(x: f32) -> i32 { |
198 | (x * 4096.0 + 0.5) as i32 |
199 | } |
200 | |
201 | #[inline ] |
202 | /// Multiply a number by 4096 |
203 | fn fsh(x: i32) -> i32 { |
204 | x << 12 |
205 | } |
206 | |
207 | /// Clamp values between 0 and 255 |
208 | #[inline ] |
209 | #[allow (clippy::cast_possible_truncation)] |
210 | fn clamp(a: i32) -> i16 { |
211 | a.clamp(min:0, max:255) as i16 |
212 | } |
213 | |