1/*
2 * Copyright (c) 2023.
3 *
4 * This software is free software;
5 *
6 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7 */
8
9//! Platform independent IDCT algorithm
10//!
11//! Not as fast as AVX one.
12
13const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
14
15#[allow(unused_assignments)]
16#[allow(
17 clippy::too_many_lines,
18 clippy::op_ref,
19 clippy::cast_possible_truncation
20)]
21pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
22 // Temporary variables.
23
24 let mut pos = 0;
25
26 let mut i = 0;
27 // Don't check for zeroes inside loop, lift it and check outside
28 // we want to accelerate the case with 63 0 ac coeff
29 if &in_vector[1..] == &[0_i32; 63] {
30 // okay then if you work, yay, let's write you really quick
31 let coeff = [(((in_vector[0] >> 3) + 128) as i16).clamp(0, 255); 8];
32
33 macro_rules! store {
34 ($index:tt) => {
35 // position of the MCU
36 let mcu_stride: &mut [i16; 8] = out_vector
37 .get_mut($index..$index + 8)
38 .unwrap()
39 .try_into()
40 .unwrap();
41 // copy coefficients
42 mcu_stride.copy_from_slice(&coeff);
43 // increment index
44 $index += stride;
45 };
46 }
47 // write to four positions
48 store!(pos);
49 store!(pos);
50 store!(pos);
51 store!(pos);
52
53 store!(pos);
54 store!(pos);
55 store!(pos);
56 store!(pos);
57 } else {
58 // because the compiler fails to see that it can be auto_vectorised so i'll
59 // leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9
60 for ptr in 0..8 {
61 let p2 = in_vector[ptr + 16];
62 let p3 = in_vector[ptr + 48];
63
64 let p1 = (p2 + p3).wrapping_mul(2217);
65
66 let t2 = p1 + p3 * -7567;
67 let t3 = p1 + p2 * 3135;
68
69 let p2 = in_vector[ptr];
70 let p3 = in_vector[32 + ptr];
71 let t0 = fsh(p2 + p3);
72 let t1 = fsh(p2 - p3);
73
74 let x0 = t0 + t3 + 512;
75 let x3 = t0 - t3 + 512;
76 let x1 = t1 + t2 + 512;
77 let x2 = t1 - t2 + 512;
78
79 // odd part
80 let mut t0 = in_vector[ptr + 56];
81 let mut t1 = in_vector[ptr + 40];
82 let mut t2 = in_vector[ptr + 24];
83 let mut t3 = in_vector[ptr + 8];
84
85 let p3 = t0 + t2;
86 let p4 = t1 + t3;
87 let p1 = t0 + t3;
88 let p2 = t1 + t2;
89 let p5 = (p3 + p4) * 4816;
90
91 t0 *= 1223;
92 t1 *= 8410;
93 t2 *= 12586;
94 t3 *= 6149;
95
96 let p1 = p5 + p1 * -3685;
97 let p2 = p5 + p2 * -10497;
98 let p3 = p3 * -8034;
99 let p4 = p4 * -1597;
100
101 t3 += p1 + p4;
102 t2 += p2 + p3;
103 t1 += p2 + p4;
104 t0 += p1 + p3;
105
106 // constants scaled things up by 1<<12; let's bring them back
107 // down, but keep 2 extra bits of precision
108 in_vector[ptr] = (x0 + t3) >> 10;
109 in_vector[ptr + 8] = (x1 + t2) >> 10;
110 in_vector[ptr + 16] = (x2 + t1) >> 10;
111 in_vector[ptr + 24] = (x3 + t0) >> 10;
112 in_vector[ptr + 32] = (x3 - t0) >> 10;
113 in_vector[ptr + 40] = (x2 - t1) >> 10;
114 in_vector[ptr + 48] = (x1 - t2) >> 10;
115 in_vector[ptr + 56] = (x0 - t3) >> 10;
116 }
117
118 // This is vectorised in architectures supporting SSE 4.1
119 while i < 64 {
120 // We won't try to short circuit here because it rarely works
121
122 // Even part
123 let p2 = in_vector[i + 2];
124 let p3 = in_vector[i + 6];
125
126 let p1 = (p2 + p3) * 2217;
127 let t2 = p1 + p3 * -7567;
128 let t3 = p1 + p2 * 3135;
129
130 let p2 = in_vector[i];
131 let p3 = in_vector[i + 4];
132
133 let t0 = fsh(p2 + p3);
134 let t1 = fsh(p2 - p3);
135 // constants scaled things up by 1<<12, plus we had 1<<2 from first
136 // loop, plus horizontal and vertical each scale by sqrt(8) so together
137 // we've got an extra 1<<3, so 1<<17 total we need to remove.
138 // so we want to round that, which means adding 0.5 * 1<<17,
139 // aka 65536. Also, we'll end up with -128 to 127 that we want
140 // to encode as 0..255 by adding 128, so we'll add that before the shift
141 let x0 = t0 + t3 + SCALE_BITS;
142 let x3 = t0 - t3 + SCALE_BITS;
143 let x1 = t1 + t2 + SCALE_BITS;
144 let x2 = t1 - t2 + SCALE_BITS;
145 // odd part
146 let mut t0 = in_vector[i + 7];
147 let mut t1 = in_vector[i + 5];
148 let mut t2 = in_vector[i + 3];
149 let mut t3 = in_vector[i + 1];
150
151 let p3 = t0 + t2;
152 let p4 = t1 + t3;
153 let p1 = t0 + t3;
154 let p2 = t1 + t2;
155 let p5 = (p3 + p4) * f2f(1.175875602);
156
157 t0 = t0.wrapping_mul(1223);
158 t1 = t1.wrapping_mul(8410);
159 t2 = t2.wrapping_mul(12586);
160 t3 = t3.wrapping_mul(6149);
161
162 let p1 = p5 + p1 * -3685;
163 let p2 = p5 + p2 * -10497;
164 let p3 = p3 * -8034;
165 let p4 = p4 * -1597;
166
167 t3 += p1 + p4;
168 t2 += p2 + p3;
169 t1 += p2 + p4;
170 t0 += p1 + p3;
171
172 let out: &mut [i16; 8] = out_vector
173 .get_mut(pos..pos + 8)
174 .unwrap()
175 .try_into()
176 .unwrap();
177
178 out[0] = clamp((x0 + t3) >> 17);
179 out[1] = clamp((x1 + t2) >> 17);
180 out[2] = clamp((x2 + t1) >> 17);
181 out[3] = clamp((x3 + t0) >> 17);
182 out[4] = clamp((x3 - t0) >> 17);
183 out[5] = clamp((x2 - t1) >> 17);
184 out[6] = clamp((x1 - t2) >> 17);
185 out[7] = clamp((x0 - t3) >> 17);
186
187 i += 8;
188
189 pos += stride;
190 }
191 }
192}
193
194#[inline]
195#[allow(clippy::cast_possible_truncation)]
196/// Multiply a number by 4096
197fn f2f(x: f32) -> i32 {
198 (x * 4096.0 + 0.5) as i32
199}
200
201#[inline]
202/// Multiply a number by 4096
203fn fsh(x: i32) -> i32 {
204 x << 12
205}
206
207/// Clamp values between 0 and 255
208#[inline]
209#[allow(clippy::cast_possible_truncation)]
210fn clamp(a: i32) -> i16 {
211 a.clamp(min:0, max:255) as i16
212}
213