| 1 | /* |
| 2 | * Copyright (c) 2023. |
| 3 | * |
| 4 | * This software is free software; |
| 5 | * |
| 6 | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
| 7 | */ |
| 8 | |
| 9 | //! Platform independent IDCT algorithm |
| 10 | //! |
| 11 | //! Not as fast as AVX one. |
| 12 | |
| 13 | const SCALE_BITS: i32 = 512 + 65536 + (128 << 17); |
| 14 | |
| 15 | #[allow (unused_assignments)] |
| 16 | #[allow ( |
| 17 | clippy::too_many_lines, |
| 18 | clippy::op_ref, |
| 19 | clippy::cast_possible_truncation |
| 20 | )] |
| 21 | pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) { |
| 22 | // Temporary variables. |
| 23 | |
| 24 | let mut pos = 0; |
| 25 | |
| 26 | let mut i = 0; |
| 27 | // Don't check for zeroes inside loop, lift it and check outside |
| 28 | // we want to accelerate the case with 63 0 ac coeff |
| 29 | if &in_vector[1..] == &[0_i32; 63] { |
| 30 | // okay then if you work, yay, let's write you really quick |
| 31 | let coeff = [(((in_vector[0] >> 3) + 128) as i16).clamp(0, 255); 8]; |
| 32 | |
| 33 | macro_rules! store { |
| 34 | ($index:tt) => { |
| 35 | // position of the MCU |
| 36 | let mcu_stride: &mut [i16; 8] = out_vector |
| 37 | .get_mut($index..$index + 8) |
| 38 | .unwrap() |
| 39 | .try_into() |
| 40 | .unwrap(); |
| 41 | // copy coefficients |
| 42 | mcu_stride.copy_from_slice(&coeff); |
| 43 | // increment index |
| 44 | $index += stride; |
| 45 | }; |
| 46 | } |
| 47 | // write to four positions |
| 48 | store!(pos); |
| 49 | store!(pos); |
| 50 | store!(pos); |
| 51 | store!(pos); |
| 52 | |
| 53 | store!(pos); |
| 54 | store!(pos); |
| 55 | store!(pos); |
| 56 | store!(pos); |
| 57 | } else { |
| 58 | // because the compiler fails to see that it can be auto_vectorised so i'll |
| 59 | // leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9 |
| 60 | for ptr in 0..8 { |
| 61 | let p2 = in_vector[ptr + 16]; |
| 62 | let p3 = in_vector[ptr + 48]; |
| 63 | |
| 64 | let p1 = (p2 + p3).wrapping_mul(2217); |
| 65 | |
| 66 | let t2 = p1 + p3 * -7567; |
| 67 | let t3 = p1 + p2 * 3135; |
| 68 | |
| 69 | let p2 = in_vector[ptr]; |
| 70 | let p3 = in_vector[32 + ptr]; |
| 71 | let t0 = fsh(p2 + p3); |
| 72 | let t1 = fsh(p2 - p3); |
| 73 | |
| 74 | let x0 = t0 + t3 + 512; |
| 75 | let x3 = t0 - t3 + 512; |
| 76 | let x1 = t1 + t2 + 512; |
| 77 | let x2 = t1 - t2 + 512; |
| 78 | |
| 79 | // odd part |
| 80 | let mut t0 = in_vector[ptr + 56]; |
| 81 | let mut t1 = in_vector[ptr + 40]; |
| 82 | let mut t2 = in_vector[ptr + 24]; |
| 83 | let mut t3 = in_vector[ptr + 8]; |
| 84 | |
| 85 | let p3 = t0 + t2; |
| 86 | let p4 = t1 + t3; |
| 87 | let p1 = t0 + t3; |
| 88 | let p2 = t1 + t2; |
| 89 | let p5 = (p3 + p4) * 4816; |
| 90 | |
| 91 | t0 *= 1223; |
| 92 | t1 *= 8410; |
| 93 | t2 *= 12586; |
| 94 | t3 *= 6149; |
| 95 | |
| 96 | let p1 = p5 + p1 * -3685; |
| 97 | let p2 = p5 + p2 * -10497; |
| 98 | let p3 = p3 * -8034; |
| 99 | let p4 = p4 * -1597; |
| 100 | |
| 101 | t3 += p1 + p4; |
| 102 | t2 += p2 + p3; |
| 103 | t1 += p2 + p4; |
| 104 | t0 += p1 + p3; |
| 105 | |
| 106 | // constants scaled things up by 1<<12; let's bring them back |
| 107 | // down, but keep 2 extra bits of precision |
| 108 | in_vector[ptr] = (x0 + t3) >> 10; |
| 109 | in_vector[ptr + 8] = (x1 + t2) >> 10; |
| 110 | in_vector[ptr + 16] = (x2 + t1) >> 10; |
| 111 | in_vector[ptr + 24] = (x3 + t0) >> 10; |
| 112 | in_vector[ptr + 32] = (x3 - t0) >> 10; |
| 113 | in_vector[ptr + 40] = (x2 - t1) >> 10; |
| 114 | in_vector[ptr + 48] = (x1 - t2) >> 10; |
| 115 | in_vector[ptr + 56] = (x0 - t3) >> 10; |
| 116 | } |
| 117 | |
| 118 | // This is vectorised in architectures supporting SSE 4.1 |
| 119 | while i < 64 { |
| 120 | // We won't try to short circuit here because it rarely works |
| 121 | |
| 122 | // Even part |
| 123 | let p2 = in_vector[i + 2]; |
| 124 | let p3 = in_vector[i + 6]; |
| 125 | |
| 126 | let p1 = (p2 + p3) * 2217; |
| 127 | let t2 = p1 + p3 * -7567; |
| 128 | let t3 = p1 + p2 * 3135; |
| 129 | |
| 130 | let p2 = in_vector[i]; |
| 131 | let p3 = in_vector[i + 4]; |
| 132 | |
| 133 | let t0 = fsh(p2 + p3); |
| 134 | let t1 = fsh(p2 - p3); |
| 135 | // constants scaled things up by 1<<12, plus we had 1<<2 from first |
| 136 | // loop, plus horizontal and vertical each scale by sqrt(8) so together |
| 137 | // we've got an extra 1<<3, so 1<<17 total we need to remove. |
| 138 | // so we want to round that, which means adding 0.5 * 1<<17, |
| 139 | // aka 65536. Also, we'll end up with -128 to 127 that we want |
| 140 | // to encode as 0..255 by adding 128, so we'll add that before the shift |
| 141 | let x0 = t0 + t3 + SCALE_BITS; |
| 142 | let x3 = t0 - t3 + SCALE_BITS; |
| 143 | let x1 = t1 + t2 + SCALE_BITS; |
| 144 | let x2 = t1 - t2 + SCALE_BITS; |
| 145 | // odd part |
| 146 | let mut t0 = in_vector[i + 7]; |
| 147 | let mut t1 = in_vector[i + 5]; |
| 148 | let mut t2 = in_vector[i + 3]; |
| 149 | let mut t3 = in_vector[i + 1]; |
| 150 | |
| 151 | let p3 = t0 + t2; |
| 152 | let p4 = t1 + t3; |
| 153 | let p1 = t0 + t3; |
| 154 | let p2 = t1 + t2; |
| 155 | let p5 = (p3 + p4) * f2f(1.175875602); |
| 156 | |
| 157 | t0 = t0.wrapping_mul(1223); |
| 158 | t1 = t1.wrapping_mul(8410); |
| 159 | t2 = t2.wrapping_mul(12586); |
| 160 | t3 = t3.wrapping_mul(6149); |
| 161 | |
| 162 | let p1 = p5 + p1 * -3685; |
| 163 | let p2 = p5 + p2 * -10497; |
| 164 | let p3 = p3 * -8034; |
| 165 | let p4 = p4 * -1597; |
| 166 | |
| 167 | t3 += p1 + p4; |
| 168 | t2 += p2 + p3; |
| 169 | t1 += p2 + p4; |
| 170 | t0 += p1 + p3; |
| 171 | |
| 172 | let out: &mut [i16; 8] = out_vector |
| 173 | .get_mut(pos..pos + 8) |
| 174 | .unwrap() |
| 175 | .try_into() |
| 176 | .unwrap(); |
| 177 | |
| 178 | out[0] = clamp((x0 + t3) >> 17); |
| 179 | out[1] = clamp((x1 + t2) >> 17); |
| 180 | out[2] = clamp((x2 + t1) >> 17); |
| 181 | out[3] = clamp((x3 + t0) >> 17); |
| 182 | out[4] = clamp((x3 - t0) >> 17); |
| 183 | out[5] = clamp((x2 - t1) >> 17); |
| 184 | out[6] = clamp((x1 - t2) >> 17); |
| 185 | out[7] = clamp((x0 - t3) >> 17); |
| 186 | |
| 187 | i += 8; |
| 188 | |
| 189 | pos += stride; |
| 190 | } |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | #[inline ] |
| 195 | #[allow (clippy::cast_possible_truncation)] |
| 196 | /// Multiply a number by 4096 |
| 197 | fn f2f(x: f32) -> i32 { |
| 198 | (x * 4096.0 + 0.5) as i32 |
| 199 | } |
| 200 | |
| 201 | #[inline ] |
| 202 | /// Multiply a number by 4096 |
| 203 | fn fsh(x: i32) -> i32 { |
| 204 | x << 12 |
| 205 | } |
| 206 | |
| 207 | /// Clamp values between 0 and 255 |
| 208 | #[inline ] |
| 209 | #[allow (clippy::cast_possible_truncation)] |
| 210 | fn clamp(a: i32) -> i16 { |
| 211 | a.clamp(min:0, max:255) as i16 |
| 212 | } |
| 213 | |