1 | #[cfg (target_arch = "x86" )] |
2 | use std::arch::x86::*; |
3 | #[cfg (target_arch = "x86_64" )] |
4 | use std::arch::x86_64::*; |
5 | |
6 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
7 | #[target_feature (enable = "ssse3" )] |
8 | unsafe fn idct8(data: &mut [__m128i; 8]) { |
9 | // The fixed-point constants here are obtained by taking the fractional part of the constants |
10 | // from the non-SIMD implementation and scaling them up by 1<<15. This is because |
11 | // _mm_mulhrs_epi16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some |
12 | // slight differences in rounding). |
13 | |
14 | // The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it |
15 | // doesn't apply any further scaling and fixed point constants have a different precision. |
16 | |
17 | let p2 = data[2]; |
18 | let p3 = data[6]; |
19 | let p1 = _mm_mulhrs_epi16(_mm_adds_epi16(p2, p3), _mm_set1_epi16(17734)); // 0.5411961 |
20 | let t2 = _mm_subs_epi16( |
21 | _mm_subs_epi16(p1, p3), |
22 | _mm_mulhrs_epi16(p3, _mm_set1_epi16(27779)), // 0.847759065 |
23 | ); |
24 | let t3 = _mm_adds_epi16(p1, _mm_mulhrs_epi16(p2, _mm_set1_epi16(25079))); // 0.765366865 |
25 | |
26 | let p2 = data[0]; |
27 | let p3 = data[4]; |
28 | let t0 = _mm_adds_epi16(p2, p3); |
29 | let t1 = _mm_subs_epi16(p2, p3); |
30 | |
31 | let x0 = _mm_adds_epi16(t0, t3); |
32 | let x3 = _mm_subs_epi16(t0, t3); |
33 | let x1 = _mm_adds_epi16(t1, t2); |
34 | let x2 = _mm_subs_epi16(t1, t2); |
35 | |
36 | let t0 = data[7]; |
37 | let t1 = data[5]; |
38 | let t2 = data[3]; |
39 | let t3 = data[1]; |
40 | |
41 | let p3 = _mm_adds_epi16(t0, t2); |
42 | let p4 = _mm_adds_epi16(t1, t3); |
43 | let p1 = _mm_adds_epi16(t0, t3); |
44 | let p2 = _mm_adds_epi16(t1, t2); |
45 | let p5 = _mm_adds_epi16(p3, p4); |
46 | let p5 = _mm_adds_epi16(p5, _mm_mulhrs_epi16(p5, _mm_set1_epi16(5763))); // 0.175875602 |
47 | |
48 | let t0 = _mm_mulhrs_epi16(t0, _mm_set1_epi16(9786)); // 0.298631336 |
49 | let t1 = _mm_adds_epi16( |
50 | _mm_adds_epi16(t1, t1), |
51 | _mm_mulhrs_epi16(t1, _mm_set1_epi16(1741)), // 0.053119869 |
52 | ); |
53 | let t2 = _mm_adds_epi16( |
54 | _mm_adds_epi16(t2, _mm_adds_epi16(t2, t2)), |
55 | _mm_mulhrs_epi16(t2, _mm_set1_epi16(2383)), // 0.072711026 |
56 | ); |
57 | let t3 = _mm_adds_epi16(t3, _mm_mulhrs_epi16(t3, _mm_set1_epi16(16427))); // 0.501321110 |
58 | |
59 | let p1 = _mm_subs_epi16(p5, _mm_mulhrs_epi16(p1, _mm_set1_epi16(29490))); // 0.899976223 |
60 | let p2 = _mm_subs_epi16( |
61 | _mm_subs_epi16(_mm_subs_epi16(p5, p2), p2), |
62 | _mm_mulhrs_epi16(p2, _mm_set1_epi16(18446)), // 0.562915447 |
63 | ); |
64 | |
65 | let p3 = _mm_subs_epi16( |
66 | _mm_mulhrs_epi16(p3, _mm_set1_epi16(-31509)), // -0.961570560 |
67 | p3, |
68 | ); |
69 | let p4 = _mm_mulhrs_epi16(p4, _mm_set1_epi16(-12785)); // -0.390180644 |
70 | |
71 | let t3 = _mm_adds_epi16(_mm_adds_epi16(p1, p4), t3); |
72 | let t2 = _mm_adds_epi16(_mm_adds_epi16(p2, p3), t2); |
73 | let t1 = _mm_adds_epi16(_mm_adds_epi16(p2, p4), t1); |
74 | let t0 = _mm_adds_epi16(_mm_adds_epi16(p1, p3), t0); |
75 | |
76 | data[0] = _mm_adds_epi16(x0, t3); |
77 | data[7] = _mm_subs_epi16(x0, t3); |
78 | data[1] = _mm_adds_epi16(x1, t2); |
79 | data[6] = _mm_subs_epi16(x1, t2); |
80 | data[2] = _mm_adds_epi16(x2, t1); |
81 | data[5] = _mm_subs_epi16(x2, t1); |
82 | data[3] = _mm_adds_epi16(x3, t0); |
83 | data[4] = _mm_subs_epi16(x3, t0); |
84 | } |
85 | |
86 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
87 | #[target_feature (enable = "ssse3" )] |
88 | unsafe fn transpose8(data: &mut [__m128i; 8]) { |
89 | // Transpose a 8x8 matrix with a sequence of interleaving operations. |
90 | // Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e. |
91 | // A0 B0 A1 B1 ... |
92 | // dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved - |
93 | // A0 B0 C0 D0 A1 B1 C1 D1 ... |
94 | let d01l = _mm_unpacklo_epi16(data[0], data[1]); |
95 | let d23l = _mm_unpacklo_epi16(data[2], data[3]); |
96 | let d45l = _mm_unpacklo_epi16(data[4], data[5]); |
97 | let d67l = _mm_unpacklo_epi16(data[6], data[7]); |
98 | let d01h = _mm_unpackhi_epi16(data[0], data[1]); |
99 | let d23h = _mm_unpackhi_epi16(data[2], data[3]); |
100 | let d45h = _mm_unpackhi_epi16(data[4], data[5]); |
101 | let d67h = _mm_unpackhi_epi16(data[6], data[7]); |
102 | // Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers. |
103 | let d0123ll = _mm_unpacklo_epi32(d01l, d23l); |
104 | let d0123lh = _mm_unpackhi_epi32(d01l, d23l); |
105 | let d4567ll = _mm_unpacklo_epi32(d45l, d67l); |
106 | let d4567lh = _mm_unpackhi_epi32(d45l, d67l); |
107 | let d0123hl = _mm_unpacklo_epi32(d01h, d23h); |
108 | let d0123hh = _mm_unpackhi_epi32(d01h, d23h); |
109 | let d4567hl = _mm_unpacklo_epi32(d45h, d67h); |
110 | let d4567hh = _mm_unpackhi_epi32(d45h, d67h); |
111 | // Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers. |
112 | data[0] = _mm_unpacklo_epi64(d0123ll, d4567ll); |
113 | data[1] = _mm_unpackhi_epi64(d0123ll, d4567ll); |
114 | data[2] = _mm_unpacklo_epi64(d0123lh, d4567lh); |
115 | data[3] = _mm_unpackhi_epi64(d0123lh, d4567lh); |
116 | data[4] = _mm_unpacklo_epi64(d0123hl, d4567hl); |
117 | data[5] = _mm_unpackhi_epi64(d0123hl, d4567hl); |
118 | data[6] = _mm_unpacklo_epi64(d0123hh, d4567hh); |
119 | data[7] = _mm_unpackhi_epi64(d0123hh, d4567hh); |
120 | } |
121 | |
122 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
123 | #[target_feature (enable = "ssse3" )] |
124 | pub unsafe fn dequantize_and_idct_block_8x8( |
125 | coefficients: &[i16; 64], |
126 | quantization_table: &[u16; 64], |
127 | output_linestride: usize, |
128 | output: &mut [u8], |
129 | ) { |
130 | // The loop below will write to positions [output_linestride * i, output_linestride * i + 8) |
131 | // for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7, |
132 | // and if that position is in-bounds, so are all other accesses. |
133 | assert!( |
134 | output.len() |
135 | > output_linestride |
136 | .checked_mul(7) |
137 | .unwrap() |
138 | .checked_add(7) |
139 | .unwrap() |
140 | ); |
141 | |
142 | #[cfg (target_arch = "x86" )] |
143 | use std::arch::x86::*; |
144 | #[cfg (target_arch = "x86_64" )] |
145 | use std::arch::x86_64::*; |
146 | |
147 | const SHIFT: i32 = 3; |
148 | |
149 | // Read the DCT coefficients, scale them up and dequantize them. |
150 | let mut data = [_mm_setzero_si128(); 8]; |
151 | for (i, item) in data.iter_mut().enumerate() { |
152 | *item = _mm_slli_epi16( |
153 | _mm_mullo_epi16( |
154 | _mm_loadu_si128(coefficients.as_ptr().wrapping_add(i * 8) as *const _), |
155 | _mm_loadu_si128(quantization_table.as_ptr().wrapping_add(i * 8) as *const _), |
156 | ), |
157 | SHIFT, |
158 | ); |
159 | } |
160 | |
161 | // Usual column IDCT - transpose - column IDCT - transpose approach. |
162 | idct8(&mut data); |
163 | transpose8(&mut data); |
164 | idct8(&mut data); |
165 | transpose8(&mut data); |
166 | |
167 | for (i, item) in data.iter_mut().enumerate() { |
168 | let mut buf = [0u8; 16]; |
169 | // The two passes of the IDCT algorithm give us a factor of 8, so the shift here is |
170 | // increased by 3. |
171 | // As values will be stored in a u8, they need to be 128-centered and not 0-centered. |
172 | // We add 128 with the appropriate shift for that purpose. |
173 | const OFFSET: i16 = 128 << (SHIFT + 3); |
174 | // We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting. |
175 | const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1; |
176 | |
177 | let data_with_offset = _mm_adds_epi16(*item, _mm_set1_epi16(OFFSET + ROUNDING_BIAS)); |
178 | |
179 | _mm_storeu_si128( |
180 | buf.as_mut_ptr() as *mut _, |
181 | _mm_packus_epi16( |
182 | _mm_srai_epi16(data_with_offset, SHIFT + 3), |
183 | _mm_setzero_si128(), |
184 | ), |
185 | ); |
186 | std::ptr::copy_nonoverlapping::<u8>( |
187 | buf.as_ptr(), |
188 | output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _, |
189 | 8, |
190 | ); |
191 | } |
192 | } |
193 | |
194 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
195 | #[target_feature (enable = "ssse3" )] |
196 | pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize { |
197 | assert!(output.len() % 3 == 0); |
198 | let num = output.len() / 3; |
199 | assert!(num <= y.len()); |
200 | assert!(num <= cb.len()); |
201 | assert!(num <= cr.len()); |
202 | // _mm_loadu_si64 generates incorrect code for Rust <1.58. To circumvent this, we use a full |
203 | // 128-bit load, but that requires leaving an extra vector of border to the scalar code. |
204 | // From Rust 1.58 on, the _mm_loadu_si128 can be replaced with _mm_loadu_si64 and this |
205 | // .saturating_sub() can be removed. |
206 | let num_vecs = (num / 8).saturating_sub(1); |
207 | |
208 | for i in 0..num_vecs { |
209 | const SHIFT: i32 = 6; |
210 | // Load. |
211 | let y = _mm_loadu_si128(y.as_ptr().wrapping_add(i * 8) as *const _); |
212 | let cb = _mm_loadu_si128(cb.as_ptr().wrapping_add(i * 8) as *const _); |
213 | let cr = _mm_loadu_si128(cr.as_ptr().wrapping_add(i * 8) as *const _); |
214 | |
215 | // Convert to 16 bit. |
216 | let shuf16 = _mm_setr_epi8( |
217 | 0, -0x7F, 1, -0x7F, 2, -0x7F, 3, -0x7F, 4, -0x7F, 5, -0x7F, 6, -0x7F, 7, -0x7F, |
218 | ); |
219 | let y = _mm_slli_epi16(_mm_shuffle_epi8(y, shuf16), SHIFT); |
220 | let cb = _mm_slli_epi16(_mm_shuffle_epi8(cb, shuf16), SHIFT); |
221 | let cr = _mm_slli_epi16(_mm_shuffle_epi8(cr, shuf16), SHIFT); |
222 | |
223 | // Add offsets |
224 | let c128 = _mm_set1_epi16(128 << SHIFT); |
225 | let y = _mm_adds_epi16(y, _mm_set1_epi16((1 << SHIFT) >> 1)); |
226 | let cb = _mm_subs_epi16(cb, c128); |
227 | let cr = _mm_subs_epi16(cr, c128); |
228 | |
229 | // Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772 |
230 | let cr_140200 = _mm_adds_epi16(_mm_mulhrs_epi16(cr, _mm_set1_epi16(13173)), cr); |
231 | let cb_034414 = _mm_mulhrs_epi16(cb, _mm_set1_epi16(11276)); |
232 | let cr_071414 = _mm_mulhrs_epi16(cr, _mm_set1_epi16(23401)); |
233 | let cb_177200 = _mm_adds_epi16(_mm_mulhrs_epi16(cb, _mm_set1_epi16(25297)), cb); |
234 | |
235 | // Last conversion step. |
236 | let r = _mm_adds_epi16(y, cr_140200); |
237 | let g = _mm_subs_epi16(y, _mm_adds_epi16(cb_034414, cr_071414)); |
238 | let b = _mm_adds_epi16(y, cb_177200); |
239 | |
240 | // Shift back and convert to u8. |
241 | let zero = _mm_setzero_si128(); |
242 | let r = _mm_packus_epi16(_mm_srai_epi16(r, SHIFT), zero); |
243 | let g = _mm_packus_epi16(_mm_srai_epi16(g, SHIFT), zero); |
244 | let b = _mm_packus_epi16(_mm_srai_epi16(b, SHIFT), zero); |
245 | |
246 | // Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb... |
247 | |
248 | // Control vectors for _mm_shuffle_epi8. -0x7F is selected so that the resulting position |
249 | // after _mm_shuffle_epi8 will be filled with 0, so that the r, g, and b vectors can then |
250 | // be OR-ed together. |
251 | let shufr = _mm_setr_epi8( |
252 | 0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F, -0x7F, 5, |
253 | ); |
254 | let shufg = _mm_setr_epi8( |
255 | -0x7F, 0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F, |
256 | -0x7F, |
257 | ); |
258 | let shufb = _mm_alignr_epi8(shufg, shufg, 15); |
259 | |
260 | let rgb_low = _mm_or_si128( |
261 | _mm_shuffle_epi8(r, shufr), |
262 | _mm_or_si128(_mm_shuffle_epi8(g, shufg), _mm_shuffle_epi8(b, shufb)), |
263 | ); |
264 | |
265 | // For the next part of the rgb vectors, we need to select R values from 6 up, G and B from |
266 | // 5 up. The highest bit of -0x7F + 6 is still set, so the corresponding location will |
267 | // still be 0. |
268 | let shufr1 = _mm_add_epi8(shufb, _mm_set1_epi8(6)); |
269 | let shufg1 = _mm_add_epi8(shufr, _mm_set1_epi8(5)); |
270 | let shufb1 = _mm_add_epi8(shufg, _mm_set1_epi8(5)); |
271 | |
272 | let rgb_hi = _mm_or_si128( |
273 | _mm_shuffle_epi8(r, shufr1), |
274 | _mm_or_si128(_mm_shuffle_epi8(g, shufg1), _mm_shuffle_epi8(b, shufb1)), |
275 | ); |
276 | |
277 | let mut data = [0u8; 32]; |
278 | _mm_storeu_si128(data.as_mut_ptr() as *mut _, rgb_low); |
279 | _mm_storeu_si128(data.as_mut_ptr().wrapping_add(16) as *mut _, rgb_hi); |
280 | std::ptr::copy_nonoverlapping::<u8>( |
281 | data.as_ptr(), |
282 | output.as_mut_ptr().wrapping_add(24 * i), |
283 | 24, |
284 | ); |
285 | } |
286 | |
287 | num_vecs * 8 |
288 | } |
289 | |