1#[cfg(target_arch = "x86")]
2use std::arch::x86::*;
3#[cfg(target_arch = "x86_64")]
4use std::arch::x86_64::*;
5
6#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7#[target_feature(enable = "ssse3")]
8unsafe fn idct8(data: &mut [__m128i; 8]) {
9 // The fixed-point constants here are obtained by taking the fractional part of the constants
10 // from the non-SIMD implementation and scaling them up by 1<<15. This is because
11 // _mm_mulhrs_epi16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some
12 // slight differences in rounding).
13
14 // The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
15 // doesn't apply any further scaling and fixed point constants have a different precision.
16
17 let p2 = data[2];
18 let p3 = data[6];
19 let p1 = _mm_mulhrs_epi16(_mm_adds_epi16(p2, p3), _mm_set1_epi16(17734)); // 0.5411961
20 let t2 = _mm_subs_epi16(
21 _mm_subs_epi16(p1, p3),
22 _mm_mulhrs_epi16(p3, _mm_set1_epi16(27779)), // 0.847759065
23 );
24 let t3 = _mm_adds_epi16(p1, _mm_mulhrs_epi16(p2, _mm_set1_epi16(25079))); // 0.765366865
25
26 let p2 = data[0];
27 let p3 = data[4];
28 let t0 = _mm_adds_epi16(p2, p3);
29 let t1 = _mm_subs_epi16(p2, p3);
30
31 let x0 = _mm_adds_epi16(t0, t3);
32 let x3 = _mm_subs_epi16(t0, t3);
33 let x1 = _mm_adds_epi16(t1, t2);
34 let x2 = _mm_subs_epi16(t1, t2);
35
36 let t0 = data[7];
37 let t1 = data[5];
38 let t2 = data[3];
39 let t3 = data[1];
40
41 let p3 = _mm_adds_epi16(t0, t2);
42 let p4 = _mm_adds_epi16(t1, t3);
43 let p1 = _mm_adds_epi16(t0, t3);
44 let p2 = _mm_adds_epi16(t1, t2);
45 let p5 = _mm_adds_epi16(p3, p4);
46 let p5 = _mm_adds_epi16(p5, _mm_mulhrs_epi16(p5, _mm_set1_epi16(5763))); // 0.175875602
47
48 let t0 = _mm_mulhrs_epi16(t0, _mm_set1_epi16(9786)); // 0.298631336
49 let t1 = _mm_adds_epi16(
50 _mm_adds_epi16(t1, t1),
51 _mm_mulhrs_epi16(t1, _mm_set1_epi16(1741)), // 0.053119869
52 );
53 let t2 = _mm_adds_epi16(
54 _mm_adds_epi16(t2, _mm_adds_epi16(t2, t2)),
55 _mm_mulhrs_epi16(t2, _mm_set1_epi16(2383)), // 0.072711026
56 );
57 let t3 = _mm_adds_epi16(t3, _mm_mulhrs_epi16(t3, _mm_set1_epi16(16427))); // 0.501321110
58
59 let p1 = _mm_subs_epi16(p5, _mm_mulhrs_epi16(p1, _mm_set1_epi16(29490))); // 0.899976223
60 let p2 = _mm_subs_epi16(
61 _mm_subs_epi16(_mm_subs_epi16(p5, p2), p2),
62 _mm_mulhrs_epi16(p2, _mm_set1_epi16(18446)), // 0.562915447
63 );
64
65 let p3 = _mm_subs_epi16(
66 _mm_mulhrs_epi16(p3, _mm_set1_epi16(-31509)), // -0.961570560
67 p3,
68 );
69 let p4 = _mm_mulhrs_epi16(p4, _mm_set1_epi16(-12785)); // -0.390180644
70
71 let t3 = _mm_adds_epi16(_mm_adds_epi16(p1, p4), t3);
72 let t2 = _mm_adds_epi16(_mm_adds_epi16(p2, p3), t2);
73 let t1 = _mm_adds_epi16(_mm_adds_epi16(p2, p4), t1);
74 let t0 = _mm_adds_epi16(_mm_adds_epi16(p1, p3), t0);
75
76 data[0] = _mm_adds_epi16(x0, t3);
77 data[7] = _mm_subs_epi16(x0, t3);
78 data[1] = _mm_adds_epi16(x1, t2);
79 data[6] = _mm_subs_epi16(x1, t2);
80 data[2] = _mm_adds_epi16(x2, t1);
81 data[5] = _mm_subs_epi16(x2, t1);
82 data[3] = _mm_adds_epi16(x3, t0);
83 data[4] = _mm_subs_epi16(x3, t0);
84}
85
86#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
87#[target_feature(enable = "ssse3")]
88unsafe fn transpose8(data: &mut [__m128i; 8]) {
89 // Transpose a 8x8 matrix with a sequence of interleaving operations.
90 // Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e.
91 // A0 B0 A1 B1 ...
92 // dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved -
93 // A0 B0 C0 D0 A1 B1 C1 D1 ...
94 let d01l = _mm_unpacklo_epi16(data[0], data[1]);
95 let d23l = _mm_unpacklo_epi16(data[2], data[3]);
96 let d45l = _mm_unpacklo_epi16(data[4], data[5]);
97 let d67l = _mm_unpacklo_epi16(data[6], data[7]);
98 let d01h = _mm_unpackhi_epi16(data[0], data[1]);
99 let d23h = _mm_unpackhi_epi16(data[2], data[3]);
100 let d45h = _mm_unpackhi_epi16(data[4], data[5]);
101 let d67h = _mm_unpackhi_epi16(data[6], data[7]);
102 // Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers.
103 let d0123ll = _mm_unpacklo_epi32(d01l, d23l);
104 let d0123lh = _mm_unpackhi_epi32(d01l, d23l);
105 let d4567ll = _mm_unpacklo_epi32(d45l, d67l);
106 let d4567lh = _mm_unpackhi_epi32(d45l, d67l);
107 let d0123hl = _mm_unpacklo_epi32(d01h, d23h);
108 let d0123hh = _mm_unpackhi_epi32(d01h, d23h);
109 let d4567hl = _mm_unpacklo_epi32(d45h, d67h);
110 let d4567hh = _mm_unpackhi_epi32(d45h, d67h);
111 // Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers.
112 data[0] = _mm_unpacklo_epi64(d0123ll, d4567ll);
113 data[1] = _mm_unpackhi_epi64(d0123ll, d4567ll);
114 data[2] = _mm_unpacklo_epi64(d0123lh, d4567lh);
115 data[3] = _mm_unpackhi_epi64(d0123lh, d4567lh);
116 data[4] = _mm_unpacklo_epi64(d0123hl, d4567hl);
117 data[5] = _mm_unpackhi_epi64(d0123hl, d4567hl);
118 data[6] = _mm_unpacklo_epi64(d0123hh, d4567hh);
119 data[7] = _mm_unpackhi_epi64(d0123hh, d4567hh);
120}
121
122#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
123#[target_feature(enable = "ssse3")]
124pub unsafe fn dequantize_and_idct_block_8x8(
125 coefficients: &[i16; 64],
126 quantization_table: &[u16; 64],
127 output_linestride: usize,
128 output: &mut [u8],
129) {
130 // The loop below will write to positions [output_linestride * i, output_linestride * i + 8)
131 // for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7,
132 // and if that position is in-bounds, so are all other accesses.
133 assert!(
134 output.len()
135 > output_linestride
136 .checked_mul(7)
137 .unwrap()
138 .checked_add(7)
139 .unwrap()
140 );
141
142 #[cfg(target_arch = "x86")]
143 use std::arch::x86::*;
144 #[cfg(target_arch = "x86_64")]
145 use std::arch::x86_64::*;
146
147 const SHIFT: i32 = 3;
148
149 // Read the DCT coefficients, scale them up and dequantize them.
150 let mut data = [_mm_setzero_si128(); 8];
151 for (i, item) in data.iter_mut().enumerate() {
152 *item = _mm_slli_epi16(
153 _mm_mullo_epi16(
154 _mm_loadu_si128(coefficients.as_ptr().wrapping_add(i * 8) as *const _),
155 _mm_loadu_si128(quantization_table.as_ptr().wrapping_add(i * 8) as *const _),
156 ),
157 SHIFT,
158 );
159 }
160
161 // Usual column IDCT - transpose - column IDCT - transpose approach.
162 idct8(&mut data);
163 transpose8(&mut data);
164 idct8(&mut data);
165 transpose8(&mut data);
166
167 for (i, item) in data.iter_mut().enumerate() {
168 let mut buf = [0u8; 16];
169 // The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
170 // increased by 3.
171 // As values will be stored in a u8, they need to be 128-centered and not 0-centered.
172 // We add 128 with the appropriate shift for that purpose.
173 const OFFSET: i16 = 128 << (SHIFT + 3);
174 // We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
175 const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1;
176
177 let data_with_offset = _mm_adds_epi16(*item, _mm_set1_epi16(OFFSET + ROUNDING_BIAS));
178
179 _mm_storeu_si128(
180 buf.as_mut_ptr() as *mut _,
181 _mm_packus_epi16(
182 _mm_srai_epi16(data_with_offset, SHIFT + 3),
183 _mm_setzero_si128(),
184 ),
185 );
186 std::ptr::copy_nonoverlapping::<u8>(
187 buf.as_ptr(),
188 output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _,
189 8,
190 );
191 }
192}
193
194#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
195#[target_feature(enable = "ssse3")]
196pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize {
197 assert!(output.len() % 3 == 0);
198 let num = output.len() / 3;
199 assert!(num <= y.len());
200 assert!(num <= cb.len());
201 assert!(num <= cr.len());
202 // _mm_loadu_si64 generates incorrect code for Rust <1.58. To circumvent this, we use a full
203 // 128-bit load, but that requires leaving an extra vector of border to the scalar code.
204 // From Rust 1.58 on, the _mm_loadu_si128 can be replaced with _mm_loadu_si64 and this
205 // .saturating_sub() can be removed.
206 let num_vecs = (num / 8).saturating_sub(1);
207
208 for i in 0..num_vecs {
209 const SHIFT: i32 = 6;
210 // Load.
211 let y = _mm_loadu_si128(y.as_ptr().wrapping_add(i * 8) as *const _);
212 let cb = _mm_loadu_si128(cb.as_ptr().wrapping_add(i * 8) as *const _);
213 let cr = _mm_loadu_si128(cr.as_ptr().wrapping_add(i * 8) as *const _);
214
215 // Convert to 16 bit.
216 let shuf16 = _mm_setr_epi8(
217 0, -0x7F, 1, -0x7F, 2, -0x7F, 3, -0x7F, 4, -0x7F, 5, -0x7F, 6, -0x7F, 7, -0x7F,
218 );
219 let y = _mm_slli_epi16(_mm_shuffle_epi8(y, shuf16), SHIFT);
220 let cb = _mm_slli_epi16(_mm_shuffle_epi8(cb, shuf16), SHIFT);
221 let cr = _mm_slli_epi16(_mm_shuffle_epi8(cr, shuf16), SHIFT);
222
223 // Add offsets
224 let c128 = _mm_set1_epi16(128 << SHIFT);
225 let y = _mm_adds_epi16(y, _mm_set1_epi16((1 << SHIFT) >> 1));
226 let cb = _mm_subs_epi16(cb, c128);
227 let cr = _mm_subs_epi16(cr, c128);
228
229 // Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772
230 let cr_140200 = _mm_adds_epi16(_mm_mulhrs_epi16(cr, _mm_set1_epi16(13173)), cr);
231 let cb_034414 = _mm_mulhrs_epi16(cb, _mm_set1_epi16(11276));
232 let cr_071414 = _mm_mulhrs_epi16(cr, _mm_set1_epi16(23401));
233 let cb_177200 = _mm_adds_epi16(_mm_mulhrs_epi16(cb, _mm_set1_epi16(25297)), cb);
234
235 // Last conversion step.
236 let r = _mm_adds_epi16(y, cr_140200);
237 let g = _mm_subs_epi16(y, _mm_adds_epi16(cb_034414, cr_071414));
238 let b = _mm_adds_epi16(y, cb_177200);
239
240 // Shift back and convert to u8.
241 let zero = _mm_setzero_si128();
242 let r = _mm_packus_epi16(_mm_srai_epi16(r, SHIFT), zero);
243 let g = _mm_packus_epi16(_mm_srai_epi16(g, SHIFT), zero);
244 let b = _mm_packus_epi16(_mm_srai_epi16(b, SHIFT), zero);
245
246 // Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
247
248 // Control vectors for _mm_shuffle_epi8. -0x7F is selected so that the resulting position
249 // after _mm_shuffle_epi8 will be filled with 0, so that the r, g, and b vectors can then
250 // be OR-ed together.
251 let shufr = _mm_setr_epi8(
252 0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F, -0x7F, 5,
253 );
254 let shufg = _mm_setr_epi8(
255 -0x7F, 0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F,
256 -0x7F,
257 );
258 let shufb = _mm_alignr_epi8(shufg, shufg, 15);
259
260 let rgb_low = _mm_or_si128(
261 _mm_shuffle_epi8(r, shufr),
262 _mm_or_si128(_mm_shuffle_epi8(g, shufg), _mm_shuffle_epi8(b, shufb)),
263 );
264
265 // For the next part of the rgb vectors, we need to select R values from 6 up, G and B from
266 // 5 up. The highest bit of -0x7F + 6 is still set, so the corresponding location will
267 // still be 0.
268 let shufr1 = _mm_add_epi8(shufb, _mm_set1_epi8(6));
269 let shufg1 = _mm_add_epi8(shufr, _mm_set1_epi8(5));
270 let shufb1 = _mm_add_epi8(shufg, _mm_set1_epi8(5));
271
272 let rgb_hi = _mm_or_si128(
273 _mm_shuffle_epi8(r, shufr1),
274 _mm_or_si128(_mm_shuffle_epi8(g, shufg1), _mm_shuffle_epi8(b, shufb1)),
275 );
276
277 let mut data = [0u8; 32];
278 _mm_storeu_si128(data.as_mut_ptr() as *mut _, rgb_low);
279 _mm_storeu_si128(data.as_mut_ptr().wrapping_add(16) as *mut _, rgb_hi);
280 std::ptr::copy_nonoverlapping::<u8>(
281 data.as_ptr(),
282 output.as_mut_ptr().wrapping_add(24 * i),
283 24,
284 );
285 }
286
287 num_vecs * 8
288}
289