ssse3.rs source code [crates/jpeg_decoder/src/arch/ssse3.rs]

1	#[cfg(target_arch = "x86")]
2	use std::arch::x86::*;
3	#[cfg(target_arch = "x86_64")]
4	use std::arch::x86_64::*;
5
6	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7	#[target_feature(enable = "ssse3")]
8	unsafe fn idct8(data: &mut [__m128i; `8`]) {
9	// The fixed-point constants here are obtained by taking the fractional part of the constants
10	// from the non-SIMD implementation and scaling them up by 1<<15. This is because
11	// _mm_mulhrs_epi16(a, b) is effectively equivalent to (ab)>>15 (except for possibly some*
12	// slight differences in rounding).
13
14	// The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
15	// doesn't apply any further scaling and fixed point constants have a different precision.
16
17	let p2 = data[`2`];
18	let p3 = data[`6`];
19	let p1 = _mm_mulhrs_epi16(_mm_adds_epi16(p2, p3), _mm_set1_epi16(`17734`)); // 0.5411961
20	let t2 = _mm_subs_epi16(
21	_mm_subs_epi16(p1, p3),
22	_mm_mulhrs_epi16(p3, _mm_set1_epi16(`27779`)), // 0.847759065
23	);
24	let t3 = _mm_adds_epi16(p1, _mm_mulhrs_epi16(p2, _mm_set1_epi16(`25079`))); // 0.765366865
25
26	let p2 = data[`0`];
27	let p3 = data[`4`];
28	let t0 = _mm_adds_epi16(p2, p3);
29	let t1 = _mm_subs_epi16(p2, p3);
30
31	let x0 = _mm_adds_epi16(t0, t3);
32	let x3 = _mm_subs_epi16(t0, t3);
33	let x1 = _mm_adds_epi16(t1, t2);
34	let x2 = _mm_subs_epi16(t1, t2);
35
36	let t0 = data[`7`];
37	let t1 = data[`5`];
38	let t2 = data[`3`];
39	let t3 = data[`1`];
40
41	let p3 = _mm_adds_epi16(t0, t2);
42	let p4 = _mm_adds_epi16(t1, t3);
43	let p1 = _mm_adds_epi16(t0, t3);
44	let p2 = _mm_adds_epi16(t1, t2);
45	let p5 = _mm_adds_epi16(p3, p4);
46	let p5 = _mm_adds_epi16(p5, _mm_mulhrs_epi16(p5, _mm_set1_epi16(`5763`))); // 0.175875602
47
48	let t0 = _mm_mulhrs_epi16(t0, _mm_set1_epi16(`9786`)); // 0.298631336
49	let t1 = _mm_adds_epi16(
50	_mm_adds_epi16(t1, t1),
51	_mm_mulhrs_epi16(t1, _mm_set1_epi16(`1741`)), // 0.053119869
52	);
53	let t2 = _mm_adds_epi16(
54	_mm_adds_epi16(t2, _mm_adds_epi16(t2, t2)),
55	_mm_mulhrs_epi16(t2, _mm_set1_epi16(`2383`)), // 0.072711026
56	);
57	let t3 = _mm_adds_epi16(t3, _mm_mulhrs_epi16(t3, _mm_set1_epi16(`16427`))); // 0.501321110
58
59	let p1 = _mm_subs_epi16(p5, _mm_mulhrs_epi16(p1, _mm_set1_epi16(`29490`))); // 0.899976223
60	let p2 = _mm_subs_epi16(
61	_mm_subs_epi16(_mm_subs_epi16(p5, p2), p2),
62	_mm_mulhrs_epi16(p2, _mm_set1_epi16(`18446`)), // 0.562915447
63	);
64
65	let p3 = _mm_subs_epi16(
66	_mm_mulhrs_epi16(p3, _mm_set1_epi16(`-31509`)), // -0.961570560
67	p3,
68	);
69	let p4 = _mm_mulhrs_epi16(p4, _mm_set1_epi16(`-12785`)); // -0.390180644
70
71	let t3 = _mm_adds_epi16(_mm_adds_epi16(p1, p4), t3);
72	let t2 = _mm_adds_epi16(_mm_adds_epi16(p2, p3), t2);
73	let t1 = _mm_adds_epi16(_mm_adds_epi16(p2, p4), t1);
74	let t0 = _mm_adds_epi16(_mm_adds_epi16(p1, p3), t0);
75
76	data[`0`] = _mm_adds_epi16(x0, t3);
77	data[`7`] = _mm_subs_epi16(x0, t3);
78	data[`1`] = _mm_adds_epi16(x1, t2);
79	data[`6`] = _mm_subs_epi16(x1, t2);
80	data[`2`] = _mm_adds_epi16(x2, t1);
81	data[`5`] = _mm_subs_epi16(x2, t1);
82	data[`3`] = _mm_adds_epi16(x3, t0);
83	data[`4`] = _mm_subs_epi16(x3, t0);
84	}
85
86	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
87	#[target_feature(enable = "ssse3")]
88	unsafe fn transpose8(data: &mut [__m128i; `8`]) {
89	// Transpose a 8x8 matrix with a sequence of interleaving operations.
90	// Naming: dABl contains elements from the lower halves of vectors A and B, interleaved, i.e.
91	// A0 B0 A1 B1 ...
92	// dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved -
93	// A0 B0 C0 D0 A1 B1 C1 D1 ...
94	let d01l = _mm_unpacklo_epi16(data[`0`], data[`1`]);
95	let d23l = _mm_unpacklo_epi16(data[`2`], data[`3`]);
96	let d45l = _mm_unpacklo_epi16(data[`4`], data[`5`]);
97	let d67l = _mm_unpacklo_epi16(data[`6`], data[`7`]);
98	let d01h = _mm_unpackhi_epi16(data[`0`], data[`1`]);
99	let d23h = _mm_unpackhi_epi16(data[`2`], data[`3`]);
100	let d45h = _mm_unpackhi_epi16(data[`4`], data[`5`]);
101	let d67h = _mm_unpackhi_epi16(data[`6`], data[`7`]);
102	// Operating on 32-bits will interleave consecutive pairs* of 16-bit integers.*
103	let d0123ll = _mm_unpacklo_epi32(d01l, d23l);
104	let d0123lh = _mm_unpackhi_epi32(d01l, d23l);
105	let d4567ll = _mm_unpacklo_epi32(d45l, d67l);
106	let d4567lh = _mm_unpackhi_epi32(d45l, d67l);
107	let d0123hl = _mm_unpacklo_epi32(d01h, d23h);
108	let d0123hh = _mm_unpackhi_epi32(d01h, d23h);
109	let d4567hl = _mm_unpacklo_epi32(d45h, d67h);
110	let d4567hh = _mm_unpackhi_epi32(d45h, d67h);
111	// Operating on 64-bits will interleave consecutive quadruples* of 16-bit integers.*
112	data[`0`] = _mm_unpacklo_epi64(d0123ll, d4567ll);
113	data[`1`] = _mm_unpackhi_epi64(d0123ll, d4567ll);
114	data[`2`] = _mm_unpacklo_epi64(d0123lh, d4567lh);
115	data[`3`] = _mm_unpackhi_epi64(d0123lh, d4567lh);
116	data[`4`] = _mm_unpacklo_epi64(d0123hl, d4567hl);
117	data[`5`] = _mm_unpackhi_epi64(d0123hl, d4567hl);
118	data[`6`] = _mm_unpacklo_epi64(d0123hh, d4567hh);
119	data[`7`] = _mm_unpackhi_epi64(d0123hh, d4567hh);
120	}
121
122	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
123	#[target_feature(enable = "ssse3")]
124	pub unsafe fn dequantize_and_idct_block_8x8(
125	coefficients: &[i16; `64`],
126	quantization_table: &[u16; `64`],
127	output_linestride: usize,
128	output: &mut [u8],
129	) {
130	// The loop below will write to positions [output_linestride i, output_linestride * i + 8)*
131	// for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade 7 + 7,*
132	// and if that position is in-bounds, so are all other accesses.
133	assert!(
134	output.len()
135	> output_linestride
136	.checked_mul(`7`)
137	.unwrap()
138	.checked_add(`7`)
139	.unwrap()
140	);
141
142	#[cfg(target_arch = "x86")]
143	use std::arch::x86::*;
144	#[cfg(target_arch = "x86_64")]
145	use std::arch::x86_64::*;
146
147	const SHIFT: i32 = `3`;
148
149	// Read the DCT coefficients, scale them up and dequantize them.
150	let mut data = [_mm_setzero_si128(); `8`];
151	for (i, item) in data.iter_mut().enumerate() {
152	*item = _mm_slli_epi16(
153	_mm_mullo_epi16(
154	_mm_loadu_si128(coefficients.as_ptr().wrapping_add(i * `8`) as *const _),
155	_mm_loadu_si128(quantization_table.as_ptr().wrapping_add(i * `8`) as *const _),
156	),
157	SHIFT,
158	);
159	}
160
161	// Usual column IDCT - transpose - column IDCT - transpose approach.
162	idct8(&mut data);
163	transpose8(&mut data);
164	idct8(&mut data);
165	transpose8(&mut data);
166
167	for (i, item) in data.iter_mut().enumerate() {
168	let mut buf = [`0u8`; `16`];
169	// The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
170	// increased by 3.
171	// As values will be stored in a u8, they need to be 128-centered and not 0-centered.
172	// We add 128 with the appropriate shift for that purpose.
173	const OFFSET: i16 = `128` << (SHIFT + `3`);
174	// We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
175	const ROUNDING_BIAS: i16 = (`1` << (SHIFT + `3`)) >> `1`;
176
177	let data_with_offset = _mm_adds_epi16(*item, _mm_set1_epi16(OFFSET + ROUNDING_BIAS));
178
179	_mm_storeu_si128(
180	buf.as_mut_ptr() as *mut _,
181	_mm_packus_epi16(
182	_mm_srai_epi16(data_with_offset, SHIFT + `3`),
183	_mm_setzero_si128(),
184	),
185	);
186	std::ptr::copy_nonoverlapping::<u8>(
187	buf.as_ptr(),
188	output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _,
189	`8`,
190	);
191	}
192	}
193
194	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
195	#[target_feature(enable = "ssse3")]
196	pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize {
197	assert!(output.len() % `3` == `0`);
198	let num = output.len() / `3`;
199	assert!(num <= y.len());
200	assert!(num <= cb.len());
201	assert!(num <= cr.len());
202	// _mm_loadu_si64 generates incorrect code for Rust <1.58. To circumvent this, we use a full
203	// 128-bit load, but that requires leaving an extra vector of border to the scalar code.
204	// From Rust 1.58 on, the _mm_loadu_si128 can be replaced with _mm_loadu_si64 and this
205	// .saturating_sub() can be removed.
206	let num_vecs = (num / `8`).saturating_sub(`1`);
207
208	for i in `0`..num_vecs {
209	const SHIFT: i32 = `6`;
210	// Load.
211	let y = _mm_loadu_si128(y.as_ptr().wrapping_add(i * `8`) as *const _);
212	let cb = _mm_loadu_si128(cb.as_ptr().wrapping_add(i * `8`) as *const _);
213	let cr = _mm_loadu_si128(cr.as_ptr().wrapping_add(i * `8`) as *const _);
214
215	// Convert to 16 bit.
216	let shuf16 = _mm_setr_epi8(
217	`0`, `-0x7F`, `1`, `-0x7F`, `2`, `-0x7F`, `3`, `-0x7F`, `4`, `-0x7F`, `5`, `-0x7F`, `6`, `-0x7F`, `7`, `-0x7F`,
218	);
219	let y = _mm_slli_epi16(_mm_shuffle_epi8(y, shuf16), SHIFT);
220	let cb = _mm_slli_epi16(_mm_shuffle_epi8(cb, shuf16), SHIFT);
221	let cr = _mm_slli_epi16(_mm_shuffle_epi8(cr, shuf16), SHIFT);
222
223	// Add offsets
224	let c128 = _mm_set1_epi16(`128` << SHIFT);
225	let y = _mm_adds_epi16(y, _mm_set1_epi16((`1` << SHIFT) >> `1`));
226	let cb = _mm_subs_epi16(cb, c128);
227	let cr = _mm_subs_epi16(cr, c128);
228
229	// Compute cr 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772*
230	let cr_140200 = _mm_adds_epi16(_mm_mulhrs_epi16(cr, _mm_set1_epi16(`13173`)), cr);
231	let cb_034414 = _mm_mulhrs_epi16(cb, _mm_set1_epi16(`11276`));
232	let cr_071414 = _mm_mulhrs_epi16(cr, _mm_set1_epi16(`23401`));
233	let cb_177200 = _mm_adds_epi16(_mm_mulhrs_epi16(cb, _mm_set1_epi16(`25297`)), cb);
234
235	// Last conversion step.
236	let r = _mm_adds_epi16(y, cr_140200);
237	let g = _mm_subs_epi16(y, _mm_adds_epi16(cb_034414, cr_071414));
238	let b = _mm_adds_epi16(y, cb_177200);
239
240	// Shift back and convert to u8.
241	let zero = _mm_setzero_si128();
242	let r = _mm_packus_epi16(_mm_srai_epi16(r, SHIFT), zero);
243	let g = _mm_packus_epi16(_mm_srai_epi16(g, SHIFT), zero);
244	let b = _mm_packus_epi16(_mm_srai_epi16(b, SHIFT), zero);
245
246	// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
247
248	// Control vectors for _mm_shuffle_epi8. -0x7F is selected so that the resulting position
249	// after _mm_shuffle_epi8 will be filled with 0, so that the r, g, and b vectors can then
250	// be OR-ed together.
251	let shufr = _mm_setr_epi8(
252	`0`, `-0x7F`, `-0x7F`, `1`, `-0x7F`, `-0x7F`, `2`, `-0x7F`, `-0x7F`, `3`, `-0x7F`, `-0x7F`, `4`, `-0x7F`, `-0x7F`, `5`,
253	);
254	let shufg = _mm_setr_epi8(
255	`-0x7F`, `0`, `-0x7F`, `-0x7F`, `1`, `-0x7F`, `-0x7F`, `2`, `-0x7F`, `-0x7F`, `3`, `-0x7F`, `-0x7F`, `4`, `-0x7F`,
256	`-0x7F`,
257	);
258	let shufb = _mm_alignr_epi8(shufg, shufg, `15`);
259
260	let rgb_low = _mm_or_si128(
261	_mm_shuffle_epi8(r, shufr),
262	_mm_or_si128(_mm_shuffle_epi8(g, shufg), _mm_shuffle_epi8(b, shufb)),
263	);
264
265	// For the next part of the rgb vectors, we need to select R values from 6 up, G and B from
266	// 5 up. The highest bit of -0x7F + 6 is still set, so the corresponding location will
267	// still be 0.
268	let shufr1 = _mm_add_epi8(shufb, _mm_set1_epi8(`6`));
269	let shufg1 = _mm_add_epi8(shufr, _mm_set1_epi8(`5`));
270	let shufb1 = _mm_add_epi8(shufg, _mm_set1_epi8(`5`));
271
272	let rgb_hi = _mm_or_si128(
273	_mm_shuffle_epi8(r, shufr1),
274	_mm_or_si128(_mm_shuffle_epi8(g, shufg1), _mm_shuffle_epi8(b, shufb1)),
275	);
276
277	let mut data = [`0u8`; `32`];
278	_mm_storeu_si128(data.as_mut_ptr() as *mut _, rgb_low);
279	_mm_storeu_si128(data.as_mut_ptr().wrapping_add(`16`) as *mut _, rgb_hi);
280	std::ptr::copy_nonoverlapping::<u8>(
281	data.as_ptr(),
282	output.as_mut_ptr().wrapping_add(`24` * i),
283	`24`,
284	);
285	}
286
287	num_vecs * `8`
288	}
289