avx2.rs source code [crates/zune_jpeg/src/idct/avx2.rs]

1	/*
2	* Copyright (c) 2023.
3	*
4	* This software is free software;
5	*
6	* You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7	*/
8
9	#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
10	//! AVX optimised IDCT.
11	//!
12	//! Okay not thaat optimised.
13	//!
14	//!
15	//! # The implementation
16	//! The implementation is neatly broken down into two operations.
17	//!
18	//! 1. Test for zeroes
19	//! > There is a shortcut method for idct where when all AC values are zero, we can get the answer really quickly.
20	//! by scaling the 1/8th of the DCT coefficient of the block to the whole block and level shifting.
21	//!
22	//! 2. If above fails, we proceed to carry out IDCT as a two pass one dimensional algorithm.
23	//! IT does two whole scans where it carries out IDCT on all items
24	//! After each successive scan, data is transposed in register(thank you x86 SIMD powers). and the second
25	//! pass is carried out.
26	//!
27	//! The code is not super optimized, it produces bit identical results with scalar code hence it's
28	//! `mm256_add_epi16`
29	//! and it also has the advantage of making this implementation easy to maintain.
30
31	#![cfg(feature = "x86")]
32	#![allow(dead_code)]
33
34	#[cfg(target_arch = "x86")]
35	use core::arch::x86::*;
36	#[cfg(target_arch = "x86_64")]
37	use core::arch::x86_64::*;
38
39	use crate::unsafe_utils::{transpose, YmmRegister};
40
41	const SCALE_BITS: i32 = `512` + `65536` + (`128` << `17`);
42
43	/// SAFETY
44	/// ------
45	///
46	/// It is the responsibility of the CALLER to ensure that this function is
47	/// called in contexts where the CPU supports it
48	///
49	///
50	/// For documentation see module docs.
51
52	pub fn idct_avx2(in_vector: &mut [i32; `64`], out_vector: &mut [i16], stride: usize) {
53	unsafe {
54	// We don't call this method directly because we need to flag the code function
55	// with #[target_feature] so that the compiler does do weird stuff with
56	// it
57	idct_int_avx2_inner(in_vector, out_vector, stride);
58	}
59	}
60
61	#[target_feature(enable = "avx2")]
62	#[allow(
63	clippy::too_many_lines,
64	clippy::cast_possible_truncation,
65	clippy::similar_names,
66	clippy::op_ref,
67	unused_assignments,
68	clippy::zero_prefixed_literal
69	)]
70	pub unsafe fn idct_int_avx2_inner(
71	in_vector: &mut [i32; `64`], out_vector: &mut [i16], stride: usize
72	) {
73	let mut pos = `0`;
74
75	// load into registers
76	//
77	// We sign extend i16's to i32's and calculate them with extended precision and
78	// later reduce them to i16's when we are done carrying out IDCT
79
80	let rw0 = _mm256_loadu_si256(in_vector[`00`..].as_ptr().cast());
81	let rw1 = _mm256_loadu_si256(in_vector[`08`..].as_ptr().cast());
82	let rw2 = _mm256_loadu_si256(in_vector[`16`..].as_ptr().cast());
83	let rw3 = _mm256_loadu_si256(in_vector[`24`..].as_ptr().cast());
84	let rw4 = _mm256_loadu_si256(in_vector[`32`..].as_ptr().cast());
85	let rw5 = _mm256_loadu_si256(in_vector[`40`..].as_ptr().cast());
86	let rw6 = _mm256_loadu_si256(in_vector[`48`..].as_ptr().cast());
87	let rw7 = _mm256_loadu_si256(in_vector[`56`..].as_ptr().cast());
88
89	// Forward DCT and quantization may cause all the AC terms to be zero, for such
90	// cases we can try to accelerate it
91
92	// Basically the poop is that whenever the array has 63 zeroes, its idct is
93	// (arr[0]>>3)or (arr[0]/8) propagated to all the elements.
94	// We first test to see if the array contains zero elements and if it does, we go the
95	// short way.
96	//
97	// This reduces IDCT overhead from about 39% to 18 %, almost half
98
99	// Do another load for the first row, we don't want to check DC value, because
100	// we only care about AC terms
101	let rw8 = _mm256_loadu_si256(in_vector[`1`..].as_ptr().cast());
102
103	let zero = _mm256_setzero_si256();
104
105	let mut non_zero = `0`;
106
107	non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw8, zero));
108	non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw1, zero));
109	non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw2, zero));
110	non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw3, zero));
111
112	non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw4, zero));
113	non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw5, zero));
114	non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw6, zero));
115	non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw7, zero));
116
117	if non_zero == `-8` {
118	// AC terms all zero, idct of the block is is ( coeff[0] qt[0] )/8 + 128 (bias)*
119	// (and clamped to 255)
120	let idct_value = _mm_set1_epi16(((in_vector[`0`] >> `3`) + `128`).clamp(`0`, `255`) as i16);
121
122	macro_rules! store {
123	($pos:tt,$value:tt) => {
124	// store
125	_mm_storeu_si128(
126	out_vector
127	.get_mut($pos..$pos + `8`)
128	.unwrap()
129	.as_mut_ptr()
130	.cast(),
131	$value
132	);
133	$pos += stride;
134	};
135	}
136	store!(pos, idct_value);
137	store!(pos, idct_value);
138	store!(pos, idct_value);
139	store!(pos, idct_value);
140
141	store!(pos, idct_value);
142	store!(pos, idct_value);
143	store!(pos, idct_value);
144	store!(pos, idct_value);
145
146	return;
147	}
148
149	let mut row0 = YmmRegister { mm256: rw0 };
150	let mut row1 = YmmRegister { mm256: rw1 };
151	let mut row2 = YmmRegister { mm256: rw2 };
152	let mut row3 = YmmRegister { mm256: rw3 };
153
154	let mut row4 = YmmRegister { mm256: rw4 };
155	let mut row5 = YmmRegister { mm256: rw5 };
156	let mut row6 = YmmRegister { mm256: rw6 };
157	let mut row7 = YmmRegister { mm256: rw7 };
158
159	macro_rules! dct_pass {
160	($SCALE_BITS:tt,$scale:tt) => {
161	// There are a lot of ways to do this
162	// but to keep it simple(and beautiful), ill make a direct translation of the
163	// scalar code to also make this code fully transparent(this version and the non
164	// avx one should produce identical code.)
165
166	// even part
167	let p1 = (row2 + row6) * `2217`;
168
169	let mut t2 = p1 + row6 * -`7567`;
170	let mut t3 = p1 + row2 * `3135`;
171
172	let mut t0 = YmmRegister {
173	mm256: _mm256_slli_epi32((row0 + row4).mm256, `12`)
174	};
175	let mut t1 = YmmRegister {
176	mm256: _mm256_slli_epi32((row0 - row4).mm256, `12`)
177	};
178
179	let x0 = t0 + t3 + $SCALE_BITS;
180	let x3 = t0 - t3 + $SCALE_BITS;
181	let x1 = t1 + t2 + $SCALE_BITS;
182	let x2 = t1 - t2 + $SCALE_BITS;
183
184	let p3 = row7 + row3;
185	let p4 = row5 + row1;
186	let p1 = row7 + row1;
187	let p2 = row5 + row3;
188	let p5 = (p3 + p4) * `4816`;
189
190	t0 = row7 * `1223`;
191	t1 = row5 * `8410`;
192	t2 = row3 * `12586`;
193	t3 = row1 * `6149`;
194
195	let p1 = p5 + p1 * -`3685`;
196	let p2 = p5 + (p2 * -`10497`);
197	let p3 = p3 * -`8034`;
198	let p4 = p4 * -`1597`;
199
200	t3 += p1 + p4;
201	t2 += p2 + p3;
202	t1 += p2 + p4;
203	t0 += p1 + p3;
204
205	row0.mm256 = _mm256_srai_epi32((x0 + t3).mm256, $scale);
206	row1.mm256 = _mm256_srai_epi32((x1 + t2).mm256, $scale);
207	row2.mm256 = _mm256_srai_epi32((x2 + t1).mm256, $scale);
208	row3.mm256 = _mm256_srai_epi32((x3 + t0).mm256, $scale);
209
210	row4.mm256 = _mm256_srai_epi32((x3 - t0).mm256, $scale);
211	row5.mm256 = _mm256_srai_epi32((x2 - t1).mm256, $scale);
212	row6.mm256 = _mm256_srai_epi32((x1 - t2).mm256, $scale);
213	row7.mm256 = _mm256_srai_epi32((x0 - t3).mm256, $scale);
214	};
215	}
216
217	// Process rows
218	dct_pass!(`512`, `10`);
219	transpose(
220	&mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
221	);
222
223	// process columns
224	dct_pass!(SCALE_BITS, `17`);
225	transpose(
226	&mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
227	);
228
229	// Pack i32 to i16's,
230	// clamp them to be between 0-255
231	// Undo shuffling
232	// Store back to array
233	macro_rules! permute_store {
234	($x:tt,$y:tt,$index:tt,$out:tt) => {
235	let a = _mm256_packs_epi32($x, $y);
236
237	// Clamp the values after packing, we can clamp more values at once
238	let b = clamp_avx(a);
239
240	// /Undo shuffling
241	let c = _mm256_permute4x64_epi64(b, shuffle(`3`, `1`, `2`, `0`));
242
243	// store first vector
244	_mm_storeu_si128(
245	($out)
246	.get_mut($index..$index + `8`)
247	.unwrap()
248	.as_mut_ptr()
249	.cast(),
250	_mm256_extractf128_si256::<`0`>(c)
251	);
252	$index += stride;
253	// second vector
254	_mm_storeu_si128(
255	($out)
256	.get_mut($index..$index + `8`)
257	.unwrap()
258	.as_mut_ptr()
259	.cast(),
260	_mm256_extractf128_si256::<`1`>(c)
261	);
262	$index += stride;
263	};
264	}
265	// Pack and write the values back to the array
266	permute_store!((row0.mm256), (row1.mm256), pos, out_vector);
267	permute_store!((row2.mm256), (row3.mm256), pos, out_vector);
268	permute_store!((row4.mm256), (row5.mm256), pos, out_vector);
269	permute_store!((row6.mm256), (row7.mm256), pos, out_vector);
270	}
271
272	#[inline]
273	#[target_feature(enable = "avx2")]
274	unsafe fn clamp_avx(reg: __m256i) -> __m256i {
275	let min_s: __m256i = _mm256_set1_epi16(`0`);
276	let max_s: __m256i = _mm256_set1_epi16(`255`);
277
278	let max_v: __m256i = _mm256_max_epi16(a:reg, b:min_s); //max(a,0)
279	let min_v: __m256i = _mm256_min_epi16(a:max_v, b:max_s); //min(max(a,0),255)
280	return min_v;
281	}
282
283	/// A copy of `_MM_SHUFFLE()` that doesn't require
284	/// a nightly compiler
285	#[inline]
286	const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
287	((z << `6`) \| (y << `4`) \| (x << `2`) \| w)
288	}
289