avx.rs source code [crates/zune_jpeg/src/color_convert/avx.rs]

1	/*
2	* Copyright (c) 2023.
3	*
4	* This software is free software;
5	*
6	* You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7	*/
8
9	//! AVX color conversion routines
10	//!
11	//! Okay these codes are cool
12	//!
13	//! Herein lies super optimized codes to do color conversions.
14	//!
15	//!
16	//! 1. The YCbCr to RGB use integer approximations and not the floating point equivalent.
17	//! That means we may be +- 2 of pixels generated by libjpeg-turbo jpeg decoding
18	//! (also libjpeg uses routines like `Y = 0.29900 R + 0.33700 * G + 0.11400 * B + 0.25000 * G`)*
19	//!
20	//! Firstly, we use integers (fun fact:there is no part of this code base where were dealing with
21	//! floating points.., fun fact: the first fun fact wasn't even fun.)
22	//!
23	//! Secondly ,we have cool clamping code, especially for rgba , where we don't need clamping and we
24	//! spend our time cursing that Intel decided permute instructions to work like 2 128 bit vectors(the compiler opitmizes
25	//! it out to something cool).
26	//!
27	//! There isn't a lot here (not as fun as bitstream ) but I hope you find what you're looking for.
28	//!
29	//! O and ~~subscribe to my youtube channel~~
30
31	#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
32	#![cfg(feature = "x86")]
33	#![allow(
34	clippy::wildcard_imports,
35	clippy::cast_possible_truncation,
36	clippy::too_many_arguments,
37	clippy::inline_always,
38	clippy::doc_markdown,
39	dead_code
40	)]
41
42	#[cfg(target_arch = "x86")]
43	use core::arch::x86::*;
44	#[cfg(target_arch = "x86_64")]
45	use core::arch::x86_64::*;
46
47	pub union YmmRegister {
48	// both are 32 when using std::mem::size_of
49	mm256: __m256i,
50	// for avx color conversion
51	array: [i16; `16`]
52	}
53
54	//--------------------------------------------------------------------------------------------------
55	// AVX conversion routines
56	//--------------------------------------------------------------------------------------------------
57
58	///
59	/// Convert YCBCR to RGB using AVX instructions
60	///
61	/// # Note
62	///IT IS THE RESPONSIBILITY OF THE CALLER TO CALL THIS IN CPUS SUPPORTING
63	/// AVX2 OTHERWISE THIS IS UB**
64	///
65	/// Peace
66	///
67	/// This library itself will ensure that it's never called in CPU's not
68	/// supporting AVX2
69	///
70	/// # Arguments
71	/// - `y`,`cb`,`cr`: A reference of 8 i32's
72	/// - `out`: The output array where we store our converted items
73	/// - `offset`: The position from 0 where we write these RGB values
74	#[inline(always)]
75	pub fn ycbcr_to_rgb_avx2(
76	y: &[i16; `16`], cb: &[i16; `16`], cr: &[i16; `16`], out: &mut [u8], offset: &mut usize
77	) {
78	// call this in another function to tell RUST to vectorize this
79	// storing
80	unsafe {
81	ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
82	}
83	}
84
85	#[inline]
86	#[target_feature(enable = "avx2")]
87	#[target_feature(enable = "avx")]
88	unsafe fn ycbcr_to_rgb_avx2_1(
89	y: &[i16; `16`], cb: &[i16; `16`], cr: &[i16; `16`], out: &mut [u8], offset: &mut usize
90	) {
91	// Load output buffer
92	let tmp: &mut [u8; `48`] = outResult<&mut [u8; 48], TryFromSliceError>
93	.get_mut(offset..offset + `48`)
94	.expect(msg:"Slice to small cannot write")
95	.try_into()
96	.unwrap();
97
98	let (r: YmmRegister, g: YmmRegister, b: YmmRegister) = ycbcr_to_rgb_baseline(y, cb, cr);
99
100	let mut j: usize = `0`;
101	let mut i: usize = `0`;
102	while i < `48` {
103	tmp[i] = r.array[j] as u8;
104
105	tmp[i + `1`] = g.array[j] as u8;
106	tmp[i + `2`] = b.array[j] as u8;
107	i += `3`;
108	j += `1`;
109	}
110
111	*offset += `48`;
112	}
113
114	/// Baseline implementation of YCBCR to RGB for avx,
115	///
116	/// It uses integer operations as opposed to floats, the approximation is
117	/// difficult for the eye to see, but this means that it may produce different
118	/// values with libjpeg_turbo. if accuracy is of utmost importance, use that.
119	///
120	/// this function should be called for most implementations, including
121	/// - ycbcr->rgb
122	/// - ycbcr->rgba
123	/// - ycbcr->brga
124	/// - ycbcr->rgbx
125	#[inline]
126	#[target_feature(enable = "avx2")]
127	#[target_feature(enable = "avx")]
128	unsafe fn ycbcr_to_rgb_baseline(
129	y: &[i16; `16`], cb: &[i16; `16`], cr: &[i16; `16`]
130	) -> (YmmRegister, YmmRegister, YmmRegister) {
131	// Load values into a register
132	//
133	// dst[127:0] := MEM[loaddr+127:loaddr]
134	// dst[255:128] := MEM[hiaddr+127:hiaddr]
135	let y_c = _mm256_loadu_si256(y.as_ptr().cast());
136
137	let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
138
139	let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
140
141	// AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb
142
143	// Cb = Cb-128;
144	let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(`128`));
145
146	// cr = Cb -128;
147	let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(`128`));
148
149	// Calculate Y->R
150	// r = Y + 45 Cr / 32*
151	// 45cr*
152	let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(`45`), cr_r);
153
154	// r1>>5
155	let r2 = _mm256_srai_epi16::<`5`>(r1);
156
157	//y+r2
158
159	let r = YmmRegister {
160	mm256: clamp_avx(_mm256_add_epi16(y_c, r2))
161	};
162
163	// g = Y - (11 Cb + 23 * Cr) / 32 ;*
164
165	// 11cb*
166	let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(`11`), cb_r);
167
168	// 23cr*
169	let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(`23`), cr_r);
170
171	//(11
172	//(11 Cb + 23 * Cr)*
173	let g3 = _mm256_add_epi16(g1, g2);
174
175	// (11 Cb + 23 * Cr) / 32*
176	let g4 = _mm256_srai_epi16::<`5`>(g3);
177
178	// Y - (11 Cb + 23 * Cr) / 32 ;*
179	let g = YmmRegister {
180	mm256: clamp_avx(_mm256_sub_epi16(y_c, g4))
181	};
182
183	// b = Y + 113 Cb / 64*
184	// 113 cb*
185	let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(`113`), cb_r);
186
187	//113 Cb / 64*
188	let b2 = _mm256_srai_epi16::<`6`>(b1);
189
190	// b = Y + 113 Cb / 64 ;*
191	let b = YmmRegister {
192	mm256: clamp_avx(_mm256_add_epi16(b2, y_c))
193	};
194
195	return (r, g, b);
196	}
197
198	#[inline]
199	#[target_feature(enable = "avx2")]
200	/// A baseline implementation of YCbCr to RGB conversion which does not carry
201	/// out clamping
202	///
203	/// This is used by the `ycbcr_to_rgba_avx` and `ycbcr_to_rgbx` conversion
204	/// routines
205	unsafe fn ycbcr_to_rgb_baseline_no_clamp(
206	y: &[i16; `16`], cb: &[i16; `16`], cr: &[i16; `16`]
207	) -> (__m256i, __m256i, __m256i) {
208	// Load values into a register
209	//
210	let y_c = _mm256_loadu_si256(y.as_ptr().cast());
211
212	let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
213
214	let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
215
216	// AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb
217
218	// Cb = Cb-128;
219	let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(`128`));
220
221	// cr = Cb -128;
222	let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(`128`));
223
224	// Calculate Y->R
225	// r = Y + 45 Cr / 32*
226	// 45cr*
227	let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(`45`), cr_r);
228
229	// r1>>5
230	let r2 = _mm256_srai_epi16::<`5`>(r1);
231
232	//y+r2
233
234	let r = _mm256_add_epi16(y_c, r2);
235
236	// g = Y - (11 Cb + 23 * Cr) / 32 ;*
237
238	// 11cb*
239	let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(`11`), cb_r);
240
241	// 23cr*
242	let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(`23`), cr_r);
243
244	//(11
245	//(11 Cb + 23 * Cr)*
246	let g3 = _mm256_add_epi16(g1, g2);
247
248	// (11 Cb + 23 * Cr) / 32*
249	let g4 = _mm256_srai_epi16::<`5`>(g3);
250
251	// Y - (11 Cb + 23 * Cr) / 32 ;*
252	let g = _mm256_sub_epi16(y_c, g4);
253
254	// b = Y + 113 Cb / 64*
255	// 113 cb*
256	let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(`113`), cb_r);
257
258	//113 Cb / 64*
259	let b2 = _mm256_srai_epi16::<`6`>(b1);
260
261	// b = Y + 113 Cb / 64 ;*
262	let b = _mm256_add_epi16(b2, y_c);
263
264	return (r, g, b);
265	}
266
267	#[inline(always)]
268	pub fn ycbcr_to_rgba_avx2(
269	y: &[i16; `16`], cb: &[i16; `16`], cr: &[i16; `16`], out: &mut [u8], offset: &mut usize
270	) {
271	unsafe {
272	ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
273	}
274	}
275
276	#[inline]
277	#[target_feature(enable = "avx2")]
278	#[rustfmt::skip]
279	unsafe fn ycbcr_to_rgba_unsafe(
280	y: &[i16; `16`], cb: &[i16; `16`], cr: &[i16; `16`],
281	out: &mut [u8],
282	offset: &mut usize,
283	)
284	{
285	// check if we have enough space to write.
286	let tmp:& mut [u8; `64`] = out.get_mut(offset..offset + `64`).expect("Slice to small cannot write").try_into().unwrap();
287
288	let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
289
290	// set alpha channel to 255 for opaque
291
292	// And no these comments were not from me pressing the keyboard
293
294	// Pack the integers into u8's using signed saturation.
295	let c = _mm256_packus_epi16(r, g); //aaaaa_bbbbb_aaaaa_bbbbbb
296	let d = _mm256_packus_epi16(b, _mm256_set1_epi16(`255`)); // cccccc_dddddd_ccccccc_ddddd
297	// transpose_u16 and interleave channels
298	let e = _mm256_unpacklo_epi8(c, d); //ab_ab_ab_ab_ab_ab_ab_ab
299	let f = _mm256_unpackhi_epi8(c, d); //cd_cd_cd_cd_cd_cd_cd_cd
300	// final transpose_u16
301	let g = _mm256_unpacklo_epi8(e, f); //abcd_abcd_abcd_abcd_abcd
302	let h = _mm256_unpackhi_epi8(e, f);
303
304
305	// undo packus shuffling...
306	let i = _mm256_permute2x128_si256::<{ shuffle(`3`, `2`, `1`, `0`) }>(g, h);
307
308	let j = _mm256_permute2x128_si256::<{ shuffle(`1`, `2`, `3`, `0`) }>(g, h);
309
310	let k = _mm256_permute2x128_si256::<{ shuffle(`3`, `2`, `0`, `1`) }>(g, h);
311
312	let l = _mm256_permute2x128_si256::<{ shuffle(`0`, `3`, `2`, `1`) }>(g, h);
313
314	let m = _mm256_blend_epi32::<`0b1111_0000`>(i, j);
315
316	let n = _mm256_blend_epi32::<`0b1111_0000`>(k, l);
317
318
319	// Store
320	// Use streaming instructions to prevent polluting the cache?
321	_mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
322
323	_mm256_storeu_si256(tmp[`32`..].as_mut_ptr().cast(), n);
324
325	*offset += `64`;
326	}
327
328	/// Clamp values between 0 and 255
329	///
330	/// This function clamps all values in `reg` to be between 0 and 255
331	///( the accepted values for RGB)
332	#[inline]
333	#[target_feature(enable = "avx2")]
334	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
335	unsafe fn clamp_avx(reg: __m256i) -> __m256i {
336	// the lowest value
337	let min_s: __m256i = _mm256_set1_epi16(`0`);
338
339	// Highest value
340	let max_s: __m256i = _mm256_set1_epi16(`255`);
341
342	let max_v: __m256i = _mm256_max_epi16(a:reg, b:min_s); //max(a,0)
343	let min_v: __m256i = _mm256_min_epi16(a:max_v, b:max_s); //min(max(a,0),255)
344	return min_v;
345	}
346
347	#[inline]
348	const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
349	(z << `6`) \| (y << `4`) \| (x << `2`) \| w
350	}
351