1/*
2 * Copyright (c) 2023.
3 *
4 * This software is free software;
5 *
6 * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7 */
8
9//! AVX color conversion routines
10//!
11//! Okay these codes are cool
12//!
13//! Herein lies super optimized codes to do color conversions.
14//!
15//!
16//! 1. The YCbCr to RGB use integer approximations and not the floating point equivalent.
17//! That means we may be +- 2 of pixels generated by libjpeg-turbo jpeg decoding
18//! (also libjpeg uses routines like `Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G`)
19//!
20//! Firstly, we use integers (fun fact:there is no part of this code base where were dealing with
21//! floating points.., fun fact: the first fun fact wasn't even fun.)
22//!
23//! Secondly ,we have cool clamping code, especially for rgba , where we don't need clamping and we
24//! spend our time cursing that Intel decided permute instructions to work like 2 128 bit vectors(the compiler opitmizes
25//! it out to something cool).
26//!
27//! There isn't a lot here (not as fun as bitstream ) but I hope you find what you're looking for.
28//!
29//! O and ~~subscribe to my youtube channel~~
30
31#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
32#![cfg(feature = "x86")]
33#![allow(
34 clippy::wildcard_imports,
35 clippy::cast_possible_truncation,
36 clippy::too_many_arguments,
37 clippy::inline_always,
38 clippy::doc_markdown,
39 dead_code
40)]
41
42#[cfg(target_arch = "x86")]
43use core::arch::x86::*;
44#[cfg(target_arch = "x86_64")]
45use core::arch::x86_64::*;
46
47pub union YmmRegister {
48 // both are 32 when using std::mem::size_of
49 mm256: __m256i,
50 // for avx color conversion
51 array: [i16; 16]
52}
53
54//--------------------------------------------------------------------------------------------------
55// AVX conversion routines
56//--------------------------------------------------------------------------------------------------
57
58///
59/// Convert YCBCR to RGB using AVX instructions
60///
61/// # Note
62///**IT IS THE RESPONSIBILITY OF THE CALLER TO CALL THIS IN CPUS SUPPORTING
63/// AVX2 OTHERWISE THIS IS UB**
64///
65/// *Peace*
66///
67/// This library itself will ensure that it's never called in CPU's not
68/// supporting AVX2
69///
70/// # Arguments
71/// - `y`,`cb`,`cr`: A reference of 8 i32's
72/// - `out`: The output array where we store our converted items
73/// - `offset`: The position from 0 where we write these RGB values
74#[inline(always)]
75pub fn ycbcr_to_rgb_avx2(
76 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
77) {
78 // call this in another function to tell RUST to vectorize this
79 // storing
80 unsafe {
81 ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
82 }
83}
84
85#[inline]
86#[target_feature(enable = "avx2")]
87#[target_feature(enable = "avx")]
88unsafe fn ycbcr_to_rgb_avx2_1(
89 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
90) {
91 // Load output buffer
92 let tmp: &mut [u8; 48] = outResult<&mut [u8; 48], TryFromSliceError>
93 .get_mut(*offset..*offset + 48)
94 .expect(msg:"Slice to small cannot write")
95 .try_into()
96 .unwrap();
97
98 let (r: YmmRegister, g: YmmRegister, b: YmmRegister) = ycbcr_to_rgb_baseline(y, cb, cr);
99
100 let mut j: usize = 0;
101 let mut i: usize = 0;
102 while i < 48 {
103 tmp[i] = r.array[j] as u8;
104
105 tmp[i + 1] = g.array[j] as u8;
106 tmp[i + 2] = b.array[j] as u8;
107 i += 3;
108 j += 1;
109 }
110
111 *offset += 48;
112}
113
114/// Baseline implementation of YCBCR to RGB for avx,
115///
116/// It uses integer operations as opposed to floats, the approximation is
117/// difficult for the eye to see, but this means that it may produce different
118/// values with libjpeg_turbo. if accuracy is of utmost importance, use that.
119///
120/// this function should be called for most implementations, including
121/// - ycbcr->rgb
122/// - ycbcr->rgba
123/// - ycbcr->brga
124/// - ycbcr->rgbx
125#[inline]
126#[target_feature(enable = "avx2")]
127#[target_feature(enable = "avx")]
128unsafe fn ycbcr_to_rgb_baseline(
129 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
130) -> (YmmRegister, YmmRegister, YmmRegister) {
131 // Load values into a register
132 //
133 // dst[127:0] := MEM[loaddr+127:loaddr]
134 // dst[255:128] := MEM[hiaddr+127:hiaddr]
135 let y_c = _mm256_loadu_si256(y.as_ptr().cast());
136
137 let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
138
139 let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
140
141 // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb
142
143 // Cb = Cb-128;
144 let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
145
146 // cr = Cb -128;
147 let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
148
149 // Calculate Y->R
150 // r = Y + 45 * Cr / 32
151 // 45*cr
152 let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
153
154 // r1>>5
155 let r2 = _mm256_srai_epi16::<5>(r1);
156
157 //y+r2
158
159 let r = YmmRegister {
160 mm256: clamp_avx(_mm256_add_epi16(y_c, r2))
161 };
162
163 // g = Y - (11 * Cb + 23 * Cr) / 32 ;
164
165 // 11*cb
166 let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
167
168 // 23*cr
169 let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
170
171 //(11
172 //(11 * Cb + 23 * Cr)
173 let g3 = _mm256_add_epi16(g1, g2);
174
175 // (11 * Cb + 23 * Cr) / 32
176 let g4 = _mm256_srai_epi16::<5>(g3);
177
178 // Y - (11 * Cb + 23 * Cr) / 32 ;
179 let g = YmmRegister {
180 mm256: clamp_avx(_mm256_sub_epi16(y_c, g4))
181 };
182
183 // b = Y + 113 * Cb / 64
184 // 113 * cb
185 let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
186
187 //113 * Cb / 64
188 let b2 = _mm256_srai_epi16::<6>(b1);
189
190 // b = Y + 113 * Cb / 64 ;
191 let b = YmmRegister {
192 mm256: clamp_avx(_mm256_add_epi16(b2, y_c))
193 };
194
195 return (r, g, b);
196}
197
198#[inline]
199#[target_feature(enable = "avx2")]
200/// A baseline implementation of YCbCr to RGB conversion which does not carry
201/// out clamping
202///
203/// This is used by the `ycbcr_to_rgba_avx` and `ycbcr_to_rgbx` conversion
204/// routines
205unsafe fn ycbcr_to_rgb_baseline_no_clamp(
206 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
207) -> (__m256i, __m256i, __m256i) {
208 // Load values into a register
209 //
210 let y_c = _mm256_loadu_si256(y.as_ptr().cast());
211
212 let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
213
214 let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
215
216 // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb
217
218 // Cb = Cb-128;
219 let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
220
221 // cr = Cb -128;
222 let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
223
224 // Calculate Y->R
225 // r = Y + 45 * Cr / 32
226 // 45*cr
227 let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
228
229 // r1>>5
230 let r2 = _mm256_srai_epi16::<5>(r1);
231
232 //y+r2
233
234 let r = _mm256_add_epi16(y_c, r2);
235
236 // g = Y - (11 * Cb + 23 * Cr) / 32 ;
237
238 // 11*cb
239 let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
240
241 // 23*cr
242 let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
243
244 //(11
245 //(11 * Cb + 23 * Cr)
246 let g3 = _mm256_add_epi16(g1, g2);
247
248 // (11 * Cb + 23 * Cr) / 32
249 let g4 = _mm256_srai_epi16::<5>(g3);
250
251 // Y - (11 * Cb + 23 * Cr) / 32 ;
252 let g = _mm256_sub_epi16(y_c, g4);
253
254 // b = Y + 113 * Cb / 64
255 // 113 * cb
256 let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
257
258 //113 * Cb / 64
259 let b2 = _mm256_srai_epi16::<6>(b1);
260
261 // b = Y + 113 * Cb / 64 ;
262 let b = _mm256_add_epi16(b2, y_c);
263
264 return (r, g, b);
265}
266
267#[inline(always)]
268pub fn ycbcr_to_rgba_avx2(
269 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
270) {
271 unsafe {
272 ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
273 }
274}
275
276#[inline]
277#[target_feature(enable = "avx2")]
278#[rustfmt::skip]
279unsafe fn ycbcr_to_rgba_unsafe(
280 y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
281 out: &mut [u8],
282 offset: &mut usize,
283)
284{
285 // check if we have enough space to write.
286 let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write").try_into().unwrap();
287
288 let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
289
290 // set alpha channel to 255 for opaque
291
292 // And no these comments were not from me pressing the keyboard
293
294 // Pack the integers into u8's using signed saturation.
295 let c = _mm256_packus_epi16(r, g); //aaaaa_bbbbb_aaaaa_bbbbbb
296 let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); // cccccc_dddddd_ccccccc_ddddd
297 // transpose_u16 and interleave channels
298 let e = _mm256_unpacklo_epi8(c, d); //ab_ab_ab_ab_ab_ab_ab_ab
299 let f = _mm256_unpackhi_epi8(c, d); //cd_cd_cd_cd_cd_cd_cd_cd
300 // final transpose_u16
301 let g = _mm256_unpacklo_epi8(e, f); //abcd_abcd_abcd_abcd_abcd
302 let h = _mm256_unpackhi_epi8(e, f);
303
304
305 // undo packus shuffling...
306 let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);
307
308 let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);
309
310 let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);
311
312 let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);
313
314 let m = _mm256_blend_epi32::<0b1111_0000>(i, j);
315
316 let n = _mm256_blend_epi32::<0b1111_0000>(k, l);
317
318
319 // Store
320 // Use streaming instructions to prevent polluting the cache?
321 _mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
322
323 _mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);
324
325 *offset += 64;
326}
327
328/// Clamp values between 0 and 255
329///
330/// This function clamps all values in `reg` to be between 0 and 255
331///( the accepted values for RGB)
332#[inline]
333#[target_feature(enable = "avx2")]
334#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
335unsafe fn clamp_avx(reg: __m256i) -> __m256i {
336 // the lowest value
337 let min_s: __m256i = _mm256_set1_epi16(0);
338
339 // Highest value
340 let max_s: __m256i = _mm256_set1_epi16(255);
341
342 let max_v: __m256i = _mm256_max_epi16(a:reg, b:min_s); //max(a,0)
343 let min_v: __m256i = _mm256_min_epi16(a:max_v, b:max_s); //min(max(a,0),255)
344 return min_v;
345}
346
347#[inline]
348const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
349 (z << 6) | (y << 4) | (x << 2) | w
350}
351