1 | /* |
2 | * Copyright (c) 2023. |
3 | * |
4 | * This software is free software; |
5 | * |
6 | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
7 | */ |
8 | |
9 | //! AVX color conversion routines |
10 | //! |
11 | //! Okay these codes are cool |
12 | //! |
13 | //! Herein lies super optimized codes to do color conversions. |
14 | //! |
15 | //! |
16 | //! 1. The YCbCr to RGB use integer approximations and not the floating point equivalent. |
17 | //! That means we may be +- 2 of pixels generated by libjpeg-turbo jpeg decoding |
18 | //! (also libjpeg uses routines like `Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G`) |
19 | //! |
20 | //! Firstly, we use integers (fun fact:there is no part of this code base where were dealing with |
21 | //! floating points.., fun fact: the first fun fact wasn't even fun.) |
22 | //! |
23 | //! Secondly ,we have cool clamping code, especially for rgba , where we don't need clamping and we |
24 | //! spend our time cursing that Intel decided permute instructions to work like 2 128 bit vectors(the compiler opitmizes |
25 | //! it out to something cool). |
26 | //! |
27 | //! There isn't a lot here (not as fun as bitstream ) but I hope you find what you're looking for. |
28 | //! |
29 | //! O and ~~subscribe to my youtube channel~~ |
30 | |
31 | #![cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
32 | #![cfg (feature = "x86" )] |
33 | #![allow ( |
34 | clippy::wildcard_imports, |
35 | clippy::cast_possible_truncation, |
36 | clippy::too_many_arguments, |
37 | clippy::inline_always, |
38 | clippy::doc_markdown, |
39 | dead_code |
40 | )] |
41 | |
42 | #[cfg (target_arch = "x86" )] |
43 | use core::arch::x86::*; |
44 | #[cfg (target_arch = "x86_64" )] |
45 | use core::arch::x86_64::*; |
46 | |
47 | pub union YmmRegister { |
48 | // both are 32 when using std::mem::size_of |
49 | mm256: __m256i, |
50 | // for avx color conversion |
51 | array: [i16; 16] |
52 | } |
53 | |
54 | //-------------------------------------------------------------------------------------------------- |
55 | // AVX conversion routines |
56 | //-------------------------------------------------------------------------------------------------- |
57 | |
58 | /// |
59 | /// Convert YCBCR to RGB using AVX instructions |
60 | /// |
61 | /// # Note |
62 | ///**IT IS THE RESPONSIBILITY OF THE CALLER TO CALL THIS IN CPUS SUPPORTING |
63 | /// AVX2 OTHERWISE THIS IS UB** |
64 | /// |
65 | /// *Peace* |
66 | /// |
67 | /// This library itself will ensure that it's never called in CPU's not |
68 | /// supporting AVX2 |
69 | /// |
70 | /// # Arguments |
71 | /// - `y`,`cb`,`cr`: A reference of 8 i32's |
72 | /// - `out`: The output array where we store our converted items |
73 | /// - `offset`: The position from 0 where we write these RGB values |
74 | #[inline (always)] |
75 | pub fn ycbcr_to_rgb_avx2( |
76 | y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize |
77 | ) { |
78 | // call this in another function to tell RUST to vectorize this |
79 | // storing |
80 | unsafe { |
81 | ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset); |
82 | } |
83 | } |
84 | |
85 | #[inline ] |
86 | #[target_feature (enable = "avx2" )] |
87 | #[target_feature (enable = "avx" )] |
88 | unsafe fn ycbcr_to_rgb_avx2_1( |
89 | y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize |
90 | ) { |
91 | // Load output buffer |
92 | let tmp: &mut [u8; 48] = outResult<&mut [u8; 48], TryFromSliceError> |
93 | .get_mut(*offset..*offset + 48) |
94 | .expect(msg:"Slice to small cannot write" ) |
95 | .try_into() |
96 | .unwrap(); |
97 | |
98 | let (r: YmmRegister, g: YmmRegister, b: YmmRegister) = ycbcr_to_rgb_baseline(y, cb, cr); |
99 | |
100 | let mut j: usize = 0; |
101 | let mut i: usize = 0; |
102 | while i < 48 { |
103 | tmp[i] = r.array[j] as u8; |
104 | |
105 | tmp[i + 1] = g.array[j] as u8; |
106 | tmp[i + 2] = b.array[j] as u8; |
107 | i += 3; |
108 | j += 1; |
109 | } |
110 | |
111 | *offset += 48; |
112 | } |
113 | |
114 | /// Baseline implementation of YCBCR to RGB for avx, |
115 | /// |
116 | /// It uses integer operations as opposed to floats, the approximation is |
117 | /// difficult for the eye to see, but this means that it may produce different |
118 | /// values with libjpeg_turbo. if accuracy is of utmost importance, use that. |
119 | /// |
120 | /// this function should be called for most implementations, including |
121 | /// - ycbcr->rgb |
122 | /// - ycbcr->rgba |
123 | /// - ycbcr->brga |
124 | /// - ycbcr->rgbx |
125 | #[inline ] |
126 | #[target_feature (enable = "avx2" )] |
127 | #[target_feature (enable = "avx" )] |
128 | unsafe fn ycbcr_to_rgb_baseline( |
129 | y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16] |
130 | ) -> (YmmRegister, YmmRegister, YmmRegister) { |
131 | // Load values into a register |
132 | // |
133 | // dst[127:0] := MEM[loaddr+127:loaddr] |
134 | // dst[255:128] := MEM[hiaddr+127:hiaddr] |
135 | let y_c = _mm256_loadu_si256(y.as_ptr().cast()); |
136 | |
137 | let cb_c = _mm256_loadu_si256(cb.as_ptr().cast()); |
138 | |
139 | let cr_c = _mm256_loadu_si256(cr.as_ptr().cast()); |
140 | |
141 | // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb |
142 | |
143 | // Cb = Cb-128; |
144 | let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128)); |
145 | |
146 | // cr = Cb -128; |
147 | let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128)); |
148 | |
149 | // Calculate Y->R |
150 | // r = Y + 45 * Cr / 32 |
151 | // 45*cr |
152 | let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r); |
153 | |
154 | // r1>>5 |
155 | let r2 = _mm256_srai_epi16::<5>(r1); |
156 | |
157 | //y+r2 |
158 | |
159 | let r = YmmRegister { |
160 | mm256: clamp_avx(_mm256_add_epi16(y_c, r2)) |
161 | }; |
162 | |
163 | // g = Y - (11 * Cb + 23 * Cr) / 32 ; |
164 | |
165 | // 11*cb |
166 | let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r); |
167 | |
168 | // 23*cr |
169 | let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r); |
170 | |
171 | //(11 |
172 | //(11 * Cb + 23 * Cr) |
173 | let g3 = _mm256_add_epi16(g1, g2); |
174 | |
175 | // (11 * Cb + 23 * Cr) / 32 |
176 | let g4 = _mm256_srai_epi16::<5>(g3); |
177 | |
178 | // Y - (11 * Cb + 23 * Cr) / 32 ; |
179 | let g = YmmRegister { |
180 | mm256: clamp_avx(_mm256_sub_epi16(y_c, g4)) |
181 | }; |
182 | |
183 | // b = Y + 113 * Cb / 64 |
184 | // 113 * cb |
185 | let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r); |
186 | |
187 | //113 * Cb / 64 |
188 | let b2 = _mm256_srai_epi16::<6>(b1); |
189 | |
190 | // b = Y + 113 * Cb / 64 ; |
191 | let b = YmmRegister { |
192 | mm256: clamp_avx(_mm256_add_epi16(b2, y_c)) |
193 | }; |
194 | |
195 | return (r, g, b); |
196 | } |
197 | |
198 | #[inline ] |
199 | #[target_feature (enable = "avx2" )] |
200 | /// A baseline implementation of YCbCr to RGB conversion which does not carry |
201 | /// out clamping |
202 | /// |
203 | /// This is used by the `ycbcr_to_rgba_avx` and `ycbcr_to_rgbx` conversion |
204 | /// routines |
205 | unsafe fn ycbcr_to_rgb_baseline_no_clamp( |
206 | y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16] |
207 | ) -> (__m256i, __m256i, __m256i) { |
208 | // Load values into a register |
209 | // |
210 | let y_c = _mm256_loadu_si256(y.as_ptr().cast()); |
211 | |
212 | let cb_c = _mm256_loadu_si256(cb.as_ptr().cast()); |
213 | |
214 | let cr_c = _mm256_loadu_si256(cr.as_ptr().cast()); |
215 | |
216 | // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb |
217 | |
218 | // Cb = Cb-128; |
219 | let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128)); |
220 | |
221 | // cr = Cb -128; |
222 | let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128)); |
223 | |
224 | // Calculate Y->R |
225 | // r = Y + 45 * Cr / 32 |
226 | // 45*cr |
227 | let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r); |
228 | |
229 | // r1>>5 |
230 | let r2 = _mm256_srai_epi16::<5>(r1); |
231 | |
232 | //y+r2 |
233 | |
234 | let r = _mm256_add_epi16(y_c, r2); |
235 | |
236 | // g = Y - (11 * Cb + 23 * Cr) / 32 ; |
237 | |
238 | // 11*cb |
239 | let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r); |
240 | |
241 | // 23*cr |
242 | let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r); |
243 | |
244 | //(11 |
245 | //(11 * Cb + 23 * Cr) |
246 | let g3 = _mm256_add_epi16(g1, g2); |
247 | |
248 | // (11 * Cb + 23 * Cr) / 32 |
249 | let g4 = _mm256_srai_epi16::<5>(g3); |
250 | |
251 | // Y - (11 * Cb + 23 * Cr) / 32 ; |
252 | let g = _mm256_sub_epi16(y_c, g4); |
253 | |
254 | // b = Y + 113 * Cb / 64 |
255 | // 113 * cb |
256 | let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r); |
257 | |
258 | //113 * Cb / 64 |
259 | let b2 = _mm256_srai_epi16::<6>(b1); |
260 | |
261 | // b = Y + 113 * Cb / 64 ; |
262 | let b = _mm256_add_epi16(b2, y_c); |
263 | |
264 | return (r, g, b); |
265 | } |
266 | |
267 | #[inline (always)] |
268 | pub fn ycbcr_to_rgba_avx2( |
269 | y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize |
270 | ) { |
271 | unsafe { |
272 | ycbcr_to_rgba_unsafe(y, cb, cr, out, offset); |
273 | } |
274 | } |
275 | |
276 | #[inline ] |
277 | #[target_feature (enable = "avx2" )] |
278 | #[rustfmt::skip] |
279 | unsafe fn ycbcr_to_rgba_unsafe( |
280 | y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], |
281 | out: &mut [u8], |
282 | offset: &mut usize, |
283 | ) |
284 | { |
285 | // check if we have enough space to write. |
286 | let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write" ).try_into().unwrap(); |
287 | |
288 | let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr); |
289 | |
290 | // set alpha channel to 255 for opaque |
291 | |
292 | // And no these comments were not from me pressing the keyboard |
293 | |
294 | // Pack the integers into u8's using signed saturation. |
295 | let c = _mm256_packus_epi16(r, g); //aaaaa_bbbbb_aaaaa_bbbbbb |
296 | let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); // cccccc_dddddd_ccccccc_ddddd |
297 | // transpose_u16 and interleave channels |
298 | let e = _mm256_unpacklo_epi8(c, d); //ab_ab_ab_ab_ab_ab_ab_ab |
299 | let f = _mm256_unpackhi_epi8(c, d); //cd_cd_cd_cd_cd_cd_cd_cd |
300 | // final transpose_u16 |
301 | let g = _mm256_unpacklo_epi8(e, f); //abcd_abcd_abcd_abcd_abcd |
302 | let h = _mm256_unpackhi_epi8(e, f); |
303 | |
304 | |
305 | // undo packus shuffling... |
306 | let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h); |
307 | |
308 | let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h); |
309 | |
310 | let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h); |
311 | |
312 | let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h); |
313 | |
314 | let m = _mm256_blend_epi32::<0b1111_0000>(i, j); |
315 | |
316 | let n = _mm256_blend_epi32::<0b1111_0000>(k, l); |
317 | |
318 | |
319 | // Store |
320 | // Use streaming instructions to prevent polluting the cache? |
321 | _mm256_storeu_si256(tmp.as_mut_ptr().cast(), m); |
322 | |
323 | _mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n); |
324 | |
325 | *offset += 64; |
326 | } |
327 | |
328 | /// Clamp values between 0 and 255 |
329 | /// |
330 | /// This function clamps all values in `reg` to be between 0 and 255 |
331 | ///( the accepted values for RGB) |
332 | #[inline ] |
333 | #[target_feature (enable = "avx2" )] |
334 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
335 | unsafe fn clamp_avx(reg: __m256i) -> __m256i { |
336 | // the lowest value |
337 | let min_s: __m256i = _mm256_set1_epi16(0); |
338 | |
339 | // Highest value |
340 | let max_s: __m256i = _mm256_set1_epi16(255); |
341 | |
342 | let max_v: __m256i = _mm256_max_epi16(a:reg, b:min_s); //max(a,0) |
343 | let min_v: __m256i = _mm256_min_epi16(a:max_v, b:max_s); //min(max(a,0),255) |
344 | return min_v; |
345 | } |
346 | |
347 | #[inline ] |
348 | const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 { |
349 | (z << 6) | (y << 4) | (x << 2) | w |
350 | } |
351 | |