| 1 | use alloc::vec::*; |
| 2 | |
| 3 | #[cfg (not(all(any(target_arch = "x86" , target_arch = "x86_64" ), feature = "simd" )))] |
| 4 | pub fn get_bitmap(a: &Vec<f32>, length: usize) -> Vec<u8> { |
| 5 | use crate::platform::{abs, clamp}; |
| 6 | use alloc::vec; |
| 7 | let mut height = 0.0; |
| 8 | assert!(length <= a.len()); |
| 9 | let mut output = vec![0; length]; |
| 10 | for i in 0..length { |
| 11 | unsafe { |
| 12 | height += a.get_unchecked(i); |
| 13 | // Clamping because as u8 is undefined outside of its range in rustc. |
| 14 | *(output.get_unchecked_mut(i)) = clamp(abs(height) * 255.9, 0.0, 255.0) as u8; |
| 15 | } |
| 16 | } |
| 17 | output |
| 18 | } |
| 19 | |
| 20 | #[allow (clippy::uninit_vec)] |
| 21 | #[cfg (all(any(target_arch = "x86" , target_arch = "x86_64" ), feature = "simd" ))] |
| 22 | pub fn get_bitmap(a: &Vec<f32>, length: usize) -> Vec<u8> { |
| 23 | #[cfg (target_arch = "x86" )] |
| 24 | use core::arch::x86::*; |
| 25 | #[cfg (target_arch = "x86_64" )] |
| 26 | use core::arch::x86_64::*; |
| 27 | |
| 28 | unsafe { |
| 29 | // Allocate a 4 byte aligned vector of bytes, and skip zeroing it. Turns out zeroing takes a |
| 30 | // while on very large sizes. |
| 31 | let mut output = { |
| 32 | // Aligned length is ceil(length / 4). |
| 33 | let aligned_length = (length + 3) >> 2; |
| 34 | let mut aligned: Vec<u32> = Vec::with_capacity(aligned_length); |
| 35 | let ptr = aligned.as_mut_ptr(); |
| 36 | let cap = aligned.capacity() << 2; |
| 37 | core::mem::forget(aligned); |
| 38 | Vec::from_raw_parts(ptr as *mut u8, aligned_length << 2, cap) |
| 39 | }; |
| 40 | // offset = Zeroed out lanes |
| 41 | let mut offset = _mm_setzero_ps(); |
| 42 | // Negative zero is important here. |
| 43 | let nzero = _mm_castps_si128(_mm_set1_ps(-0.0)); |
| 44 | for i in (0..output.len()).step_by(4) { |
| 45 | // x = Read 4 floats from self.a |
| 46 | let mut x = _mm_loadu_ps(a.get_unchecked(i)); |
| 47 | // x += (0.0, x[0], x[1], x[2]) |
| 48 | x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4))); |
| 49 | // x += (0.0, 0.0, x[0], x[1]) |
| 50 | x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 8))); |
| 51 | // x += offset |
| 52 | x = _mm_add_ps(x, offset); |
| 53 | |
| 54 | // y = x * 255.9 |
| 55 | let y = _mm_mul_ps(x, _mm_set1_ps(255.9)); |
| 56 | // y = abs(y) |
| 57 | let y = _mm_andnot_ps(_mm_castsi128_ps(nzero), y); |
| 58 | // y = Convert y to i32s and truncate |
| 59 | let mut y = _mm_cvttps_epi32(y); |
| 60 | // y = Take the first byte of each of the 4 values in y and pack them into |
| 61 | // the first 4 bytes of y. |
| 62 | y = _mm_packus_epi16(_mm_packs_epi32(y, nzero), nzero); |
| 63 | |
| 64 | // Store the first 4 u8s from y in output. |
| 65 | let pointer: &mut i32 = core::mem::transmute::<&mut u8, &mut i32>(output.get_unchecked_mut(i)); |
| 66 | *pointer = core::mem::transmute::<__m128i, [i32; 4]>(y)[0]; |
| 67 | // offset = (x[3], x[3], x[3], x[3]) |
| 68 | offset = _mm_set1_ps(core::mem::transmute::<__m128, [f32; 4]>(x)[3]); |
| 69 | } |
| 70 | output.truncate(length); |
| 71 | output |
| 72 | } |
| 73 | } |
| 74 | |