1 | use alloc::vec::*; |
2 | |
3 | #[cfg (not(all(any(target_arch = "x86" , target_arch = "x86_64" ), feature = "simd" )))] |
4 | pub fn get_bitmap(a: &Vec<f32>, length: usize) -> Vec<u8> { |
5 | use crate::platform::{abs, clamp}; |
6 | use alloc::vec; |
7 | let mut height = 0.0; |
8 | assert!(length <= a.len()); |
9 | let mut output = vec![0; length]; |
10 | for i in 0..length { |
11 | unsafe { |
12 | height += a.get_unchecked(i); |
13 | // Clamping because as u8 is undefined outside of its range in rustc. |
14 | *(output.get_unchecked_mut(i)) = clamp(abs(height) * 255.9, 0.0, 255.0) as u8; |
15 | } |
16 | } |
17 | output |
18 | } |
19 | |
20 | #[allow (clippy::uninit_vec)] |
21 | #[cfg (all(any(target_arch = "x86" , target_arch = "x86_64" ), feature = "simd" ))] |
22 | pub fn get_bitmap(a: &Vec<f32>, length: usize) -> Vec<u8> { |
23 | #[cfg (target_arch = "x86" )] |
24 | use core::arch::x86::*; |
25 | #[cfg (target_arch = "x86_64" )] |
26 | use core::arch::x86_64::*; |
27 | |
28 | unsafe { |
29 | // Allocate a 4 byte aligned vector of bytes, and skip zeroing it. Turns out zeroing takes a |
30 | // while on very large sizes. |
31 | let mut output = { |
32 | // Aligned length is ceil(length / 4). |
33 | let aligned_length = (length + 3) >> 2; |
34 | let mut aligned: Vec<u32> = Vec::with_capacity(aligned_length); |
35 | let ptr = aligned.as_mut_ptr(); |
36 | let cap = aligned.capacity() << 2; |
37 | core::mem::forget(aligned); |
38 | Vec::from_raw_parts(ptr as *mut u8, aligned_length << 2, cap) |
39 | }; |
40 | // offset = Zeroed out lanes |
41 | let mut offset = _mm_setzero_ps(); |
42 | // Negative zero is important here. |
43 | let nzero = _mm_castps_si128(_mm_set1_ps(-0.0)); |
44 | for i in (0..output.len()).step_by(4) { |
45 | // x = Read 4 floats from self.a |
46 | let mut x = _mm_loadu_ps(a.get_unchecked(i)); |
47 | // x += (0.0, x[0], x[1], x[2]) |
48 | x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4))); |
49 | // x += (0.0, 0.0, x[0], x[1]) |
50 | x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 8))); |
51 | // x += offset |
52 | x = _mm_add_ps(x, offset); |
53 | |
54 | // y = x * 255.9 |
55 | let y = _mm_mul_ps(x, _mm_set1_ps(255.9)); |
56 | // y = abs(y) |
57 | let y = _mm_andnot_ps(_mm_castsi128_ps(nzero), y); |
58 | // y = Convert y to i32s and truncate |
59 | let mut y = _mm_cvttps_epi32(y); |
60 | // y = Take the first byte of each of the 4 values in y and pack them into |
61 | // the first 4 bytes of y. |
62 | y = _mm_packus_epi16(_mm_packs_epi32(y, nzero), nzero); |
63 | |
64 | // Store the first 4 u8s from y in output. |
65 | let pointer: &mut i32 = core::mem::transmute(output.get_unchecked_mut(i)); |
66 | *pointer = core::mem::transmute::<__m128i, [i32; 4]>(y)[0]; |
67 | // offset = (x[3], x[3], x[3], x[3]) |
68 | offset = _mm_set1_ps(core::mem::transmute::<__m128, [f32; 4]>(x)[3]); |
69 | } |
70 | output.truncate(length); |
71 | output |
72 | } |
73 | } |
74 | |