1 | // This file contains a set of fairly generic utility functions when working |
2 | // with SIMD vectors. |
3 | // |
4 | // SAFETY: All of the routines below are unsafe to call because they assume |
5 | // the necessary CPU target features in order to use particular vendor |
6 | // intrinsics. Calling these routines when the underlying CPU does not support |
7 | // the appropriate target features is NOT safe. Callers must ensure this |
8 | // themselves. |
9 | // |
10 | // Note that it may not look like this safety invariant is being upheld when |
11 | // these routines are called. Namely, the CPU feature check is typically pretty |
12 | // far away from when these routines are used. Instead, we rely on the fact |
13 | // that certain types serve as a guaranteed receipt that pertinent target |
14 | // features are enabled. For example, the only way TeddySlim3Mask256 can be |
15 | // constructed is if the AVX2 CPU feature is available. Thus, any code running |
16 | // inside of TeddySlim3Mask256 can use any of the functions below without any |
17 | // additional checks: its very existence *is* the check. |
18 | |
19 | use core::arch::x86_64::*; |
20 | |
21 | /// Shift `a` to the left by two bytes (removing its two most significant |
22 | /// bytes), and concatenate it with the the two most significant bytes of `b`. |
23 | #[target_feature (enable = "avx2" )] |
24 | pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i { |
25 | // Credit goes to jneem for figuring this out: |
26 | // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 |
27 | // |
28 | // TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR |
29 | // instructions, which is not what we want, so we need to do some extra |
30 | // shuffling. |
31 | |
32 | // This permute gives us the low 16 bytes of a concatenated with the high |
33 | // 16 bytes of b, in order of most significant to least significant. So |
34 | // `v = a[15:0] b[31:16]`. |
35 | let v = _mm256_permute2x128_si256(b, a, 0x21); |
36 | // This effectively does this (where we deal in terms of byte-indexing |
37 | // and byte-shifting, and use inclusive ranges): |
38 | // |
39 | // ret[15:0] := ((a[15:0] << 16) | v[15:0]) >> 14 |
40 | // = ((a[15:0] << 16) | b[31:16]) >> 14 |
41 | // ret[31:16] := ((a[31:16] << 16) | v[31:16]) >> 14 |
42 | // = ((a[31:16] << 16) | a[15:0]) >> 14 |
43 | // |
44 | // Which therefore results in: |
45 | // |
46 | // ret[31:0] := a[29:16] a[15:14] a[13:0] b[31:30] |
47 | // |
48 | // The end result is that we've effectively done this: |
49 | // |
50 | // (a << 2) | (b >> 30) |
51 | // |
52 | // When `A` and `B` are strings---where the beginning of the string is in |
53 | // the least significant bits---we effectively result in the following |
54 | // semantic operation: |
55 | // |
56 | // (A >> 2) | (B << 30) |
57 | // |
58 | // The reversal being attributed to the fact that we are in little-endian. |
59 | _mm256_alignr_epi8(a, v, 14) |
60 | } |
61 | |
62 | /// Shift `a` to the left by three byte (removing its most significant byte), |
63 | /// and concatenate it with the the most significant byte of `b`. |
64 | #[target_feature (enable = "avx2" )] |
65 | pub unsafe fn alignr256_13(a: __m256i, b: __m256i) -> __m256i { |
66 | // For explanation, see alignr256_14. |
67 | let v: __m256i = _mm256_permute2x128_si256(a:b, b:a, 0x21); |
68 | _mm256_alignr_epi8(a, b:v, 13) |
69 | } |
70 | |
71 | /// Shift `a` to the left by one byte (removing its most significant byte), and |
72 | /// concatenate it with the the most significant byte of `b`. |
73 | #[target_feature (enable = "avx2" )] |
74 | pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i { |
75 | // For explanation, see alignr256_14. |
76 | let v: __m256i = _mm256_permute2x128_si256(a:b, b:a, 0x21); |
77 | _mm256_alignr_epi8(a, b:v, 15) |
78 | } |
79 | |
80 | /// Unpack the given 128-bit vector into its 64-bit components. The first |
81 | /// element of the array returned corresponds to the least significant 64-bit |
82 | /// lane in `a`. |
83 | #[target_feature (enable = "ssse3" )] |
84 | pub unsafe fn unpack64x128(a: __m128i) -> [u64; 2] { |
85 | [ |
86 | _mm_cvtsi128_si64(a) as u64, |
87 | _mm_cvtsi128_si64(_mm_srli_si128(a, 8)) as u64, |
88 | ] |
89 | } |
90 | |
91 | /// Unpack the given 256-bit vector into its 64-bit components. The first |
92 | /// element of the array returned corresponds to the least significant 64-bit |
93 | /// lane in `a`. |
94 | #[target_feature (enable = "avx2" )] |
95 | pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] { |
96 | // Using transmute here is precisely equivalent, but actually slower. It's |
97 | // not quite clear why. |
98 | let lo: __m128i = _mm256_extracti128_si256(a, 0); |
99 | let hi: __m128i = _mm256_extracti128_si256(a, 1); |
100 | [ |
101 | _mm_cvtsi128_si64(lo) as u64, |
102 | _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, |
103 | _mm_cvtsi128_si64(hi) as u64, |
104 | _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, |
105 | ] |
106 | } |
107 | |
108 | /// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit |
109 | /// integers. |
110 | /// |
111 | /// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element |
112 | /// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits, |
113 | /// then the return value is `b2 b1 a2 a1`. |
114 | #[target_feature (enable = "avx2" )] |
115 | pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4] { |
116 | let lo: __m128i = _mm256_castsi256_si128(a); |
117 | let hi: __m128i = _mm256_castsi256_si128(b); |
118 | [ |
119 | _mm_cvtsi128_si64(lo) as u64, |
120 | _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, |
121 | _mm_cvtsi128_si64(hi) as u64, |
122 | _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, |
123 | ] |
124 | } |
125 | |
126 | /// Returns true if and only if all bits in the given 128-bit vector are 0. |
127 | #[target_feature (enable = "ssse3" )] |
128 | pub unsafe fn is_all_zeroes128(a: __m128i) -> bool { |
129 | let cmp: __m128i = _mm_cmpeq_epi8(a, b:zeroes128()); |
130 | _mm_movemask_epi8(cmp) as u32 == 0xFFFF |
131 | } |
132 | |
133 | /// Returns true if and only if all bits in the given 256-bit vector are 0. |
134 | #[target_feature (enable = "avx2" )] |
135 | pub unsafe fn is_all_zeroes256(a: __m256i) -> bool { |
136 | let cmp: __m256i = _mm256_cmpeq_epi8(a, b:zeroes256()); |
137 | _mm256_movemask_epi8(cmp) as u32 == 0xFFFFFFFF |
138 | } |
139 | |
140 | /// Load a 128-bit vector from slice at the given position. The slice does |
141 | /// not need to be unaligned. |
142 | /// |
143 | /// Since this code assumes little-endian (there is no big-endian x86), the |
144 | /// bytes starting in `slice[at..]` will be at the least significant bits of |
145 | /// the returned vector. This is important for the surrounding code, since for |
146 | /// example, shifting the resulting vector right is equivalent to logically |
147 | /// shifting the bytes in `slice` left. |
148 | #[target_feature (enable = "sse2" )] |
149 | pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i { |
150 | let ptr: *const u8 = slice.get_unchecked(index:at..).as_ptr(); |
151 | _mm_loadu_si128(mem_addr:ptr as *const u8 as *const __m128i) |
152 | } |
153 | |
154 | /// Load a 256-bit vector from slice at the given position. The slice does |
155 | /// not need to be unaligned. |
156 | /// |
157 | /// Since this code assumes little-endian (there is no big-endian x86), the |
158 | /// bytes starting in `slice[at..]` will be at the least significant bits of |
159 | /// the returned vector. This is important for the surrounding code, since for |
160 | /// example, shifting the resulting vector right is equivalent to logically |
161 | /// shifting the bytes in `slice` left. |
162 | #[target_feature (enable = "avx2" )] |
163 | pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i { |
164 | let ptr: *const u8 = slice.get_unchecked(index:at..).as_ptr(); |
165 | _mm256_loadu_si256(mem_addr:ptr as *const u8 as *const __m256i) |
166 | } |
167 | |
168 | /// Returns a 128-bit vector with all bits set to 0. |
169 | #[target_feature (enable = "sse2" )] |
170 | pub unsafe fn zeroes128() -> __m128i { |
171 | _mm_set1_epi8(0) |
172 | } |
173 | |
174 | /// Returns a 256-bit vector with all bits set to 0. |
175 | #[target_feature (enable = "avx2" )] |
176 | pub unsafe fn zeroes256() -> __m256i { |
177 | _mm256_set1_epi8(0) |
178 | } |
179 | |
180 | /// Returns a 128-bit vector with all bits set to 1. |
181 | #[target_feature (enable = "sse2" )] |
182 | pub unsafe fn ones128() -> __m128i { |
183 | _mm_set1_epi8(0xFF as u8 as i8) |
184 | } |
185 | |
186 | /// Returns a 256-bit vector with all bits set to 1. |
187 | #[target_feature (enable = "avx2" )] |
188 | pub unsafe fn ones256() -> __m256i { |
189 | _mm256_set1_epi8(0xFF as u8 as i8) |
190 | } |
191 | |