1// This file contains a set of fairly generic utility functions when working
2// with SIMD vectors.
3//
4// SAFETY: All of the routines below are unsafe to call because they assume
5// the necessary CPU target features in order to use particular vendor
6// intrinsics. Calling these routines when the underlying CPU does not support
7// the appropriate target features is NOT safe. Callers must ensure this
8// themselves.
9//
10// Note that it may not look like this safety invariant is being upheld when
11// these routines are called. Namely, the CPU feature check is typically pretty
12// far away from when these routines are used. Instead, we rely on the fact
13// that certain types serve as a guaranteed receipt that pertinent target
14// features are enabled. For example, the only way TeddySlim3Mask256 can be
15// constructed is if the AVX2 CPU feature is available. Thus, any code running
16// inside of TeddySlim3Mask256 can use any of the functions below without any
17// additional checks: its very existence *is* the check.
18
19use core::arch::x86_64::*;
20
21/// Shift `a` to the left by two bytes (removing its two most significant
22/// bytes), and concatenate it with the the two most significant bytes of `b`.
23#[target_feature(enable = "avx2")]
24pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i {
25 // Credit goes to jneem for figuring this out:
26 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
27 //
28 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR
29 // instructions, which is not what we want, so we need to do some extra
30 // shuffling.
31
32 // This permute gives us the low 16 bytes of a concatenated with the high
33 // 16 bytes of b, in order of most significant to least significant. So
34 // `v = a[15:0] b[31:16]`.
35 let v = _mm256_permute2x128_si256(b, a, 0x21);
36 // This effectively does this (where we deal in terms of byte-indexing
37 // and byte-shifting, and use inclusive ranges):
38 //
39 // ret[15:0] := ((a[15:0] << 16) | v[15:0]) >> 14
40 // = ((a[15:0] << 16) | b[31:16]) >> 14
41 // ret[31:16] := ((a[31:16] << 16) | v[31:16]) >> 14
42 // = ((a[31:16] << 16) | a[15:0]) >> 14
43 //
44 // Which therefore results in:
45 //
46 // ret[31:0] := a[29:16] a[15:14] a[13:0] b[31:30]
47 //
48 // The end result is that we've effectively done this:
49 //
50 // (a << 2) | (b >> 30)
51 //
52 // When `A` and `B` are strings---where the beginning of the string is in
53 // the least significant bits---we effectively result in the following
54 // semantic operation:
55 //
56 // (A >> 2) | (B << 30)
57 //
58 // The reversal being attributed to the fact that we are in little-endian.
59 _mm256_alignr_epi8(a, v, 14)
60}
61
62/// Shift `a` to the left by three byte (removing its most significant byte),
63/// and concatenate it with the the most significant byte of `b`.
64#[target_feature(enable = "avx2")]
65pub unsafe fn alignr256_13(a: __m256i, b: __m256i) -> __m256i {
66 // For explanation, see alignr256_14.
67 let v: __m256i = _mm256_permute2x128_si256(a:b, b:a, 0x21);
68 _mm256_alignr_epi8(a, b:v, 13)
69}
70
71/// Shift `a` to the left by one byte (removing its most significant byte), and
72/// concatenate it with the the most significant byte of `b`.
73#[target_feature(enable = "avx2")]
74pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i {
75 // For explanation, see alignr256_14.
76 let v: __m256i = _mm256_permute2x128_si256(a:b, b:a, 0x21);
77 _mm256_alignr_epi8(a, b:v, 15)
78}
79
80/// Unpack the given 128-bit vector into its 64-bit components. The first
81/// element of the array returned corresponds to the least significant 64-bit
82/// lane in `a`.
83#[target_feature(enable = "ssse3")]
84pub unsafe fn unpack64x128(a: __m128i) -> [u64; 2] {
85 [
86 _mm_cvtsi128_si64(a) as u64,
87 _mm_cvtsi128_si64(_mm_srli_si128(a, 8)) as u64,
88 ]
89}
90
91/// Unpack the given 256-bit vector into its 64-bit components. The first
92/// element of the array returned corresponds to the least significant 64-bit
93/// lane in `a`.
94#[target_feature(enable = "avx2")]
95pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] {
96 // Using transmute here is precisely equivalent, but actually slower. It's
97 // not quite clear why.
98 let lo: __m128i = _mm256_extracti128_si256(a, 0);
99 let hi: __m128i = _mm256_extracti128_si256(a, 1);
100 [
101 _mm_cvtsi128_si64(lo) as u64,
102 _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64,
103 _mm_cvtsi128_si64(hi) as u64,
104 _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64,
105 ]
106}
107
108/// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit
109/// integers.
110///
111/// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element
112/// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits,
113/// then the return value is `b2 b1 a2 a1`.
114#[target_feature(enable = "avx2")]
115pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4] {
116 let lo: __m128i = _mm256_castsi256_si128(a);
117 let hi: __m128i = _mm256_castsi256_si128(b);
118 [
119 _mm_cvtsi128_si64(lo) as u64,
120 _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64,
121 _mm_cvtsi128_si64(hi) as u64,
122 _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64,
123 ]
124}
125
126/// Returns true if and only if all bits in the given 128-bit vector are 0.
127#[target_feature(enable = "ssse3")]
128pub unsafe fn is_all_zeroes128(a: __m128i) -> bool {
129 let cmp: __m128i = _mm_cmpeq_epi8(a, b:zeroes128());
130 _mm_movemask_epi8(cmp) as u32 == 0xFFFF
131}
132
133/// Returns true if and only if all bits in the given 256-bit vector are 0.
134#[target_feature(enable = "avx2")]
135pub unsafe fn is_all_zeroes256(a: __m256i) -> bool {
136 let cmp: __m256i = _mm256_cmpeq_epi8(a, b:zeroes256());
137 _mm256_movemask_epi8(cmp) as u32 == 0xFFFFFFFF
138}
139
140/// Load a 128-bit vector from slice at the given position. The slice does
141/// not need to be unaligned.
142///
143/// Since this code assumes little-endian (there is no big-endian x86), the
144/// bytes starting in `slice[at..]` will be at the least significant bits of
145/// the returned vector. This is important for the surrounding code, since for
146/// example, shifting the resulting vector right is equivalent to logically
147/// shifting the bytes in `slice` left.
148#[target_feature(enable = "sse2")]
149pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i {
150 let ptr: *const u8 = slice.get_unchecked(index:at..).as_ptr();
151 _mm_loadu_si128(mem_addr:ptr as *const u8 as *const __m128i)
152}
153
154/// Load a 256-bit vector from slice at the given position. The slice does
155/// not need to be unaligned.
156///
157/// Since this code assumes little-endian (there is no big-endian x86), the
158/// bytes starting in `slice[at..]` will be at the least significant bits of
159/// the returned vector. This is important for the surrounding code, since for
160/// example, shifting the resulting vector right is equivalent to logically
161/// shifting the bytes in `slice` left.
162#[target_feature(enable = "avx2")]
163pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i {
164 let ptr: *const u8 = slice.get_unchecked(index:at..).as_ptr();
165 _mm256_loadu_si256(mem_addr:ptr as *const u8 as *const __m256i)
166}
167
168/// Returns a 128-bit vector with all bits set to 0.
169#[target_feature(enable = "sse2")]
170pub unsafe fn zeroes128() -> __m128i {
171 _mm_set1_epi8(0)
172}
173
174/// Returns a 256-bit vector with all bits set to 0.
175#[target_feature(enable = "avx2")]
176pub unsafe fn zeroes256() -> __m256i {
177 _mm256_set1_epi8(0)
178}
179
180/// Returns a 128-bit vector with all bits set to 1.
181#[target_feature(enable = "sse2")]
182pub unsafe fn ones128() -> __m128i {
183 _mm_set1_epi8(0xFF as u8 as i8)
184}
185
186/// Returns a 256-bit vector with all bits set to 1.
187#[target_feature(enable = "avx2")]
188pub unsafe fn ones256() -> __m256i {
189 _mm256_set1_epi8(0xFF as u8 as i8)
190}
191