vector.rs source code [crates/aho-corasick-1.0.2/src/packed/vector.rs]

1	// This file contains a set of fairly generic utility functions when working
2	// with SIMD vectors.
3	//
4	// SAFETY: All of the routines below are unsafe to call because they assume
5	// the necessary CPU target features in order to use particular vendor
6	// intrinsics. Calling these routines when the underlying CPU does not support
7	// the appropriate target features is NOT safe. Callers must ensure this
8	// themselves.
9	//
10	// Note that it may not look like this safety invariant is being upheld when
11	// these routines are called. Namely, the CPU feature check is typically pretty
12	// far away from when these routines are used. Instead, we rely on the fact
13	// that certain types serve as a guaranteed receipt that pertinent target
14	// features are enabled. For example, the only way TeddySlim3Mask256 can be
15	// constructed is if the AVX2 CPU feature is available. Thus, any code running
16	// inside of TeddySlim3Mask256 can use any of the functions below without any
17	// additional checks: its very existence is* the check.*
18
19	use core::arch::x86_64::*;
20
21	/// Shift `a` to the left by two bytes (removing its two most significant
22	/// bytes), and concatenate it with the the two most significant bytes of `b`.
23	#[target_feature(enable = "avx2")]
24	pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i {
25	// Credit goes to jneem for figuring this out:
26	// https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
27	//
28	// TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR
29	// instructions, which is not what we want, so we need to do some extra
30	// shuffling.
31
32	// This permute gives us the low 16 bytes of a concatenated with the high
33	// 16 bytes of b, in order of most significant to least significant. So
34	// `v = a[15:0] b[31:16]`.
35	let v = _mm256_permute2x128_si256(b, a, `0x21`);
36	// This effectively does this (where we deal in terms of byte-indexing
37	// and byte-shifting, and use inclusive ranges):
38	//
39	// ret[15:0] := ((a[15:0] << 16) \| v[15:0]) >> 14
40	// = ((a[15:0] << 16) \| b[31:16]) >> 14
41	// ret[31:16] := ((a[31:16] << 16) \| v[31:16]) >> 14
42	// = ((a[31:16] << 16) \| a[15:0]) >> 14
43	//
44	// Which therefore results in:
45	//
46	// ret[31:0] := a[29:16] a[15:14] a[13:0] b[31:30]
47	//
48	// The end result is that we've effectively done this:
49	//
50	// (a << 2) \| (b >> 30)
51	//
52	// When `A` and `B` are strings---where the beginning of the string is in
53	// the least significant bits---we effectively result in the following
54	// semantic operation:
55	//
56	// (A >> 2) \| (B << 30)
57	//
58	// The reversal being attributed to the fact that we are in little-endian.
59	_mm256_alignr_epi8(a, v, `14`)
60	}
61
62	/// Shift `a` to the left by three byte (removing its most significant byte),
63	/// and concatenate it with the the most significant byte of `b`.
64	#[target_feature(enable = "avx2")]
65	pub unsafe fn alignr256_13(a: __m256i, b: __m256i) -> __m256i {
66	// For explanation, see alignr256_14.
67	let v: __m256i = _mm256_permute2x128_si256(a:b, b:a, `0x21`);
68	_mm256_alignr_epi8(a, b:v, `13`)
69	}
70
71	/// Shift `a` to the left by one byte (removing its most significant byte), and
72	/// concatenate it with the the most significant byte of `b`.
73	#[target_feature(enable = "avx2")]
74	pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i {
75	// For explanation, see alignr256_14.
76	let v: __m256i = _mm256_permute2x128_si256(a:b, b:a, `0x21`);
77	_mm256_alignr_epi8(a, b:v, `15`)
78	}
79
80	/// Unpack the given 128-bit vector into its 64-bit components. The first
81	/// element of the array returned corresponds to the least significant 64-bit
82	/// lane in `a`.
83	#[target_feature(enable = "ssse3")]
84	pub unsafe fn unpack64x128(a: __m128i) -> [u64; `2`] {
85	[
86	_mm_cvtsi128_si64(a) as u64,
87	_mm_cvtsi128_si64(_mm_srli_si128(a, `8`)) as u64,
88	]
89	}
90
91	/// Unpack the given 256-bit vector into its 64-bit components. The first
92	/// element of the array returned corresponds to the least significant 64-bit
93	/// lane in `a`.
94	#[target_feature(enable = "avx2")]
95	pub unsafe fn unpack64x256(a: __m256i) -> [u64; `4`] {
96	// Using transmute here is precisely equivalent, but actually slower. It's
97	// not quite clear why.
98	let lo: __m128i = _mm256_extracti128_si256(a, `0`);
99	let hi: __m128i = _mm256_extracti128_si256(a, `1`);
100	[
101	_mm_cvtsi128_si64(lo) as u64,
102	_mm_cvtsi128_si64(_mm_srli_si128(lo, `8`)) as u64,
103	_mm_cvtsi128_si64(hi) as u64,
104	_mm_cvtsi128_si64(_mm_srli_si128(hi, `8`)) as u64,
105	]
106	}
107
108	/// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit
109	/// integers.
110	///
111	/// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element
112	/// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits,
113	/// then the return value is `b2 b1 a2 a1`.
114	#[target_feature(enable = "avx2")]
115	pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; `4`] {
116	let lo: __m128i = _mm256_castsi256_si128(a);
117	let hi: __m128i = _mm256_castsi256_si128(b);
118	[
119	_mm_cvtsi128_si64(lo) as u64,
120	_mm_cvtsi128_si64(_mm_srli_si128(lo, `8`)) as u64,
121	_mm_cvtsi128_si64(hi) as u64,
122	_mm_cvtsi128_si64(_mm_srli_si128(hi, `8`)) as u64,
123	]
124	}
125
126	/// Returns true if and only if all bits in the given 128-bit vector are 0.
127	#[target_feature(enable = "ssse3")]
128	pub unsafe fn is_all_zeroes128(a: __m128i) -> bool {
129	let cmp: __m128i = _mm_cmpeq_epi8(a, b:zeroes128());
130	_mm_movemask_epi8(cmp) as u32 == `0xFFFF`
131	}
132
133	/// Returns true if and only if all bits in the given 256-bit vector are 0.
134	#[target_feature(enable = "avx2")]
135	pub unsafe fn is_all_zeroes256(a: __m256i) -> bool {
136	let cmp: __m256i = _mm256_cmpeq_epi8(a, b:zeroes256());
137	_mm256_movemask_epi8(cmp) as u32 == `0xFFFFFFFF`
138	}
139
140	/// Load a 128-bit vector from slice at the given position. The slice does
141	/// not need to be unaligned.
142	///
143	/// Since this code assumes little-endian (there is no big-endian x86), the
144	/// bytes starting in `slice[at..]` will be at the least significant bits of
145	/// the returned vector. This is important for the surrounding code, since for
146	/// example, shifting the resulting vector right is equivalent to logically
147	/// shifting the bytes in `slice` left.
148	#[target_feature(enable = "sse2")]
149	pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i {
150	let ptr: *const u8 = slice.get_unchecked(index:at..).as_ptr();
151	_mm_loadu_si128(mem_addr:ptr as *const u8 as *const __m128i)
152	}
153
154	/// Load a 256-bit vector from slice at the given position. The slice does
155	/// not need to be unaligned.
156	///
157	/// Since this code assumes little-endian (there is no big-endian x86), the
158	/// bytes starting in `slice[at..]` will be at the least significant bits of
159	/// the returned vector. This is important for the surrounding code, since for
160	/// example, shifting the resulting vector right is equivalent to logically
161	/// shifting the bytes in `slice` left.
162	#[target_feature(enable = "avx2")]
163	pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i {
164	let ptr: *const u8 = slice.get_unchecked(index:at..).as_ptr();
165	_mm256_loadu_si256(mem_addr:ptr as *const u8 as *const __m256i)
166	}
167
168	/// Returns a 128-bit vector with all bits set to 0.
169	#[target_feature(enable = "sse2")]
170	pub unsafe fn zeroes128() -> __m128i {
171	_mm_set1_epi8(`0`)
172	}
173
174	/// Returns a 256-bit vector with all bits set to 0.
175	#[target_feature(enable = "avx2")]
176	pub unsafe fn zeroes256() -> __m256i {
177	_mm256_set1_epi8(`0`)
178	}
179
180	/// Returns a 128-bit vector with all bits set to 1.
181	#[target_feature(enable = "sse2")]
182	pub unsafe fn ones128() -> __m128i {
183	_mm_set1_epi8(`0xFF` as u8 as i8)
184	}
185
186	/// Returns a 256-bit vector with all bits set to 1.
187	#[target_feature(enable = "avx2")]
188	pub unsafe fn ones256() -> __m256i {
189	_mm256_set1_epi8(`0xFF` as u8 as i8)
190	}
191