vector.rs source code [crates/aho_corasick/src/packed/vector.rs]

1	// NOTE: The descriptions for each of the vector methods on the traits below
2	// are pretty inscrutable. For this reason, there are tests for every method
3	// on for every trait impl below. If you're confused about what an op does,
4	// consult its test. (They probably should be doc tests, but I couldn't figure
5	// out how to write them in a non-annoying way.)
6
7	use core::{
8	fmt::Debug,
9	panic::{RefUnwindSafe, UnwindSafe},
10	};
11
12	/// A trait for describing vector operations used by vectorized searchers.
13	///
14	/// The trait is highly constrained to low level vector operations needed for
15	/// the specific algorithms used in this crate. In general, it was invented
16	/// mostly to be generic over x86's __m128i and __m256i types. At time of
17	/// writing, it also supports wasm and aarch64 128-bit vector types as well.
18	///
19	/// # Safety
20	///
21	/// All methods are not safe since they are intended to be implemented using
22	/// vendor intrinsics, which are also not safe. Callers must ensure that
23	/// the appropriate target features are enabled in the calling function,
24	/// and that the current CPU supports them. All implementations should
25	/// avoid marking the routines with `#[target_feature]` and instead mark
26	/// them as `#[inline(always)]` to ensure they get appropriately inlined.
27	/// (`inline(always)` cannot be used with target_feature.)
28	pub(crate) trait Vector:
29	Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe
30	{
31	/// The number of bits in the vector.
32	const BITS: usize;
33	/// The number of bytes in the vector. That is, this is the size of the
34	/// vector in memory.
35	const BYTES: usize;
36
37	/// Create a vector with 8-bit lanes with the given byte repeated into each
38	/// lane.
39	///
40	/// # Safety
41	///
42	/// Callers must ensure that this is okay to call in the current target for
43	/// the current CPU.
44	unsafe fn splat(byte: u8) -> Self;
45
46	/// Read a vector-size number of bytes from the given pointer. The pointer
47	/// does not need to be aligned.
48	///
49	/// # Safety
50	///
51	/// Callers must ensure that this is okay to call in the current target for
52	/// the current CPU.
53	///
54	/// Callers must guarantee that at least `BYTES` bytes are readable from
55	/// `data`.
56	unsafe fn load_unaligned(data: *const u8) -> Self;
57
58	/// Returns true if and only if this vector has zero in all of its lanes.
59	///
60	/// # Safety
61	///
62	/// Callers must ensure that this is okay to call in the current target for
63	/// the current CPU.
64	unsafe fn is_zero(self) -> bool;
65
66	/// Do an 8-bit pairwise equality check. If lane `i` is equal in this
67	/// vector and the one given, then lane `i` in the resulting vector is set
68	/// to `0xFF`. Otherwise, it is set to `0x00`.
69	///
70	/// # Safety
71	///
72	/// Callers must ensure that this is okay to call in the current target for
73	/// the current CPU.
74	unsafe fn cmpeq(self, vector2: Self) -> Self;
75
76	/// Perform a bitwise 'and' of this vector and the one given and return
77	/// the result.
78	///
79	/// # Safety
80	///
81	/// Callers must ensure that this is okay to call in the current target for
82	/// the current CPU.
83	unsafe fn and(self, vector2: Self) -> Self;
84
85	/// Perform a bitwise 'or' of this vector and the one given and return
86	/// the result.
87	///
88	/// # Safety
89	///
90	/// Callers must ensure that this is okay to call in the current target for
91	/// the current CPU.
92	#[allow(dead_code)] // unused, but useful enough to keep around?
93	unsafe fn or(self, vector2: Self) -> Self;
94
95	/// Shift each 8-bit lane in this vector to the right by the number of
96	/// bits indictated by the `BITS` type parameter.
97	///
98	/// # Safety
99	///
100	/// Callers must ensure that this is okay to call in the current target for
101	/// the current CPU.
102	unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self;
103
104	/// Shift this vector to the left by one byte and shift the most
105	/// significant byte of `vector2` into the least significant position of
106	/// this vector.
107	///
108	/// Stated differently, this behaves as if `self` and `vector2` were
109	/// concatenated into a `2 Self::BITS` temporary buffer and then shifted*
110	/// right by `Self::BYTES - 1` bytes.
111	///
112	/// With respect to the Teddy algorithm, `vector2` is usually a previous
113	/// `Self::BYTES` chunk from the haystack and `self` is the chunk
114	/// immediately following it. This permits combining the last two bytes
115	/// from the previous chunk (`vector2`) with the first `Self::BYTES - 1`
116	/// bytes from the current chunk. This permits aligning the result of
117	/// various shuffles so that they can be and-ed together and a possible
118	/// candidate discovered.
119	///
120	/// # Safety
121	///
122	/// Callers must ensure that this is okay to call in the current target for
123	/// the current CPU.
124	unsafe fn shift_in_one_byte(self, vector2: Self) -> Self;
125
126	/// Shift this vector to the left by two bytes and shift the two most
127	/// significant bytes of `vector2` into the least significant position of
128	/// this vector.
129	///
130	/// Stated differently, this behaves as if `self` and `vector2` were
131	/// concatenated into a `2 Self::BITS` temporary buffer and then shifted*
132	/// right by `Self::BYTES - 2` bytes.
133	///
134	/// With respect to the Teddy algorithm, `vector2` is usually a previous
135	/// `Self::BYTES` chunk from the haystack and `self` is the chunk
136	/// immediately following it. This permits combining the last two bytes
137	/// from the previous chunk (`vector2`) with the first `Self::BYTES - 2`
138	/// bytes from the current chunk. This permits aligning the result of
139	/// various shuffles so that they can be and-ed together and a possible
140	/// candidate discovered.
141	///
142	/// # Safety
143	///
144	/// Callers must ensure that this is okay to call in the current target for
145	/// the current CPU.
146	unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self;
147
148	/// Shift this vector to the left by three bytes and shift the three most
149	/// significant bytes of `vector2` into the least significant position of
150	/// this vector.
151	///
152	/// Stated differently, this behaves as if `self` and `vector2` were
153	/// concatenated into a `2 Self::BITS` temporary buffer and then shifted*
154	/// right by `Self::BYTES - 3` bytes.
155	///
156	/// With respect to the Teddy algorithm, `vector2` is usually a previous
157	/// `Self::BYTES` chunk from the haystack and `self` is the chunk
158	/// immediately following it. This permits combining the last three bytes
159	/// from the previous chunk (`vector2`) with the first `Self::BYTES - 3`
160	/// bytes from the current chunk. This permits aligning the result of
161	/// various shuffles so that they can be and-ed together and a possible
162	/// candidate discovered.
163	///
164	/// # Safety
165	///
166	/// Callers must ensure that this is okay to call in the current target for
167	/// the current CPU.
168	unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self;
169
170	/// Shuffles the bytes in this vector according to the indices in each of
171	/// the corresponding lanes in `indices`.
172	///
173	/// If `i` is the index of corresponding lanes, `A` is this vector, `B` is
174	/// indices and `C` is the resulting vector, then `C = A[B[i]]`.
175	///
176	/// # Safety
177	///
178	/// Callers must ensure that this is okay to call in the current target for
179	/// the current CPU.
180	unsafe fn shuffle_bytes(self, indices: Self) -> Self;
181
182	/// Call the provided function for each 64-bit lane in this vector. The
183	/// given function is provided the lane index and lane value as a `u64`.
184	///
185	/// If `f` returns `Some`, then iteration over the lanes is stopped and the
186	/// value is returned. Otherwise, this returns `None`.
187	///
188	/// # Notes
189	///
190	/// Conceptually it would be nice if we could have a
191	/// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is
192	/// tricky given Rust's [current support for const generics][support].
193	/// And even if we could, it would be tricky to write generic code over
194	/// it. (Not impossible. We could introduce another layer that requires
195	/// `AsRef<[u64]>` or something.)
196	///
197	/// [support]: https://github.com/rust-lang/rust/issues/60551
198	///
199	/// # Safety
200	///
201	/// Callers must ensure that this is okay to call in the current target for
202	/// the current CPU.
203	unsafe fn for_each_64bit_lane<T>(
204	self,
205	f: impl FnMut(usize, u64) -> Option<T>,
206	) -> Option<T>;
207	}
208
209	/// This trait extends the `Vector` trait with additional operations to support
210	/// Fat Teddy.
211	///
212	/// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as
213	/// the vector size) instead of the full size of a vector per iteration. For
214	/// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr
215	/// but Fat Teddy reads 16 bytes at a time.
216	///
217	/// Fat Teddy is useful when searching for a large number of literals.
218	/// The extra number of buckets spreads the literals out more and reduces
219	/// verification time.
220	///
221	/// Currently we only implement this for AVX on x86_64. It would be nice to
222	/// implement this for SSE on x86_64 and NEON on aarch64, with the latter two
223	/// only reading 8 bytes at a time. It's not clear how well it would work, but
224	/// there are some tricky things to figure out in terms of implementation. The
225	/// `half_shift_in_{one,two,three}_bytes` methods in particular are probably
226	/// the trickiest of the bunch. For AVX2, these are implemented by taking
227	/// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit
228	/// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8`
229	/// operates on the full 128-bit vector and not on each 64-bit half.) I didn't
230	/// do a careful survey of NEON to see if it could easily support these
231	/// operations.
232	pub(crate) trait FatVector: Vector {
233	type Half: Vector;
234
235	/// Read a half-vector-size number of bytes from the given pointer, and
236	/// broadcast it across both halfs of a full vector. The pointer does not
237	/// need to be aligned.
238	///
239	/// # Safety
240	///
241	/// Callers must ensure that this is okay to call in the current target for
242	/// the current CPU.
243	///
244	/// Callers must guarantee that at least `Self::HALF::BYTES` bytes are
245	/// readable from `data`.
246	unsafe fn load_half_unaligned(data: *const u8) -> Self;
247
248	/// Like `Vector::shift_in_one_byte`, except this is done for each half
249	/// of the vector instead.
250	///
251	/// # Safety
252	///
253	/// Callers must ensure that this is okay to call in the current target for
254	/// the current CPU.
255	unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self;
256
257	/// Like `Vector::shift_in_two_bytes`, except this is done for each half
258	/// of the vector instead.
259	///
260	/// # Safety
261	///
262	/// Callers must ensure that this is okay to call in the current target for
263	/// the current CPU.
264	unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self;
265
266	/// Like `Vector::shift_in_two_bytes`, except this is done for each half
267	/// of the vector instead.
268	///
269	/// # Safety
270	///
271	/// Callers must ensure that this is okay to call in the current target for
272	/// the current CPU.
273	unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self;
274
275	/// Swap the 128-bit lanes in this vector.
276	///
277	/// # Safety
278	///
279	/// Callers must ensure that this is okay to call in the current target for
280	/// the current CPU.
281	unsafe fn swap_halves(self) -> Self;
282
283	/// Unpack and interleave the 8-bit lanes from the low 128 bits of each
284	/// vector and return the result.
285	///
286	/// # Safety
287	///
288	/// Callers must ensure that this is okay to call in the current target for
289	/// the current CPU.
290	unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self;
291
292	/// Unpack and interleave the 8-bit lanes from the high 128 bits of each
293	/// vector and return the result.
294	///
295	/// # Safety
296	///
297	/// Callers must ensure that this is okay to call in the current target for
298	/// the current CPU.
299	unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self;
300
301	/// Call the provided function for each 64-bit lane in the lower half
302	/// of this vector and then in the other vector. The given function is
303	/// provided the lane index and lane value as a `u64`. (The high 128-bits
304	/// of each vector are ignored.)
305	///
306	/// If `f` returns `Some`, then iteration over the lanes is stopped and the
307	/// value is returned. Otherwise, this returns `None`.
308	///
309	/// # Safety
310	///
311	/// Callers must ensure that this is okay to call in the current target for
312	/// the current CPU.
313	unsafe fn for_each_low_64bit_lane<T>(
314	self,
315	vector2: Self,
316	f: impl FnMut(usize, u64) -> Option<T>,
317	) -> Option<T>;
318	}
319
320	#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
321	mod x86_64_ssse3 {
322	use core::arch::x86_64::*;
323
324	use crate::util::int::{I32, I8};
325
326	use super::Vector;
327
328	impl Vector for __m128i {
329	const BITS: usize = `128`;
330	const BYTES: usize = `16`;
331
332	#[inline(always)]
333	unsafe fn splat(byte: u8) -> __m128i {
334	_mm_set1_epi8(i8::from_bits(byte))
335	}
336
337	#[inline(always)]
338	unsafe fn load_unaligned(data: *const u8) -> __m128i {
339	_mm_loadu_si128(data.cast::<__m128i>())
340	}
341
342	#[inline(always)]
343	unsafe fn is_zero(self) -> bool {
344	let cmp = self.cmpeq(Self::splat(`0`));
345	_mm_movemask_epi8(cmp).to_bits() == `0xFFFF`
346	}
347
348	#[inline(always)]
349	unsafe fn cmpeq(self, vector2: Self) -> __m128i {
350	_mm_cmpeq_epi8(self, vector2)
351	}
352
353	#[inline(always)]
354	unsafe fn and(self, vector2: Self) -> __m128i {
355	_mm_and_si128(self, vector2)
356	}
357
358	#[inline(always)]
359	unsafe fn or(self, vector2: Self) -> __m128i {
360	_mm_or_si128(self, vector2)
361	}
362
363	#[inline(always)]
364	unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
365	// Apparently there is no _mm_srli_epi8, so we emulate it by
366	// shifting 16-bit integers and masking out the high nybble of each
367	// 8-bit lane (since that nybble will contain bits from the low
368	// nybble of the previous lane).
369	let lomask = Self::splat(`0xF`);
370	_mm_srli_epi16(self, BITS).and(lomask)
371	}
372
373	#[inline(always)]
374	unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
375	_mm_alignr_epi8(self, vector2, `15`)
376	}
377
378	#[inline(always)]
379	unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
380	_mm_alignr_epi8(self, vector2, `14`)
381	}
382
383	#[inline(always)]
384	unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
385	_mm_alignr_epi8(self, vector2, `13`)
386	}
387
388	#[inline(always)]
389	unsafe fn shuffle_bytes(self, indices: Self) -> Self {
390	_mm_shuffle_epi8(self, indices)
391	}
392
393	#[inline(always)]
394	unsafe fn for_each_64bit_lane<T>(
395	self,
396	mut f: impl FnMut(usize, u64) -> Option<T>,
397	) -> Option<T> {
398	// We could just use _mm_extract_epi64 here, but that requires
399	// SSE 4.1. It isn't necessarily a problem to just require SSE 4.1,
400	// but everything else works with SSSE3 so we stick to that subset.
401	let lanes: [u64; `2`] = core::mem::transmute(self);
402	if let Some(t) = f(`0`, lanes[`0`]) {
403	return Some(t);
404	}
405	if let Some(t) = f(`1`, lanes[`1`]) {
406	return Some(t);
407	}
408	None
409	}
410	}
411	}
412
413	#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
414	mod x86_64_avx2 {
415	use core::arch::x86_64::*;
416
417	use crate::util::int::{I32, I64, I8};
418
419	use super::{FatVector, Vector};
420
421	impl Vector for __m256i {
422	const BITS: usize = `256`;
423	const BYTES: usize = `32`;
424
425	#[inline(always)]
426	unsafe fn splat(byte: u8) -> __m256i {
427	_mm256_set1_epi8(i8::from_bits(byte))
428	}
429
430	#[inline(always)]
431	unsafe fn load_unaligned(data: *const u8) -> __m256i {
432	_mm256_loadu_si256(data.cast::<__m256i>())
433	}
434
435	#[inline(always)]
436	unsafe fn is_zero(self) -> bool {
437	let cmp = self.cmpeq(Self::splat(`0`));
438	_mm256_movemask_epi8(cmp).to_bits() == `0xFFFFFFFF`
439	}
440
441	#[inline(always)]
442	unsafe fn cmpeq(self, vector2: Self) -> __m256i {
443	_mm256_cmpeq_epi8(self, vector2)
444	}
445
446	#[inline(always)]
447	unsafe fn and(self, vector2: Self) -> __m256i {
448	_mm256_and_si256(self, vector2)
449	}
450
451	#[inline(always)]
452	unsafe fn or(self, vector2: Self) -> __m256i {
453	_mm256_or_si256(self, vector2)
454	}
455
456	#[inline(always)]
457	unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
458	let lomask = Self::splat(`0xF`);
459	_mm256_srli_epi16(self, BITS).and(lomask)
460	}
461
462	#[inline(always)]
463	unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
464	// Credit goes to jneem for figuring this out:
465	// https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
466	//
467	// TL;DR avx2's PALIGNR instruction is actually just two 128-bit
468	// PALIGNR instructions, which is not what we want, so we need to
469	// do some extra shuffling.
470	let v = _mm256_permute2x128_si256(vector2, self, `0x21`);
471	_mm256_alignr_epi8(self, v, `15`)
472	}
473
474	#[inline(always)]
475	unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
476	// Credit goes to jneem for figuring this out:
477	// https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
478	//
479	// TL;DR avx2's PALIGNR instruction is actually just two 128-bit
480	// PALIGNR instructions, which is not what we want, so we need to
481	// do some extra shuffling.
482	let v = _mm256_permute2x128_si256(vector2, self, `0x21`);
483	_mm256_alignr_epi8(self, v, `14`)
484	}
485
486	#[inline(always)]
487	unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
488	// Credit goes to jneem for figuring this out:
489	// https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
490	//
491	// TL;DR avx2's PALIGNR instruction is actually just two 128-bit
492	// PALIGNR instructions, which is not what we want, so we need to
493	// do some extra shuffling.
494	let v = _mm256_permute2x128_si256(vector2, self, `0x21`);
495	_mm256_alignr_epi8(self, v, `13`)
496	}
497
498	#[inline(always)]
499	unsafe fn shuffle_bytes(self, indices: Self) -> Self {
500	_mm256_shuffle_epi8(self, indices)
501	}
502
503	#[inline(always)]
504	unsafe fn for_each_64bit_lane<T>(
505	self,
506	mut f: impl FnMut(usize, u64) -> Option<T>,
507	) -> Option<T> {
508	// NOTE: At one point in the past, I used transmute to this to
509	// get a [u64; 4], but it turned out to lead to worse codegen IIRC.
510	// I've tried it more recently, and it looks like that's no longer
511	// the case. But since there's no difference, we stick with the
512	// slightly more complicated but transmute-free version.
513	let lane = _mm256_extract_epi64(self, `0`).to_bits();
514	if let Some(t) = f(`0`, lane) {
515	return Some(t);
516	}
517	let lane = _mm256_extract_epi64(self, `1`).to_bits();
518	if let Some(t) = f(`1`, lane) {
519	return Some(t);
520	}
521	let lane = _mm256_extract_epi64(self, `2`).to_bits();
522	if let Some(t) = f(`2`, lane) {
523	return Some(t);
524	}
525	let lane = _mm256_extract_epi64(self, `3`).to_bits();
526	if let Some(t) = f(`3`, lane) {
527	return Some(t);
528	}
529	None
530	}
531	}
532
533	impl FatVector for __m256i {
534	type Half = __m128i;
535
536	#[inline(always)]
537	unsafe fn load_half_unaligned(data: *const u8) -> Self {
538	let half = Self::Half::load_unaligned(data);
539	_mm256_broadcastsi128_si256(half)
540	}
541
542	#[inline(always)]
543	unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self {
544	_mm256_alignr_epi8(self, vector2, `15`)
545	}
546
547	#[inline(always)]
548	unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self {
549	_mm256_alignr_epi8(self, vector2, `14`)
550	}
551
552	#[inline(always)]
553	unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self {
554	_mm256_alignr_epi8(self, vector2, `13`)
555	}
556
557	#[inline(always)]
558	unsafe fn swap_halves(self) -> Self {
559	_mm256_permute4x64_epi64(self, `0x4E`)
560	}
561
562	#[inline(always)]
563	unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self {
564	_mm256_unpacklo_epi8(self, vector2)
565	}
566
567	#[inline(always)]
568	unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self {
569	_mm256_unpackhi_epi8(self, vector2)
570	}
571
572	#[inline(always)]
573	unsafe fn for_each_low_64bit_lane<T>(
574	self,
575	vector2: Self,
576	mut f: impl FnMut(usize, u64) -> Option<T>,
577	) -> Option<T> {
578	let lane = _mm256_extract_epi64(self, `0`).to_bits();
579	if let Some(t) = f(`0`, lane) {
580	return Some(t);
581	}
582	let lane = _mm256_extract_epi64(self, `1`).to_bits();
583	if let Some(t) = f(`1`, lane) {
584	return Some(t);
585	}
586	let lane = _mm256_extract_epi64(vector2, `0`).to_bits();
587	if let Some(t) = f(`2`, lane) {
588	return Some(t);
589	}
590	let lane = _mm256_extract_epi64(vector2, `1`).to_bits();
591	if let Some(t) = f(`3`, lane) {
592	return Some(t);
593	}
594	None
595	}
596	}
597	}
598
599	#[cfg(all(
600	target_arch = "aarch64",
601	target_feature = "neon",
602	target_endian = "little"
603	))]
604	mod aarch64_neon {
605	use core::arch::aarch64::*;
606
607	use super::Vector;
608
609	impl Vector for uint8x16_t {
610	const BITS: usize = `128`;
611	const BYTES: usize = `16`;
612
613	#[inline(always)]
614	unsafe fn splat(byte: u8) -> uint8x16_t {
615	vdupq_n_u8(byte)
616	}
617
618	#[inline(always)]
619	unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
620	vld1q_u8(data)
621	}
622
623	#[inline(always)]
624	unsafe fn is_zero(self) -> bool {
625	// Could also use vmaxvq_u8.
626	// ... I tried that and couldn't observe any meaningful difference
627	// in benchmarks.
628	let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
629	vgetq_lane_u64(maxes, `0`) == `0`
630	}
631
632	#[inline(always)]
633	unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
634	vceqq_u8(self, vector2)
635	}
636
637	#[inline(always)]
638	unsafe fn and(self, vector2: Self) -> uint8x16_t {
639	vandq_u8(self, vector2)
640	}
641
642	#[inline(always)]
643	unsafe fn or(self, vector2: Self) -> uint8x16_t {
644	vorrq_u8(self, vector2)
645	}
646
647	#[inline(always)]
648	unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
649	debug_assert!(BITS <= `7`);
650	vshrq_n_u8(self, BITS)
651	}
652
653	#[inline(always)]
654	unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
655	vextq_u8(vector2, self, `15`)
656	}
657
658	#[inline(always)]
659	unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
660	vextq_u8(vector2, self, `14`)
661	}
662
663	#[inline(always)]
664	unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
665	vextq_u8(vector2, self, `13`)
666	}
667
668	#[inline(always)]
669	unsafe fn shuffle_bytes(self, indices: Self) -> Self {
670	vqtbl1q_u8(self, indices)
671	}
672
673	#[inline(always)]
674	unsafe fn for_each_64bit_lane<T>(
675	self,
676	mut f: impl FnMut(usize, u64) -> Option<T>,
677	) -> Option<T> {
678	let this = vreinterpretq_u64_u8(self);
679	let lane = vgetq_lane_u64(this, `0`);
680	if let Some(t) = f(`0`, lane) {
681	return Some(t);
682	}
683	let lane = vgetq_lane_u64(this, `1`);
684	if let Some(t) = f(`1`, lane) {
685	return Some(t);
686	}
687	None
688	}
689	}
690	}
691
692	#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
693	mod tests_x86_64_ssse3 {
694	use core::arch::x86_64::*;
695
696	use crate::util::int::{I32, U32};
697
698	use super::*;
699
700	fn is_runnable() -> bool {
701	std::is_x86_feature_detected!("ssse3")
702	}
703
704	#[target_feature(enable = "ssse3")]
705	unsafe fn load(lanes: [u8; `16`]) -> __m128i {
706	__m128i::load_unaligned(&lanes as *const u8)
707	}
708
709	#[target_feature(enable = "ssse3")]
710	unsafe fn unload(v: __m128i) -> [u8; `16`] {
711	[
712	_mm_extract_epi8(v, `0`).to_bits().low_u8(),
713	_mm_extract_epi8(v, `1`).to_bits().low_u8(),
714	_mm_extract_epi8(v, `2`).to_bits().low_u8(),
715	_mm_extract_epi8(v, `3`).to_bits().low_u8(),
716	_mm_extract_epi8(v, `4`).to_bits().low_u8(),
717	_mm_extract_epi8(v, `5`).to_bits().low_u8(),
718	_mm_extract_epi8(v, `6`).to_bits().low_u8(),
719	_mm_extract_epi8(v, `7`).to_bits().low_u8(),
720	_mm_extract_epi8(v, `8`).to_bits().low_u8(),
721	_mm_extract_epi8(v, `9`).to_bits().low_u8(),
722	_mm_extract_epi8(v, `10`).to_bits().low_u8(),
723	_mm_extract_epi8(v, `11`).to_bits().low_u8(),
724	_mm_extract_epi8(v, `12`).to_bits().low_u8(),
725	_mm_extract_epi8(v, `13`).to_bits().low_u8(),
726	_mm_extract_epi8(v, `14`).to_bits().low_u8(),
727	_mm_extract_epi8(v, `15`).to_bits().low_u8(),
728	]
729	}
730
731	#[test]
732	fn vector_splat() {
733	#[target_feature(enable = "ssse3")]
734	unsafe fn test() {
735	let v = __m128i::splat(`0xAF`);
736	assert_eq!(
737	unload(v),
738	[
739	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
740	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`
741	]
742	);
743	}
744	if !is_runnable() {
745	return;
746	}
747	unsafe { test() }
748	}
749
750	#[test]
751	fn vector_is_zero() {
752	#[target_feature(enable = "ssse3")]
753	unsafe fn test() {
754	let v = load([`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
755	assert!(!v.is_zero());
756	let v = load([`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
757	assert!(v.is_zero());
758	}
759	if !is_runnable() {
760	return;
761	}
762	unsafe { test() }
763	}
764
765	#[test]
766	fn vector_cmpeq() {
767	#[target_feature(enable = "ssse3")]
768	unsafe fn test() {
769	let v1 =
770	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `1`]);
771	let v2 =
772	load([`16`, `15`, `14`, `13`, `12`, `11`, `10`, `9`, `8`, `7`, `6`, `5`, `4`, `3`, `2`, `1`]);
773	assert_eq!(
774	unload(v1.cmpeq(v2)),
775	[`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0xFF`]
776	);
777	}
778	if !is_runnable() {
779	return;
780	}
781	unsafe { test() }
782	}
783
784	#[test]
785	fn vector_and() {
786	#[target_feature(enable = "ssse3")]
787	unsafe fn test() {
788	let v1 =
789	load([`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
790	let v2 =
791	load([`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
792	assert_eq!(
793	unload(v1.and(v2)),
794	[`0`, `0`, `0`, `0`, `0`, `0b1000`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
795	);
796	}
797	if !is_runnable() {
798	return;
799	}
800	unsafe { test() }
801	}
802
803	#[test]
804	fn vector_or() {
805	#[target_feature(enable = "ssse3")]
806	unsafe fn test() {
807	let v1 =
808	load([`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
809	let v2 =
810	load([`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
811	assert_eq!(
812	unload(v1.or(v2)),
813	[`0`, `0`, `0`, `0`, `0`, `0b1011`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
814	);
815	}
816	if !is_runnable() {
817	return;
818	}
819	unsafe { test() }
820	}
821
822	#[test]
823	fn vector_shift_8bit_lane_right() {
824	#[target_feature(enable = "ssse3")]
825	unsafe fn test() {
826	let v = load([
827	`0`, `0`, `0`, `0`, `0b1011`, `0b0101`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
828	]);
829	assert_eq!(
830	unload(v.shift_8bit_lane_right::<`2`>()),
831	[`0`, `0`, `0`, `0`, `0b0010`, `0b0001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
832	);
833	}
834	if !is_runnable() {
835	return;
836	}
837	unsafe { test() }
838	}
839
840	#[test]
841	fn vector_shift_in_one_byte() {
842	#[target_feature(enable = "ssse3")]
843	unsafe fn test() {
844	let v1 =
845	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
846	let v2 = load([
847	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
848	]);
849	assert_eq!(
850	unload(v1.shift_in_one_byte(v2)),
851	[`32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`],
852	);
853	}
854	if !is_runnable() {
855	return;
856	}
857	unsafe { test() }
858	}
859
860	#[test]
861	fn vector_shift_in_two_bytes() {
862	#[target_feature(enable = "ssse3")]
863	unsafe fn test() {
864	let v1 =
865	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
866	let v2 = load([
867	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
868	]);
869	assert_eq!(
870	unload(v1.shift_in_two_bytes(v2)),
871	[`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`],
872	);
873	}
874	if !is_runnable() {
875	return;
876	}
877	unsafe { test() }
878	}
879
880	#[test]
881	fn vector_shift_in_three_bytes() {
882	#[target_feature(enable = "ssse3")]
883	unsafe fn test() {
884	let v1 =
885	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
886	let v2 = load([
887	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
888	]);
889	assert_eq!(
890	unload(v1.shift_in_three_bytes(v2)),
891	[`30`, `31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`],
892	);
893	}
894	if !is_runnable() {
895	return;
896	}
897	unsafe { test() }
898	}
899
900	#[test]
901	fn vector_shuffle_bytes() {
902	#[target_feature(enable = "ssse3")]
903	unsafe fn test() {
904	let v1 =
905	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
906	let v2 =
907	load([`0`, `0`, `0`, `0`, `4`, `4`, `4`, `4`, `8`, `8`, `8`, `8`, `12`, `12`, `12`, `12`]);
908	assert_eq!(
909	unload(v1.shuffle_bytes(v2)),
910	[`1`, `1`, `1`, `1`, `5`, `5`, `5`, `5`, `9`, `9`, `9`, `9`, `13`, `13`, `13`, `13`],
911	);
912	}
913	if !is_runnable() {
914	return;
915	}
916	unsafe { test() }
917	}
918
919	#[test]
920	fn vector_for_each_64bit_lane() {
921	#[target_feature(enable = "ssse3")]
922	unsafe fn test() {
923	let v = load([
924	`0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0A`,
925	`0x0B`, `0x0C`, `0x0D`, `0x0E`, `0x0F`, `0x10`,
926	]);
927	let mut lanes = [`0u64`; `2`];
928	v.for_each_64bit_lane(\|i, lane\| {
929	lanes[i] = lane;
930	None::<()>
931	});
932	assert_eq!(lanes, [`0x0807060504030201`, `0x100F0E0D0C0B0A09`],);
933	}
934	if !is_runnable() {
935	return;
936	}
937	unsafe { test() }
938	}
939	}
940
941	#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
942	mod tests_x86_64_avx2 {
943	use core::arch::x86_64::*;
944
945	use crate::util::int::{I32, U32};
946
947	use super::*;
948
949	fn is_runnable() -> bool {
950	std::is_x86_feature_detected!("avx2")
951	}
952
953	#[target_feature(enable = "avx2")]
954	unsafe fn load(lanes: [u8; `32`]) -> __m256i {
955	__m256i::load_unaligned(&lanes as *const u8)
956	}
957
958	#[target_feature(enable = "avx2")]
959	unsafe fn load_half(lanes: [u8; `16`]) -> __m256i {
960	__m256i::load_half_unaligned(&lanes as *const u8)
961	}
962
963	#[target_feature(enable = "avx2")]
964	unsafe fn unload(v: __m256i) -> [u8; `32`] {
965	[
966	_mm256_extract_epi8(v, `0`).to_bits().low_u8(),
967	_mm256_extract_epi8(v, `1`).to_bits().low_u8(),
968	_mm256_extract_epi8(v, `2`).to_bits().low_u8(),
969	_mm256_extract_epi8(v, `3`).to_bits().low_u8(),
970	_mm256_extract_epi8(v, `4`).to_bits().low_u8(),
971	_mm256_extract_epi8(v, `5`).to_bits().low_u8(),
972	_mm256_extract_epi8(v, `6`).to_bits().low_u8(),
973	_mm256_extract_epi8(v, `7`).to_bits().low_u8(),
974	_mm256_extract_epi8(v, `8`).to_bits().low_u8(),
975	_mm256_extract_epi8(v, `9`).to_bits().low_u8(),
976	_mm256_extract_epi8(v, `10`).to_bits().low_u8(),
977	_mm256_extract_epi8(v, `11`).to_bits().low_u8(),
978	_mm256_extract_epi8(v, `12`).to_bits().low_u8(),
979	_mm256_extract_epi8(v, `13`).to_bits().low_u8(),
980	_mm256_extract_epi8(v, `14`).to_bits().low_u8(),
981	_mm256_extract_epi8(v, `15`).to_bits().low_u8(),
982	_mm256_extract_epi8(v, `16`).to_bits().low_u8(),
983	_mm256_extract_epi8(v, `17`).to_bits().low_u8(),
984	_mm256_extract_epi8(v, `18`).to_bits().low_u8(),
985	_mm256_extract_epi8(v, `19`).to_bits().low_u8(),
986	_mm256_extract_epi8(v, `20`).to_bits().low_u8(),
987	_mm256_extract_epi8(v, `21`).to_bits().low_u8(),
988	_mm256_extract_epi8(v, `22`).to_bits().low_u8(),
989	_mm256_extract_epi8(v, `23`).to_bits().low_u8(),
990	_mm256_extract_epi8(v, `24`).to_bits().low_u8(),
991	_mm256_extract_epi8(v, `25`).to_bits().low_u8(),
992	_mm256_extract_epi8(v, `26`).to_bits().low_u8(),
993	_mm256_extract_epi8(v, `27`).to_bits().low_u8(),
994	_mm256_extract_epi8(v, `28`).to_bits().low_u8(),
995	_mm256_extract_epi8(v, `29`).to_bits().low_u8(),
996	_mm256_extract_epi8(v, `30`).to_bits().low_u8(),
997	_mm256_extract_epi8(v, `31`).to_bits().low_u8(),
998	]
999	}
1000
1001	#[test]
1002	fn vector_splat() {
1003	#[target_feature(enable = "avx2")]
1004	unsafe fn test() {
1005	let v = __m256i::splat(`0xAF`);
1006	assert_eq!(
1007	unload(v),
1008	[
1009	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1010	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1011	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1012	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1013	]
1014	);
1015	}
1016	if !is_runnable() {
1017	return;
1018	}
1019	unsafe { test() }
1020	}
1021
1022	#[test]
1023	fn vector_is_zero() {
1024	#[target_feature(enable = "avx2")]
1025	unsafe fn test() {
1026	let v = load([
1027	`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1028	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1029	]);
1030	assert!(!v.is_zero());
1031	let v = load([
1032	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1033	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1034	]);
1035	assert!(v.is_zero());
1036	}
1037	if !is_runnable() {
1038	return;
1039	}
1040	unsafe { test() }
1041	}
1042
1043	#[test]
1044	fn vector_cmpeq() {
1045	#[target_feature(enable = "avx2")]
1046	unsafe fn test() {
1047	let v1 = load([
1048	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1049	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `1`,
1050	]);
1051	let v2 = load([
1052	`32`, `31`, `30`, `29`, `28`, `27`, `26`, `25`, `24`, `23`, `22`, `21`, `20`, `19`, `18`,
1053	`17`, `16`, `15`, `14`, `13`, `12`, `11`, `10`, `9`, `8`, `7`, `6`, `5`, `4`, `3`, `2`, `1`,
1054	]);
1055	assert_eq!(
1056	unload(v1.cmpeq(v2)),
1057	[
1058	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1059	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0xFF`
1060	]
1061	);
1062	}
1063	if !is_runnable() {
1064	return;
1065	}
1066	unsafe { test() }
1067	}
1068
1069	#[test]
1070	fn vector_and() {
1071	#[target_feature(enable = "avx2")]
1072	unsafe fn test() {
1073	let v1 = load([
1074	`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1075	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1076	]);
1077	let v2 = load([
1078	`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1079	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1080	]);
1081	assert_eq!(
1082	unload(v1.and(v2)),
1083	[
1084	`0`, `0`, `0`, `0`, `0`, `0b1000`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1085	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1086	]
1087	);
1088	}
1089	if !is_runnable() {
1090	return;
1091	}
1092	unsafe { test() }
1093	}
1094
1095	#[test]
1096	fn vector_or() {
1097	#[target_feature(enable = "avx2")]
1098	unsafe fn test() {
1099	let v1 = load([
1100	`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1101	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1102	]);
1103	let v2 = load([
1104	`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1105	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1106	]);
1107	assert_eq!(
1108	unload(v1.or(v2)),
1109	[
1110	`0`, `0`, `0`, `0`, `0`, `0b1011`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1111	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1112	]
1113	);
1114	}
1115	if !is_runnable() {
1116	return;
1117	}
1118	unsafe { test() }
1119	}
1120
1121	#[test]
1122	fn vector_shift_8bit_lane_right() {
1123	#[target_feature(enable = "avx2")]
1124	unsafe fn test() {
1125	let v = load([
1126	`0`, `0`, `0`, `0`, `0b1011`, `0b0101`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1127	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1128	]);
1129	assert_eq!(
1130	unload(v.shift_8bit_lane_right::<`2`>()),
1131	[
1132	`0`, `0`, `0`, `0`, `0b0010`, `0b0001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1133	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1134	]
1135	);
1136	}
1137	if !is_runnable() {
1138	return;
1139	}
1140	unsafe { test() }
1141	}
1142
1143	#[test]
1144	fn vector_shift_in_one_byte() {
1145	#[target_feature(enable = "avx2")]
1146	unsafe fn test() {
1147	let v1 = load([
1148	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1149	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1150	]);
1151	let v2 = load([
1152	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1153	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1154	`63`, `64`,
1155	]);
1156	assert_eq!(
1157	unload(v1.shift_in_one_byte(v2)),
1158	[
1159	`64`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`,
1160	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`,
1161	`31`,
1162	],
1163	);
1164	}
1165	if !is_runnable() {
1166	return;
1167	}
1168	unsafe { test() }
1169	}
1170
1171	#[test]
1172	fn vector_shift_in_two_bytes() {
1173	#[target_feature(enable = "avx2")]
1174	unsafe fn test() {
1175	let v1 = load([
1176	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1177	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1178	]);
1179	let v2 = load([
1180	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1181	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1182	`63`, `64`,
1183	]);
1184	assert_eq!(
1185	unload(v1.shift_in_two_bytes(v2)),
1186	[
1187	`63`, `64`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
1188	`16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`,
1189	`30`,
1190	],
1191	);
1192	}
1193	if !is_runnable() {
1194	return;
1195	}
1196	unsafe { test() }
1197	}
1198
1199	#[test]
1200	fn vector_shift_in_three_bytes() {
1201	#[target_feature(enable = "avx2")]
1202	unsafe fn test() {
1203	let v1 = load([
1204	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1205	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1206	]);
1207	let v2 = load([
1208	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1209	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1210	`63`, `64`,
1211	]);
1212	assert_eq!(
1213	unload(v1.shift_in_three_bytes(v2)),
1214	[
1215	`62`, `63`, `64`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`,
1216	`15`, `16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`,
1217	`29`,
1218	],
1219	);
1220	}
1221	if !is_runnable() {
1222	return;
1223	}
1224	unsafe { test() }
1225	}
1226
1227	#[test]
1228	fn vector_shuffle_bytes() {
1229	#[target_feature(enable = "avx2")]
1230	unsafe fn test() {
1231	let v1 = load([
1232	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1233	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1234	]);
1235	let v2 = load([
1236	`0`, `0`, `0`, `0`, `4`, `4`, `4`, `4`, `8`, `8`, `8`, `8`, `12`, `12`, `12`, `12`, `16`, `16`,
1237	`16`, `16`, `20`, `20`, `20`, `20`, `24`, `24`, `24`, `24`, `28`, `28`, `28`, `28`,
1238	]);
1239	assert_eq!(
1240	unload(v1.shuffle_bytes(v2)),
1241	[
1242	`1`, `1`, `1`, `1`, `5`, `5`, `5`, `5`, `9`, `9`, `9`, `9`, `13`, `13`, `13`, `13`, `17`,
1243	`17`, `17`, `17`, `21`, `21`, `21`, `21`, `25`, `25`, `25`, `25`, `29`, `29`, `29`,
1244	`29`
1245	],
1246	);
1247	}
1248	if !is_runnable() {
1249	return;
1250	}
1251	unsafe { test() }
1252	}
1253
1254	#[test]
1255	fn vector_for_each_64bit_lane() {
1256	#[target_feature(enable = "avx2")]
1257	unsafe fn test() {
1258	let v = load([
1259	`0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0A`,
1260	`0x0B`, `0x0C`, `0x0D`, `0x0E`, `0x0F`, `0x10`, `0x11`, `0x12`, `0x13`, `0x14`,
1261	`0x15`, `0x16`, `0x17`, `0x18`, `0x19`, `0x1A`, `0x1B`, `0x1C`, `0x1D`, `0x1E`,
1262	`0x1F`, `0x20`,
1263	]);
1264	let mut lanes = [`0u64`; `4`];
1265	v.for_each_64bit_lane(\|i, lane\| {
1266	lanes[i] = lane;
1267	None::<()>
1268	});
1269	assert_eq!(
1270	lanes,
1271	[
1272	`0x0807060504030201`,
1273	`0x100F0E0D0C0B0A09`,
1274	`0x1817161514131211`,
1275	`0x201F1E1D1C1B1A19`
1276	]
1277	);
1278	}
1279	if !is_runnable() {
1280	return;
1281	}
1282	unsafe { test() }
1283	}
1284
1285	#[test]
1286	fn fat_vector_half_shift_in_one_byte() {
1287	#[target_feature(enable = "avx2")]
1288	unsafe fn test() {
1289	let v1 = load_half([
1290	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`,
1291	]);
1292	let v2 = load_half([
1293	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1294	]);
1295	assert_eq!(
1296	unload(v1.half_shift_in_one_byte(v2)),
1297	[
1298	`32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `32`,
1299	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`
1300	],
1301	);
1302	}
1303	if !is_runnable() {
1304	return;
1305	}
1306	unsafe { test() }
1307	}
1308
1309	#[test]
1310	fn fat_vector_half_shift_in_two_bytes() {
1311	#[target_feature(enable = "avx2")]
1312	unsafe fn test() {
1313	let v1 = load_half([
1314	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`,
1315	]);
1316	let v2 = load_half([
1317	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1318	]);
1319	assert_eq!(
1320	unload(v1.half_shift_in_two_bytes(v2)),
1321	[
1322	`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `31`,
1323	`32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`,
1324	],
1325	);
1326	}
1327	if !is_runnable() {
1328	return;
1329	}
1330	unsafe { test() }
1331	}
1332
1333	#[test]
1334	fn fat_vector_half_shift_in_three_bytes() {
1335	#[target_feature(enable = "avx2")]
1336	unsafe fn test() {
1337	let v1 = load_half([
1338	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`,
1339	]);
1340	let v2 = load_half([
1341	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1342	]);
1343	assert_eq!(
1344	unload(v1.half_shift_in_three_bytes(v2)),
1345	[
1346	`30`, `31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `30`,
1347	`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`,
1348	],
1349	);
1350	}
1351	if !is_runnable() {
1352	return;
1353	}
1354	unsafe { test() }
1355	}
1356
1357	#[test]
1358	fn fat_vector_swap_halves() {
1359	#[target_feature(enable = "avx2")]
1360	unsafe fn test() {
1361	let v = load([
1362	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1363	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1364	]);
1365	assert_eq!(
1366	unload(v.swap_halves()),
1367	[
1368	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`,
1369	`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
1370	`16`,
1371	],
1372	);
1373	}
1374	if !is_runnable() {
1375	return;
1376	}
1377	unsafe { test() }
1378	}
1379
1380	#[test]
1381	fn fat_vector_interleave_low_8bit_lanes() {
1382	#[target_feature(enable = "avx2")]
1383	unsafe fn test() {
1384	let v1 = load([
1385	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1386	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1387	]);
1388	let v2 = load([
1389	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1390	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1391	`63`, `64`,
1392	]);
1393	assert_eq!(
1394	unload(v1.interleave_low_8bit_lanes(v2)),
1395	[
1396	`1`, `33`, `2`, `34`, `3`, `35`, `4`, `36`, `5`, `37`, `6`, `38`, `7`, `39`, `8`, `40`,
1397	`17`, `49`, `18`, `50`, `19`, `51`, `20`, `52`, `21`, `53`, `22`, `54`, `23`, `55`,
1398	`24`, `56`,
1399	],
1400	);
1401	}
1402	if !is_runnable() {
1403	return;
1404	}
1405	unsafe { test() }
1406	}
1407
1408	#[test]
1409	fn fat_vector_interleave_high_8bit_lanes() {
1410	#[target_feature(enable = "avx2")]
1411	unsafe fn test() {
1412	let v1 = load([
1413	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1414	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1415	]);
1416	let v2 = load([
1417	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1418	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1419	`63`, `64`,
1420	]);
1421	assert_eq!(
1422	unload(v1.interleave_high_8bit_lanes(v2)),
1423	[
1424	`9`, `41`, `10`, `42`, `11`, `43`, `12`, `44`, `13`, `45`, `14`, `46`, `15`, `47`, `16`,
1425	`48`, `25`, `57`, `26`, `58`, `27`, `59`, `28`, `60`, `29`, `61`, `30`, `62`, `31`,
1426	`63`, `32`, `64`,
1427	],
1428	);
1429	}
1430	if !is_runnable() {
1431	return;
1432	}
1433	unsafe { test() }
1434	}
1435
1436	#[test]
1437	fn fat_vector_for_each_low_64bit_lane() {
1438	#[target_feature(enable = "avx2")]
1439	unsafe fn test() {
1440	let v1 = load([
1441	`0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0A`,
1442	`0x0B`, `0x0C`, `0x0D`, `0x0E`, `0x0F`, `0x10`, `0x11`, `0x12`, `0x13`, `0x14`,
1443	`0x15`, `0x16`, `0x17`, `0x18`, `0x19`, `0x1A`, `0x1B`, `0x1C`, `0x1D`, `0x1E`,
1444	`0x1F`, `0x20`,
1445	]);
1446	let v2 = load([
1447	`0x21`, `0x22`, `0x23`, `0x24`, `0x25`, `0x26`, `0x27`, `0x28`, `0x29`, `0x2A`,
1448	`0x2B`, `0x2C`, `0x2D`, `0x2E`, `0x2F`, `0x30`, `0x31`, `0x32`, `0x33`, `0x34`,
1449	`0x35`, `0x36`, `0x37`, `0x38`, `0x39`, `0x3A`, `0x3B`, `0x3C`, `0x3D`, `0x3E`,
1450	`0x3F`, `0x40`,
1451	]);
1452	let mut lanes = [`0u64`; `4`];
1453	v1.for_each_low_64bit_lane(v2, \|i, lane\| {
1454	lanes[i] = lane;
1455	None::<()>
1456	});
1457	assert_eq!(
1458	lanes,
1459	[
1460	`0x0807060504030201`,
1461	`0x100F0E0D0C0B0A09`,
1462	`0x2827262524232221`,
1463	`0x302F2E2D2C2B2A29`
1464	]
1465	);
1466	}
1467	if !is_runnable() {
1468	return;
1469	}
1470	unsafe { test() }
1471	}
1472	}
1473
1474	#[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))]
1475	mod tests_aarch64_neon {
1476	use core::arch::aarch64::*;
1477
1478	use super::*;
1479
1480	#[target_feature(enable = "neon")]
1481	unsafe fn load(lanes: [u8; `16`]) -> uint8x16_t {
1482	uint8x16_t::load_unaligned(&lanes as *const u8)
1483	}
1484
1485	#[target_feature(enable = "neon")]
1486	unsafe fn unload(v: uint8x16_t) -> [u8; `16`] {
1487	[
1488	vgetq_lane_u8(v, `0`),
1489	vgetq_lane_u8(v, `1`),
1490	vgetq_lane_u8(v, `2`),
1491	vgetq_lane_u8(v, `3`),
1492	vgetq_lane_u8(v, `4`),
1493	vgetq_lane_u8(v, `5`),
1494	vgetq_lane_u8(v, `6`),
1495	vgetq_lane_u8(v, `7`),
1496	vgetq_lane_u8(v, `8`),
1497	vgetq_lane_u8(v, `9`),
1498	vgetq_lane_u8(v, `10`),
1499	vgetq_lane_u8(v, `11`),
1500	vgetq_lane_u8(v, `12`),
1501	vgetq_lane_u8(v, `13`),
1502	vgetq_lane_u8(v, `14`),
1503	vgetq_lane_u8(v, `15`),
1504	]
1505	}
1506
1507	// Example functions. These don't test the Vector traits, but rather,
1508	// specific NEON instructions. They are basically little experiments I
1509	// wrote to figure out what an instruction does since their descriptions
1510	// are so dense. I decided to keep the experiments around as example tests
1511	// in case there' useful.
1512
1513	#[test]
1514	fn example_vmaxvq_u8_non_zero() {
1515	#[target_feature(enable = "neon")]
1516	unsafe fn example() {
1517	let v = load([`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1518	assert_eq!(vmaxvq_u8(v), `1`);
1519	}
1520	unsafe { example() }
1521	}
1522
1523	#[test]
1524	fn example_vmaxvq_u8_zero() {
1525	#[target_feature(enable = "neon")]
1526	unsafe fn example() {
1527	let v = load([`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1528	assert_eq!(vmaxvq_u8(v), `0`);
1529	}
1530	unsafe { example() }
1531	}
1532
1533	#[test]
1534	fn example_vpmaxq_u8_non_zero() {
1535	#[target_feature(enable = "neon")]
1536	unsafe fn example() {
1537	let v = load([`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1538	let r = vpmaxq_u8(v, v);
1539	assert_eq!(
1540	unload(r),
1541	[`0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`]
1542	);
1543	}
1544	unsafe { example() }
1545	}
1546
1547	#[test]
1548	fn example_vpmaxq_u8_self() {
1549	#[target_feature(enable = "neon")]
1550	unsafe fn example() {
1551	let v =
1552	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1553	let r = vpmaxq_u8(v, v);
1554	assert_eq!(
1555	unload(r),
1556	[`2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`, `2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`]
1557	);
1558	}
1559	unsafe { example() }
1560	}
1561
1562	#[test]
1563	fn example_vpmaxq_u8_other() {
1564	#[target_feature(enable = "neon")]
1565	unsafe fn example() {
1566	let v1 =
1567	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1568	let v2 = load([
1569	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1570	]);
1571	let r = vpmaxq_u8(v1, v2);
1572	assert_eq!(
1573	unload(r),
1574	[`2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`, `18`, `20`, `22`, `24`, `26`, `28`, `30`, `32`]
1575	);
1576	}
1577	unsafe { example() }
1578	}
1579
1580	// Now we test the actual methods on the Vector trait.
1581
1582	#[test]
1583	fn vector_splat() {
1584	#[target_feature(enable = "neon")]
1585	unsafe fn test() {
1586	let v = uint8x16_t::splat(`0xAF`);
1587	assert_eq!(
1588	unload(v),
1589	[
1590	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1591	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`
1592	]
1593	);
1594	}
1595	unsafe { test() }
1596	}
1597
1598	#[test]
1599	fn vector_is_zero() {
1600	#[target_feature(enable = "neon")]
1601	unsafe fn test() {
1602	let v = load([`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1603	assert!(!v.is_zero());
1604	let v = load([`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1605	assert!(v.is_zero());
1606	}
1607	unsafe { test() }
1608	}
1609
1610	#[test]
1611	fn vector_cmpeq() {
1612	#[target_feature(enable = "neon")]
1613	unsafe fn test() {
1614	let v1 =
1615	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `1`]);
1616	let v2 =
1617	load([`16`, `15`, `14`, `13`, `12`, `11`, `10`, `9`, `8`, `7`, `6`, `5`, `4`, `3`, `2`, `1`]);
1618	assert_eq!(
1619	unload(v1.cmpeq(v2)),
1620	[`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0xFF`]
1621	);
1622	}
1623	unsafe { test() }
1624	}
1625
1626	#[test]
1627	fn vector_and() {
1628	#[target_feature(enable = "neon")]
1629	unsafe fn test() {
1630	let v1 =
1631	load([`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1632	let v2 =
1633	load([`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1634	assert_eq!(
1635	unload(v1.and(v2)),
1636	[`0`, `0`, `0`, `0`, `0`, `0b1000`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
1637	);
1638	}
1639	unsafe { test() }
1640	}
1641
1642	#[test]
1643	fn vector_or() {
1644	#[target_feature(enable = "neon")]
1645	unsafe fn test() {
1646	let v1 =
1647	load([`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1648	let v2 =
1649	load([`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1650	assert_eq!(
1651	unload(v1.or(v2)),
1652	[`0`, `0`, `0`, `0`, `0`, `0b1011`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
1653	);
1654	}
1655	unsafe { test() }
1656	}
1657
1658	#[test]
1659	fn vector_shift_8bit_lane_right() {
1660	#[target_feature(enable = "neon")]
1661	unsafe fn test() {
1662	let v = load([
1663	`0`, `0`, `0`, `0`, `0b1011`, `0b0101`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1664	]);
1665	assert_eq!(
1666	unload(v.shift_8bit_lane_right::<`2`>()),
1667	[`0`, `0`, `0`, `0`, `0b0010`, `0b0001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
1668	);
1669	}
1670	unsafe { test() }
1671	}
1672
1673	#[test]
1674	fn vector_shift_in_one_byte() {
1675	#[target_feature(enable = "neon")]
1676	unsafe fn test() {
1677	let v1 =
1678	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1679	let v2 = load([
1680	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1681	]);
1682	assert_eq!(
1683	unload(v1.shift_in_one_byte(v2)),
1684	[`32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`],
1685	);
1686	}
1687	unsafe { test() }
1688	}
1689
1690	#[test]
1691	fn vector_shift_in_two_bytes() {
1692	#[target_feature(enable = "neon")]
1693	unsafe fn test() {
1694	let v1 =
1695	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1696	let v2 = load([
1697	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1698	]);
1699	assert_eq!(
1700	unload(v1.shift_in_two_bytes(v2)),
1701	[`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`],
1702	);
1703	}
1704	unsafe { test() }
1705	}
1706
1707	#[test]
1708	fn vector_shift_in_three_bytes() {
1709	#[target_feature(enable = "neon")]
1710	unsafe fn test() {
1711	let v1 =
1712	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1713	let v2 = load([
1714	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1715	]);
1716	assert_eq!(
1717	unload(v1.shift_in_three_bytes(v2)),
1718	[`30`, `31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`],
1719	);
1720	}
1721	unsafe { test() }
1722	}
1723
1724	#[test]
1725	fn vector_shuffle_bytes() {
1726	#[target_feature(enable = "neon")]
1727	unsafe fn test() {
1728	let v1 =
1729	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1730	let v2 =
1731	load([`0`, `0`, `0`, `0`, `4`, `4`, `4`, `4`, `8`, `8`, `8`, `8`, `12`, `12`, `12`, `12`]);
1732	assert_eq!(
1733	unload(v1.shuffle_bytes(v2)),
1734	[`1`, `1`, `1`, `1`, `5`, `5`, `5`, `5`, `9`, `9`, `9`, `9`, `13`, `13`, `13`, `13`],
1735	);
1736	}
1737	unsafe { test() }
1738	}
1739
1740	#[test]
1741	fn vector_for_each_64bit_lane() {
1742	#[target_feature(enable = "neon")]
1743	unsafe fn test() {
1744	let v = load([
1745	`0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0A`,
1746	`0x0B`, `0x0C`, `0x0D`, `0x0E`, `0x0F`, `0x10`,
1747	]);
1748	let mut lanes = [`0u64`; `2`];
1749	v.for_each_64bit_lane(\|i, lane\| {
1750	lanes[i] = lane;
1751	None::<()>
1752	});
1753	assert_eq!(lanes, [`0x0807060504030201`, `0x100F0E0D0C0B0A09`],);
1754	}
1755	unsafe { test() }
1756	}
1757	}
1758