vector.rs - Codebrowser

1	// NOTE: The descriptions for each of the vector methods on the traits below
2	// are pretty inscrutable. For this reason, there are tests for every method
3	// on for every trait impl below. If you're confused about what an op does,
4	// consult its test. (They probably should be doc tests, but I couldn't figure
5	// out how to write them in a non-annoying way.)
6
7	use core::{
8	fmt::Debug,
9	panic::{RefUnwindSafe, UnwindSafe},
10	};
11
12	/// A trait for describing vector operations used by vectorized searchers.
13	///
14	/// The trait is highly constrained to low level vector operations needed for
15	/// the specific algorithms used in this crate. In general, it was invented
16	/// mostly to be generic over x86's __m128i and __m256i types. At time of
17	/// writing, it also supports wasm and aarch64 128-bit vector types as well.
18	///
19	/// # Safety
20	///
21	/// All methods are not safe since they are intended to be implemented using
22	/// vendor intrinsics, which are also not safe. Callers must ensure that
23	/// the appropriate target features are enabled in the calling function,
24	/// and that the current CPU supports them. All implementations should
25	/// avoid marking the routines with `#[target_feature]` and instead mark
26	/// them as `#[inline(always)]` to ensure they get appropriately inlined.
27	/// (`inline(always)` cannot be used with target_feature.)
28	pub(crate) trait Vector:
29	Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe
30	{
31	/// The number of bits in the vector.
32	const BITS: usize;
33	/// The number of bytes in the vector. That is, this is the size of the
34	/// vector in memory.
35	const BYTES: usize;
36
37	/// Create a vector with 8-bit lanes with the given byte repeated into each
38	/// lane.
39	///
40	/// # Safety
41	///
42	/// Callers must ensure that this is okay to call in the current target for
43	/// the current CPU.
44	unsafe fn splat(byte: u8) -> Self;
45
46	/// Read a vector-size number of bytes from the given pointer. The pointer
47	/// does not need to be aligned.
48	///
49	/// # Safety
50	///
51	/// Callers must ensure that this is okay to call in the current target for
52	/// the current CPU.
53	///
54	/// Callers must guarantee that at least `BYTES` bytes are readable from
55	/// `data`.
56	unsafe fn load_unaligned(data: *const u8) -> Self;
57
58	/// Returns true if and only if this vector has zero in all of its lanes.
59	///
60	/// # Safety
61	///
62	/// Callers must ensure that this is okay to call in the current target for
63	/// the current CPU.
64	unsafe fn is_zero(self) -> bool;
65
66	/// Do an 8-bit pairwise equality check. If lane `i` is equal in this
67	/// vector and the one given, then lane `i` in the resulting vector is set
68	/// to `0xFF`. Otherwise, it is set to `0x00`.
69	///
70	/// # Safety
71	///
72	/// Callers must ensure that this is okay to call in the current target for
73	/// the current CPU.
74	unsafe fn cmpeq(self, vector2: Self) -> Self;
75
76	/// Perform a bitwise 'and' of this vector and the one given and return
77	/// the result.
78	///
79	/// # Safety
80	///
81	/// Callers must ensure that this is okay to call in the current target for
82	/// the current CPU.
83	unsafe fn and(self, vector2: Self) -> Self;
84
85	/// Perform a bitwise 'or' of this vector and the one given and return
86	/// the result.
87	///
88	/// # Safety
89	///
90	/// Callers must ensure that this is okay to call in the current target for
91	/// the current CPU.
92	unsafe fn or(self, vector2: Self) -> Self;
93
94	/// Shift each 8-bit lane in this vector to the right by the number of
95	/// bits indictated by the `BITS` type parameter.
96	///
97	/// # Safety
98	///
99	/// Callers must ensure that this is okay to call in the current target for
100	/// the current CPU.
101	unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self;
102
103	/// Shift this vector to the left by one byte and shift the most
104	/// significant byte of `vector2` into the least significant position of
105	/// this vector.
106	///
107	/// Stated differently, this behaves as if `self` and `vector2` were
108	/// concatenated into a `2 Self::BITS` temporary buffer and then shifted*
109	/// right by `Self::BYTES - 1` bytes.
110	///
111	/// With respect to the Teddy algorithm, `vector2` is usually a previous
112	/// `Self::BYTES` chunk from the haystack and `self` is the chunk
113	/// immediately following it. This permits combining the last two bytes
114	/// from the previous chunk (`vector2`) with the first `Self::BYTES - 1`
115	/// bytes from the current chunk. This permits aligning the result of
116	/// various shuffles so that they can be and-ed together and a possible
117	/// candidate discovered.
118	///
119	/// # Safety
120	///
121	/// Callers must ensure that this is okay to call in the current target for
122	/// the current CPU.
123	unsafe fn shift_in_one_byte(self, vector2: Self) -> Self;
124
125	/// Shift this vector to the left by two bytes and shift the two most
126	/// significant bytes of `vector2` into the least significant position of
127	/// this vector.
128	///
129	/// Stated differently, this behaves as if `self` and `vector2` were
130	/// concatenated into a `2 Self::BITS` temporary buffer and then shifted*
131	/// right by `Self::BYTES - 2` bytes.
132	///
133	/// With respect to the Teddy algorithm, `vector2` is usually a previous
134	/// `Self::BYTES` chunk from the haystack and `self` is the chunk
135	/// immediately following it. This permits combining the last two bytes
136	/// from the previous chunk (`vector2`) with the first `Self::BYTES - 2`
137	/// bytes from the current chunk. This permits aligning the result of
138	/// various shuffles so that they can be and-ed together and a possible
139	/// candidate discovered.
140	///
141	/// # Safety
142	///
143	/// Callers must ensure that this is okay to call in the current target for
144	/// the current CPU.
145	unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self;
146
147	/// Shift this vector to the left by three bytes and shift the three most
148	/// significant bytes of `vector2` into the least significant position of
149	/// this vector.
150	///
151	/// Stated differently, this behaves as if `self` and `vector2` were
152	/// concatenated into a `2 Self::BITS` temporary buffer and then shifted*
153	/// right by `Self::BYTES - 3` bytes.
154	///
155	/// With respect to the Teddy algorithm, `vector2` is usually a previous
156	/// `Self::BYTES` chunk from the haystack and `self` is the chunk
157	/// immediately following it. This permits combining the last three bytes
158	/// from the previous chunk (`vector2`) with the first `Self::BYTES - 3`
159	/// bytes from the current chunk. This permits aligning the result of
160	/// various shuffles so that they can be and-ed together and a possible
161	/// candidate discovered.
162	///
163	/// # Safety
164	///
165	/// Callers must ensure that this is okay to call in the current target for
166	/// the current CPU.
167	unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self;
168
169	/// Shuffles the bytes in this vector according to the indices in each of
170	/// the corresponding lanes in `indices`.
171	///
172	/// If `i` is the index of corresponding lanes, `A` is this vector, `B` is
173	/// indices and `C` is the resulting vector, then `C = A[B[i]]`.
174	///
175	/// # Safety
176	///
177	/// Callers must ensure that this is okay to call in the current target for
178	/// the current CPU.
179	unsafe fn shuffle_bytes(self, indices: Self) -> Self;
180
181	/// Call the provided function for each 64-bit lane in this vector. The
182	/// given function is provided the lane index and lane value as a `u64`.
183	///
184	/// If `f` returns `Some`, then iteration over the lanes is stopped and the
185	/// value is returned. Otherwise, this returns `None`.
186	///
187	/// # Notes
188	///
189	/// Conceptually it would be nice if we could have a
190	/// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is
191	/// tricky given Rust's [current support for const generics][support].
192	/// And even if we could, it would be tricky to write generic code over
193	/// it. (Not impossible. We could introduce another layer that requires
194	/// `AsRef<[u64]>` or something.)
195	///
196	/// [support]: https://github.com/rust-lang/rust/issues/60551
197	///
198	/// # Safety
199	///
200	/// Callers must ensure that this is okay to call in the current target for
201	/// the current CPU.
202	unsafe fn for_each_64bit_lane<T>(
203	self,
204	f: impl FnMut(usize, u64) -> Option<T>,
205	) -> Option<T>;
206	}
207
208	/// This trait extends the `Vector` trait with additional operations to support
209	/// Fat Teddy.
210	///
211	/// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as
212	/// the vector size) instead of the full size of a vector per iteration. For
213	/// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr
214	/// but Fat Teddy reads 16 bytes at a time.
215	///
216	/// Fat Teddy is useful when searching for a large number of literals.
217	/// The extra number of buckets spreads the literals out more and reduces
218	/// verification time.
219	///
220	/// Currently we only implement this for AVX on x86_64. It would be nice to
221	/// implement this for SSE on x86_64 and NEON on aarch64, with the latter two
222	/// only reading 8 bytes at a time. It's not clear how well it would work, but
223	/// there are some tricky things to figure out in terms of implementation. The
224	/// `half_shift_in_{one,two,three}_bytes` methods in particular are probably
225	/// the trickiest of the bunch. For AVX2, these are implemented by taking
226	/// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit
227	/// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8`
228	/// operates on the full 128-bit vector and not on each 64-bit half.) I didn't
229	/// do a careful survey of NEON to see if it could easily support these
230	/// operations.
231	pub(crate) trait FatVector: Vector {
232	type Half: Vector;
233
234	/// Read a half-vector-size number of bytes from the given pointer, and
235	/// broadcast it across both halfs of a full vector. The pointer does not
236	/// need to be aligned.
237	///
238	/// # Safety
239	///
240	/// Callers must ensure that this is okay to call in the current target for
241	/// the current CPU.
242	///
243	/// Callers must guarantee that at least `Self::HALF::BYTES` bytes are
244	/// readable from `data`.
245	unsafe fn load_half_unaligned(data: *const u8) -> Self;
246
247	/// Like `Vector::shift_in_one_byte`, except this is done for each half
248	/// of the vector instead.
249	///
250	/// # Safety
251	///
252	/// Callers must ensure that this is okay to call in the current target for
253	/// the current CPU.
254	unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self;
255
256	/// Like `Vector::shift_in_two_bytes`, except this is done for each half
257	/// of the vector instead.
258	///
259	/// # Safety
260	///
261	/// Callers must ensure that this is okay to call in the current target for
262	/// the current CPU.
263	unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self;
264
265	/// Like `Vector::shift_in_two_bytes`, except this is done for each half
266	/// of the vector instead.
267	///
268	/// # Safety
269	///
270	/// Callers must ensure that this is okay to call in the current target for
271	/// the current CPU.
272	unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self;
273
274	/// Swap the 128-bit lanes in this vector.
275	///
276	/// # Safety
277	///
278	/// Callers must ensure that this is okay to call in the current target for
279	/// the current CPU.
280	unsafe fn swap_halves(self) -> Self;
281
282	/// Unpack and interleave the 8-bit lanes from the low 128 bits of each
283	/// vector and return the result.
284	///
285	/// # Safety
286	///
287	/// Callers must ensure that this is okay to call in the current target for
288	/// the current CPU.
289	unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self;
290
291	/// Unpack and interleave the 8-bit lanes from the high 128 bits of each
292	/// vector and return the result.
293	///
294	/// # Safety
295	///
296	/// Callers must ensure that this is okay to call in the current target for
297	/// the current CPU.
298	unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self;
299
300	/// Call the provided function for each 64-bit lane in the lower half
301	/// of this vector and then in the other vector. The given function is
302	/// provided the lane index and lane value as a `u64`. (The high 128-bits
303	/// of each vector are ignored.)
304	///
305	/// If `f` returns `Some`, then iteration over the lanes is stopped and the
306	/// value is returned. Otherwise, this returns `None`.
307	///
308	/// # Safety
309	///
310	/// Callers must ensure that this is okay to call in the current target for
311	/// the current CPU.
312	unsafe fn for_each_low_64bit_lane<T>(
313	self,
314	vector2: Self,
315	f: impl FnMut(usize, u64) -> Option<T>,
316	) -> Option<T>;
317	}
318
319	#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
320	mod x86_64_ssse3 {
321	use core::arch::x86_64::*;
322
323	use crate::util::int::{I32, I8};
324
325	use super::Vector;
326
327	impl Vector for __m128i {
328	const BITS: usize = `128`;
329	const BYTES: usize = `16`;
330
331	#[inline(always)]
332	unsafe fn splat(byte: u8) -> __m128i {
333	_mm_set1_epi8(i8::from_bits(byte))
334	}
335
336	#[inline(always)]
337	unsafe fn load_unaligned(data: *const u8) -> __m128i {
338	_mm_loadu_si128(data.cast::<__m128i>())
339	}
340
341	#[inline(always)]
342	unsafe fn is_zero(self) -> bool {
343	let cmp = self.cmpeq(Self::splat(`0`));
344	_mm_movemask_epi8(cmp).to_bits() == `0xFFFF`
345	}
346
347	#[inline(always)]
348	unsafe fn cmpeq(self, vector2: Self) -> __m128i {
349	_mm_cmpeq_epi8(self, vector2)
350	}
351
352	#[inline(always)]
353	unsafe fn and(self, vector2: Self) -> __m128i {
354	_mm_and_si128(self, vector2)
355	}
356
357	#[inline(always)]
358	unsafe fn or(self, vector2: Self) -> __m128i {
359	_mm_or_si128(self, vector2)
360	}
361
362	#[inline(always)]
363	unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
364	// Apparently there is no _mm_srli_epi8, so we emulate it by
365	// shifting 16-bit integers and masking out the high nybble of each
366	// 8-bit lane (since that nybble will contain bits from the low
367	// nybble of the previous lane).
368	let lomask = Self::splat(`0xF`);
369	_mm_srli_epi16(self, BITS).and(lomask)
370	}
371
372	#[inline(always)]
373	unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
374	_mm_alignr_epi8(self, vector2, `15`)
375	}
376
377	#[inline(always)]
378	unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
379	_mm_alignr_epi8(self, vector2, `14`)
380	}
381
382	#[inline(always)]
383	unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
384	_mm_alignr_epi8(self, vector2, `13`)
385	}
386
387	#[inline(always)]
388	unsafe fn shuffle_bytes(self, indices: Self) -> Self {
389	_mm_shuffle_epi8(self, indices)
390	}
391
392	#[inline(always)]
393	unsafe fn for_each_64bit_lane<T>(
394	self,
395	mut f: impl FnMut(usize, u64) -> Option<T>,
396	) -> Option<T> {
397	// We could just use _mm_extract_epi64 here, but that requires
398	// SSE 4.1. It isn't necessarily a problem to just require SSE 4.1,
399	// but everything else works with SSSE3 so we stick to that subset.
400	let lanes: [u64; `2`] = core::mem::transmute(self);
401	if let Some(t) = f(`0`, lanes[`0`]) {
402	return Some(t);
403	}
404	if let Some(t) = f(`1`, lanes[`1`]) {
405	return Some(t);
406	}
407	None
408	}
409	}
410	}
411
412	#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
413	mod x86_64_avx2 {
414	use core::arch::x86_64::*;
415
416	use crate::util::int::{I32, I64, I8};
417
418	use super::{FatVector, Vector};
419
420	impl Vector for __m256i {
421	const BITS: usize = `256`;
422	const BYTES: usize = `32`;
423
424	#[inline(always)]
425	unsafe fn splat(byte: u8) -> __m256i {
426	_mm256_set1_epi8(i8::from_bits(byte))
427	}
428
429	#[inline(always)]
430	unsafe fn load_unaligned(data: *const u8) -> __m256i {
431	_mm256_loadu_si256(data.cast::<__m256i>())
432	}
433
434	#[inline(always)]
435	unsafe fn is_zero(self) -> bool {
436	let cmp = self.cmpeq(Self::splat(`0`));
437	_mm256_movemask_epi8(cmp).to_bits() == `0xFFFFFFFF`
438	}
439
440	#[inline(always)]
441	unsafe fn cmpeq(self, vector2: Self) -> __m256i {
442	_mm256_cmpeq_epi8(self, vector2)
443	}
444
445	#[inline(always)]
446	unsafe fn and(self, vector2: Self) -> __m256i {
447	_mm256_and_si256(self, vector2)
448	}
449
450	#[inline(always)]
451	unsafe fn or(self, vector2: Self) -> __m256i {
452	_mm256_or_si256(self, vector2)
453	}
454
455	#[inline(always)]
456	unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
457	let lomask = Self::splat(`0xF`);
458	_mm256_srli_epi16(self, BITS).and(lomask)
459	}
460
461	#[inline(always)]
462	unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
463	// Credit goes to jneem for figuring this out:
464	// https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
465	//
466	// TL;DR avx2's PALIGNR instruction is actually just two 128-bit
467	// PALIGNR instructions, which is not what we want, so we need to
468	// do some extra shuffling.
469	let v = _mm256_permute2x128_si256(vector2, self, `0x21`);
470	_mm256_alignr_epi8(self, v, `15`)
471	}
472
473	#[inline(always)]
474	unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
475	// Credit goes to jneem for figuring this out:
476	// https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
477	//
478	// TL;DR avx2's PALIGNR instruction is actually just two 128-bit
479	// PALIGNR instructions, which is not what we want, so we need to
480	// do some extra shuffling.
481	let v = _mm256_permute2x128_si256(vector2, self, `0x21`);
482	_mm256_alignr_epi8(self, v, `14`)
483	}
484
485	#[inline(always)]
486	unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
487	// Credit goes to jneem for figuring this out:
488	// https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
489	//
490	// TL;DR avx2's PALIGNR instruction is actually just two 128-bit
491	// PALIGNR instructions, which is not what we want, so we need to
492	// do some extra shuffling.
493	let v = _mm256_permute2x128_si256(vector2, self, `0x21`);
494	_mm256_alignr_epi8(self, v, `13`)
495	}
496
497	#[inline(always)]
498	unsafe fn shuffle_bytes(self, indices: Self) -> Self {
499	_mm256_shuffle_epi8(self, indices)
500	}
501
502	#[inline(always)]
503	unsafe fn for_each_64bit_lane<T>(
504	self,
505	mut f: impl FnMut(usize, u64) -> Option<T>,
506	) -> Option<T> {
507	// NOTE: At one point in the past, I used transmute to this to
508	// get a [u64; 4], but it turned out to lead to worse codegen IIRC.
509	// I've tried it more recently, and it looks like that's no longer
510	// the case. But since there's no difference, we stick with the
511	// slightly more complicated but transmute-free version.
512	let lane = _mm256_extract_epi64(self, `0`).to_bits();
513	if let Some(t) = f(`0`, lane) {
514	return Some(t);
515	}
516	let lane = _mm256_extract_epi64(self, `1`).to_bits();
517	if let Some(t) = f(`1`, lane) {
518	return Some(t);
519	}
520	let lane = _mm256_extract_epi64(self, `2`).to_bits();
521	if let Some(t) = f(`2`, lane) {
522	return Some(t);
523	}
524	let lane = _mm256_extract_epi64(self, `3`).to_bits();
525	if let Some(t) = f(`3`, lane) {
526	return Some(t);
527	}
528	None
529	}
530	}
531
532	impl FatVector for __m256i {
533	type Half = __m128i;
534
535	#[inline(always)]
536	unsafe fn load_half_unaligned(data: *const u8) -> Self {
537	let half = Self::Half::load_unaligned(data);
538	_mm256_broadcastsi128_si256(half)
539	}
540
541	#[inline(always)]
542	unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self {
543	_mm256_alignr_epi8(self, vector2, `15`)
544	}
545
546	#[inline(always)]
547	unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self {
548	_mm256_alignr_epi8(self, vector2, `14`)
549	}
550
551	#[inline(always)]
552	unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self {
553	_mm256_alignr_epi8(self, vector2, `13`)
554	}
555
556	#[inline(always)]
557	unsafe fn swap_halves(self) -> Self {
558	_mm256_permute4x64_epi64(self, `0x4E`)
559	}
560
561	#[inline(always)]
562	unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self {
563	_mm256_unpacklo_epi8(self, vector2)
564	}
565
566	#[inline(always)]
567	unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self {
568	_mm256_unpackhi_epi8(self, vector2)
569	}
570
571	#[inline(always)]
572	unsafe fn for_each_low_64bit_lane<T>(
573	self,
574	vector2: Self,
575	mut f: impl FnMut(usize, u64) -> Option<T>,
576	) -> Option<T> {
577	let lane = _mm256_extract_epi64(self, `0`).to_bits();
578	if let Some(t) = f(`0`, lane) {
579	return Some(t);
580	}
581	let lane = _mm256_extract_epi64(self, `1`).to_bits();
582	if let Some(t) = f(`1`, lane) {
583	return Some(t);
584	}
585	let lane = _mm256_extract_epi64(vector2, `0`).to_bits();
586	if let Some(t) = f(`2`, lane) {
587	return Some(t);
588	}
589	let lane = _mm256_extract_epi64(vector2, `1`).to_bits();
590	if let Some(t) = f(`3`, lane) {
591	return Some(t);
592	}
593	None
594	}
595	}
596	}
597
598	#[cfg(target_arch = "aarch64")]
599	mod aarch64_neon {
600	use core::arch::aarch64::*;
601
602	use super::Vector;
603
604	impl Vector for uint8x16_t {
605	const BITS: usize = `128`;
606	const BYTES: usize = `16`;
607
608	#[inline(always)]
609	unsafe fn splat(byte: u8) -> uint8x16_t {
610	vdupq_n_u8(byte)
611	}
612
613	#[inline(always)]
614	unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
615	vld1q_u8(data)
616	}
617
618	#[inline(always)]
619	unsafe fn is_zero(self) -> bool {
620	// Could also use vmaxvq_u8.
621	// ... I tried that and couldn't observe any meaningful difference
622	// in benchmarks.
623	let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
624	vgetq_lane_u64(maxes, `0`) == `0`
625	}
626
627	#[inline(always)]
628	unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
629	vceqq_u8(self, vector2)
630	}
631
632	#[inline(always)]
633	unsafe fn and(self, vector2: Self) -> uint8x16_t {
634	vandq_u8(self, vector2)
635	}
636
637	#[inline(always)]
638	unsafe fn or(self, vector2: Self) -> uint8x16_t {
639	vorrq_u8(self, vector2)
640	}
641
642	#[inline(always)]
643	unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
644	debug_assert!(BITS <= `7`);
645	vshrq_n_u8(self, BITS)
646	}
647
648	#[inline(always)]
649	unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
650	vextq_u8(vector2, self, `15`)
651	}
652
653	#[inline(always)]
654	unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
655	vextq_u8(vector2, self, `14`)
656	}
657
658	#[inline(always)]
659	unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
660	vextq_u8(vector2, self, `13`)
661	}
662
663	#[inline(always)]
664	unsafe fn shuffle_bytes(self, indices: Self) -> Self {
665	vqtbl1q_u8(self, indices)
666	}
667
668	#[inline(always)]
669	unsafe fn for_each_64bit_lane<T>(
670	self,
671	mut f: impl FnMut(usize, u64) -> Option<T>,
672	) -> Option<T> {
673	let this = vreinterpretq_u64_u8(self);
674	let lane = vgetq_lane_u64(this, `0`);
675	if let Some(t) = f(`0`, lane) {
676	return Some(t);
677	}
678	let lane = vgetq_lane_u64(this, `1`);
679	if let Some(t) = f(`1`, lane) {
680	return Some(t);
681	}
682	None
683	}
684	}
685	}
686
687	#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
688	mod tests_x86_64_ssse3 {
689	use core::arch::x86_64::*;
690
691	use crate::util::int::{I32, U32};
692
693	use super::*;
694
695	fn is_runnable() -> bool {
696	std::is_x86_feature_detected!("ssse3")
697	}
698
699	#[target_feature(enable = "ssse3")]
700	unsafe fn load(lanes: [u8; `16`]) -> __m128i {
701	__m128i::load_unaligned(&lanes as *const u8)
702	}
703
704	#[target_feature(enable = "ssse3")]
705	unsafe fn unload(v: __m128i) -> [u8; `16`] {
706	[
707	_mm_extract_epi8(v, `0`).to_bits().low_u8(),
708	_mm_extract_epi8(v, `1`).to_bits().low_u8(),
709	_mm_extract_epi8(v, `2`).to_bits().low_u8(),
710	_mm_extract_epi8(v, `3`).to_bits().low_u8(),
711	_mm_extract_epi8(v, `4`).to_bits().low_u8(),
712	_mm_extract_epi8(v, `5`).to_bits().low_u8(),
713	_mm_extract_epi8(v, `6`).to_bits().low_u8(),
714	_mm_extract_epi8(v, `7`).to_bits().low_u8(),
715	_mm_extract_epi8(v, `8`).to_bits().low_u8(),
716	_mm_extract_epi8(v, `9`).to_bits().low_u8(),
717	_mm_extract_epi8(v, `10`).to_bits().low_u8(),
718	_mm_extract_epi8(v, `11`).to_bits().low_u8(),
719	_mm_extract_epi8(v, `12`).to_bits().low_u8(),
720	_mm_extract_epi8(v, `13`).to_bits().low_u8(),
721	_mm_extract_epi8(v, `14`).to_bits().low_u8(),
722	_mm_extract_epi8(v, `15`).to_bits().low_u8(),
723	]
724	}
725
726	#[test]
727	fn vector_splat() {
728	#[target_feature(enable = "ssse3")]
729	unsafe fn test() {
730	let v = __m128i::splat(`0xAF`);
731	assert_eq!(
732	unload(v),
733	[
734	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
735	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`
736	]
737	);
738	}
739	if !is_runnable() {
740	return;
741	}
742	unsafe { test() }
743	}
744
745	#[test]
746	fn vector_is_zero() {
747	#[target_feature(enable = "ssse3")]
748	unsafe fn test() {
749	let v = load([`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
750	assert!(!v.is_zero());
751	let v = load([`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
752	assert!(v.is_zero());
753	}
754	if !is_runnable() {
755	return;
756	}
757	unsafe { test() }
758	}
759
760	#[test]
761	fn vector_cmpeq() {
762	#[target_feature(enable = "ssse3")]
763	unsafe fn test() {
764	let v1 =
765	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `1`]);
766	let v2 =
767	load([`16`, `15`, `14`, `13`, `12`, `11`, `10`, `9`, `8`, `7`, `6`, `5`, `4`, `3`, `2`, `1`]);
768	assert_eq!(
769	unload(v1.cmpeq(v2)),
770	[`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0xFF`]
771	);
772	}
773	if !is_runnable() {
774	return;
775	}
776	unsafe { test() }
777	}
778
779	#[test]
780	fn vector_and() {
781	#[target_feature(enable = "ssse3")]
782	unsafe fn test() {
783	let v1 =
784	load([`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
785	let v2 =
786	load([`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
787	assert_eq!(
788	unload(v1.and(v2)),
789	[`0`, `0`, `0`, `0`, `0`, `0b1000`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
790	);
791	}
792	if !is_runnable() {
793	return;
794	}
795	unsafe { test() }
796	}
797
798	#[test]
799	fn vector_or() {
800	#[target_feature(enable = "ssse3")]
801	unsafe fn test() {
802	let v1 =
803	load([`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
804	let v2 =
805	load([`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
806	assert_eq!(
807	unload(v1.or(v2)),
808	[`0`, `0`, `0`, `0`, `0`, `0b1011`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
809	);
810	}
811	if !is_runnable() {
812	return;
813	}
814	unsafe { test() }
815	}
816
817	#[test]
818	fn vector_shift_8bit_lane_right() {
819	#[target_feature(enable = "ssse3")]
820	unsafe fn test() {
821	let v = load([
822	`0`, `0`, `0`, `0`, `0b1011`, `0b0101`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
823	]);
824	assert_eq!(
825	unload(v.shift_8bit_lane_right::<`2`>()),
826	[`0`, `0`, `0`, `0`, `0b0010`, `0b0001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
827	);
828	}
829	if !is_runnable() {
830	return;
831	}
832	unsafe { test() }
833	}
834
835	#[test]
836	fn vector_shift_in_one_byte() {
837	#[target_feature(enable = "ssse3")]
838	unsafe fn test() {
839	let v1 =
840	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
841	let v2 = load([
842	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
843	]);
844	assert_eq!(
845	unload(v1.shift_in_one_byte(v2)),
846	[`32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`],
847	);
848	}
849	if !is_runnable() {
850	return;
851	}
852	unsafe { test() }
853	}
854
855	#[test]
856	fn vector_shift_in_two_bytes() {
857	#[target_feature(enable = "ssse3")]
858	unsafe fn test() {
859	let v1 =
860	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
861	let v2 = load([
862	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
863	]);
864	assert_eq!(
865	unload(v1.shift_in_two_bytes(v2)),
866	[`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`],
867	);
868	}
869	if !is_runnable() {
870	return;
871	}
872	unsafe { test() }
873	}
874
875	#[test]
876	fn vector_shift_in_three_bytes() {
877	#[target_feature(enable = "ssse3")]
878	unsafe fn test() {
879	let v1 =
880	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
881	let v2 = load([
882	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
883	]);
884	assert_eq!(
885	unload(v1.shift_in_three_bytes(v2)),
886	[`30`, `31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`],
887	);
888	}
889	if !is_runnable() {
890	return;
891	}
892	unsafe { test() }
893	}
894
895	#[test]
896	fn vector_shuffle_bytes() {
897	#[target_feature(enable = "ssse3")]
898	unsafe fn test() {
899	let v1 =
900	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
901	let v2 =
902	load([`0`, `0`, `0`, `0`, `4`, `4`, `4`, `4`, `8`, `8`, `8`, `8`, `12`, `12`, `12`, `12`]);
903	assert_eq!(
904	unload(v1.shuffle_bytes(v2)),
905	[`1`, `1`, `1`, `1`, `5`, `5`, `5`, `5`, `9`, `9`, `9`, `9`, `13`, `13`, `13`, `13`],
906	);
907	}
908	if !is_runnable() {
909	return;
910	}
911	unsafe { test() }
912	}
913
914	#[test]
915	fn vector_for_each_64bit_lane() {
916	#[target_feature(enable = "ssse3")]
917	unsafe fn test() {
918	let v = load([
919	`0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0A`,
920	`0x0B`, `0x0C`, `0x0D`, `0x0E`, `0x0F`, `0x10`,
921	]);
922	let mut lanes = [`0u64`; `2`];
923	v.for_each_64bit_lane(\|i, lane\| {
924	lanes[i] = lane;
925	None::<()>
926	});
927	assert_eq!(lanes, [`0x0807060504030201`, `0x100F0E0D0C0B0A09`],);
928	}
929	if !is_runnable() {
930	return;
931	}
932	unsafe { test() }
933	}
934	}
935
936	#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
937	mod tests_x86_64_avx2 {
938	use core::arch::x86_64::*;
939
940	use crate::util::int::{I32, U32};
941
942	use super::*;
943
944	fn is_runnable() -> bool {
945	std::is_x86_feature_detected!("avx2")
946	}
947
948	#[target_feature(enable = "avx2")]
949	unsafe fn load(lanes: [u8; `32`]) -> __m256i {
950	__m256i::load_unaligned(&lanes as *const u8)
951	}
952
953	#[target_feature(enable = "avx2")]
954	unsafe fn load_half(lanes: [u8; `16`]) -> __m256i {
955	__m256i::load_half_unaligned(&lanes as *const u8)
956	}
957
958	#[target_feature(enable = "avx2")]
959	unsafe fn unload(v: __m256i) -> [u8; `32`] {
960	[
961	_mm256_extract_epi8(v, `0`).to_bits().low_u8(),
962	_mm256_extract_epi8(v, `1`).to_bits().low_u8(),
963	_mm256_extract_epi8(v, `2`).to_bits().low_u8(),
964	_mm256_extract_epi8(v, `3`).to_bits().low_u8(),
965	_mm256_extract_epi8(v, `4`).to_bits().low_u8(),
966	_mm256_extract_epi8(v, `5`).to_bits().low_u8(),
967	_mm256_extract_epi8(v, `6`).to_bits().low_u8(),
968	_mm256_extract_epi8(v, `7`).to_bits().low_u8(),
969	_mm256_extract_epi8(v, `8`).to_bits().low_u8(),
970	_mm256_extract_epi8(v, `9`).to_bits().low_u8(),
971	_mm256_extract_epi8(v, `10`).to_bits().low_u8(),
972	_mm256_extract_epi8(v, `11`).to_bits().low_u8(),
973	_mm256_extract_epi8(v, `12`).to_bits().low_u8(),
974	_mm256_extract_epi8(v, `13`).to_bits().low_u8(),
975	_mm256_extract_epi8(v, `14`).to_bits().low_u8(),
976	_mm256_extract_epi8(v, `15`).to_bits().low_u8(),
977	_mm256_extract_epi8(v, `16`).to_bits().low_u8(),
978	_mm256_extract_epi8(v, `17`).to_bits().low_u8(),
979	_mm256_extract_epi8(v, `18`).to_bits().low_u8(),
980	_mm256_extract_epi8(v, `19`).to_bits().low_u8(),
981	_mm256_extract_epi8(v, `20`).to_bits().low_u8(),
982	_mm256_extract_epi8(v, `21`).to_bits().low_u8(),
983	_mm256_extract_epi8(v, `22`).to_bits().low_u8(),
984	_mm256_extract_epi8(v, `23`).to_bits().low_u8(),
985	_mm256_extract_epi8(v, `24`).to_bits().low_u8(),
986	_mm256_extract_epi8(v, `25`).to_bits().low_u8(),
987	_mm256_extract_epi8(v, `26`).to_bits().low_u8(),
988	_mm256_extract_epi8(v, `27`).to_bits().low_u8(),
989	_mm256_extract_epi8(v, `28`).to_bits().low_u8(),
990	_mm256_extract_epi8(v, `29`).to_bits().low_u8(),
991	_mm256_extract_epi8(v, `30`).to_bits().low_u8(),
992	_mm256_extract_epi8(v, `31`).to_bits().low_u8(),
993	]
994	}
995
996	#[test]
997	fn vector_splat() {
998	#[target_feature(enable = "avx2")]
999	unsafe fn test() {
1000	let v = __m256i::splat(`0xAF`);
1001	assert_eq!(
1002	unload(v),
1003	[
1004	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1005	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1006	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1007	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1008	]
1009	);
1010	}
1011	if !is_runnable() {
1012	return;
1013	}
1014	unsafe { test() }
1015	}
1016
1017	#[test]
1018	fn vector_is_zero() {
1019	#[target_feature(enable = "avx2")]
1020	unsafe fn test() {
1021	let v = load([
1022	`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1023	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1024	]);
1025	assert!(!v.is_zero());
1026	let v = load([
1027	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1028	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1029	]);
1030	assert!(v.is_zero());
1031	}
1032	if !is_runnable() {
1033	return;
1034	}
1035	unsafe { test() }
1036	}
1037
1038	#[test]
1039	fn vector_cmpeq() {
1040	#[target_feature(enable = "avx2")]
1041	unsafe fn test() {
1042	let v1 = load([
1043	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1044	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `1`,
1045	]);
1046	let v2 = load([
1047	`32`, `31`, `30`, `29`, `28`, `27`, `26`, `25`, `24`, `23`, `22`, `21`, `20`, `19`, `18`,
1048	`17`, `16`, `15`, `14`, `13`, `12`, `11`, `10`, `9`, `8`, `7`, `6`, `5`, `4`, `3`, `2`, `1`,
1049	]);
1050	assert_eq!(
1051	unload(v1.cmpeq(v2)),
1052	[
1053	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1054	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0xFF`
1055	]
1056	);
1057	}
1058	if !is_runnable() {
1059	return;
1060	}
1061	unsafe { test() }
1062	}
1063
1064	#[test]
1065	fn vector_and() {
1066	#[target_feature(enable = "avx2")]
1067	unsafe fn test() {
1068	let v1 = load([
1069	`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1070	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1071	]);
1072	let v2 = load([
1073	`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1074	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1075	]);
1076	assert_eq!(
1077	unload(v1.and(v2)),
1078	[
1079	`0`, `0`, `0`, `0`, `0`, `0b1000`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1080	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1081	]
1082	);
1083	}
1084	if !is_runnable() {
1085	return;
1086	}
1087	unsafe { test() }
1088	}
1089
1090	#[test]
1091	fn vector_or() {
1092	#[target_feature(enable = "avx2")]
1093	unsafe fn test() {
1094	let v1 = load([
1095	`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1096	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1097	]);
1098	let v2 = load([
1099	`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1100	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1101	]);
1102	assert_eq!(
1103	unload(v1.or(v2)),
1104	[
1105	`0`, `0`, `0`, `0`, `0`, `0b1011`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1106	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1107	]
1108	);
1109	}
1110	if !is_runnable() {
1111	return;
1112	}
1113	unsafe { test() }
1114	}
1115
1116	#[test]
1117	fn vector_shift_8bit_lane_right() {
1118	#[target_feature(enable = "avx2")]
1119	unsafe fn test() {
1120	let v = load([
1121	`0`, `0`, `0`, `0`, `0b1011`, `0b0101`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1122	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1123	]);
1124	assert_eq!(
1125	unload(v.shift_8bit_lane_right::<`2`>()),
1126	[
1127	`0`, `0`, `0`, `0`, `0b0010`, `0b0001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1128	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1129	]
1130	);
1131	}
1132	if !is_runnable() {
1133	return;
1134	}
1135	unsafe { test() }
1136	}
1137
1138	#[test]
1139	fn vector_shift_in_one_byte() {
1140	#[target_feature(enable = "avx2")]
1141	unsafe fn test() {
1142	let v1 = load([
1143	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1144	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1145	]);
1146	let v2 = load([
1147	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1148	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1149	`63`, `64`,
1150	]);
1151	assert_eq!(
1152	unload(v1.shift_in_one_byte(v2)),
1153	[
1154	`64`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`,
1155	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`,
1156	`31`,
1157	],
1158	);
1159	}
1160	if !is_runnable() {
1161	return;
1162	}
1163	unsafe { test() }
1164	}
1165
1166	#[test]
1167	fn vector_shift_in_two_bytes() {
1168	#[target_feature(enable = "avx2")]
1169	unsafe fn test() {
1170	let v1 = load([
1171	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1172	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1173	]);
1174	let v2 = load([
1175	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1176	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1177	`63`, `64`,
1178	]);
1179	assert_eq!(
1180	unload(v1.shift_in_two_bytes(v2)),
1181	[
1182	`63`, `64`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
1183	`16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`,
1184	`30`,
1185	],
1186	);
1187	}
1188	if !is_runnable() {
1189	return;
1190	}
1191	unsafe { test() }
1192	}
1193
1194	#[test]
1195	fn vector_shift_in_three_bytes() {
1196	#[target_feature(enable = "avx2")]
1197	unsafe fn test() {
1198	let v1 = load([
1199	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1200	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1201	]);
1202	let v2 = load([
1203	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1204	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1205	`63`, `64`,
1206	]);
1207	assert_eq!(
1208	unload(v1.shift_in_three_bytes(v2)),
1209	[
1210	`62`, `63`, `64`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`,
1211	`15`, `16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`,
1212	`29`,
1213	],
1214	);
1215	}
1216	if !is_runnable() {
1217	return;
1218	}
1219	unsafe { test() }
1220	}
1221
1222	#[test]
1223	fn vector_shuffle_bytes() {
1224	#[target_feature(enable = "avx2")]
1225	unsafe fn test() {
1226	let v1 = load([
1227	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1228	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1229	]);
1230	let v2 = load([
1231	`0`, `0`, `0`, `0`, `4`, `4`, `4`, `4`, `8`, `8`, `8`, `8`, `12`, `12`, `12`, `12`, `16`, `16`,
1232	`16`, `16`, `20`, `20`, `20`, `20`, `24`, `24`, `24`, `24`, `28`, `28`, `28`, `28`,
1233	]);
1234	assert_eq!(
1235	unload(v1.shuffle_bytes(v2)),
1236	[
1237	`1`, `1`, `1`, `1`, `5`, `5`, `5`, `5`, `9`, `9`, `9`, `9`, `13`, `13`, `13`, `13`, `17`,
1238	`17`, `17`, `17`, `21`, `21`, `21`, `21`, `25`, `25`, `25`, `25`, `29`, `29`, `29`,
1239	`29`
1240	],
1241	);
1242	}
1243	if !is_runnable() {
1244	return;
1245	}
1246	unsafe { test() }
1247	}
1248
1249	#[test]
1250	fn vector_for_each_64bit_lane() {
1251	#[target_feature(enable = "avx2")]
1252	unsafe fn test() {
1253	let v = load([
1254	`0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0A`,
1255	`0x0B`, `0x0C`, `0x0D`, `0x0E`, `0x0F`, `0x10`, `0x11`, `0x12`, `0x13`, `0x14`,
1256	`0x15`, `0x16`, `0x17`, `0x18`, `0x19`, `0x1A`, `0x1B`, `0x1C`, `0x1D`, `0x1E`,
1257	`0x1F`, `0x20`,
1258	]);
1259	let mut lanes = [`0u64`; `4`];
1260	v.for_each_64bit_lane(\|i, lane\| {
1261	lanes[i] = lane;
1262	None::<()>
1263	});
1264	assert_eq!(
1265	lanes,
1266	[
1267	`0x0807060504030201`,
1268	`0x100F0E0D0C0B0A09`,
1269	`0x1817161514131211`,
1270	`0x201F1E1D1C1B1A19`
1271	]
1272	);
1273	}
1274	if !is_runnable() {
1275	return;
1276	}
1277	unsafe { test() }
1278	}
1279
1280	#[test]
1281	fn fat_vector_half_shift_in_one_byte() {
1282	#[target_feature(enable = "avx2")]
1283	unsafe fn test() {
1284	let v1 = load_half([
1285	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`,
1286	]);
1287	let v2 = load_half([
1288	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1289	]);
1290	assert_eq!(
1291	unload(v1.half_shift_in_one_byte(v2)),
1292	[
1293	`32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `32`,
1294	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`
1295	],
1296	);
1297	}
1298	if !is_runnable() {
1299	return;
1300	}
1301	unsafe { test() }
1302	}
1303
1304	#[test]
1305	fn fat_vector_half_shift_in_two_bytes() {
1306	#[target_feature(enable = "avx2")]
1307	unsafe fn test() {
1308	let v1 = load_half([
1309	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`,
1310	]);
1311	let v2 = load_half([
1312	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1313	]);
1314	assert_eq!(
1315	unload(v1.half_shift_in_two_bytes(v2)),
1316	[
1317	`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `31`,
1318	`32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`,
1319	],
1320	);
1321	}
1322	if !is_runnable() {
1323	return;
1324	}
1325	unsafe { test() }
1326	}
1327
1328	#[test]
1329	fn fat_vector_half_shift_in_three_bytes() {
1330	#[target_feature(enable = "avx2")]
1331	unsafe fn test() {
1332	let v1 = load_half([
1333	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`,
1334	]);
1335	let v2 = load_half([
1336	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1337	]);
1338	assert_eq!(
1339	unload(v1.half_shift_in_three_bytes(v2)),
1340	[
1341	`30`, `31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `30`,
1342	`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`,
1343	],
1344	);
1345	}
1346	if !is_runnable() {
1347	return;
1348	}
1349	unsafe { test() }
1350	}
1351
1352	#[test]
1353	fn fat_vector_swap_halves() {
1354	#[target_feature(enable = "avx2")]
1355	unsafe fn test() {
1356	let v = load([
1357	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1358	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1359	]);
1360	assert_eq!(
1361	unload(v.swap_halves()),
1362	[
1363	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`,
1364	`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
1365	`16`,
1366	],
1367	);
1368	}
1369	if !is_runnable() {
1370	return;
1371	}
1372	unsafe { test() }
1373	}
1374
1375	#[test]
1376	fn fat_vector_interleave_low_8bit_lanes() {
1377	#[target_feature(enable = "avx2")]
1378	unsafe fn test() {
1379	let v1 = load([
1380	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1381	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1382	]);
1383	let v2 = load([
1384	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1385	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1386	`63`, `64`,
1387	]);
1388	assert_eq!(
1389	unload(v1.interleave_low_8bit_lanes(v2)),
1390	[
1391	`1`, `33`, `2`, `34`, `3`, `35`, `4`, `36`, `5`, `37`, `6`, `38`, `7`, `39`, `8`, `40`,
1392	`17`, `49`, `18`, `50`, `19`, `51`, `20`, `52`, `21`, `53`, `22`, `54`, `23`, `55`,
1393	`24`, `56`,
1394	],
1395	);
1396	}
1397	if !is_runnable() {
1398	return;
1399	}
1400	unsafe { test() }
1401	}
1402
1403	#[test]
1404	fn fat_vector_interleave_high_8bit_lanes() {
1405	#[target_feature(enable = "avx2")]
1406	unsafe fn test() {
1407	let v1 = load([
1408	`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`,
1409	`19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1410	]);
1411	let v2 = load([
1412	`33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
1413	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`,
1414	`63`, `64`,
1415	]);
1416	assert_eq!(
1417	unload(v1.interleave_high_8bit_lanes(v2)),
1418	[
1419	`9`, `41`, `10`, `42`, `11`, `43`, `12`, `44`, `13`, `45`, `14`, `46`, `15`, `47`, `16`,
1420	`48`, `25`, `57`, `26`, `58`, `27`, `59`, `28`, `60`, `29`, `61`, `30`, `62`, `31`,
1421	`63`, `32`, `64`,
1422	],
1423	);
1424	}
1425	if !is_runnable() {
1426	return;
1427	}
1428	unsafe { test() }
1429	}
1430
1431	#[test]
1432	fn fat_vector_for_each_low_64bit_lane() {
1433	#[target_feature(enable = "avx2")]
1434	unsafe fn test() {
1435	let v1 = load([
1436	`0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0A`,
1437	`0x0B`, `0x0C`, `0x0D`, `0x0E`, `0x0F`, `0x10`, `0x11`, `0x12`, `0x13`, `0x14`,
1438	`0x15`, `0x16`, `0x17`, `0x18`, `0x19`, `0x1A`, `0x1B`, `0x1C`, `0x1D`, `0x1E`,
1439	`0x1F`, `0x20`,
1440	]);
1441	let v2 = load([
1442	`0x21`, `0x22`, `0x23`, `0x24`, `0x25`, `0x26`, `0x27`, `0x28`, `0x29`, `0x2A`,
1443	`0x2B`, `0x2C`, `0x2D`, `0x2E`, `0x2F`, `0x30`, `0x31`, `0x32`, `0x33`, `0x34`,
1444	`0x35`, `0x36`, `0x37`, `0x38`, `0x39`, `0x3A`, `0x3B`, `0x3C`, `0x3D`, `0x3E`,
1445	`0x3F`, `0x40`,
1446	]);
1447	let mut lanes = [`0u64`; `4`];
1448	v1.for_each_low_64bit_lane(v2, \|i, lane\| {
1449	lanes[i] = lane;
1450	None::<()>
1451	});
1452	assert_eq!(
1453	lanes,
1454	[
1455	`0x0807060504030201`,
1456	`0x100F0E0D0C0B0A09`,
1457	`0x2827262524232221`,
1458	`0x302F2E2D2C2B2A29`
1459	]
1460	);
1461	}
1462	if !is_runnable() {
1463	return;
1464	}
1465	unsafe { test() }
1466	}
1467	}
1468
1469	#[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))]
1470	mod tests_aarch64_neon {
1471	use core::arch::aarch64::*;
1472
1473	use super::*;
1474
1475	#[target_feature(enable = "neon")]
1476	unsafe fn load(lanes: [u8; `16`]) -> uint8x16_t {
1477	uint8x16_t::load_unaligned(&lanes as *const u8)
1478	}
1479
1480	#[target_feature(enable = "neon")]
1481	unsafe fn unload(v: uint8x16_t) -> [u8; `16`] {
1482	[
1483	vgetq_lane_u8(v, `0`),
1484	vgetq_lane_u8(v, `1`),
1485	vgetq_lane_u8(v, `2`),
1486	vgetq_lane_u8(v, `3`),
1487	vgetq_lane_u8(v, `4`),
1488	vgetq_lane_u8(v, `5`),
1489	vgetq_lane_u8(v, `6`),
1490	vgetq_lane_u8(v, `7`),
1491	vgetq_lane_u8(v, `8`),
1492	vgetq_lane_u8(v, `9`),
1493	vgetq_lane_u8(v, `10`),
1494	vgetq_lane_u8(v, `11`),
1495	vgetq_lane_u8(v, `12`),
1496	vgetq_lane_u8(v, `13`),
1497	vgetq_lane_u8(v, `14`),
1498	vgetq_lane_u8(v, `15`),
1499	]
1500	}
1501
1502	// Example functions. These don't test the Vector traits, but rather,
1503	// specific NEON instructions. They are basically little experiments I
1504	// wrote to figure out what an instruction does since their descriptions
1505	// are so dense. I decided to keep the experiments around as example tests
1506	// in case there' useful.
1507
1508	#[test]
1509	fn example_vmaxvq_u8_non_zero() {
1510	#[target_feature(enable = "neon")]
1511	unsafe fn example() {
1512	let v = load([`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1513	assert_eq!(vmaxvq_u8(v), `1`);
1514	}
1515	unsafe { example() }
1516	}
1517
1518	#[test]
1519	fn example_vmaxvq_u8_zero() {
1520	#[target_feature(enable = "neon")]
1521	unsafe fn example() {
1522	let v = load([`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1523	assert_eq!(vmaxvq_u8(v), `0`);
1524	}
1525	unsafe { example() }
1526	}
1527
1528	#[test]
1529	fn example_vpmaxq_u8_non_zero() {
1530	#[target_feature(enable = "neon")]
1531	unsafe fn example() {
1532	let v = load([`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1533	let r = vpmaxq_u8(v, v);
1534	assert_eq!(
1535	unload(r),
1536	[`0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`]
1537	);
1538	}
1539	unsafe { example() }
1540	}
1541
1542	#[test]
1543	fn example_vpmaxq_u8_self() {
1544	#[target_feature(enable = "neon")]
1545	unsafe fn example() {
1546	let v =
1547	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1548	let r = vpmaxq_u8(v, v);
1549	assert_eq!(
1550	unload(r),
1551	[`2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`, `2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`]
1552	);
1553	}
1554	unsafe { example() }
1555	}
1556
1557	#[test]
1558	fn example_vpmaxq_u8_other() {
1559	#[target_feature(enable = "neon")]
1560	unsafe fn example() {
1561	let v1 =
1562	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1563	let v2 = load([
1564	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1565	]);
1566	let r = vpmaxq_u8(v1, v2);
1567	assert_eq!(
1568	unload(r),
1569	[`2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`, `18`, `20`, `22`, `24`, `26`, `28`, `30`, `32`]
1570	);
1571	}
1572	unsafe { example() }
1573	}
1574
1575	// Now we test the actual methods on the Vector trait.
1576
1577	#[test]
1578	fn vector_splat() {
1579	#[target_feature(enable = "neon")]
1580	unsafe fn test() {
1581	let v = uint8x16_t::splat(`0xAF`);
1582	assert_eq!(
1583	unload(v),
1584	[
1585	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`,
1586	`0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`, `0xAF`
1587	]
1588	);
1589	}
1590	unsafe { test() }
1591	}
1592
1593	#[test]
1594	fn vector_is_zero() {
1595	#[target_feature(enable = "neon")]
1596	unsafe fn test() {
1597	let v = load([`0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1598	assert!(!v.is_zero());
1599	let v = load([`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1600	assert!(v.is_zero());
1601	}
1602	unsafe { test() }
1603	}
1604
1605	#[test]
1606	fn vector_cmpeq() {
1607	#[target_feature(enable = "neon")]
1608	unsafe fn test() {
1609	let v1 =
1610	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `1`]);
1611	let v2 =
1612	load([`16`, `15`, `14`, `13`, `12`, `11`, `10`, `9`, `8`, `7`, `6`, `5`, `4`, `3`, `2`, `1`]);
1613	assert_eq!(
1614	unload(v1.cmpeq(v2)),
1615	[`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0xFF`]
1616	);
1617	}
1618	unsafe { test() }
1619	}
1620
1621	#[test]
1622	fn vector_and() {
1623	#[target_feature(enable = "neon")]
1624	unsafe fn test() {
1625	let v1 =
1626	load([`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1627	let v2 =
1628	load([`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1629	assert_eq!(
1630	unload(v1.and(v2)),
1631	[`0`, `0`, `0`, `0`, `0`, `0b1000`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
1632	);
1633	}
1634	unsafe { test() }
1635	}
1636
1637	#[test]
1638	fn vector_or() {
1639	#[target_feature(enable = "neon")]
1640	unsafe fn test() {
1641	let v1 =
1642	load([`0`, `0`, `0`, `0`, `0`, `0b1001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1643	let v2 =
1644	load([`0`, `0`, `0`, `0`, `0`, `0b1010`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
1645	assert_eq!(
1646	unload(v1.or(v2)),
1647	[`0`, `0`, `0`, `0`, `0`, `0b1011`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
1648	);
1649	}
1650	unsafe { test() }
1651	}
1652
1653	#[test]
1654	fn vector_shift_8bit_lane_right() {
1655	#[target_feature(enable = "neon")]
1656	unsafe fn test() {
1657	let v = load([
1658	`0`, `0`, `0`, `0`, `0b1011`, `0b0101`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1659	]);
1660	assert_eq!(
1661	unload(v.shift_8bit_lane_right::<`2`>()),
1662	[`0`, `0`, `0`, `0`, `0b0010`, `0b0001`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]
1663	);
1664	}
1665	unsafe { test() }
1666	}
1667
1668	#[test]
1669	fn vector_shift_in_one_byte() {
1670	#[target_feature(enable = "neon")]
1671	unsafe fn test() {
1672	let v1 =
1673	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1674	let v2 = load([
1675	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1676	]);
1677	assert_eq!(
1678	unload(v1.shift_in_one_byte(v2)),
1679	[`32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`],
1680	);
1681	}
1682	unsafe { test() }
1683	}
1684
1685	#[test]
1686	fn vector_shift_in_two_bytes() {
1687	#[target_feature(enable = "neon")]
1688	unsafe fn test() {
1689	let v1 =
1690	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1691	let v2 = load([
1692	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1693	]);
1694	assert_eq!(
1695	unload(v1.shift_in_two_bytes(v2)),
1696	[`31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`],
1697	);
1698	}
1699	unsafe { test() }
1700	}
1701
1702	#[test]
1703	fn vector_shift_in_three_bytes() {
1704	#[target_feature(enable = "neon")]
1705	unsafe fn test() {
1706	let v1 =
1707	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1708	let v2 = load([
1709	`17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`, `32`,
1710	]);
1711	assert_eq!(
1712	unload(v1.shift_in_three_bytes(v2)),
1713	[`30`, `31`, `32`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`],
1714	);
1715	}
1716	unsafe { test() }
1717	}
1718
1719	#[test]
1720	fn vector_shuffle_bytes() {
1721	#[target_feature(enable = "neon")]
1722	unsafe fn test() {
1723	let v1 =
1724	load([`1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`, `16`]);
1725	let v2 =
1726	load([`0`, `0`, `0`, `0`, `4`, `4`, `4`, `4`, `8`, `8`, `8`, `8`, `12`, `12`, `12`, `12`]);
1727	assert_eq!(
1728	unload(v1.shuffle_bytes(v2)),
1729	[`1`, `1`, `1`, `1`, `5`, `5`, `5`, `5`, `9`, `9`, `9`, `9`, `13`, `13`, `13`, `13`],
1730	);
1731	}
1732	unsafe { test() }
1733	}
1734
1735	#[test]
1736	fn vector_for_each_64bit_lane() {
1737	#[target_feature(enable = "neon")]
1738	unsafe fn test() {
1739	let v = load([
1740	`0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0A`,
1741	`0x0B`, `0x0C`, `0x0D`, `0x0E`, `0x0F`, `0x10`,
1742	]);
1743	let mut lanes = [`0u64`; `2`];
1744	v.for_each_64bit_lane(\|i, lane\| {
1745	lanes[i] = lane;
1746	None::<()>
1747	});
1748	assert_eq!(lanes, [`0x0807060504030201`, `0x100F0E0D0C0B0A09`],);
1749	}
1750	unsafe { test() }
1751	}
1752	}
1753