ascii.rs source code [crates/encoding_rs/src/ascii.rs]

1	// Copyright Mozilla Foundation. See the COPYRIGHT
2	// file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	// It's assumed that in due course Rust will have explicit SIMD but will not
11	// be good at run-time selection of SIMD vs. no-SIMD. In such a future,
12	// x86_64 will always use SSE2 and 32-bit x86 will use SSE2 when compiled with
13	// a Mozilla-shipped rustc. SIMD support and especially detection on ARM is a
14	// mess. Under the circumstances, it seems to make sense to optimize the ALU
15	// case for ARMv7 rather than x86. Annoyingly, I was unable to get useful
16	// numbers of the actual ARMv7 CPU I have access to, because (thermal?)
17	// throttling kept interfering. Since Raspberry Pi 3 (ARMv8 core but running
18	// ARMv7 code) produced reproducible performance numbers, that's the ARM
19	// computer that this code ended up being optimized for in the ALU case.
20	// Less popular CPU architectures simply get the approach that was chosen based
21	// on Raspberry Pi 3 measurements. The UTF-16 and UTF-8 ALU cases take
22	// different approaches based on benchmarking on Raspberry Pi 3.
23
24	#[cfg(all(
25	feature = "simd-accel",
26	any(
27	target_feature = "sse2",
28	all(target_endian = "little", target_arch = "aarch64"),
29	all(target_endian = "little", target_feature = "neon")
30	)
31	))]
32	use crate::simd_funcs::*;
33
34	cfg_if! {
35	if #[cfg(feature = "simd-accel")] {
36	#[allow(unused_imports)]
37	use ::core::intrinsics::unlikely;
38	#[allow(unused_imports)]
39	use ::core::intrinsics::likely;
40	} else {
41	#[allow(dead_code)]
42	#[inline(always)]
43	fn unlikely(b: bool) -> bool {
44	b
45	}
46	#[allow(dead_code)]
47	#[inline(always)]
48	fn likely(b: bool) -> bool {
49	b
50	}
51	}
52	}
53
54	// Safety invariants for masks: data & mask = 0 for valid ASCII or basic latin utf-16
55
56	// `as` truncates, so works on 32-bit, too.
57	#[allow(dead_code)]
58	pub const ASCII_MASK: usize = `0x8080_8080_8080_8080u64` as usize;
59
60	// `as` truncates, so works on 32-bit, too.
61	#[allow(dead_code)]
62	pub const BASIC_LATIN_MASK: usize = `0xFF80_FF80_FF80_FF80u64` as usize;
63
64	#[allow(unused_macros)]
65	macro_rules! ascii_naive {
66	($name:ident, $src_unit:ty, $dst_unit:ty) => {
67	/// Safety: src and dst must have len_unit elements and be aligned
68	/// Safety-usable invariant: will return Some() when it fails
69	/// to convert. The first value will be a u8 that is > 127.
70	#[inline(always)]
71	pub unsafe fn $name(
72	src: *const $src_unit,
73	dst: *mut $dst_unit,
74	len: usize,
75	) -> Option<($src_unit, usize)> {
76	// Yes, manually omitting the bound check here matters
77	// a lot for perf.
78	for i in `0`..len {
79	// Safety: len invariant used here
80	let code_unit = *(src.add(i));
81	// Safety: Upholds safety-usable invariant here
82	if code_unit > `127` {
83	return Some((code_unit, i));
84	}
85	// Safety: len invariant used here
86	*(dst.add(i)) = code_unit as $dst_unit;
87	}
88	return None;
89	}
90	};
91	}
92
93	#[allow(unused_macros)]
94	macro_rules! ascii_alu {
95	($name:ident,
96	// safety invariant: src/dst MUST be u8
97	$src_unit:ty,
98	$dst_unit:ty,
99	// Safety invariant: stride_fn must consume and produce two usizes, and return the index of the first non-ascii when it fails
100	$stride_fn:ident) => {
101	/// Safety: src and dst must have len elements, src is valid for read, dst is valid for
102	/// write
103	/// Safety-usable invariant: will return Some() when it fails
104	/// to convert. The first value will be a u8 that is > 127.
105	#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cast_ptr_alignment))]
106	#[inline(always)]
107	pub unsafe fn $name(
108	src: *const $src_unit,
109	dst: *mut $dst_unit,
110	len: usize,
111	) -> Option<($src_unit, usize)> {
112	let mut offset = `0usize`;
113	// This loop is only broken out of as a `goto` forward
114	loop {
115	// Safety: until_alignment becomes the number of bytes we need to munch until we are aligned to usize
116	let mut until_alignment = {
117	// Check if the other unit aligns if we move the narrower unit
118	// to alignment.
119	// if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
120	// ascii_to_ascii
121	let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK;
122	let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK;
123	if src_alignment != dst_alignment {
124	// Safety: bails early and ends up in the naïve branch where usize-alignment doesn't matter
125	break;
126	}
127	(ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK
128	// } else if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
129	// ascii_to_basic_latin
130	// let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
131	// if (dst.add(src_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
132	// break;
133	// }
134	// src_until_alignment
135	// } else {
136	// basic_latin_to_ascii
137	// let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
138	// if (src.add(dst_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
139	// break;
140	// }
141	// dst_until_alignment
142	// }
143	};
144	if until_alignment + ALU_STRIDE_SIZE <= len {
145	// Moving pointers to alignment seems to be a pessimization on
146	// x86_64 for operations that have UTF-16 as the internal
147	// Unicode representation. However, since it seems to be a win
148	// on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
149	// mixed results when encoding from UTF-16 and since x86 and
150	// x86_64 should be using SSE2 in due course, keeping the move
151	// to alignment here. It would be good to test on more ARM CPUs
152	// and on real MIPS and POWER hardware.
153	//
154	// Safety: This is the naïve code once again, for `until_alignment` bytes
155	while until_alignment != `0` {
156	let code_unit = *(src.add(offset));
157	if code_unit > `127` {
158	// Safety: Upholds safety-usable invariant here
159	return Some((code_unit, offset));
160	}
161	*(dst.add(offset)) = code_unit as $dst_unit;
162	// Safety: offset is the number of bytes copied so far
163	offset += `1`;
164	until_alignment -= `1`;
165	}
166	let len_minus_stride = len - ALU_STRIDE_SIZE;
167	loop {
168	// Safety: num_ascii is known to be a byte index of a non-ascii byte due to stride_fn's invariant
169	if let Some(num_ascii) = $stride_fn(
170	// Safety: These are known to be valid and aligned since we have at
171	// least ALU_STRIDE_SIZE data in these buffers, and offset is the
172	// number of elements copied so far, which according to the
173	// until_alignment calculation above will cause both src and dst to be
174	// aligned to usize after this add
175	src.add(offset) as *const usize,
176	dst.add(offset) as *mut usize,
177	) {
178	offset += num_ascii;
179	// Safety: Upholds safety-usable invariant here by indexing into non-ascii byte
180	return Some((*(src.add(offset)), offset));
181	}
182	// Safety: offset continues to be the number of bytes copied so far, and
183	// maintains usize alignment for the next loop iteration
184	offset += ALU_STRIDE_SIZE;
185	// Safety: This is `offset > len - stride. This loop will continue as long as
186	// `offset <= len - stride`, which means there are `stride` bytes to still be read.
187	if offset > len_minus_stride {
188	break;
189	}
190	}
191	}
192	break;
193	}
194
195	// Safety: This is the naïve code, same as ascii_naive, and has no requirements
196	// other than src/dst being valid for the the right lens
197	while offset < len {
198	// Safety: len invariant used here
199	let code_unit = *(src.add(offset));
200	if code_unit > `127` {
201	// Safety: Upholds safety-usable invariant here
202	return Some((code_unit, offset));
203	}
204	// Safety: len invariant used here
205	*(dst.add(offset)) = code_unit as $dst_unit;
206	offset += `1`;
207	}
208	None
209	}
210	};
211	}
212
213	#[allow(unused_macros)]
214	macro_rules! basic_latin_alu {
215	($name:ident,
216	// safety invariant: use u8 for src/dest for ascii, and u16 for basic_latin
217	$src_unit:ty,
218	$dst_unit:ty,
219	// safety invariant: stride function must munch ALU_STRIDE_SIZEsize(src_unit) bytes off of src and*
220	// write ALU_STRIDE_SIZEsize(dst_unit) bytes to dst*
221	$stride_fn:ident) => {
222	/// Safety: src and dst must have len elements, src is valid for read, dst is valid for
223	/// write
224	/// Safety-usable invariant: will return Some() when it fails
225	/// to convert. The first value will be a u8 that is > 127.
226	#[cfg_attr(
227	feature = "cargo-clippy",
228	allow(never_loop, cast_ptr_alignment, cast_lossless)
229	)]
230	#[inline(always)]
231	pub unsafe fn $name(
232	src: *const $src_unit,
233	dst: *mut $dst_unit,
234	len: usize,
235	) -> Option<($src_unit, usize)> {
236	let mut offset = `0usize`;
237	// This loop is only broken out of as a `goto` forward
238	loop {
239	// Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize
240	// We ensure basic-latin has the same alignment as ascii, starting with ascii since it is smaller.
241	let mut until_alignment = {
242	// Check if the other unit aligns if we move the narrower unit
243	// to alignment.
244	// if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
245	// ascii_to_ascii
246	// let src_alignment = (src as usize) & ALIGNMENT_MASK;
247	// let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
248	// if src_alignment != dst_alignment {
249	// break;
250	// }
251	// (ALIGNMENT - src_alignment) & ALIGNMENT_MASK
252	// } else
253	if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
254	// ascii_to_basic_latin
255	let src_until_alignment = (ALU_ALIGNMENT
256	- ((src as usize) & ALU_ALIGNMENT_MASK))
257	& ALU_ALIGNMENT_MASK;
258	if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
259	!= `0`
260	{
261	break;
262	}
263	src_until_alignment
264	} else {
265	// basic_latin_to_ascii
266	let dst_until_alignment = (ALU_ALIGNMENT
267	- ((dst as usize) & ALU_ALIGNMENT_MASK))
268	& ALU_ALIGNMENT_MASK;
269	if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
270	!= `0`
271	{
272	break;
273	}
274	dst_until_alignment
275	}
276	};
277	if until_alignment + ALU_STRIDE_SIZE <= len {
278	// Moving pointers to alignment seems to be a pessimization on
279	// x86_64 for operations that have UTF-16 as the internal
280	// Unicode representation. However, since it seems to be a win
281	// on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
282	// mixed results when encoding from UTF-16 and since x86 and
283	// x86_64 should be using SSE2 in due course, keeping the move
284	// to alignment here. It would be good to test on more ARM CPUs
285	// and on real MIPS and POWER hardware.
286	//
287	// Safety: This is the naïve code once again, for `until_alignment` bytes
288	while until_alignment != `0` {
289	let code_unit = *(src.add(offset));
290	if code_unit > `127` {
291	// Safety: Upholds safety-usable invariant here
292	return Some((code_unit, offset));
293	}
294	*(dst.add(offset)) = code_unit as $dst_unit;
295	// Safety: offset is the number of bytes copied so far
296	offset += `1`;
297	until_alignment -= `1`;
298	}
299	let len_minus_stride = len - ALU_STRIDE_SIZE;
300	loop {
301	if !$stride_fn(
302	// Safety: These are known to be valid and aligned since we have at
303	// least ALU_STRIDE_SIZE data in these buffers, and offset is the
304	// number of elements copied so far, which according to the
305	// until_alignment calculation above will cause both src and dst to be
306	// aligned to usize after this add
307	src.add(offset) as *const usize,
308	dst.add(offset) as *mut usize,
309	) {
310	break;
311	}
312	// Safety: offset continues to be the number of bytes copied so far, and
313	// maintains usize alignment for the next loop iteration
314	offset += ALU_STRIDE_SIZE;
315	// Safety: This is `offset > len - stride. This loop will continue as long as
316	// `offset <= len - stride`, which means there are `stride` bytes to still be read.
317	if offset > len_minus_stride {
318	break;
319	}
320	}
321	}
322	break;
323	}
324	// Safety: This is the naïve code once again, for leftover bytes
325	while offset < len {
326	// Safety: len invariant used here
327	let code_unit = *(src.add(offset));
328	if code_unit > `127` {
329	// Safety: Upholds safety-usable invariant here
330	return Some((code_unit, offset));
331	}
332	// Safety: len invariant used here
333	*(dst.add(offset)) = code_unit as $dst_unit;
334	offset += `1`;
335	}
336	None
337	}
338	};
339	}
340
341	#[allow(unused_macros)]
342	macro_rules! latin1_alu {
343	// safety invariant: stride function must munch ALU_STRIDE_SIZEsize(src_unit) bytes off of src and*
344	// write ALU_STRIDE_SIZEsize(dst_unit) bytes to dst*
345	($name:ident, $src_unit:ty, $dst_unit:ty, $stride_fn:ident) => {
346	/// Safety: src and dst must have len elements, src is valid for read, dst is valid for
347	/// write
348	#[cfg_attr(
349	feature = "cargo-clippy",
350	allow(never_loop, cast_ptr_alignment, cast_lossless)
351	)]
352	#[inline(always)]
353	pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
354	let mut offset = `0usize`;
355	// This loop is only broken out of as a `goto` forward
356	loop {
357	// Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize
358	// We ensure the UTF-16 side has the same alignment as the Latin-1 side, starting with Latin-1 since it is smaller.
359	let mut until_alignment = {
360	if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
361	// unpack
362	let src_until_alignment = (ALU_ALIGNMENT
363	- ((src as usize) & ALU_ALIGNMENT_MASK))
364	& ALU_ALIGNMENT_MASK;
365	if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
366	!= `0`
367	{
368	break;
369	}
370	src_until_alignment
371	} else {
372	// pack
373	let dst_until_alignment = (ALU_ALIGNMENT
374	- ((dst as usize) & ALU_ALIGNMENT_MASK))
375	& ALU_ALIGNMENT_MASK;
376	if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
377	!= `0`
378	{
379	break;
380	}
381	dst_until_alignment
382	}
383	};
384	if until_alignment + ALU_STRIDE_SIZE <= len {
385	// Safety: This is the naïve code once again, for `until_alignment` bytes
386	while until_alignment != `0` {
387	let code_unit = *(src.add(offset));
388	*(dst.add(offset)) = code_unit as $dst_unit;
389	// Safety: offset is the number of bytes copied so far
390	offset += `1`;
391	until_alignment -= `1`;
392	}
393	let len_minus_stride = len - ALU_STRIDE_SIZE;
394	loop {
395	$stride_fn(
396	// Safety: These are known to be valid and aligned since we have at
397	// least ALU_STRIDE_SIZE data in these buffers, and offset is the
398	// number of elements copied so far, which according to the
399	// until_alignment calculation above will cause both src and dst to be
400	// aligned to usize after this add
401	src.add(offset) as *const usize,
402	dst.add(offset) as *mut usize,
403	);
404	// Safety: offset continues to be the number of bytes copied so far, and
405	// maintains usize alignment for the next loop iteration
406	offset += ALU_STRIDE_SIZE;
407	// Safety: This is `offset > len - stride. This loop will continue as long as
408	// `offset <= len - stride`, which means there are `stride` bytes to still be read.
409	if offset > len_minus_stride {
410	break;
411	}
412	}
413	}
414	break;
415	}
416	// Safety: This is the naïve code once again, for leftover bytes
417	while offset < len {
418	// Safety: len invariant used here
419	let code_unit = *(src.add(offset));
420	*(dst.add(offset)) = code_unit as $dst_unit;
421	offset += `1`;
422	}
423	}
424	};
425	}
426
427	#[allow(unused_macros)]
428	macro_rules! ascii_simd_check_align {
429	(
430	$name:ident,
431	$src_unit:ty,
432	$dst_unit:ty,
433	// Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
434	$stride_both_aligned:ident,
435	// Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
436	$stride_src_aligned:ident,
437	// Safety: This function must require unaligned/aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
438	$stride_dst_aligned:ident,
439	// Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
440	$stride_neither_aligned:ident
441	) => {
442	/// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
443	///
444	/// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
445	/// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
446	#[inline(always)]
447	pub unsafe fn $name(
448	src: *const $src_unit,
449	dst: *mut $dst_unit,
450	len: usize,
451	) -> Option<($src_unit, usize)> {
452	let mut offset = `0usize`;
453	// Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
454	if SIMD_STRIDE_SIZE <= len {
455	let len_minus_stride = len - SIMD_STRIDE_SIZE;
456	// XXX Should we first process one stride unconditionally as unaligned to
457	// avoid the cost of the branchiness below if the first stride fails anyway?
458	// XXX Should we just use unaligned SSE2 access unconditionally? It seems that
459	// on Haswell, it would make sense to just use unaligned and not bother
460	// checking. Need to benchmark older architectures before deciding.
461	let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
462	// Safety: checking whether src is aligned
463	if ((src as usize) & SIMD_ALIGNMENT_MASK) == `0` {
464	// Safety: Checking whether dst is aligned
465	if dst_masked == `0` {
466	loop {
467	// Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
468	if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
469	break;
470	}
471	offset += SIMD_STRIDE_SIZE;
472	// Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
473	if offset > len_minus_stride {
474	break;
475	}
476	}
477	} else {
478	loop {
479	// Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
480	if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
481	break;
482	}
483	offset += SIMD_STRIDE_SIZE;
484	// Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
485	if offset > len_minus_stride {
486	break;
487	}
488	}
489	}
490	} else {
491	if dst_masked == `0` {
492	loop {
493	// Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
494	if !$stride_dst_aligned(src.add(offset), dst.add(offset)) {
495	break;
496	}
497	offset += SIMD_STRIDE_SIZE;
498	// Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
499	if offset > len_minus_stride {
500	break;
501	}
502	}
503	} else {
504	loop {
505	// Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
506	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
507	break;
508	}
509	offset += SIMD_STRIDE_SIZE;
510	// Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
511	if offset > len_minus_stride {
512	break;
513	}
514	}
515	}
516	}
517	}
518	while offset < len {
519	// Safety: uses len invariant here and below
520	let code_unit = *(src.add(offset));
521	if code_unit > `127` {
522	// Safety: upholds safety-usable invariant
523	return Some((code_unit, offset));
524	}
525	*(dst.add(offset)) = code_unit as $dst_unit;
526	offset += `1`;
527	}
528	None
529	}
530	};
531	}
532
533	#[allow(unused_macros)]
534	macro_rules! ascii_simd_check_align_unrolled {
535	(
536	$name:ident,
537	$src_unit:ty,
538	$dst_unit:ty,
539	// Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
540	$stride_both_aligned:ident,
541	// Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
542	$stride_src_aligned:ident,
543	// Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
544	$stride_neither_aligned:ident,
545	// Safety: This function must require aligned src/dest that are valid for reading/writing 2SIMD_STRIDE_SIZE src_unit/dst_unit*
546	$double_stride_both_aligned:ident,
547	// Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing 2SIMD_STRIDE_SIZE src_unit/dst_unit*
548	$double_stride_src_aligned:ident
549	) => {
550	/// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
551	///
552	/// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
553	/// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found #[inline(always)]
554	pub unsafe fn $name(
555	src: *const $src_unit,
556	dst: *mut $dst_unit,
557	len: usize,
558	) -> Option<($src_unit, usize)> {
559	let unit_size = ::core::mem::size_of::<$src_unit>();
560	let mut offset = `0usize`;
561	// This loop is only broken out of as a goto forward without
562	// actually looping
563	'outer: loop {
564	// Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
565	if SIMD_STRIDE_SIZE <= len {
566	// First, process one unaligned
567	// Safety: this is safe to call since we're valid for this read/write
568	if !$stride_neither_aligned(src, dst) {
569	break 'outer;
570	}
571	offset = SIMD_STRIDE_SIZE;
572
573	// We have now seen 16 ASCII bytes. Let's guess that
574	// there will be enough more to justify more expense
575	// in the case of non-ASCII.
576	// Use aligned reads for the sake of old microachitectures.
577	//
578	// Safety: this correctly calculates the number of src_units that need to be read before the remaining list is aligned.
579	// This is less that SIMD_ALIGNMENT, which is also SIMD_STRIDE_SIZE (as documented)
580	let until_alignment = ((SIMD_ALIGNMENT
581	- ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK))
582	& SIMD_ALIGNMENT_MASK)
583	/ unit_size;
584	// Safety: This addition won't overflow, because even in the 32-bit PAE case the
585	// address space holds enough code that the slice length can't be that
586	// close to address space size.
587	// offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
588	//
589	// Safety: if this check succeeds we're valid for reading/writing at least `2 SIMD_STRIDE_SIZE` elements plus `until_alignment`.*
590	// The extra SIMD_STRIDE_SIZE in the condition is because `offset` is already `SIMD_STRIDE_SIZE`.
591	if until_alignment + (SIMD_STRIDE_SIZE * `3`) <= len {
592	if until_alignment != `0` {
593	// Safety: this is safe to call since we're valid for this read/write (and more), and don't care about alignment
594	// This will copy over bytes that get decoded twice since it's not incrementing `offset` by SIMD_STRIDE_SIZE. This is fine.
595	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
596	break;
597	}
598	offset += until_alignment;
599	}
600	// Safety: At this point we're valid for reading/writing 2SIMD_STRIDE_SIZE elements*
601	// Safety: Now `offset` is aligned for `src`
602	let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * `2`);
603	// Safety: This is whether dst is aligned
604	let dst_masked = (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK;
605	if dst_masked == `0` {
606	loop {
607	// Safety: both are aligned, we can call the aligned function. We're valid for reading/writing double stride from the initial condition
608	// and the loop break condition below
609	if let Some(advance) =
610	$double_stride_both_aligned(src.add(offset), dst.add(offset))
611	{
612	offset += advance;
613	let code_unit = *(src.add(offset));
614	// Safety: uses safety-usable invariant on ascii_to_ascii_simd_double_stride to return
615	// guaranteed non-ascii
616	return Some((code_unit, offset));
617	}
618	offset += SIMD_STRIDE_SIZE * `2`;
619	// Safety: This is `offset > len - 2 SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.*
620	if offset > len_minus_stride_times_two {
621	break;
622	}
623	}
624	// Safety: We're valid for reading/writing one more, and can still assume alignment
625	if offset + SIMD_STRIDE_SIZE <= len {
626	if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
627	break 'outer;
628	}
629	offset += SIMD_STRIDE_SIZE;
630	}
631	} else {
632	loop {
633	// Safety: only src is aligned here. We're valid for reading/writing double stride from the initial condition
634	// and the loop break condition below
635	if let Some(advance) =
636	$double_stride_src_aligned(src.add(offset), dst.add(offset))
637	{
638	offset += advance;
639	let code_unit = *(src.add(offset));
640	// Safety: uses safety-usable invariant on ascii_to_ascii_simd_double_stride to return
641	// guaranteed non-ascii
642	return Some((code_unit, offset));
643	}
644	offset += SIMD_STRIDE_SIZE * `2`;
645	// Safety: This is `offset > len - 2 SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.*
646
647	if offset > len_minus_stride_times_two {
648	break;
649	}
650	}
651	// Safety: We're valid for reading/writing one more, and can still assume alignment
652	if offset + SIMD_STRIDE_SIZE <= len {
653	if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
654	break 'outer;
655	}
656	offset += SIMD_STRIDE_SIZE;
657	}
658	}
659	} else {
660	// At most two iterations, so unroll
661	if offset + SIMD_STRIDE_SIZE <= len {
662	// Safety: The check above ensures we're allowed to read/write this, and we don't use alignment
663	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
664	break;
665	}
666	offset += SIMD_STRIDE_SIZE;
667	if offset + SIMD_STRIDE_SIZE <= len {
668	// Safety: The check above ensures we're allowed to read/write this, and we don't use alignment
669	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
670	break;
671	}
672	offset += SIMD_STRIDE_SIZE;
673	}
674	}
675	}
676	}
677	break 'outer;
678	}
679	while offset < len {
680	// Safety: relies straightforwardly on the `len` invariant
681	let code_unit = *(src.add(offset));
682	if code_unit > `127` {
683	// Safety-usable invariant upheld here
684	return Some((code_unit, offset));
685	}
686	*(dst.add(offset)) = code_unit as $dst_unit;
687	offset += `1`;
688	}
689	None
690	}
691	};
692	}
693
694	#[allow(unused_macros)]
695	macro_rules! latin1_simd_check_align {
696	(
697	$name:ident,
698	$src_unit:ty,
699	$dst_unit:ty,
700	// Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
701	$stride_both_aligned:ident,
702	// Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
703	$stride_src_aligned:ident,
704	// Safety: This function must require unaligned/aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
705	$stride_dst_aligned:ident,
706	// Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
707	$stride_neither_aligned:ident
708
709	) => {
710	/// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
711	#[inline(always)]
712	pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
713	let mut offset = `0usize`;
714	// Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
715	if SIMD_STRIDE_SIZE <= len {
716	let len_minus_stride = len - SIMD_STRIDE_SIZE;
717	// Whether dst is aligned
718	let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
719	// Whether src is aligned
720	if ((src as usize) & SIMD_ALIGNMENT_MASK) == `0` {
721	if dst_masked == `0` {
722	loop {
723	// Safety: Both were aligned, we can use the aligned function
724	$stride_both_aligned(src.add(offset), dst.add(offset));
725	offset += SIMD_STRIDE_SIZE;
726	// Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
727	// reading/writing at least SIMD_STRIDE_SIZE elements.
728	if offset > len_minus_stride {
729	break;
730	}
731	}
732	} else {
733	loop {
734	// Safety: src was aligned, dst was not
735	$stride_src_aligned(src.add(offset), dst.add(offset));
736	offset += SIMD_STRIDE_SIZE;
737	// Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
738	// reading/writing at least SIMD_STRIDE_SIZE elements.
739	if offset > len_minus_stride {
740	break;
741	}
742	}
743	}
744	} else {
745	if dst_masked == `0` {
746	loop {
747	// Safety: src was aligned, dst was not
748	$stride_dst_aligned(src.add(offset), dst.add(offset));
749	offset += SIMD_STRIDE_SIZE;
750	// Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
751	// reading/writing at least SIMD_STRIDE_SIZE elements.
752	if offset > len_minus_stride {
753	break;
754	}
755	}
756	} else {
757	loop {
758	// Safety: Neither were aligned
759	$stride_neither_aligned(src.add(offset), dst.add(offset));
760	offset += SIMD_STRIDE_SIZE;
761	// Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
762	// reading/writing at least SIMD_STRIDE_SIZE elements.
763	if offset > len_minus_stride {
764	break;
765	}
766	}
767	}
768	}
769	}
770	while offset < len {
771	// Safety: relies straightforwardly on the `len` invariant
772	let code_unit = *(src.add(offset));
773	*(dst.add(offset)) = code_unit as $dst_unit;
774	offset += `1`;
775	}
776	}
777	};
778	}
779
780	#[allow(unused_macros)]
781	macro_rules! latin1_simd_check_align_unrolled {
782	(
783	$name:ident,
784	$src_unit:ty,
785	$dst_unit:ty,
786	// Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
787	$stride_both_aligned:ident,
788	// Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
789	$stride_src_aligned:ident,
790	// Safety: This function must require unaligned/aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
791	$stride_dst_aligned:ident,
792	// Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
793	$stride_neither_aligned:ident
794	) => {
795	/// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
796	#[inline(always)]
797	pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
798	let unit_size = ::core::mem::size_of::<$src_unit>();
799	let mut offset = `0usize`;
800	// Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
801	if SIMD_STRIDE_SIZE <= len {
802	// Safety: this correctly calculates the number of src_units that need to be read before the remaining list is aligned.
803	// This is by definition less than SIMD_STRIDE_SIZE.
804	let mut until_alignment = ((SIMD_STRIDE_SIZE
805	- ((src as usize) & SIMD_ALIGNMENT_MASK))
806	& SIMD_ALIGNMENT_MASK)
807	/ unit_size;
808	while until_alignment != `0` {
809	// Safety: This is a straightforward copy, since until_alignment is < SIMD_STRIDE_SIZE < len, this is in-bounds
810	(dst.add(offset)) = (src.add(offset)) as $dst_unit;
811	offset += `1`;
812	until_alignment -= `1`;
813	}
814	// Safety: here offset will be `until_alignment`, i.e. enough to align `src`.
815	let len_minus_stride = len - SIMD_STRIDE_SIZE;
816	// Safety: if this check succeeds we're valid for reading/writing at least `2 SIMD_STRIDE_SIZE` elements.*
817	if offset + SIMD_STRIDE_SIZE * `2` <= len {
818	let len_minus_stride_times_two = len_minus_stride - SIMD_STRIDE_SIZE;
819	// Safety: at this point src is known to be aligned at offset, dst is not.
820	if (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK == `0` {
821	loop {
822	// Safety: We checked alignment of dst above, we can use the alignment functions. We're allowed to read/write 2SIMD_STRIDE_SIZE elements, which we do.*
823	$stride_both_aligned(src.add(offset), dst.add(offset));
824	offset += SIMD_STRIDE_SIZE;
825	$stride_both_aligned(src.add(offset), dst.add(offset));
826	offset += SIMD_STRIDE_SIZE;
827	// Safety: This is `offset > len - 2 SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.*
828	if offset > len_minus_stride_times_two {
829	break;
830	}
831	}
832	} else {
833	loop {
834	// Safety: we ensured alignment of src already.
835	$stride_src_aligned(src.add(offset), dst.add(offset));
836	offset += SIMD_STRIDE_SIZE;
837	$stride_src_aligned(src.add(offset), dst.add(offset));
838	offset += SIMD_STRIDE_SIZE;
839	// Safety: This is `offset > len - 2 SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.*
840	if offset > len_minus_stride_times_two {
841	break;
842	}
843	}
844	}
845	}
846	// Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we are valid to munch SIMD_STRIDE_SIZE more elements, which we do
847	if offset < len_minus_stride {
848	$stride_src_aligned(src.add(offset), dst.add(offset));
849	offset += SIMD_STRIDE_SIZE;
850	}
851	}
852	while offset < len {
853	// Safety: uses len invariant here and below
854	let code_unit = *(src.add(offset));
855	// On x86_64, this loop autovectorizes but in the pack
856	// case there are instructions whose purpose is to make sure
857	// each u16 in the vector is truncated before packing. However,
858	// since we don't care about saturating behavior of SSE2 packing
859	// when the input isn't Latin1, those instructions are useless.
860	// Unfortunately, using the `assume` intrinsic to lie to the
861	// optimizer doesn't make LLVM omit the trunctation that we
862	// don't need. Possibly this loop could be manually optimized
863	// to do the sort of thing that LLVM does but without the
864	// ANDing the read vectors of u16 with a constant that discards
865	// the high half of each u16. As far as I can tell, the
866	// optimization assumes that doing a SIMD read past the end of
867	// the array is OK.
868	*(dst.add(offset)) = code_unit as $dst_unit;
869	offset += `1`;
870	}
871	}
872	};
873	}
874
875	#[allow(unused_macros)]
876	macro_rules! ascii_simd_unalign {
877	// Safety: stride_neither_aligned must be a function that requires src/dest be valid for unaligned reads/writes for SIMD_STRIDE_SIZE elements of type src_unit/dest_unit
878	($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
879	/// Safety: src and dst must be valid for reads/writes of len elements of type src_unit/dst_unit
880	///
881	/// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
882	/// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
883	#[inline(always)]
884	pub unsafe fn $name(
885	src: *const $src_unit,
886	dst: *mut $dst_unit,
887	len: usize,
888	) -> Option<($src_unit, usize)> {
889	let mut offset = `0usize`;
890	// Safety: if this check succeeds we're valid for reading/writing at least `stride` elements.
891	if SIMD_STRIDE_SIZE <= len {
892	let len_minus_stride = len - SIMD_STRIDE_SIZE;
893	loop {
894	// Safety: We know we're valid for `stride` reads/writes, so we can call this function. We don't need alignment.
895	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
896	break;
897	}
898	offset += SIMD_STRIDE_SIZE;
899	// This is `offset > len - stride` which means we always have at least `stride` elements to munch next time.
900	if offset > len_minus_stride {
901	break;
902	}
903	}
904	}
905	while offset < len {
906	// Safety: Uses len invariant here and below
907	let code_unit = *(src.add(offset));
908	if code_unit > `127` {
909	// Safety-usable invariant upheld here
910	return Some((code_unit, offset));
911	}
912	*(dst.add(offset)) = code_unit as $dst_unit;
913	offset += `1`;
914	}
915	None
916	}
917	};
918	}
919
920	#[allow(unused_macros)]
921	macro_rules! latin1_simd_unalign {
922	// Safety: stride_neither_aligned must be a function that requires src/dest be valid for unaligned reads/writes for SIMD_STRIDE_SIZE elements of type src_unit/dest_unit
923	($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
924	/// Safety: src and dst must be valid for unaligned reads/writes of len elements of type src_unit/dst_unit
925	#[inline(always)]
926	pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
927	let mut offset = `0usize`;
928	// Safety: if this check succeeds we're valid for reading/writing at least `stride` elements.
929	if SIMD_STRIDE_SIZE <= len {
930	let len_minus_stride = len - SIMD_STRIDE_SIZE;
931	loop {
932	// Safety: We know we're valid for `stride` reads/writes, so we can call this function. We don't need alignment.
933	$stride_neither_aligned(src.add(offset), dst.add(offset));
934	offset += SIMD_STRIDE_SIZE;
935	// This is `offset > len - stride` which means we always have at least `stride` elements to munch next time.
936	if offset > len_minus_stride {
937	break;
938	}
939	}
940	}
941	while offset < len {
942	// Safety: Uses len invariant here
943	let code_unit = *(src.add(offset));
944	*(dst.add(offset)) = code_unit as $dst_unit;
945	offset += `1`;
946	}
947	}
948	};
949	}
950
951	#[allow(unused_macros)]
952	macro_rules! ascii_to_ascii_simd_stride {
953	// Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load\|store)(16\|8)_(unaligned\|aligned)` functions)
954	($name:ident, $load:ident, $store:ident) => {
955	/// Safety: src and dst must be valid for 16 bytes of read/write according to
956	/// the $load/$store fn, which may allow for unaligned reads/writes or require
957	/// alignment to either 16x8 or u8x16.
958	#[inline(always)]
959	pub unsafe fn $name(src: *const u8, dst: *mut u8) -> bool {
960	let simd = $load(src);
961	if !simd_is_ascii(simd) {
962	return `false`;
963	}
964	$store(dst, simd);
965	`true`
966	}
967	};
968	}
969
970	#[allow(unused_macros)]
971	macro_rules! ascii_to_ascii_simd_double_stride {
972	// Safety: store must be valid for 32 bytes of write, which may be unaligned (candidates: `store(8\|16)_(aligned\|unaligned)`)
973	($name:ident, $store:ident) => {
974	/// Safety: src must be valid for 32 bytes of aligned u8x16 read
975	/// dst must be valid for 32 bytes of unaligned write according to
976	/// the $store fn, which may allow for unaligned writes or require
977	/// alignment to either 16x8 or u8x16.
978	///
979	/// Safety-usable invariant: Returns Some(index) if the element at `index` is invalid ASCII
980	#[inline(always)]
981	pub unsafe fn $name(src: *const u8, dst: *mut u8) -> Option<usize> {
982	let first = load16_aligned(src);
983	let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
984	$store(dst, first);
985	if unlikely(!simd_is_ascii(first \| second)) {
986	// Safety: mask_ascii produces a mask of all the high bits.
987	let mask_first = mask_ascii(first);
988	if mask_first != `0` {
989	// Safety: on little endian systems this will be the number of ascii bytes
990	// before the first non-ascii, i.e. valid for indexing src
991	// TODO SAFETY: What about big-endian systems?
992	return Some(mask_first.trailing_zeros() as usize);
993	}
994	$store(dst.add(SIMD_STRIDE_SIZE), second);
995	let mask_second = mask_ascii(second);
996	// Safety: on little endian systems this will be the number of ascii bytes
997	// before the first non-ascii, i.e. valid for indexing src
998	return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
999	}
1000	$store(dst.add(SIMD_STRIDE_SIZE), second);
1001	None
1002	}
1003	};
1004	}
1005
1006	#[allow(unused_macros)]
1007	macro_rules! ascii_to_basic_latin_simd_stride {
1008	// Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load\|store)(16\|8)_(unaligned\|aligned)` functions)
1009	($name:ident, $load:ident, $store:ident) => {
1010	/// Safety: src and dst must be valid for 16/32 bytes of read/write according to
1011	/// the $load/$store fn, which may allow for unaligned reads/writes or require
1012	/// alignment to either 16x8 or u8x16.
1013	#[inline(always)]
1014	pub unsafe fn $name(src: *const u8, dst: *mut u16) -> bool {
1015	let simd = $load(src);
1016	if !simd_is_ascii(simd) {
1017	return `false`;
1018	}
1019	let (first, second) = simd_unpack(simd);
1020	$store(dst, first);
1021	$store(dst.add(`8`), second);
1022	`true`
1023	}
1024	};
1025	}
1026
1027	#[allow(unused_macros)]
1028	macro_rules! ascii_to_basic_latin_simd_double_stride {
1029	// Safety: store must be valid for 16 bytes of write, which may be unaligned
1030	($name:ident, $store:ident) => {
1031	/// Safety: src must be valid for 2SIMD_STRIDE_SIZE bytes of aligned reads,*
1032	/// aligned to either 16x8 or u8x16.
1033	/// dst must be valid for 2SIMD_STRIDE_SIZE bytes of aligned or unaligned reads*
1034	#[inline(always)]
1035	pub unsafe fn $name(src: *const u8, dst: *mut u16) -> Option<usize> {
1036	let first = load16_aligned(src);
1037	let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
1038	let (a, b) = simd_unpack(first);
1039	$store(dst, a);
1040	// Safety: divide by 2 since it's a u16 pointer
1041	$store(dst.add(SIMD_STRIDE_SIZE / `2`), b);
1042	if unlikely(!simd_is_ascii(first \| second)) {
1043	let mask_first = mask_ascii(first);
1044	if mask_first != `0` {
1045	return Some(mask_first.trailing_zeros() as usize);
1046	}
1047	let (c, d) = simd_unpack(second);
1048	$store(dst.add(SIMD_STRIDE_SIZE), c);
1049	$store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / `2`)), d);
1050	let mask_second = mask_ascii(second);
1051	return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
1052	}
1053	let (c, d) = simd_unpack(second);
1054	$store(dst.add(SIMD_STRIDE_SIZE), c);
1055	$store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / `2`)), d);
1056	None
1057	}
1058	};
1059	}
1060
1061	#[allow(unused_macros)]
1062	macro_rules! unpack_simd_stride {
1063	// Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load\|store)(16\|8)_(unaligned\|aligned)` functions)
1064	($name:ident, $load:ident, $store:ident) => {
1065	/// Safety: src and dst must be valid for 16 bytes of read/write according to
1066	/// the $load/$store fn, which may allow for unaligned reads/writes or require
1067	/// alignment to either 16x8 or u8x16.
1068	#[inline(always)]
1069	pub unsafe fn $name(src: *const u8, dst: *mut u16) {
1070	let simd = $load(src);
1071	let (first, second) = simd_unpack(simd);
1072	$store(dst, first);
1073	$store(dst.add(`8`), second);
1074	}
1075	};
1076	}
1077
1078	#[allow(unused_macros)]
1079	macro_rules! basic_latin_to_ascii_simd_stride {
1080	// Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load\|store)(16\|8)_(unaligned\|aligned)` functions)
1081	($name:ident, $load:ident, $store:ident) => {
1082	/// Safety: src and dst must be valid for 32/16 bytes of read/write according to
1083	/// the $load/$store fn, which may allow for unaligned reads/writes or require
1084	/// alignment to either 16x8 or u8x16.
1085	#[inline(always)]
1086	pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool {
1087	let first = $load(src);
1088	let second = $load(src.add(`8`));
1089	if simd_is_basic_latin(first \| second) {
1090	$store(dst, simd_pack(first, second));
1091	`true`
1092	} else {
1093	`false`
1094	}
1095	}
1096	};
1097	}
1098
1099	#[allow(unused_macros)]
1100	macro_rules! pack_simd_stride {
1101	// Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load\|store)(16\|8)_(unaligned\|aligned)` functions)
1102	($name:ident, $load:ident, $store:ident) => {
1103	/// Safety: src and dst must be valid for 32/16 bytes of read/write according to
1104	/// the $load/$store fn, which may allow for unaligned reads/writes or require
1105	/// alignment to either 16x8 or u8x16.
1106	#[inline(always)]
1107	pub unsafe fn $name(src: *const u16, dst: *mut u8) {
1108	let first = $load(src);
1109	let second = $load(src.add(`8`));
1110	$store(dst, simd_pack(first, second));
1111	}
1112	};
1113	}
1114
1115	cfg_if! {
1116	if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "aarch64"))] {
1117	// SIMD with the same instructions for aligned and unaligned loads and stores
1118
1119	pub const SIMD_STRIDE_SIZE: usize = `16`;
1120
1121	pub const MAX_STRIDE_SIZE: usize = `16`;
1122
1123	// pub const ALIGNMENT: usize = 8;
1124
1125	pub const ALU_STRIDE_SIZE: usize = `16`;
1126
1127	pub const ALU_ALIGNMENT: usize = `8`;
1128
1129	pub const ALU_ALIGNMENT_MASK: usize = `7`;
1130
1131	// Safety for stride macros: We stick to the load8_aligned/etc family of functions. We consistently produce
1132	// neither_unaligned variants using only unaligned inputs.
1133	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
1134
1135	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
1136	unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
1137
1138	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
1139	pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
1140
1141	// Safety for conversion macros: We use the unalign macro with unalign functions above. All stride functions were produced
1142	// by stride macros that universally munch a single SIMD_STRIDE_SIZE worth of elements.
1143	ascii_simd_unalign!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_neither_aligned);
1144	ascii_simd_unalign!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_neither_aligned);
1145	ascii_simd_unalign!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_neither_aligned);
1146	latin1_simd_unalign!(unpack_latin1, u8, u16, unpack_stride_neither_aligned);
1147	latin1_simd_unalign!(pack_latin1, u16, u8, pack_stride_neither_aligned);
1148	} else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1149	// SIMD with different instructions for aligned and unaligned loads and stores.
1150	//
1151	// Newer microarchitectures are not supposed to have a performance difference between
1152	// aligned and unaligned SSE2 loads and stores when the address is actually aligned,
1153	// but the benchmark results I see don't agree.
1154
1155	pub const SIMD_STRIDE_SIZE: usize = `16`;
1156
1157	pub const MAX_STRIDE_SIZE: usize = `16`;
1158
1159	pub const SIMD_ALIGNMENT_MASK: usize = `15`;
1160
1161	// Safety for stride macros: We stick to the load8_aligned/etc family of functions. We consistently name
1162	// aligned/unaligned functions according to src/dst being aligned/unaligned
1163
1164	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
1165	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
1166	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_dst_aligned, load16_unaligned, store16_aligned);
1167	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
1168
1169	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
1170	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
1171	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_dst_aligned, load16_unaligned, store8_aligned);
1172	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
1173
1174	unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
1175	unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
1176	unpack_simd_stride!(unpack_stride_dst_aligned, load16_unaligned, store8_aligned);
1177	unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
1178
1179	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
1180	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
1181	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
1182	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
1183
1184	pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
1185	pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
1186	pack_simd_stride!(pack_stride_dst_aligned, load8_unaligned, store16_aligned);
1187	pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
1188
1189	// Safety for conversion macros: We use the correct pattern of both/src/dst/neither here. All stride functions were produced
1190	// by stride macros that universally munch a single SIMD_STRIDE_SIZE worth of elements.
1191
1192	ascii_simd_check_align!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_dst_aligned, ascii_to_ascii_stride_neither_aligned);
1193	ascii_simd_check_align!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_dst_aligned, ascii_to_basic_latin_stride_neither_aligned);
1194	ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
1195	latin1_simd_check_align!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
1196	latin1_simd_check_align!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
1197	} else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1198	// SIMD with different instructions for aligned and unaligned loads and stores.
1199	//
1200	// Newer microarchitectures are not supposed to have a performance difference between
1201	// aligned and unaligned SSE2 loads and stores when the address is actually aligned,
1202	// but the benchmark results I see don't agree.
1203
1204	pub const SIMD_STRIDE_SIZE: usize = `16`;
1205
1206	/// Safety-usable invariant: This should be identical to SIMD_STRIDE_SIZE (used by ascii_simd_check_align_unrolled)
1207	pub const SIMD_ALIGNMENT: usize = `16`;
1208
1209	pub const MAX_STRIDE_SIZE: usize = `16`;
1210
1211	pub const SIMD_ALIGNMENT_MASK: usize = `15`;
1212
1213	// Safety for stride macros: We stick to the load8_aligned/etc family of functions. We consistently name
1214	// aligned/unaligned functions according to src/dst being aligned/unaligned
1215
1216	ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_both_aligned, store16_aligned);
1217	ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_src_aligned, store16_unaligned);
1218
1219	ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_both_aligned, store8_aligned);
1220	ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_src_aligned, store8_unaligned);
1221
1222	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
1223	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
1224	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
1225
1226	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
1227	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
1228	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
1229
1230	unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
1231	unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
1232
1233	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
1234	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
1235	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
1236	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
1237
1238	pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
1239	pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
1240
1241	// Safety for conversion macros: We use the correct pattern of both/src/dst/neither/double_both/double_src here. All stride functions were produced
1242	// by stride macros that universally munch a single SIMD_STRIDE_SIZE worth of elements.
1243
1244	ascii_simd_check_align_unrolled!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_neither_aligned, ascii_to_ascii_simd_double_stride_both_aligned, ascii_to_ascii_simd_double_stride_src_aligned);
1245	ascii_simd_check_align_unrolled!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_neither_aligned, ascii_to_basic_latin_simd_double_stride_both_aligned, ascii_to_basic_latin_simd_double_stride_src_aligned);
1246
1247	ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
1248	latin1_simd_check_align_unrolled!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
1249	latin1_simd_check_align_unrolled!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
1250	} else if #[cfg(all(target_endian = "little", target_pointer_width = "64"))] {
1251	// Aligned ALU word, little-endian, 64-bit
1252
1253	/// Safety invariant: this is the amount of bytes consumed by
1254	/// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1255	/// This is also the number of bytes produced by pack_alu.
1256	/// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1257	pub const ALU_STRIDE_SIZE: usize = `16`;
1258
1259	pub const MAX_STRIDE_SIZE: usize = `16`;
1260
1261	// Safety invariant: this is the pointer width in bytes
1262	pub const ALU_ALIGNMENT: usize = `8`;
1263
1264	// Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1265	pub const ALU_ALIGNMENT_MASK: usize = `7`;
1266
1267	/// Safety: dst must point to valid space for writing four `usize`s
1268	#[inline(always)]
1269	unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1270	let first = ((`0x0000_0000_FF00_0000usize` & word) << `24`) \|
1271	((`0x0000_0000_00FF_0000usize` & word) << `16`) \|
1272	((`0x0000_0000_0000_FF00usize` & word) << `8`) \|
1273	(`0x0000_0000_0000_00FFusize` & word);
1274	let second = ((`0xFF00_0000_0000_0000usize` & word) >> `8`) \|
1275	((`0x00FF_0000_0000_0000usize` & word) >> `16`) \|
1276	((`0x0000_FF00_0000_0000usize` & word) >> `24`) \|
1277	((`0x0000_00FF_0000_0000usize` & word) >> `32`);
1278	let third = ((`0x0000_0000_FF00_0000usize` & second_word) << `24`) \|
1279	((`0x0000_0000_00FF_0000usize` & second_word) << `16`) \|
1280	((`0x0000_0000_0000_FF00usize` & second_word) << `8`) \|
1281	(`0x0000_0000_0000_00FFusize` & second_word);
1282	let fourth = ((`0xFF00_0000_0000_0000usize` & second_word) >> `8`) \|
1283	((`0x00FF_0000_0000_0000usize` & second_word) >> `16`) \|
1284	((`0x0000_FF00_0000_0000usize` & second_word) >> `24`) \|
1285	((`0x0000_00FF_0000_0000usize` & second_word) >> `32`);
1286	// Safety: fn invariant used here
1287	*dst = first;
1288	*(dst.add(`1`)) = second;
1289	*(dst.add(`2`)) = third;
1290	*(dst.add(`3`)) = fourth;
1291	}
1292
1293	/// Safety: dst must point to valid space for writing two `usize`s
1294	#[inline(always)]
1295	unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1296	let word = ((`0x00FF_0000_0000_0000usize` & second) << `8`) \|
1297	((`0x0000_00FF_0000_0000usize` & second) << `16`) \|
1298	((`0x0000_0000_00FF_0000usize` & second) << `24`) \|
1299	((`0x0000_0000_0000_00FFusize` & second) << `32`) \|
1300	((`0x00FF_0000_0000_0000usize` & first) >> `24`) \|
1301	((`0x0000_00FF_0000_0000usize` & first) >> `16`) \|
1302	((`0x0000_0000_00FF_0000usize` & first) >> `8`) \|
1303	(`0x0000_0000_0000_00FFusize` & first);
1304	let second_word = ((`0x00FF_0000_0000_0000usize` & fourth) << `8`) \|
1305	((`0x0000_00FF_0000_0000usize` & fourth) << `16`) \|
1306	((`0x0000_0000_00FF_0000usize` & fourth) << `24`) \|
1307	((`0x0000_0000_0000_00FFusize` & fourth) << `32`) \|
1308	((`0x00FF_0000_0000_0000usize` & third) >> `24`) \|
1309	((`0x0000_00FF_0000_0000usize` & third) >> `16`) \|
1310	((`0x0000_0000_00FF_0000usize` & third) >> `8`) \|
1311	(`0x0000_0000_0000_00FFusize` & third);
1312	// Safety: fn invariant used here
1313	*dst = word;
1314	*(dst.add(`1`)) = second_word;
1315	}
1316	} else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] {
1317	// Aligned ALU word, little-endian, 32-bit
1318
1319	/// Safety invariant: this is the amount of bytes consumed by
1320	/// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1321	/// This is also the number of bytes produced by pack_alu.
1322	/// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1323	pub const ALU_STRIDE_SIZE: usize = `8`;
1324
1325	pub const MAX_STRIDE_SIZE: usize = `8`;
1326
1327	// Safety invariant: this is the pointer width in bytes
1328	pub const ALU_ALIGNMENT: usize = `4`;
1329
1330	// Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1331	pub const ALU_ALIGNMENT_MASK: usize = `3`;
1332
1333	/// Safety: dst must point to valid space for writing four `usize`s
1334	#[inline(always)]
1335	unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1336	let first = ((`0x0000_FF00usize` & word) << `8`) \|
1337	(`0x0000_00FFusize` & word);
1338	let second = ((`0xFF00_0000usize` & word) >> `8`) \|
1339	((`0x00FF_0000usize` & word) >> `16`);
1340	let third = ((`0x0000_FF00usize` & second_word) << `8`) \|
1341	(`0x0000_00FFusize` & second_word);
1342	let fourth = ((`0xFF00_0000usize` & second_word) >> `8`) \|
1343	((`0x00FF_0000usize` & second_word) >> `16`);
1344	// Safety: fn invariant used here
1345	*dst = first;
1346	*(dst.add(`1`)) = second;
1347	*(dst.add(`2`)) = third;
1348	*(dst.add(`3`)) = fourth;
1349	}
1350
1351	/// Safety: dst must point to valid space for writing two `usize`s
1352	#[inline(always)]
1353	unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1354	let word = ((`0x00FF_0000usize` & second) << `8`) \|
1355	((`0x0000_00FFusize` & second) << `16`) \|
1356	((`0x00FF_0000usize` & first) >> `8`) \|
1357	(`0x0000_00FFusize` & first);
1358	let second_word = ((`0x00FF_0000usize` & fourth) << `8`) \|
1359	((`0x0000_00FFusize` & fourth) << `16`) \|
1360	((`0x00FF_0000usize` & third) >> `8`) \|
1361	(`0x0000_00FFusize` & third);
1362	// Safety: fn invariant used here
1363	*dst = word;
1364	*(dst.add(`1`)) = second_word;
1365	}
1366	} else if #[cfg(all(target_endian = "big", target_pointer_width = "64"))] {
1367	// Aligned ALU word, big-endian, 64-bit
1368
1369	/// Safety invariant: this is the amount of bytes consumed by
1370	/// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1371	/// This is also the number of bytes produced by pack_alu.
1372	/// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1373	pub const ALU_STRIDE_SIZE: usize = `16`;
1374
1375	pub const MAX_STRIDE_SIZE: usize = `16`;
1376
1377	// Safety invariant: this is the pointer width in bytes
1378	pub const ALU_ALIGNMENT: usize = `8`;
1379
1380	// Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1381	pub const ALU_ALIGNMENT_MASK: usize = `7`;
1382
1383	/// Safety: dst must point to valid space for writing four `usize`s
1384	#[inline(always)]
1385	unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1386	let first = ((`0xFF00_0000_0000_0000usize` & word) >> `8`) \|
1387	((`0x00FF_0000_0000_0000usize` & word) >> `16`) \|
1388	((`0x0000_FF00_0000_0000usize` & word) >> `24`) \|
1389	((`0x0000_00FF_0000_0000usize` & word) >> `32`);
1390	let second = ((`0x0000_0000_FF00_0000usize` & word) << `24`) \|
1391	((`0x0000_0000_00FF_0000usize` & word) << `16`) \|
1392	((`0x0000_0000_0000_FF00usize` & word) << `8`) \|
1393	(`0x0000_0000_0000_00FFusize` & word);
1394	let third = ((`0xFF00_0000_0000_0000usize` & second_word) >> `8`) \|
1395	((`0x00FF_0000_0000_0000usize` & second_word) >> `16`) \|
1396	((`0x0000_FF00_0000_0000usize` & second_word) >> `24`) \|
1397	((`0x0000_00FF_0000_0000usize` & second_word) >> `32`);
1398	let fourth = ((`0x0000_0000_FF00_0000usize` & second_word) << `24`) \|
1399	((`0x0000_0000_00FF_0000usize` & second_word) << `16`) \|
1400	((`0x0000_0000_0000_FF00usize` & second_word) << `8`) \|
1401	(`0x0000_0000_0000_00FFusize` & second_word);
1402	// Safety: fn invariant used here
1403	*dst = first;
1404	*(dst.add(`1`)) = second;
1405	*(dst.add(`2`)) = third;
1406	*(dst.add(`3`)) = fourth;
1407	}
1408
1409	/// Safety: dst must point to valid space for writing two `usize`s
1410	#[inline(always)]
1411	unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1412	let word = ((`0x00FF0000_00000000usize` & first) << `8`) \|
1413	((`0x000000FF_00000000usize` & first) << `16`) \|
1414	((`0x00000000_00FF0000usize` & first) << `24`) \|
1415	((`0x00000000_000000FFusize` & first) << `32`) \|
1416	((`0x00FF0000_00000000usize` & second) >> `24`) \|
1417	((`0x000000FF_00000000usize` & second) >> `16`) \|
1418	((`0x00000000_00FF0000usize` & second) >> `8`) \|
1419	(`0x00000000_000000FFusize` & second);
1420	let second_word = ((`0x00FF0000_00000000usize` & third) << `8`) \|
1421	((`0x000000FF_00000000usize` & third) << `16`) \|
1422	((`0x00000000_00FF0000usize` & third) << `24`) \|
1423	((`0x00000000_000000FFusize` & third) << `32`) \|
1424	((`0x00FF0000_00000000usize` & fourth) >> `24`) \|
1425	((`0x000000FF_00000000usize` & fourth) >> `16`) \|
1426	((`0x00000000_00FF0000usize` & fourth) >> `8`) \|
1427	(`0x00000000_000000FFusize` & fourth);
1428	// Safety: fn invariant used here
1429	*dst = word;
1430	*(dst.add(`1`)) = second_word;
1431	}
1432	} else if #[cfg(all(target_endian = "big", target_pointer_width = "32"))] {
1433	// Aligned ALU word, big-endian, 32-bit
1434
1435	/// Safety invariant: this is the amount of bytes consumed by
1436	/// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1437	/// This is also the number of bytes produced by pack_alu.
1438	/// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1439	pub const ALU_STRIDE_SIZE: usize = `8`;
1440
1441	pub const MAX_STRIDE_SIZE: usize = `8`;
1442
1443	// Safety invariant: this is the pointer width in bytes
1444	pub const ALU_ALIGNMENT: usize = `4`;
1445
1446	// Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1447	pub const ALU_ALIGNMENT_MASK: usize = `3`;
1448
1449	/// Safety: dst must point to valid space for writing four `usize`s
1450	#[inline(always)]
1451	unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1452	let first = ((`0xFF00_0000usize` & word) >> `8`) \|
1453	((`0x00FF_0000usize` & word) >> `16`);
1454	let second = ((`0x0000_FF00usize` & word) << `8`) \|
1455	(`0x0000_00FFusize` & word);
1456	let third = ((`0xFF00_0000usize` & second_word) >> `8`) \|
1457	((`0x00FF_0000usize` & second_word) >> `16`);
1458	let fourth = ((`0x0000_FF00usize` & second_word) << `8`) \|
1459	(`0x0000_00FFusize` & second_word);
1460	// Safety: fn invariant used here
1461	*dst = first;
1462	*(dst.add(`1`)) = second;
1463	*(dst.add(`2`)) = third;
1464	*(dst.add(`3`)) = fourth;
1465	}
1466
1467	/// Safety: dst must point to valid space for writing two `usize`s
1468	#[inline(always)]
1469	unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1470	let word = ((`0x00FF_0000usize` & first) << `8`) \|
1471	((`0x0000_00FFusize` & first) << `16`) \|
1472	((`0x00FF_0000usize` & second) >> `8`) \|
1473	(`0x0000_00FFusize` & second);
1474	let second_word = ((`0x00FF_0000usize` & third) << `8`) \|
1475	((`0x0000_00FFusize` & third) << `16`) \|
1476	((`0x00FF_0000usize` & fourth) >> `8`) \|
1477	(`0x0000_00FFusize` & fourth);
1478	// Safety: fn invariant used here
1479	*dst = word;
1480	*(dst.add(`1`)) = second_word;
1481	}
1482	} else {
1483	ascii_naive!(ascii_to_ascii, u8, u8);
1484	ascii_naive!(ascii_to_basic_latin, u8, u16);
1485	ascii_naive!(basic_latin_to_ascii, u16, u8);
1486	}
1487	}
1488
1489	cfg_if! {
1490	// Safety-usable invariant: this counts the zeroes from the "first byte" of utf-8 data packed into a usize
1491	// with the target endianness
1492	if #[cfg(target_endian = "little")] {
1493	#[allow(dead_code)]
1494	#[inline(always)]
1495	fn count_zeros(word: usize) -> u32 {
1496	word.trailing_zeros()
1497	}
1498	} else {
1499	#[allow(dead_code)]
1500	#[inline(always)]
1501	fn count_zeros(word: usize) -> u32 {
1502	word.leading_zeros()
1503	}
1504	}
1505	}
1506
1507	cfg_if! {
1508	if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "disabled"))] {
1509	/// Safety-usable invariant: Will return the value and position of the first non-ASCII byte in the slice in a Some if found.
1510	/// In other words, the first element of the Some is always `> 127`
1511	#[inline(always)]
1512	pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1513	let src = slice.as_ptr();
1514	let len = slice.len();
1515	let mut offset = `0usize`;
1516	// Safety: if this check succeeds we're valid for reading/writing at least `stride` elements.
1517	if SIMD_STRIDE_SIZE <= len {
1518	let len_minus_stride = len - SIMD_STRIDE_SIZE;
1519	loop {
1520	// Safety: src at offset is valid for a `SIMD_STRIDE_SIZE` read
1521	let simd = unsafe { load16_unaligned(src.add(offset)) };
1522	if !simd_is_ascii(simd) {
1523	break;
1524	}
1525	offset += SIMD_STRIDE_SIZE;
1526	// This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
1527	if offset > len_minus_stride {
1528	break;
1529	}
1530	}
1531	}
1532	while offset < len {
1533	let code_unit = slice[offset];
1534	if code_unit > `127` {
1535	// Safety: Safety-usable invariant upheld here
1536	return Some((code_unit, offset));
1537	}
1538	offset += `1`;
1539	}
1540	None
1541	}
1542	} else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1543	/// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
1544	/// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
1545	#[inline(always)]
1546	pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1547	let src = slice.as_ptr();
1548	let len = slice.len();
1549	let mut offset = `0usize`;
1550	// Safety: if this check succeeds we're valid for reading at least `stride` elements.
1551	if SIMD_STRIDE_SIZE <= len {
1552	// First, process one unaligned vector
1553	// Safety: src is valid for a `SIMD_STRIDE_SIZE` read
1554	let simd = unsafe { load16_unaligned(src) };
1555	let mask = mask_ascii(simd);
1556	if mask != `0` {
1557	offset = mask.trailing_zeros() as usize;
1558	let non_ascii = unsafe { *src.add(offset) };
1559	return Some((non_ascii, offset));
1560	}
1561	offset = SIMD_STRIDE_SIZE;
1562	// Safety: Now that offset has changed we don't yet know how much it is valid for
1563
1564	// We have now seen 16 ASCII bytes. Let's guess that
1565	// there will be enough more to justify more expense
1566	// in the case of non-ASCII.
1567	// Use aligned reads for the sake of old microachitectures.
1568	// Safety: this correctly calculates the number of src_units that need to be read before the remaining list is aligned.
1569	// This is by definition less than SIMD_ALIGNMENT, which is defined to be equal to SIMD_STRIDE_SIZE.
1570	let until_alignment = unsafe { (SIMD_ALIGNMENT - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK };
1571	// This addition won't overflow, because even in the 32-bit PAE case the
1572	// address space holds enough code that the slice length can't be that
1573	// close to address space size.
1574	// offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
1575	//
1576	// Safety: if this check succeeds we're valid for reading at least `2 SIMD_STRIDE_SIZE` elements plus `until_alignment`.*
1577	// The extra SIMD_STRIDE_SIZE in the condition is because `offset` is already `SIMD_STRIDE_SIZE`.
1578	if until_alignment + (SIMD_STRIDE_SIZE * `3`) <= len {
1579	if until_alignment != `0` {
1580	// Safety: this is safe to call since we're valid for this read (and more), and don't care about alignment
1581	// This will copy over bytes that get decoded twice since it's not incrementing `offset` by SIMD_STRIDE_SIZE. This is fine.
1582	let simd = unsafe { load16_unaligned(src.add(offset)) };
1583	let mask = mask_ascii(simd);
1584	if mask != `0` {
1585	offset += mask.trailing_zeros() as usize;
1586	let non_ascii = unsafe { *src.add(offset) };
1587	return Some((non_ascii, offset));
1588	}
1589	offset += until_alignment;
1590	}
1591	// Safety: At this point we're valid for reading 2SIMD_STRIDE_SIZE elements*
1592	// Safety: Now `offset` is aligned for `src`
1593	let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * `2`);
1594	loop {
1595	// Safety: We were valid for this read, and were aligned.
1596	let first = unsafe { load16_aligned(src.add(offset)) };
1597	let second = unsafe { load16_aligned(src.add(offset + SIMD_STRIDE_SIZE)) };
1598	if !simd_is_ascii(first \| second) {
1599	// Safety: mask_ascii produces a mask of all the high bits.
1600	let mask_first = mask_ascii(first);
1601	if mask_first != `0` {
1602	// Safety: on little endian systems this will be the number of ascii bytes
1603	// before the first non-ascii, i.e. valid for indexing src
1604	// TODO SAFETY: What about big-endian systems?
1605	offset += mask_first.trailing_zeros() as usize;
1606	} else {
1607	let mask_second = mask_ascii(second);
1608	// Safety: on little endian systems this will be the number of ascii bytes
1609	// before the first non-ascii, i.e. valid for indexing src
1610	offset += SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize;
1611	}
1612	// Safety: We know this is non-ASCII, and can uphold the safety-usable invariant here
1613	let non_ascii = unsafe { *src.add(offset) };
1614
1615	return Some((non_ascii, offset));
1616	}
1617	offset += SIMD_STRIDE_SIZE * `2`;
1618	// Safety: This is `offset > len - 2 SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.*
1619	if offset > len_minus_stride_times_two {
1620	break;
1621	}
1622	}
1623	// Safety: if this check succeeds we're valid for reading at least `SIMD_STRIDE_SIZE`
1624	if offset + SIMD_STRIDE_SIZE <= len {
1625	// Safety: We were valid for this read, and were aligned.
1626	let simd = unsafe { load16_aligned(src.add(offset)) };
1627	// Safety: mask_ascii produces a mask of all the high bits.
1628	let mask = mask_ascii(simd);
1629	if mask != `0` {
1630	// Safety: on little endian systems this will be the number of ascii bytes
1631	// before the first non-ascii, i.e. valid for indexing src
1632	offset += mask.trailing_zeros() as usize;
1633	let non_ascii = unsafe { *src.add(offset) };
1634	// Safety: We know this is non-ASCII, and can uphold the safety-usable invariant here
1635	return Some((non_ascii, offset));
1636	}
1637	offset += SIMD_STRIDE_SIZE;
1638	}
1639	} else {
1640	// Safety: this is the unaligned branch
1641	// At most two iterations, so unroll
1642	// Safety: if this check succeeds we're valid for reading at least `SIMD_STRIDE_SIZE`
1643	if offset + SIMD_STRIDE_SIZE <= len {
1644	// Safety: We're valid for this read but must use an unaligned read
1645	let simd = unsafe { load16_unaligned(src.add(offset)) };
1646	let mask = mask_ascii(simd);
1647	if mask != `0` {
1648	offset += mask.trailing_zeros() as usize;
1649	let non_ascii = unsafe { *src.add(offset) };
1650	// Safety-usable invariant upheld here (same as above)
1651	return Some((non_ascii, offset));
1652	}
1653	offset += SIMD_STRIDE_SIZE;
1654	// Safety: if this check succeeds we're valid for reading at least `SIMD_STRIDE_SIZE`
1655	if offset + SIMD_STRIDE_SIZE <= len {
1656	// Safety: We're valid for this read but must use an unaligned read
1657	let simd = unsafe { load16_unaligned(src.add(offset)) };
1658	let mask = mask_ascii(simd);
1659	if mask != `0` {
1660	offset += mask.trailing_zeros() as usize;
1661	let non_ascii = unsafe { *src.add(offset) };
1662	// Safety-usable invariant upheld here (same as above)
1663	return Some((non_ascii, offset));
1664	}
1665	offset += SIMD_STRIDE_SIZE;
1666	}
1667	}
1668	}
1669	}
1670	while offset < len {
1671	// Safety: relies straightforwardly on the `len` invariant
1672	let code_unit = unsafe { *(src.add(offset)) };
1673	if code_unit > `127` {
1674	// Safety-usable invariant upheld here
1675	return Some((code_unit, offset));
1676	}
1677	offset += `1`;
1678	}
1679	None
1680	}
1681	} else {
1682	// Safety-usable invariant: returns byte index of first non-ascii byte
1683	#[inline(always)]
1684	fn find_non_ascii(word: usize, second_word: usize) -> Option<usize> {
1685	let word_masked = word & ASCII_MASK;
1686	let second_masked = second_word & ASCII_MASK;
1687	if (word_masked \| second_masked) == `0` {
1688	// Both are ascii, invariant upheld
1689	return None;
1690	}
1691	if word_masked != `0` {
1692	let zeros = count_zeros(word_masked);
1693	// `zeros` now contains 0 to 7 (for the seven bits of masked ASCII in little endian,
1694	// or up to 7 bits of non-ASCII in big endian if the first byte is non-ASCII)
1695	// plus 8 times the number of ASCII in text order before the
1696	// non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1697	// text order before the non-ASCII byte in the big-endian case.
1698	let num_ascii = (zeros >> `3`) as usize;
1699	// Safety-usable invariant upheld here
1700	return Some(num_ascii);
1701	}
1702	let zeros = count_zeros(second_masked);
1703	// `zeros` now contains 0 to 7 (for the seven bits of masked ASCII in little endian,
1704	// or up to 7 bits of non-ASCII in big endian if the first byte is non-ASCII)
1705	// plus 8 times the number of ASCII in text order before the
1706	// non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1707	// text order before the non-ASCII byte in the big-endian case.
1708	let num_ascii = (zeros >> `3`) as usize;
1709	// Safety-usable invariant upheld here
1710	Some(ALU_ALIGNMENT + num_ascii)
1711	}
1712
1713	/// Safety: `src` must be valid for the reads of two `usize`s
1714	///
1715	/// Safety-usable invariant: will return byte index of first non-ascii byte
1716	#[inline(always)]
1717	unsafe fn validate_ascii_stride(src: *const usize) -> Option<usize> {
1718	let word = *src;
1719	let second_word = *(src.add(`1`));
1720	find_non_ascii(word, second_word)
1721	}
1722
1723	/// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
1724	/// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
1725	#[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
1726	#[inline(always)]
1727	pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1728	let src = slice.as_ptr();
1729	let len = slice.len();
1730	let mut offset = `0usize`;
1731	let mut until_alignment = (ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & ALU_ALIGNMENT_MASK;
1732	// Safety: If this check fails we're valid to read `until_alignment + ALU_STRIDE_SIZE` elements
1733	if until_alignment + ALU_STRIDE_SIZE <= len {
1734	while until_alignment != `0` {
1735	let code_unit = slice[offset];
1736	if code_unit > `127` {
1737	// Safety-usable invairant upheld here
1738	return Some((code_unit, offset));
1739	}
1740	offset += `1`;
1741	until_alignment -= `1`;
1742	}
1743	// Safety: At this point we have read until_alignment elements and
1744	// are valid for `ALU_STRIDE_SIZE` more.
1745	let len_minus_stride = len - ALU_STRIDE_SIZE;
1746	loop {
1747	// Safety: we were valid for this read
1748	let ptr = unsafe { src.add(offset) as *const usize };
1749	if let Some(num_ascii) = unsafe { validate_ascii_stride(ptr) } {
1750	offset += num_ascii;
1751	// Safety-usable invairant upheld here using the invariant from validate_ascii_stride()
1752	return Some((unsafe { *(src.add(offset)) }, offset));
1753	}
1754	offset += ALU_STRIDE_SIZE;
1755	// Safety: This is `offset > ALU_STRIDE_SIZE` which means we always have at least `2 ALU_STRIDE_SIZE` elements to munch next time.*
1756	if offset > len_minus_stride {
1757	break;
1758	}
1759	}
1760	}
1761	while offset < len {
1762	let code_unit = slice[offset];
1763	if code_unit > `127` {
1764	// Safety-usable invairant upheld here
1765	return Some((code_unit, offset));
1766	}
1767	offset += `1`;
1768	}
1769	None
1770	}
1771
1772	}
1773	}
1774
1775	cfg_if! {
1776	if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] {
1777
1778	} else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1779	// Even with NEON enabled, we use the ALU path for ASCII validation, because testing
1780	// on Exynos 5 indicated that using NEON isn't worthwhile where there are only
1781	// vector reads without vector writes.
1782
1783	pub const ALU_STRIDE_SIZE: usize = `8`;
1784
1785	pub const ALU_ALIGNMENT: usize = `4`;
1786
1787	pub const ALU_ALIGNMENT_MASK: usize = `3`;
1788	} else {
1789	// Safety: src points to two valid `usize`s, dst points to four valid `usize`s
1790	#[inline(always)]
1791	unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1792	// Safety: src safety invariant used here
1793	let word = *src;
1794	let second_word = *(src.add(`1`));
1795	// Safety: dst safety invariant passed down
1796	unpack_alu(word, second_word, dst);
1797	}
1798
1799	// Safety: src points to four valid `usize`s, dst points to two valid `usize`s
1800	#[inline(always)]
1801	unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1802	// Safety: src safety invariant used here
1803	let first = *src;
1804	let second = *(src.add(`1`));
1805	let third = *(src.add(`2`));
1806	let fourth = *(src.add(`3`));
1807	// Safety: dst safety invariant passed down
1808	pack_alu(first, second, third, fourth, dst);
1809	}
1810
1811	// Safety: src points to two valid `usize`s, dst points to four valid `usize`s
1812	#[inline(always)]
1813	unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1814	// Safety: src safety invariant used here
1815	let word = *src;
1816	let second_word = *(src.add(`1`));
1817	// Check if the words contains non-ASCII
1818	if (word & ASCII_MASK) \| (second_word & ASCII_MASK) != `0` {
1819	return `false`;
1820	}
1821	// Safety: dst safety invariant passed down
1822	unpack_alu(word, second_word, dst);
1823	`true`
1824	}
1825
1826	// Safety: src points four valid `usize`s, dst points to two valid `usize`s
1827	#[inline(always)]
1828	unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1829	// Safety: src safety invariant used here
1830	let first = *src;
1831	let second = *(src.add(`1`));
1832	let third = *(src.add(`2`));
1833	let fourth = *(src.add(`3`));
1834	if (first & BASIC_LATIN_MASK) \| (second & BASIC_LATIN_MASK) \| (third & BASIC_LATIN_MASK) \| (fourth & BASIC_LATIN_MASK) != `0` {
1835	return `false`;
1836	}
1837	// Safety: dst safety invariant passed down
1838	pack_alu(first, second, third, fourth, dst);
1839	`true`
1840	}
1841
1842	// Safety: src, dst both point to two valid `usize`s each
1843	// Safety-usable invariant: Will return byte index of first non-ascii byte.
1844	#[inline(always)]
1845	unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option<usize> {
1846	// Safety: src safety invariant used here
1847	let word = *src;
1848	let second_word = *(src.add(`1`));
1849	// Safety: src safety invariant used here
1850	*dst = word;
1851	*(dst.add(`1`)) = second_word;
1852	// Relies on safety-usable invariant here
1853	find_non_ascii(word, second_word)
1854	}
1855
1856	basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu);
1857	basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_alu);
1858	latin1_alu!(unpack_latin1, u8, u16, unpack_latin1_stride_alu);
1859	latin1_alu!(pack_latin1, u16, u8, pack_latin1_stride_alu);
1860	// Safety invariant upheld: ascii_to_ascii_stride will return byte index of first non-ascii if found
1861	ascii_alu!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride);
1862	}
1863	}
1864
1865	pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
1866	match validate_ascii(slice:bytes) {
1867	None => bytes.len(),
1868	Some((_, num_valid: usize)) => num_valid,
1869	}
1870	}
1871
1872	pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
1873	for (i: usize, b_ref: &u8) in bytes.iter().enumerate() {
1874	let b: u8 = *b_ref;
1875	if b >= `0x80` \|\| b == `0x1B` \|\| b == `0x0E` \|\| b == `0x0F` {
1876	return i;
1877	}
1878	}
1879	bytes.len()
1880	}
1881
1882	// Any copyright to the test code below this comment is dedicated to the
1883	// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
1884
1885	#[cfg(all(test, feature = "alloc"))]
1886	mod tests {
1887	use super::*;
1888	use alloc::vec::Vec;
1889
1890	macro_rules! test_ascii {
1891	($test_name:ident, $fn_tested:ident, $src_unit:ty, $dst_unit:ty) => {
1892	#[test]
1893	fn $test_name() {
1894	let mut src: Vec<$src_unit> = Vec::with_capacity(`32`);
1895	let mut dst: Vec<$dst_unit> = Vec::with_capacity(`32`);
1896	for i in `0`..`32` {
1897	src.clear();
1898	dst.clear();
1899	dst.resize(`32`, `0`);
1900	for j in `0`..`32` {
1901	let c = if i == j { `0xAA` } else { j + `0x40` };
1902	src.push(c as $src_unit);
1903	}
1904	match unsafe { $fn_tested(src.as_ptr(), dst.as_mut_ptr(), `32`) } {
1905	None => unreachable!("Should always find non-ASCII"),
1906	Some((non_ascii, num_ascii)) => {
1907	assert_eq!(non_ascii, `0xAA`);
1908	assert_eq!(num_ascii, i);
1909	for j in `0`..i {
1910	assert_eq!(dst[j], (j + `0x40`) as $dst_unit);
1911	}
1912	}
1913	}
1914	}
1915	}
1916	};
1917	}
1918
1919	test_ascii!(test_ascii_to_ascii, ascii_to_ascii, u8, u8);
1920	test_ascii!(test_ascii_to_basic_latin, ascii_to_basic_latin, u8, u16);
1921	test_ascii!(test_basic_latin_to_ascii, basic_latin_to_ascii, u16, u8);
1922	}
1923