ascii.rs source code [crates/encoding_rs-0.8.32/src/ascii.rs]

1	// Copyright Mozilla Foundation. See the COPYRIGHT
2	// file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	// It's assumed that in due course Rust will have explicit SIMD but will not
11	// be good at run-time selection of SIMD vs. no-SIMD. In such a future,
12	// x86_64 will always use SSE2 and 32-bit x86 will use SSE2 when compiled with
13	// a Mozilla-shipped rustc. SIMD support and especially detection on ARM is a
14	// mess. Under the circumstances, it seems to make sense to optimize the ALU
15	// case for ARMv7 rather than x86. Annoyingly, I was unable to get useful
16	// numbers of the actual ARMv7 CPU I have access to, because (thermal?)
17	// throttling kept interfering. Since Raspberry Pi 3 (ARMv8 core but running
18	// ARMv7 code) produced reproducible performance numbers, that's the ARM
19	// computer that this code ended up being optimized for in the ALU case.
20	// Less popular CPU architectures simply get the approach that was chosen based
21	// on Raspberry Pi 3 measurements. The UTF-16 and UTF-8 ALU cases take
22	// different approaches based on benchmarking on Raspberry Pi 3.
23
24	#[cfg(all(
25	feature = "simd-accel",
26	any(
27	target_feature = "sse2",
28	all(target_endian = "little", target_arch = "aarch64"),
29	all(target_endian = "little", target_feature = "neon")
30	)
31	))]
32	use crate::simd_funcs::*;
33
34	cfg_if! {
35	if #[cfg(feature = "simd-accel")] {
36	#[allow(unused_imports)]
37	use ::core::intrinsics::unlikely;
38	#[allow(unused_imports)]
39	use ::core::intrinsics::likely;
40	} else {
41	#[allow(dead_code)]
42	#[inline(always)]
43	fn unlikely(b: bool) -> bool {
44	b
45	}
46	#[allow(dead_code)]
47	#[inline(always)]
48	fn likely(b: bool) -> bool {
49	b
50	}
51	}
52	}
53
54	// `as` truncates, so works on 32-bit, too.
55	#[allow(dead_code)]
56	pub const ASCII_MASK: usize = `0x8080_8080_8080_8080u64` as usize;
57
58	// `as` truncates, so works on 32-bit, too.
59	#[allow(dead_code)]
60	pub const BASIC_LATIN_MASK: usize = `0xFF80_FF80_FF80_FF80u64` as usize;
61
62	#[allow(unused_macros)]
63	macro_rules! ascii_naive {
64	($name:ident, $src_unit:ty, $dst_unit:ty) => {
65	#[inline(always)]
66	pub unsafe fn $name(
67	src: *const $src_unit,
68	dst: *mut $dst_unit,
69	len: usize,
70	) -> Option<($src_unit, usize)> {
71	// Yes, manually omitting the bound check here matters
72	// a lot for perf.
73	for i in `0`..len {
74	let code_unit = *(src.add(i));
75	if code_unit > `127` {
76	return Some((code_unit, i));
77	}
78	*(dst.add(i)) = code_unit as $dst_unit;
79	}
80	return None;
81	}
82	};
83	}
84
85	#[allow(unused_macros)]
86	macro_rules! ascii_alu {
87	($name:ident,
88	$src_unit:ty,
89	$dst_unit:ty,
90	$stride_fn:ident) => {
91	#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cast_ptr_alignment))]
92	#[inline(always)]
93	pub unsafe fn $name(
94	src: *const $src_unit,
95	dst: *mut $dst_unit,
96	len: usize,
97	) -> Option<($src_unit, usize)> {
98	let mut offset = `0usize`;
99	// This loop is only broken out of as a `goto` forward
100	loop {
101	let mut until_alignment = {
102	// Check if the other unit aligns if we move the narrower unit
103	// to alignment.
104	// if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
105	// ascii_to_ascii
106	let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK;
107	let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK;
108	if src_alignment != dst_alignment {
109	break;
110	}
111	(ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK
112	// } else if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
113	// ascii_to_basic_latin
114	// let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
115	// if (dst.add(src_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
116	// break;
117	// }
118	// src_until_alignment
119	// } else {
120	// basic_latin_to_ascii
121	// let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
122	// if (src.add(dst_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
123	// break;
124	// }
125	// dst_until_alignment
126	// }
127	};
128	if until_alignment + ALU_STRIDE_SIZE <= len {
129	// Moving pointers to alignment seems to be a pessimization on
130	// x86_64 for operations that have UTF-16 as the internal
131	// Unicode representation. However, since it seems to be a win
132	// on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
133	// mixed results when encoding from UTF-16 and since x86 and
134	// x86_64 should be using SSE2 in due course, keeping the move
135	// to alignment here. It would be good to test on more ARM CPUs
136	// and on real MIPS and POWER hardware.
137	while until_alignment != `0` {
138	let code_unit = *(src.add(offset));
139	if code_unit > `127` {
140	return Some((code_unit, offset));
141	}
142	*(dst.add(offset)) = code_unit as $dst_unit;
143	offset += `1`;
144	until_alignment -= `1`;
145	}
146	let len_minus_stride = len - ALU_STRIDE_SIZE;
147	loop {
148	if let Some(num_ascii) = $stride_fn(
149	src.add(offset) as *const usize,
150	dst.add(offset) as *mut usize,
151	) {
152	offset += num_ascii;
153	return Some((*(src.add(offset)), offset));
154	}
155	offset += ALU_STRIDE_SIZE;
156	if offset > len_minus_stride {
157	break;
158	}
159	}
160	}
161	break;
162	}
163	while offset < len {
164	let code_unit = *(src.add(offset));
165	if code_unit > `127` {
166	return Some((code_unit, offset));
167	}
168	*(dst.add(offset)) = code_unit as $dst_unit;
169	offset += `1`;
170	}
171	None
172	}
173	};
174	}
175
176	#[allow(unused_macros)]
177	macro_rules! basic_latin_alu {
178	($name:ident,
179	$src_unit:ty,
180	$dst_unit:ty,
181	$stride_fn:ident) => {
182	#[cfg_attr(
183	feature = "cargo-clippy",
184	allow(never_loop, cast_ptr_alignment, cast_lossless)
185	)]
186	#[inline(always)]
187	pub unsafe fn $name(
188	src: *const $src_unit,
189	dst: *mut $dst_unit,
190	len: usize,
191	) -> Option<($src_unit, usize)> {
192	let mut offset = `0usize`;
193	// This loop is only broken out of as a `goto` forward
194	loop {
195	let mut until_alignment = {
196	// Check if the other unit aligns if we move the narrower unit
197	// to alignment.
198	// if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
199	// ascii_to_ascii
200	// let src_alignment = (src as usize) & ALIGNMENT_MASK;
201	// let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
202	// if src_alignment != dst_alignment {
203	// break;
204	// }
205	// (ALIGNMENT - src_alignment) & ALIGNMENT_MASK
206	// } else
207	if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
208	// ascii_to_basic_latin
209	let src_until_alignment = (ALU_ALIGNMENT
210	- ((src as usize) & ALU_ALIGNMENT_MASK))
211	& ALU_ALIGNMENT_MASK;
212	if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
213	!= `0`
214	{
215	break;
216	}
217	src_until_alignment
218	} else {
219	// basic_latin_to_ascii
220	let dst_until_alignment = (ALU_ALIGNMENT
221	- ((dst as usize) & ALU_ALIGNMENT_MASK))
222	& ALU_ALIGNMENT_MASK;
223	if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
224	!= `0`
225	{
226	break;
227	}
228	dst_until_alignment
229	}
230	};
231	if until_alignment + ALU_STRIDE_SIZE <= len {
232	// Moving pointers to alignment seems to be a pessimization on
233	// x86_64 for operations that have UTF-16 as the internal
234	// Unicode representation. However, since it seems to be a win
235	// on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
236	// mixed results when encoding from UTF-16 and since x86 and
237	// x86_64 should be using SSE2 in due course, keeping the move
238	// to alignment here. It would be good to test on more ARM CPUs
239	// and on real MIPS and POWER hardware.
240	while until_alignment != `0` {
241	let code_unit = *(src.add(offset));
242	if code_unit > `127` {
243	return Some((code_unit, offset));
244	}
245	*(dst.add(offset)) = code_unit as $dst_unit;
246	offset += `1`;
247	until_alignment -= `1`;
248	}
249	let len_minus_stride = len - ALU_STRIDE_SIZE;
250	loop {
251	if !$stride_fn(
252	src.add(offset) as *const usize,
253	dst.add(offset) as *mut usize,
254	) {
255	break;
256	}
257	offset += ALU_STRIDE_SIZE;
258	if offset > len_minus_stride {
259	break;
260	}
261	}
262	}
263	break;
264	}
265	while offset < len {
266	let code_unit = *(src.add(offset));
267	if code_unit > `127` {
268	return Some((code_unit, offset));
269	}
270	*(dst.add(offset)) = code_unit as $dst_unit;
271	offset += `1`;
272	}
273	None
274	}
275	};
276	}
277
278	#[allow(unused_macros)]
279	macro_rules! latin1_alu {
280	($name:ident, $src_unit:ty, $dst_unit:ty, $stride_fn:ident) => {
281	#[cfg_attr(
282	feature = "cargo-clippy",
283	allow(never_loop, cast_ptr_alignment, cast_lossless)
284	)]
285	#[inline(always)]
286	pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
287	let mut offset = `0usize`;
288	// This loop is only broken out of as a `goto` forward
289	loop {
290	let mut until_alignment = {
291	if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
292	// unpack
293	let src_until_alignment = (ALU_ALIGNMENT
294	- ((src as usize) & ALU_ALIGNMENT_MASK))
295	& ALU_ALIGNMENT_MASK;
296	if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
297	!= `0`
298	{
299	break;
300	}
301	src_until_alignment
302	} else {
303	// pack
304	let dst_until_alignment = (ALU_ALIGNMENT
305	- ((dst as usize) & ALU_ALIGNMENT_MASK))
306	& ALU_ALIGNMENT_MASK;
307	if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
308	!= `0`
309	{
310	break;
311	}
312	dst_until_alignment
313	}
314	};
315	if until_alignment + ALU_STRIDE_SIZE <= len {
316	while until_alignment != `0` {
317	let code_unit = *(src.add(offset));
318	*(dst.add(offset)) = code_unit as $dst_unit;
319	offset += `1`;
320	until_alignment -= `1`;
321	}
322	let len_minus_stride = len - ALU_STRIDE_SIZE;
323	loop {
324	$stride_fn(
325	src.add(offset) as *const usize,
326	dst.add(offset) as *mut usize,
327	);
328	offset += ALU_STRIDE_SIZE;
329	if offset > len_minus_stride {
330	break;
331	}
332	}
333	}
334	break;
335	}
336	while offset < len {
337	let code_unit = *(src.add(offset));
338	*(dst.add(offset)) = code_unit as $dst_unit;
339	offset += `1`;
340	}
341	}
342	};
343	}
344
345	#[allow(unused_macros)]
346	macro_rules! ascii_simd_check_align {
347	(
348	$name:ident,
349	$src_unit:ty,
350	$dst_unit:ty,
351	$stride_both_aligned:ident,
352	$stride_src_aligned:ident,
353	$stride_dst_aligned:ident,
354	$stride_neither_aligned:ident
355	) => {
356	#[inline(always)]
357	pub unsafe fn $name(
358	src: *const $src_unit,
359	dst: *mut $dst_unit,
360	len: usize,
361	) -> Option<($src_unit, usize)> {
362	let mut offset = `0usize`;
363	if SIMD_STRIDE_SIZE <= len {
364	let len_minus_stride = len - SIMD_STRIDE_SIZE;
365	// XXX Should we first process one stride unconditionally as unaligned to
366	// avoid the cost of the branchiness below if the first stride fails anyway?
367	// XXX Should we just use unaligned SSE2 access unconditionally? It seems that
368	// on Haswell, it would make sense to just use unaligned and not bother
369	// checking. Need to benchmark older architectures before deciding.
370	let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
371	if ((src as usize) & SIMD_ALIGNMENT_MASK) == `0` {
372	if dst_masked == `0` {
373	loop {
374	if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
375	break;
376	}
377	offset += SIMD_STRIDE_SIZE;
378	if offset > len_minus_stride {
379	break;
380	}
381	}
382	} else {
383	loop {
384	if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
385	break;
386	}
387	offset += SIMD_STRIDE_SIZE;
388	if offset > len_minus_stride {
389	break;
390	}
391	}
392	}
393	} else {
394	if dst_masked == `0` {
395	loop {
396	if !$stride_dst_aligned(src.add(offset), dst.add(offset)) {
397	break;
398	}
399	offset += SIMD_STRIDE_SIZE;
400	if offset > len_minus_stride {
401	break;
402	}
403	}
404	} else {
405	loop {
406	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
407	break;
408	}
409	offset += SIMD_STRIDE_SIZE;
410	if offset > len_minus_stride {
411	break;
412	}
413	}
414	}
415	}
416	}
417	while offset < len {
418	let code_unit = *(src.add(offset));
419	if code_unit > `127` {
420	return Some((code_unit, offset));
421	}
422	*(dst.add(offset)) = code_unit as $dst_unit;
423	offset += `1`;
424	}
425	None
426	}
427	};
428	}
429
430	#[allow(unused_macros)]
431	macro_rules! ascii_simd_check_align_unrolled {
432	(
433	$name:ident,
434	$src_unit:ty,
435	$dst_unit:ty,
436	$stride_both_aligned:ident,
437	$stride_src_aligned:ident,
438	$stride_neither_aligned:ident,
439	$double_stride_both_aligned:ident,
440	$double_stride_src_aligned:ident
441	) => {
442	#[inline(always)]
443	pub unsafe fn $name(
444	src: *const $src_unit,
445	dst: *mut $dst_unit,
446	len: usize,
447	) -> Option<($src_unit, usize)> {
448	let unit_size = ::core::mem::size_of::<$src_unit>();
449	let mut offset = `0usize`;
450	// This loop is only broken out of as a goto forward without
451	// actually looping
452	'outer: loop {
453	if SIMD_STRIDE_SIZE <= len {
454	// First, process one unaligned
455	if !$stride_neither_aligned(src, dst) {
456	break 'outer;
457	}
458	offset = SIMD_STRIDE_SIZE;
459
460	// We have now seen 16 ASCII bytes. Let's guess that
461	// there will be enough more to justify more expense
462	// in the case of non-ASCII.
463	// Use aligned reads for the sake of old microachitectures.
464	let until_alignment = ((SIMD_ALIGNMENT
465	- ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK))
466	& SIMD_ALIGNMENT_MASK)
467	/ unit_size;
468	// This addition won't overflow, because even in the 32-bit PAE case the
469	// address space holds enough code that the slice length can't be that
470	// close to address space size.
471	// offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
472	if until_alignment + (SIMD_STRIDE_SIZE * `3`) <= len {
473	if until_alignment != `0` {
474	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
475	break;
476	}
477	offset += until_alignment;
478	}
479	let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * `2`);
480	let dst_masked = (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK;
481	if dst_masked == `0` {
482	loop {
483	if let Some(advance) =
484	$double_stride_both_aligned(src.add(offset), dst.add(offset))
485	{
486	offset += advance;
487	let code_unit = *(src.add(offset));
488	return Some((code_unit, offset));
489	}
490	offset += SIMD_STRIDE_SIZE * `2`;
491	if offset > len_minus_stride_times_two {
492	break;
493	}
494	}
495	if offset + SIMD_STRIDE_SIZE <= len {
496	if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
497	break 'outer;
498	}
499	offset += SIMD_STRIDE_SIZE;
500	}
501	} else {
502	loop {
503	if let Some(advance) =
504	$double_stride_src_aligned(src.add(offset), dst.add(offset))
505	{
506	offset += advance;
507	let code_unit = *(src.add(offset));
508	return Some((code_unit, offset));
509	}
510	offset += SIMD_STRIDE_SIZE * `2`;
511	if offset > len_minus_stride_times_two {
512	break;
513	}
514	}
515	if offset + SIMD_STRIDE_SIZE <= len {
516	if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
517	break 'outer;
518	}
519	offset += SIMD_STRIDE_SIZE;
520	}
521	}
522	} else {
523	// At most two iterations, so unroll
524	if offset + SIMD_STRIDE_SIZE <= len {
525	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
526	break;
527	}
528	offset += SIMD_STRIDE_SIZE;
529	if offset + SIMD_STRIDE_SIZE <= len {
530	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
531	break;
532	}
533	offset += SIMD_STRIDE_SIZE;
534	}
535	}
536	}
537	}
538	break 'outer;
539	}
540	while offset < len {
541	let code_unit = *(src.add(offset));
542	if code_unit > `127` {
543	return Some((code_unit, offset));
544	}
545	*(dst.add(offset)) = code_unit as $dst_unit;
546	offset += `1`;
547	}
548	None
549	}
550	};
551	}
552
553	#[allow(unused_macros)]
554	macro_rules! latin1_simd_check_align {
555	(
556	$name:ident,
557	$src_unit:ty,
558	$dst_unit:ty,
559	$stride_both_aligned:ident,
560	$stride_src_aligned:ident,
561	$stride_dst_aligned:ident,
562	$stride_neither_aligned:ident
563	) => {
564	#[inline(always)]
565	pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
566	let mut offset = `0usize`;
567	if SIMD_STRIDE_SIZE <= len {
568	let len_minus_stride = len - SIMD_STRIDE_SIZE;
569	let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
570	if ((src as usize) & SIMD_ALIGNMENT_MASK) == `0` {
571	if dst_masked == `0` {
572	loop {
573	$stride_both_aligned(src.add(offset), dst.add(offset));
574	offset += SIMD_STRIDE_SIZE;
575	if offset > len_minus_stride {
576	break;
577	}
578	}
579	} else {
580	loop {
581	$stride_src_aligned(src.add(offset), dst.add(offset));
582	offset += SIMD_STRIDE_SIZE;
583	if offset > len_minus_stride {
584	break;
585	}
586	}
587	}
588	} else {
589	if dst_masked == `0` {
590	loop {
591	$stride_dst_aligned(src.add(offset), dst.add(offset));
592	offset += SIMD_STRIDE_SIZE;
593	if offset > len_minus_stride {
594	break;
595	}
596	}
597	} else {
598	loop {
599	$stride_neither_aligned(src.add(offset), dst.add(offset));
600	offset += SIMD_STRIDE_SIZE;
601	if offset > len_minus_stride {
602	break;
603	}
604	}
605	}
606	}
607	}
608	while offset < len {
609	let code_unit = *(src.add(offset));
610	*(dst.add(offset)) = code_unit as $dst_unit;
611	offset += `1`;
612	}
613	}
614	};
615	}
616
617	#[allow(unused_macros)]
618	macro_rules! latin1_simd_check_align_unrolled {
619	(
620	$name:ident,
621	$src_unit:ty,
622	$dst_unit:ty,
623	$stride_both_aligned:ident,
624	$stride_src_aligned:ident,
625	$stride_dst_aligned:ident,
626	$stride_neither_aligned:ident
627	) => {
628	#[inline(always)]
629	pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
630	let unit_size = ::core::mem::size_of::<$src_unit>();
631	let mut offset = `0usize`;
632	if SIMD_STRIDE_SIZE <= len {
633	let mut until_alignment = ((SIMD_STRIDE_SIZE
634	- ((src as usize) & SIMD_ALIGNMENT_MASK))
635	& SIMD_ALIGNMENT_MASK)
636	/ unit_size;
637	while until_alignment != `0` {
638	(dst.add(offset)) = (src.add(offset)) as $dst_unit;
639	offset += `1`;
640	until_alignment -= `1`;
641	}
642	let len_minus_stride = len - SIMD_STRIDE_SIZE;
643	if offset + SIMD_STRIDE_SIZE * `2` <= len {
644	let len_minus_stride_times_two = len_minus_stride - SIMD_STRIDE_SIZE;
645	if (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK == `0` {
646	loop {
647	$stride_both_aligned(src.add(offset), dst.add(offset));
648	offset += SIMD_STRIDE_SIZE;
649	$stride_both_aligned(src.add(offset), dst.add(offset));
650	offset += SIMD_STRIDE_SIZE;
651	if offset > len_minus_stride_times_two {
652	break;
653	}
654	}
655	} else {
656	loop {
657	$stride_src_aligned(src.add(offset), dst.add(offset));
658	offset += SIMD_STRIDE_SIZE;
659	$stride_src_aligned(src.add(offset), dst.add(offset));
660	offset += SIMD_STRIDE_SIZE;
661	if offset > len_minus_stride_times_two {
662	break;
663	}
664	}
665	}
666	}
667	if offset < len_minus_stride {
668	$stride_src_aligned(src.add(offset), dst.add(offset));
669	offset += SIMD_STRIDE_SIZE;
670	}
671	}
672	while offset < len {
673	let code_unit = *(src.add(offset));
674	// On x86_64, this loop autovectorizes but in the pack
675	// case there are instructions whose purpose is to make sure
676	// each u16 in the vector is truncated before packing. However,
677	// since we don't care about saturating behavior of SSE2 packing
678	// when the input isn't Latin1, those instructions are useless.
679	// Unfortunately, using the `assume` intrinsic to lie to the
680	// optimizer doesn't make LLVM omit the trunctation that we
681	// don't need. Possibly this loop could be manually optimized
682	// to do the sort of thing that LLVM does but without the
683	// ANDing the read vectors of u16 with a constant that discards
684	// the high half of each u16. As far as I can tell, the
685	// optimization assumes that doing a SIMD read past the end of
686	// the array is OK.
687	*(dst.add(offset)) = code_unit as $dst_unit;
688	offset += `1`;
689	}
690	}
691	};
692	}
693
694	#[allow(unused_macros)]
695	macro_rules! ascii_simd_unalign {
696	($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
697	#[inline(always)]
698	pub unsafe fn $name(
699	src: *const $src_unit,
700	dst: *mut $dst_unit,
701	len: usize,
702	) -> Option<($src_unit, usize)> {
703	let mut offset = `0usize`;
704	if SIMD_STRIDE_SIZE <= len {
705	let len_minus_stride = len - SIMD_STRIDE_SIZE;
706	loop {
707	if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
708	break;
709	}
710	offset += SIMD_STRIDE_SIZE;
711	if offset > len_minus_stride {
712	break;
713	}
714	}
715	}
716	while offset < len {
717	let code_unit = *(src.add(offset));
718	if code_unit > `127` {
719	return Some((code_unit, offset));
720	}
721	*(dst.add(offset)) = code_unit as $dst_unit;
722	offset += `1`;
723	}
724	None
725	}
726	};
727	}
728
729	#[allow(unused_macros)]
730	macro_rules! latin1_simd_unalign {
731	($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
732	#[inline(always)]
733	pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
734	let mut offset = `0usize`;
735	if SIMD_STRIDE_SIZE <= len {
736	let len_minus_stride = len - SIMD_STRIDE_SIZE;
737	loop {
738	$stride_neither_aligned(src.add(offset), dst.add(offset));
739	offset += SIMD_STRIDE_SIZE;
740	if offset > len_minus_stride {
741	break;
742	}
743	}
744	}
745	while offset < len {
746	let code_unit = *(src.add(offset));
747	*(dst.add(offset)) = code_unit as $dst_unit;
748	offset += `1`;
749	}
750	}
751	};
752	}
753
754	#[allow(unused_macros)]
755	macro_rules! ascii_to_ascii_simd_stride {
756	($name:ident, $load:ident, $store:ident) => {
757	#[inline(always)]
758	pub unsafe fn $name(src: *const u8, dst: *mut u8) -> bool {
759	let simd = $load(src);
760	if !simd_is_ascii(simd) {
761	return `false`;
762	}
763	$store(dst, simd);
764	`true`
765	}
766	};
767	}
768
769	#[allow(unused_macros)]
770	macro_rules! ascii_to_ascii_simd_double_stride {
771	($name:ident, $store:ident) => {
772	#[inline(always)]
773	pub unsafe fn $name(src: *const u8, dst: *mut u8) -> Option<usize> {
774	let first = load16_aligned(src);
775	let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
776	$store(dst, first);
777	if unlikely(!simd_is_ascii(first \| second)) {
778	let mask_first = mask_ascii(first);
779	if mask_first != `0` {
780	return Some(mask_first.trailing_zeros() as usize);
781	}
782	$store(dst.add(SIMD_STRIDE_SIZE), second);
783	let mask_second = mask_ascii(second);
784	return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
785	}
786	$store(dst.add(SIMD_STRIDE_SIZE), second);
787	None
788	}
789	};
790	}
791
792	#[allow(unused_macros)]
793	macro_rules! ascii_to_basic_latin_simd_stride {
794	($name:ident, $load:ident, $store:ident) => {
795	#[inline(always)]
796	pub unsafe fn $name(src: *const u8, dst: *mut u16) -> bool {
797	let simd = $load(src);
798	if !simd_is_ascii(simd) {
799	return `false`;
800	}
801	let (first, second) = simd_unpack(simd);
802	$store(dst, first);
803	$store(dst.add(`8`), second);
804	`true`
805	}
806	};
807	}
808
809	#[allow(unused_macros)]
810	macro_rules! ascii_to_basic_latin_simd_double_stride {
811	($name:ident, $store:ident) => {
812	#[inline(always)]
813	pub unsafe fn $name(src: *const u8, dst: *mut u16) -> Option<usize> {
814	let first = load16_aligned(src);
815	let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
816	let (a, b) = simd_unpack(first);
817	$store(dst, a);
818	$store(dst.add(SIMD_STRIDE_SIZE / `2`), b);
819	if unlikely(!simd_is_ascii(first \| second)) {
820	let mask_first = mask_ascii(first);
821	if mask_first != `0` {
822	return Some(mask_first.trailing_zeros() as usize);
823	}
824	let (c, d) = simd_unpack(second);
825	$store(dst.add(SIMD_STRIDE_SIZE), c);
826	$store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / `2`)), d);
827	let mask_second = mask_ascii(second);
828	return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
829	}
830	let (c, d) = simd_unpack(second);
831	$store(dst.add(SIMD_STRIDE_SIZE), c);
832	$store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / `2`)), d);
833	None
834	}
835	};
836	}
837
838	#[allow(unused_macros)]
839	macro_rules! unpack_simd_stride {
840	($name:ident, $load:ident, $store:ident) => {
841	#[inline(always)]
842	pub unsafe fn $name(src: *const u8, dst: *mut u16) {
843	let simd = $load(src);
844	let (first, second) = simd_unpack(simd);
845	$store(dst, first);
846	$store(dst.add(`8`), second);
847	}
848	};
849	}
850
851	#[allow(unused_macros)]
852	macro_rules! basic_latin_to_ascii_simd_stride {
853	($name:ident, $load:ident, $store:ident) => {
854	#[inline(always)]
855	pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool {
856	let first = $load(src);
857	let second = $load(src.add(`8`));
858	if simd_is_basic_latin(first \| second) {
859	$store(dst, simd_pack(first, second));
860	`true`
861	} else {
862	`false`
863	}
864	}
865	};
866	}
867
868	#[allow(unused_macros)]
869	macro_rules! pack_simd_stride {
870	($name:ident, $load:ident, $store:ident) => {
871	#[inline(always)]
872	pub unsafe fn $name(src: *const u16, dst: *mut u8) {
873	let first = $load(src);
874	let second = $load(src.add(`8`));
875	$store(dst, simd_pack(first, second));
876	}
877	};
878	}
879
880	cfg_if! {
881	if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "aarch64"))] {
882	// SIMD with the same instructions for aligned and unaligned loads and stores
883
884	pub const SIMD_STRIDE_SIZE: usize = `16`;
885
886	pub const MAX_STRIDE_SIZE: usize = `16`;
887
888	// pub const ALIGNMENT: usize = 8;
889
890	pub const ALU_STRIDE_SIZE: usize = `16`;
891
892	pub const ALU_ALIGNMENT: usize = `8`;
893
894	pub const ALU_ALIGNMENT_MASK: usize = `7`;
895
896	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
897
898	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
899	unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
900
901	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
902	pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
903
904	ascii_simd_unalign!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_neither_aligned);
905	ascii_simd_unalign!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_neither_aligned);
906	ascii_simd_unalign!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_neither_aligned);
907	latin1_simd_unalign!(unpack_latin1, u8, u16, unpack_stride_neither_aligned);
908	latin1_simd_unalign!(pack_latin1, u16, u8, pack_stride_neither_aligned);
909	} else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
910	// SIMD with different instructions for aligned and unaligned loads and stores.
911	//
912	// Newer microarchitectures are not supposed to have a performance difference between
913	// aligned and unaligned SSE2 loads and stores when the address is actually aligned,
914	// but the benchmark results I see don't agree.
915
916	pub const SIMD_STRIDE_SIZE: usize = `16`;
917
918	pub const MAX_STRIDE_SIZE: usize = `16`;
919
920	pub const SIMD_ALIGNMENT_MASK: usize = `15`;
921
922	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
923	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
924	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_dst_aligned, load16_unaligned, store16_aligned);
925	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
926
927	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
928	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
929	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_dst_aligned, load16_unaligned, store8_aligned);
930	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
931
932	unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
933	unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
934	unpack_simd_stride!(unpack_stride_dst_aligned, load16_unaligned, store8_aligned);
935	unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
936
937	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
938	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
939	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
940	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
941
942	pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
943	pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
944	pack_simd_stride!(pack_stride_dst_aligned, load8_unaligned, store16_aligned);
945	pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
946
947	ascii_simd_check_align!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_dst_aligned, ascii_to_ascii_stride_neither_aligned);
948	ascii_simd_check_align!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_dst_aligned, ascii_to_basic_latin_stride_neither_aligned);
949	ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
950	latin1_simd_check_align!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
951	latin1_simd_check_align!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
952	} else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
953	// SIMD with different instructions for aligned and unaligned loads and stores.
954	//
955	// Newer microarchitectures are not supposed to have a performance difference between
956	// aligned and unaligned SSE2 loads and stores when the address is actually aligned,
957	// but the benchmark results I see don't agree.
958
959	pub const SIMD_STRIDE_SIZE: usize = `16`;
960
961	pub const SIMD_ALIGNMENT: usize = `16`;
962
963	pub const MAX_STRIDE_SIZE: usize = `16`;
964
965	pub const SIMD_ALIGNMENT_MASK: usize = `15`;
966
967	ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_both_aligned, store16_aligned);
968	ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_src_aligned, store16_unaligned);
969
970	ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_both_aligned, store8_aligned);
971	ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_src_aligned, store8_unaligned);
972
973	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
974	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
975	ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
976
977	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
978	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
979	ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
980
981	unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
982	unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
983
984	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
985	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
986	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
987	basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
988
989	pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
990	pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
991
992	ascii_simd_check_align_unrolled!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_neither_aligned, ascii_to_ascii_simd_double_stride_both_aligned, ascii_to_ascii_simd_double_stride_src_aligned);
993	ascii_simd_check_align_unrolled!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_neither_aligned, ascii_to_basic_latin_simd_double_stride_both_aligned, ascii_to_basic_latin_simd_double_stride_src_aligned);
994
995	ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
996	latin1_simd_check_align_unrolled!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
997	latin1_simd_check_align_unrolled!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
998	} else if #[cfg(all(target_endian = "little", target_pointer_width = "64"))] {
999	// Aligned ALU word, little-endian, 64-bit
1000
1001	pub const ALU_STRIDE_SIZE: usize = `16`;
1002
1003	pub const MAX_STRIDE_SIZE: usize = `16`;
1004
1005	pub const ALU_ALIGNMENT: usize = `8`;
1006
1007	pub const ALU_ALIGNMENT_MASK: usize = `7`;
1008
1009	#[inline(always)]
1010	unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1011	let first = ((`0x0000_0000_FF00_0000usize` & word) << `24`) \|
1012	((`0x0000_0000_00FF_0000usize` & word) << `16`) \|
1013	((`0x0000_0000_0000_FF00usize` & word) << `8`) \|
1014	(`0x0000_0000_0000_00FFusize` & word);
1015	let second = ((`0xFF00_0000_0000_0000usize` & word) >> `8`) \|
1016	((`0x00FF_0000_0000_0000usize` & word) >> `16`) \|
1017	((`0x0000_FF00_0000_0000usize` & word) >> `24`) \|
1018	((`0x0000_00FF_0000_0000usize` & word) >> `32`);
1019	let third = ((`0x0000_0000_FF00_0000usize` & second_word) << `24`) \|
1020	((`0x0000_0000_00FF_0000usize` & second_word) << `16`) \|
1021	((`0x0000_0000_0000_FF00usize` & second_word) << `8`) \|
1022	(`0x0000_0000_0000_00FFusize` & second_word);
1023	let fourth = ((`0xFF00_0000_0000_0000usize` & second_word) >> `8`) \|
1024	((`0x00FF_0000_0000_0000usize` & second_word) >> `16`) \|
1025	((`0x0000_FF00_0000_0000usize` & second_word) >> `24`) \|
1026	((`0x0000_00FF_0000_0000usize` & second_word) >> `32`);
1027	*dst = first;
1028	*(dst.add(`1`)) = second;
1029	*(dst.add(`2`)) = third;
1030	*(dst.add(`3`)) = fourth;
1031	}
1032
1033	#[inline(always)]
1034	unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1035	let word = ((`0x00FF_0000_0000_0000usize` & second) << `8`) \|
1036	((`0x0000_00FF_0000_0000usize` & second) << `16`) \|
1037	((`0x0000_0000_00FF_0000usize` & second) << `24`) \|
1038	((`0x0000_0000_0000_00FFusize` & second) << `32`) \|
1039	((`0x00FF_0000_0000_0000usize` & first) >> `24`) \|
1040	((`0x0000_00FF_0000_0000usize` & first) >> `16`) \|
1041	((`0x0000_0000_00FF_0000usize` & first) >> `8`) \|
1042	(`0x0000_0000_0000_00FFusize` & first);
1043	let second_word = ((`0x00FF_0000_0000_0000usize` & fourth) << `8`) \|
1044	((`0x0000_00FF_0000_0000usize` & fourth) << `16`) \|
1045	((`0x0000_0000_00FF_0000usize` & fourth) << `24`) \|
1046	((`0x0000_0000_0000_00FFusize` & fourth) << `32`) \|
1047	((`0x00FF_0000_0000_0000usize` & third) >> `24`) \|
1048	((`0x0000_00FF_0000_0000usize` & third) >> `16`) \|
1049	((`0x0000_0000_00FF_0000usize` & third) >> `8`) \|
1050	(`0x0000_0000_0000_00FFusize` & third);
1051	*dst = word;
1052	*(dst.add(`1`)) = second_word;
1053	}
1054	} else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] {
1055	// Aligned ALU word, little-endian, 32-bit
1056
1057	pub const ALU_STRIDE_SIZE: usize = `8`;
1058
1059	pub const MAX_STRIDE_SIZE: usize = `8`;
1060
1061	pub const ALU_ALIGNMENT: usize = `4`;
1062
1063	pub const ALU_ALIGNMENT_MASK: usize = `3`;
1064
1065	#[inline(always)]
1066	unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1067	let first = ((`0x0000_FF00usize` & word) << `8`) \|
1068	(`0x0000_00FFusize` & word);
1069	let second = ((`0xFF00_0000usize` & word) >> `8`) \|
1070	((`0x00FF_0000usize` & word) >> `16`);
1071	let third = ((`0x0000_FF00usize` & second_word) << `8`) \|
1072	(`0x0000_00FFusize` & second_word);
1073	let fourth = ((`0xFF00_0000usize` & second_word) >> `8`) \|
1074	((`0x00FF_0000usize` & second_word) >> `16`);
1075	*dst = first;
1076	*(dst.add(`1`)) = second;
1077	*(dst.add(`2`)) = third;
1078	*(dst.add(`3`)) = fourth;
1079	}
1080
1081	#[inline(always)]
1082	unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1083	let word = ((`0x00FF_0000usize` & second) << `8`) \|
1084	((`0x0000_00FFusize` & second) << `16`) \|
1085	((`0x00FF_0000usize` & first) >> `8`) \|
1086	(`0x0000_00FFusize` & first);
1087	let second_word = ((`0x00FF_0000usize` & fourth) << `8`) \|
1088	((`0x0000_00FFusize` & fourth) << `16`) \|
1089	((`0x00FF_0000usize` & third) >> `8`) \|
1090	(`0x0000_00FFusize` & third);
1091	*dst = word;
1092	*(dst.add(`1`)) = second_word;
1093	}
1094	} else if #[cfg(all(target_endian = "big", target_pointer_width = "64"))] {
1095	// Aligned ALU word, big-endian, 64-bit
1096
1097	pub const ALU_STRIDE_SIZE: usize = `16`;
1098
1099	pub const MAX_STRIDE_SIZE: usize = `16`;
1100
1101	pub const ALU_ALIGNMENT: usize = `8`;
1102
1103	pub const ALU_ALIGNMENT_MASK: usize = `7`;
1104
1105	#[inline(always)]
1106	unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1107	let first = ((`0xFF00_0000_0000_0000usize` & word) >> `8`) \|
1108	((`0x00FF_0000_0000_0000usize` & word) >> `16`) \|
1109	((`0x0000_FF00_0000_0000usize` & word) >> `24`) \|
1110	((`0x0000_00FF_0000_0000usize` & word) >> `32`);
1111	let second = ((`0x0000_0000_FF00_0000usize` & word) << `24`) \|
1112	((`0x0000_0000_00FF_0000usize` & word) << `16`) \|
1113	((`0x0000_0000_0000_FF00usize` & word) << `8`) \|
1114	(`0x0000_0000_0000_00FFusize` & word);
1115	let third = ((`0xFF00_0000_0000_0000usize` & second_word) >> `8`) \|
1116	((`0x00FF_0000_0000_0000usize` & second_word) >> `16`) \|
1117	((`0x0000_FF00_0000_0000usize` & second_word) >> `24`) \|
1118	((`0x0000_00FF_0000_0000usize` & second_word) >> `32`);
1119	let fourth = ((`0x0000_0000_FF00_0000usize` & second_word) << `24`) \|
1120	((`0x0000_0000_00FF_0000usize` & second_word) << `16`) \|
1121	((`0x0000_0000_0000_FF00usize` & second_word) << `8`) \|
1122	(`0x0000_0000_0000_00FFusize` & second_word);
1123	*dst = first;
1124	*(dst.add(`1`)) = second;
1125	*(dst.add(`2`)) = third;
1126	*(dst.add(`3`)) = fourth;
1127	}
1128
1129	#[inline(always)]
1130	unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1131	let word = ((`0x00FF0000_00000000usize` & first) << `8`) \|
1132	((`0x000000FF_00000000usize` & first) << `16`) \|
1133	((`0x00000000_00FF0000usize` & first) << `24`) \|
1134	((`0x00000000_000000FFusize` & first) << `32`) \|
1135	((`0x00FF0000_00000000usize` & second) >> `24`) \|
1136	((`0x000000FF_00000000usize` & second) >> `16`) \|
1137	((`0x00000000_00FF0000usize` & second) >> `8`) \|
1138	(`0x00000000_000000FFusize` & second);
1139	let second_word = ((`0x00FF0000_00000000usize` & third) << `8`) \|
1140	((`0x000000FF_00000000usize` & third) << `16`) \|
1141	((`0x00000000_00FF0000usize` & third) << `24`) \|
1142	((`0x00000000_000000FFusize` & third) << `32`) \|
1143	((`0x00FF0000_00000000usize` & fourth) >> `24`) \|
1144	((`0x000000FF_00000000usize` & fourth) >> `16`) \|
1145	((`0x00000000_00FF0000usize` & fourth) >> `8`) \|
1146	(`0x00000000_000000FFusize` & fourth);
1147	*dst = word;
1148	*(dst.add(`1`)) = second_word;
1149	}
1150	} else if #[cfg(all(target_endian = "big", target_pointer_width = "32"))] {
1151	// Aligned ALU word, big-endian, 32-bit
1152
1153	pub const ALU_STRIDE_SIZE: usize = `8`;
1154
1155	pub const MAX_STRIDE_SIZE: usize = `8`;
1156
1157	pub const ALU_ALIGNMENT: usize = `4`;
1158
1159	pub const ALU_ALIGNMENT_MASK: usize = `3`;
1160
1161	#[inline(always)]
1162	unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1163	let first = ((`0xFF00_0000usize` & word) >> `8`) \|
1164	((`0x00FF_0000usize` & word) >> `16`);
1165	let second = ((`0x0000_FF00usize` & word) << `8`) \|
1166	(`0x0000_00FFusize` & word);
1167	let third = ((`0xFF00_0000usize` & second_word) >> `8`) \|
1168	((`0x00FF_0000usize` & second_word) >> `16`);
1169	let fourth = ((`0x0000_FF00usize` & second_word) << `8`) \|
1170	(`0x0000_00FFusize` & second_word);
1171	*dst = first;
1172	*(dst.add(`1`)) = second;
1173	*(dst.add(`2`)) = third;
1174	*(dst.add(`3`)) = fourth;
1175	}
1176
1177	#[inline(always)]
1178	unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1179	let word = ((`0x00FF_0000usize` & first) << `8`) \|
1180	((`0x0000_00FFusize` & first) << `16`) \|
1181	((`0x00FF_0000usize` & second) >> `8`) \|
1182	(`0x0000_00FFusize` & second);
1183	let second_word = ((`0x00FF_0000usize` & third) << `8`) \|
1184	((`0x0000_00FFusize` & third) << `16`) \|
1185	((`0x00FF_0000usize` & fourth) >> `8`) \|
1186	(`0x0000_00FFusize` & fourth);
1187	*dst = word;
1188	*(dst.add(`1`)) = second_word;
1189	}
1190	} else {
1191	ascii_naive!(ascii_to_ascii, u8, u8);
1192	ascii_naive!(ascii_to_basic_latin, u8, u16);
1193	ascii_naive!(basic_latin_to_ascii, u16, u8);
1194	}
1195	}
1196
1197	cfg_if! {
1198	if #[cfg(target_endian = "little")] {
1199	#[allow(dead_code)]
1200	#[inline(always)]
1201	fn count_zeros(word: usize) -> u32 {
1202	word.trailing_zeros()
1203	}
1204	} else {
1205	#[allow(dead_code)]
1206	#[inline(always)]
1207	fn count_zeros(word: usize) -> u32 {
1208	word.leading_zeros()
1209	}
1210	}
1211	}
1212
1213	cfg_if! {
1214	if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "disabled"))] {
1215	#[inline(always)]
1216	pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1217	let src = slice.as_ptr();
1218	let len = slice.len();
1219	let mut offset = `0usize`;
1220	if SIMD_STRIDE_SIZE <= len {
1221	let len_minus_stride = len - SIMD_STRIDE_SIZE;
1222	loop {
1223	let simd = unsafe { load16_unaligned(src.add(offset)) };
1224	if !simd_is_ascii(simd) {
1225	break;
1226	}
1227	offset += SIMD_STRIDE_SIZE;
1228	if offset > len_minus_stride {
1229	break;
1230	}
1231	}
1232	}
1233	while offset < len {
1234	let code_unit = slice[offset];
1235	if code_unit > `127` {
1236	return Some((code_unit, offset));
1237	}
1238	offset += `1`;
1239	}
1240	None
1241	}
1242	} else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1243	#[inline(always)]
1244	pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1245	let src = slice.as_ptr();
1246	let len = slice.len();
1247	let mut offset = `0usize`;
1248	if SIMD_STRIDE_SIZE <= len {
1249	// First, process one unaligned vector
1250	let simd = unsafe { load16_unaligned(src) };
1251	let mask = mask_ascii(simd);
1252	if mask != `0` {
1253	offset = mask.trailing_zeros() as usize;
1254	let non_ascii = unsafe { *src.add(offset) };
1255	return Some((non_ascii, offset));
1256	}
1257	offset = SIMD_STRIDE_SIZE;
1258
1259	// We have now seen 16 ASCII bytes. Let's guess that
1260	// there will be enough more to justify more expense
1261	// in the case of non-ASCII.
1262	// Use aligned reads for the sake of old microachitectures.
1263	let until_alignment = unsafe { (SIMD_ALIGNMENT - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK };
1264	// This addition won't overflow, because even in the 32-bit PAE case the
1265	// address space holds enough code that the slice length can't be that
1266	// close to address space size.
1267	// offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
1268	if until_alignment + (SIMD_STRIDE_SIZE * `3`) <= len {
1269	if until_alignment != `0` {
1270	let simd = unsafe { load16_unaligned(src.add(offset)) };
1271	let mask = mask_ascii(simd);
1272	if mask != `0` {
1273	offset += mask.trailing_zeros() as usize;
1274	let non_ascii = unsafe { *src.add(offset) };
1275	return Some((non_ascii, offset));
1276	}
1277	offset += until_alignment;
1278	}
1279	let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * `2`);
1280	loop {
1281	let first = unsafe { load16_aligned(src.add(offset)) };
1282	let second = unsafe { load16_aligned(src.add(offset + SIMD_STRIDE_SIZE)) };
1283	if !simd_is_ascii(first \| second) {
1284	let mask_first = mask_ascii(first);
1285	if mask_first != `0` {
1286	offset += mask_first.trailing_zeros() as usize;
1287	} else {
1288	let mask_second = mask_ascii(second);
1289	offset += SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize;
1290	}
1291	let non_ascii = unsafe { *src.add(offset) };
1292	return Some((non_ascii, offset));
1293	}
1294	offset += SIMD_STRIDE_SIZE * `2`;
1295	if offset > len_minus_stride_times_two {
1296	break;
1297	}
1298	}
1299	if offset + SIMD_STRIDE_SIZE <= len {
1300	let simd = unsafe { load16_aligned(src.add(offset)) };
1301	let mask = mask_ascii(simd);
1302	if mask != `0` {
1303	offset += mask.trailing_zeros() as usize;
1304	let non_ascii = unsafe { *src.add(offset) };
1305	return Some((non_ascii, offset));
1306	}
1307	offset += SIMD_STRIDE_SIZE;
1308	}
1309	} else {
1310	// At most two iterations, so unroll
1311	if offset + SIMD_STRIDE_SIZE <= len {
1312	let simd = unsafe { load16_unaligned(src.add(offset)) };
1313	let mask = mask_ascii(simd);
1314	if mask != `0` {
1315	offset += mask.trailing_zeros() as usize;
1316	let non_ascii = unsafe { *src.add(offset) };
1317	return Some((non_ascii, offset));
1318	}
1319	offset += SIMD_STRIDE_SIZE;
1320	if offset + SIMD_STRIDE_SIZE <= len {
1321	let simd = unsafe { load16_unaligned(src.add(offset)) };
1322	let mask = mask_ascii(simd);
1323	if mask != `0` {
1324	offset += mask.trailing_zeros() as usize;
1325	let non_ascii = unsafe { *src.add(offset) };
1326	return Some((non_ascii, offset));
1327	}
1328	offset += SIMD_STRIDE_SIZE;
1329	}
1330	}
1331	}
1332	}
1333	while offset < len {
1334	let code_unit = unsafe { *(src.add(offset)) };
1335	if code_unit > `127` {
1336	return Some((code_unit, offset));
1337	}
1338	offset += `1`;
1339	}
1340	None
1341	}
1342	} else {
1343	#[inline(always)]
1344	fn find_non_ascii(word: usize, second_word: usize) -> Option<usize> {
1345	let word_masked = word & ASCII_MASK;
1346	let second_masked = second_word & ASCII_MASK;
1347	if (word_masked \| second_masked) == `0` {
1348	return None;
1349	}
1350	if word_masked != `0` {
1351	let zeros = count_zeros(word_masked);
1352	// `zeros` now contains 7 (for the seven bits of non-ASCII)
1353	// plus 8 times the number of ASCII in text order before the
1354	// non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1355	// text order before the non-ASCII byte in the big-endian case.
1356	let num_ascii = (zeros >> `3`) as usize;
1357	return Some(num_ascii);
1358	}
1359	let zeros = count_zeros(second_masked);
1360	// `zeros` now contains 7 (for the seven bits of non-ASCII)
1361	// plus 8 times the number of ASCII in text order before the
1362	// non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1363	// text order before the non-ASCII byte in the big-endian case.
1364	let num_ascii = (zeros >> `3`) as usize;
1365	Some(ALU_ALIGNMENT + num_ascii)
1366	}
1367
1368	#[inline(always)]
1369	unsafe fn validate_ascii_stride(src: *const usize) -> Option<usize> {
1370	let word = *src;
1371	let second_word = *(src.add(`1`));
1372	find_non_ascii(word, second_word)
1373	}
1374
1375	#[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
1376	#[inline(always)]
1377	pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1378	let src = slice.as_ptr();
1379	let len = slice.len();
1380	let mut offset = `0usize`;
1381	let mut until_alignment = (ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & ALU_ALIGNMENT_MASK;
1382	if until_alignment + ALU_STRIDE_SIZE <= len {
1383	while until_alignment != `0` {
1384	let code_unit = slice[offset];
1385	if code_unit > `127` {
1386	return Some((code_unit, offset));
1387	}
1388	offset += `1`;
1389	until_alignment -= `1`;
1390	}
1391	let len_minus_stride = len - ALU_STRIDE_SIZE;
1392	loop {
1393	let ptr = unsafe { src.add(offset) as *const usize };
1394	if let Some(num_ascii) = unsafe { validate_ascii_stride(ptr) } {
1395	offset += num_ascii;
1396	return Some((unsafe { *(src.add(offset)) }, offset));
1397	}
1398	offset += ALU_STRIDE_SIZE;
1399	if offset > len_minus_stride {
1400	break;
1401	}
1402	}
1403	}
1404	while offset < len {
1405	let code_unit = slice[offset];
1406	if code_unit > `127` {
1407	return Some((code_unit, offset));
1408	}
1409	offset += `1`;
1410	}
1411	None
1412	}
1413
1414	}
1415	}
1416
1417	cfg_if! {
1418	if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] {
1419
1420	} else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1421	// Even with NEON enabled, we use the ALU path for ASCII validation, because testing
1422	// on Exynos 5 indicated that using NEON isn't worthwhile where there are only
1423	// vector reads without vector writes.
1424
1425	pub const ALU_STRIDE_SIZE: usize = `8`;
1426
1427	pub const ALU_ALIGNMENT: usize = `4`;
1428
1429	pub const ALU_ALIGNMENT_MASK: usize = `3`;
1430	} else {
1431	#[inline(always)]
1432	unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1433	let word = *src;
1434	let second_word = *(src.add(`1`));
1435	unpack_alu(word, second_word, dst);
1436	}
1437
1438	#[inline(always)]
1439	unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1440	let first = *src;
1441	let second = *(src.add(`1`));
1442	let third = *(src.add(`2`));
1443	let fourth = *(src.add(`3`));
1444	pack_alu(first, second, third, fourth, dst);
1445	}
1446
1447	#[inline(always)]
1448	unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1449	let word = *src;
1450	let second_word = *(src.add(`1`));
1451	// Check if the words contains non-ASCII
1452	if (word & ASCII_MASK) \| (second_word & ASCII_MASK) != `0` {
1453	return `false`;
1454	}
1455	unpack_alu(word, second_word, dst);
1456	`true`
1457	}
1458
1459	#[inline(always)]
1460	unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1461	let first = *src;
1462	let second = *(src.add(`1`));
1463	let third = *(src.add(`2`));
1464	let fourth = *(src.add(`3`));
1465	if (first & BASIC_LATIN_MASK) \| (second & BASIC_LATIN_MASK) \| (third & BASIC_LATIN_MASK) \| (fourth & BASIC_LATIN_MASK) != `0` {
1466	return `false`;
1467	}
1468	pack_alu(first, second, third, fourth, dst);
1469	`true`
1470	}
1471
1472	#[inline(always)]
1473	unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option<usize> {
1474	let word = *src;
1475	let second_word = *(src.add(`1`));
1476	*dst = word;
1477	*(dst.add(`1`)) = second_word;
1478	find_non_ascii(word, second_word)
1479	}
1480
1481	basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu);
1482	basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_alu);
1483	latin1_alu!(unpack_latin1, u8, u16, unpack_latin1_stride_alu);
1484	latin1_alu!(pack_latin1, u16, u8, pack_latin1_stride_alu);
1485	ascii_alu!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride);
1486	}
1487	}
1488
1489	pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
1490	match validate_ascii(slice:bytes) {
1491	None => bytes.len(),
1492	Some((_, num_valid: usize)) => num_valid,
1493	}
1494	}
1495
1496	pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
1497	for (i: usize, b_ref: &u8) in bytes.iter().enumerate() {
1498	let b: u8 = *b_ref;
1499	if b >= `0x80` \|\| b == `0x1B` \|\| b == `0x0E` \|\| b == `0x0F` {
1500	return i;
1501	}
1502	}
1503	bytes.len()
1504	}
1505
1506	// Any copyright to the test code below this comment is dedicated to the
1507	// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
1508
1509	#[cfg(all(test, feature = "alloc"))]
1510	mod tests {
1511	use super::*;
1512	use alloc::vec::Vec;
1513
1514	macro_rules! test_ascii {
1515	($test_name:ident, $fn_tested:ident, $src_unit:ty, $dst_unit:ty) => {
1516	#[test]
1517	fn $test_name() {
1518	let mut src: Vec<$src_unit> = Vec::with_capacity(`32`);
1519	let mut dst: Vec<$dst_unit> = Vec::with_capacity(`32`);
1520	for i in `0`..`32` {
1521	src.clear();
1522	dst.clear();
1523	dst.resize(`32`, `0`);
1524	for j in `0`..`32` {
1525	let c = if i == j { `0xAA` } else { j + `0x40` };
1526	src.push(c as $src_unit);
1527	}
1528	match unsafe { $fn_tested(src.as_ptr(), dst.as_mut_ptr(), `32`) } {
1529	None => unreachable!("Should always find non-ASCII"),
1530	Some((non_ascii, num_ascii)) => {
1531	assert_eq!(non_ascii, `0xAA`);
1532	assert_eq!(num_ascii, i);
1533	for j in `0`..i {
1534	assert_eq!(dst[j], (j + `0x40`) as $dst_unit);
1535	}
1536	}
1537	}
1538	}
1539	}
1540	};
1541	}
1542
1543	test_ascii!(test_ascii_to_ascii, ascii_to_ascii, u8, u8);
1544	test_ascii!(test_ascii_to_basic_latin, ascii_to_basic_latin, u8, u16);
1545	test_ascii!(test_basic_latin_to_ascii, basic_latin_to_ascii, u16, u8);
1546	}
1547