mem.rs source code [crates/encoding_rs-0.8.32/src/mem.rs]

1	// Copyright Mozilla Foundation. See the COPYRIGHT
2	// file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	//! Functions for converting between different in-RAM representations of text
11	//! and for quickly checking if the Unicode Bidirectional Algorithm can be
12	//! avoided.
13	//!
14	//! By using slices for output, the functions here seek to enable by-register
15	//! (ALU register or SIMD register as available) operations in order to
16	//! outperform iterator-based conversions available in the Rust standard
17	//! library.
18	//!
19	//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20	//! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21	//! in-memory encoding is sometimes used as a storage optimization of text
22	//! when UTF-16 indexing and length semantics are exposed.
23	//!
24	//! The FFI binding for this module are in the
25	//! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26
27	#[cfg(feature = "alloc")]
28	use alloc::borrow::Cow;
29	#[cfg(feature = "alloc")]
30	use alloc::string::String;
31	#[cfg(feature = "alloc")]
32	use alloc::vec::Vec;
33
34	use super::in_inclusive_range16;
35	use super::in_inclusive_range32;
36	use super::in_inclusive_range8;
37	use super::in_range16;
38	use super::in_range32;
39	use super::DecoderResult;
40	use crate::ascii::*;
41	use crate::utf_8::*;
42
43	macro_rules! non_fuzz_debug_assert {
44	($($arg:tt)) => (if !cfg!(fuzzing) { debug_assert!($($arg)); })
45	}
46
47	cfg_if! {
48	if #[cfg(feature = "simd-accel")] {
49	use ::core::intrinsics::likely;
50	use ::core::intrinsics::unlikely;
51	} else {
52	#[inline(always)]
53	fn likely(b: bool) -> bool {
54	b
55	}
56	#[inline(always)]
57	fn unlikely(b: bool) -> bool {
58	b
59	}
60	}
61	}
62
63	/// Classification of text as Latin1 (all code points are below U+0100),
64	/// left-to-right with some non-Latin1 characters or as containing at least
65	/// some right-to-left characters.
66	#[must_use]
67	#[derive(Debug, PartialEq, Eq)]
68	#[repr(C)]
69	pub enum Latin1Bidi {
70	/// Every character is below U+0100.
71	Latin1 = `0`,
72	/// There is at least one character that's U+0100 or higher, but there
73	/// are no right-to-left characters.
74	LeftToRight = `1`,
75	/// There is at least one right-to-left character.
76	Bidi = `2`,
77	}
78
79	// `as` truncates, so works on 32-bit, too.
80	#[allow(dead_code)]
81	const LATIN1_MASK: usize = `0xFF00_FF00_FF00_FF00u64` as usize;
82
83	#[allow(unused_macros)]
84	macro_rules! by_unit_check_alu {
85	($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
86	#[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
87	#[inline(always)]
88	fn $name(buffer: &[$unit]) -> bool {
89	let mut offset = `0usize`;
90	let mut accu = `0usize`;
91	let unit_size = ::core::mem::size_of::<$unit>();
92	let len = buffer.len();
93	if len >= ALU_ALIGNMENT / unit_size {
94	// The most common reason to return `false` is for the first code
95	// unit to fail the test, so check that first.
96	if buffer[`0`] >= $bound {
97	return `false`;
98	}
99	let src = buffer.as_ptr();
100	let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
101	& ALU_ALIGNMENT_MASK)
102	/ unit_size;
103	if until_alignment + ALU_ALIGNMENT / unit_size <= len {
104	if until_alignment != `0` {
105	accu \|= buffer[offset] as usize;
106	offset += `1`;
107	until_alignment -= `1`;
108	while until_alignment != `0` {
109	accu \|= buffer[offset] as usize;
110	offset += `1`;
111	until_alignment -= `1`;
112	}
113	if accu >= $bound {
114	return `false`;
115	}
116	}
117	let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
118	if offset + (`4` * (ALU_ALIGNMENT / unit_size)) <= len {
119	let len_minus_unroll = len - (`4` * (ALU_ALIGNMENT / unit_size));
120	loop {
121	let unroll_accu = unsafe { (src.add(offset) as const usize) }
122	\| unsafe {
123	(src.add(offset + (ALU_ALIGNMENT / unit_size)) as const usize)
124	}
125	\| unsafe {
126	(src.add(offset + (`2` (ALU_ALIGNMENT / unit_size)))
127	as *const usize)
128	}
129	\| unsafe {
130	(src.add(offset + (`3` (ALU_ALIGNMENT / unit_size)))
131	as *const usize)
132	};
133	if unroll_accu & $mask != `0` {
134	return `false`;
135	}
136	offset += `4` * (ALU_ALIGNMENT / unit_size);
137	if offset > len_minus_unroll {
138	break;
139	}
140	}
141	}
142	while offset <= len_minus_stride {
143	accu \|= unsafe { (src.add(offset) as const usize) };
144	offset += ALU_ALIGNMENT / unit_size;
145	}
146	}
147	}
148	for &unit in &buffer[offset..] {
149	accu \|= unit as usize;
150	}
151	accu & $mask == `0`
152	}
153	};
154	}
155
156	#[allow(unused_macros)]
157	macro_rules! by_unit_check_simd {
158	($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
159	#[inline(always)]
160	fn $name(buffer: &[$unit]) -> bool {
161	let mut offset = `0usize`;
162	let mut accu = `0usize`;
163	let unit_size = ::core::mem::size_of::<$unit>();
164	let len = buffer.len();
165	if len >= SIMD_STRIDE_SIZE / unit_size {
166	// The most common reason to return `false` is for the first code
167	// unit to fail the test, so check that first.
168	if buffer[`0`] >= $bound {
169	return `false`;
170	}
171	let src = buffer.as_ptr();
172	let mut until_alignment = ((SIMD_ALIGNMENT
173	- ((src as usize) & SIMD_ALIGNMENT_MASK))
174	& SIMD_ALIGNMENT_MASK)
175	/ unit_size;
176	if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
177	if until_alignment != `0` {
178	accu \|= buffer[offset] as usize;
179	offset += `1`;
180	until_alignment -= `1`;
181	while until_alignment != `0` {
182	accu \|= buffer[offset] as usize;
183	offset += `1`;
184	until_alignment -= `1`;
185	}
186	if accu >= $bound {
187	return `false`;
188	}
189	}
190	let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
191	if offset + (`4` * (SIMD_STRIDE_SIZE / unit_size)) <= len {
192	let len_minus_unroll = len - (`4` * (SIMD_STRIDE_SIZE / unit_size));
193	loop {
194	let unroll_accu = unsafe { (src.add(offset) as const $simd_ty) }
195	\| unsafe {
196	*(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
197	as *const $simd_ty)
198	}
199	\| unsafe {
200	(src.add(offset + (`2` (SIMD_STRIDE_SIZE / unit_size)))
201	as *const $simd_ty)
202	}
203	\| unsafe {
204	(src.add(offset + (`3` (SIMD_STRIDE_SIZE / unit_size)))
205	as *const $simd_ty)
206	};
207	if !$func(unroll_accu) {
208	return `false`;
209	}
210	offset += `4` * (SIMD_STRIDE_SIZE / unit_size);
211	if offset > len_minus_unroll {
212	break;
213	}
214	}
215	}
216	let mut simd_accu = $splat;
217	while offset <= len_minus_stride {
218	simd_accu = simd_accu \| unsafe { (src.add(offset) as const $simd_ty) };
219	offset += SIMD_STRIDE_SIZE / unit_size;
220	}
221	if !$func(simd_accu) {
222	return `false`;
223	}
224	}
225	}
226	for &unit in &buffer[offset..] {
227	accu \|= unit as usize;
228	}
229	accu < $bound
230	}
231	};
232	}
233
234	cfg_if! {
235	if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
236	use crate::simd_funcs::*;
237	use packed_simd::u8x16;
238	use packed_simd::u16x8;
239
240	const SIMD_ALIGNMENT: usize = `16`;
241
242	const SIMD_ALIGNMENT_MASK: usize = `15`;
243
244	by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(`0`), u8x16, `0x80`, simd_is_ascii);
245	by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(`0`), u16x8, `0x80`, simd_is_basic_latin);
246	by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(`0`), u16x8, `0x100`, simd_is_latin1);
247
248	#[inline(always)]
249	fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
250	// This function is a mess, because it simultaneously tries to do
251	// only aligned SIMD (perhaps misguidedly) and needs to deal with
252	// the last code unit in a SIMD stride being part of a valid
253	// surrogate pair.
254	let unit_size = ::core::mem::size_of::<u16>();
255	let src = buffer.as_ptr();
256	let len = buffer.len();
257	let mut offset = `0usize`;
258	'outer: loop {
259	let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
260	SIMD_ALIGNMENT_MASK) / unit_size;
261	if until_alignment == `0` {
262	if offset + SIMD_STRIDE_SIZE / unit_size > len {
263	break;
264	}
265	} else {
266	let offset_plus_until_alignment = offset + until_alignment;
267	let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + `1`;
268	if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
269	break;
270	}
271	let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
272	if up_to < until_alignment {
273	return offset + up_to;
274	}
275	if last_valid_low {
276	offset = offset_plus_until_alignment_plus_one;
277	continue;
278	}
279	offset = offset_plus_until_alignment;
280	}
281	let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
282	loop {
283	let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
284	if contains_surrogates(unsafe { (src.add(offset) as const u16x8) }) {
285	if offset_plus_stride == len {
286	break 'outer;
287	}
288	let offset_plus_stride_plus_one = offset_plus_stride + `1`;
289	let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
290	if up_to < SIMD_STRIDE_SIZE / unit_size {
291	return offset + up_to;
292	}
293	if last_valid_low {
294	offset = offset_plus_stride_plus_one;
295	continue 'outer;
296	}
297	}
298	offset = offset_plus_stride;
299	if offset > len_minus_stride {
300	break 'outer;
301	}
302	}
303	}
304	let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
305	offset + up_to
306	}
307	} else {
308	by_unit_check_alu!(is_ascii_impl, u8, `0x80`, ASCII_MASK);
309	by_unit_check_alu!(is_basic_latin_impl, u16, `0x80`, BASIC_LATIN_MASK);
310	by_unit_check_alu!(is_utf16_latin1_impl, u16, `0x100`, LATIN1_MASK);
311
312	#[inline(always)]
313	fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
314	let (up_to, _) = utf16_valid_up_to_alu(buffer);
315	up_to
316	}
317	}
318	}
319
320	/// The second return value is true iff the last code unit of the slice was
321	/// reached and turned out to be a low surrogate that is part of a valid pair.
322	#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
323	#[inline(always)]
324	fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
325	let len = buffer.len();
326	if len == `0` {
327	return (`0`, `false`);
328	}
329	let mut offset = `0usize`;
330	loop {
331	let unit = buffer[offset];
332	let next = offset + `1`;
333	let unit_minus_surrogate_start = unit.wrapping_sub(`0xD800`);
334	if unit_minus_surrogate_start > (`0xDFFF` - `0xD800`) {
335	// Not a surrogate
336	offset = next;
337	if offset == len {
338	return (offset, `false`);
339	}
340	continue;
341	}
342	if unit_minus_surrogate_start <= (`0xDBFF` - `0xD800`) {
343	// high surrogate
344	if next < len {
345	let second = buffer[next];
346	let second_minus_low_surrogate_start = second.wrapping_sub(`0xDC00`);
347	if second_minus_low_surrogate_start <= (`0xDFFF` - `0xDC00`) {
348	// The next code unit is a low surrogate. Advance position.
349	offset = next + `1`;
350	if offset == len {
351	return (offset, `true`);
352	}
353	continue;
354	}
355	// The next code unit is not a low surrogate. Don't advance
356	// position and treat the high surrogate as unpaired.
357	// fall through
358	}
359	// Unpaired, fall through
360	}
361	// Unpaired surrogate
362	return (offset, `false`);
363	}
364	}
365
366	cfg_if! {
367	if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
368	#[inline(always)]
369	fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
370	let mut offset = `0usize`;
371	let bytes = buffer.as_bytes();
372	let len = bytes.len();
373	if len >= SIMD_STRIDE_SIZE {
374	let src = bytes.as_ptr();
375	let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
376	SIMD_ALIGNMENT_MASK;
377	if until_alignment + SIMD_STRIDE_SIZE <= len {
378	while until_alignment != `0` {
379	if bytes[offset] > `0xC3` {
380	return Some(offset);
381	}
382	offset += `1`;
383	until_alignment -= `1`;
384	}
385	let len_minus_stride = len - SIMD_STRIDE_SIZE;
386	loop {
387	if !simd_is_str_latin1(unsafe { (src.add(offset) as const u8x16) }) {
388	// TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
389	while bytes[offset] & `0xC0` == `0x80` {
390	offset += `1`;
391	}
392	return Some(offset);
393	}
394	offset += SIMD_STRIDE_SIZE;
395	if offset > len_minus_stride {
396	break;
397	}
398	}
399	}
400	}
401	for i in offset..len {
402	if bytes[i] > `0xC3` {
403	return Some(i);
404	}
405	}
406	None
407	}
408	} else {
409	#[inline(always)]
410	fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
411	let mut bytes = buffer.as_bytes();
412	let mut total = `0`;
413	loop {
414	if let Some((byte, offset)) = validate_ascii(bytes) {
415	total += offset;
416	if byte > `0xC3` {
417	return Some(total);
418	}
419	bytes = &bytes[offset + `2`..];
420	total += `2`;
421	} else {
422	return None;
423	}
424	}
425	}
426	}
427	}
428
429	#[inline(always)]
430	fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
431	let mut bytes: &[u8] = buffer;
432	let mut total: usize = `0`;
433	loop {
434	if let Some((byte: u8, offset: usize)) = validate_ascii(slice:bytes) {
435	total += offset;
436	if in_inclusive_range8(i:byte, start:`0xC2`, end:`0xC3`) {
437	let next: usize = offset + `1`;
438	if next == bytes.len() {
439	return Some(total);
440	}
441	if bytes[next] & `0xC0` != `0x80` {
442	return Some(total);
443	}
444	bytes = &bytes[offset + `2`..];
445	total += `2`;
446	} else {
447	return Some(total);
448	}
449	} else {
450	return None;
451	}
452	}
453	}
454
455	cfg_if! {
456	if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
457	#[inline(always)]
458	fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
459	let mut offset = `0usize`;
460	let len = buffer.len();
461	if len >= SIMD_STRIDE_SIZE / `2` {
462	let src = buffer.as_ptr();
463	let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
464	SIMD_ALIGNMENT_MASK) / `2`;
465	if until_alignment + (SIMD_STRIDE_SIZE / `2`) <= len {
466	while until_alignment != `0` {
467	if is_utf16_code_unit_bidi(buffer[offset]) {
468	return `true`;
469	}
470	offset += `1`;
471	until_alignment -= `1`;
472	}
473	let len_minus_stride = len - (SIMD_STRIDE_SIZE / `2`);
474	loop {
475	if is_u16x8_bidi(unsafe { (src.add(offset) as const u16x8) }) {
476	return `true`;
477	}
478	offset += SIMD_STRIDE_SIZE / `2`;
479	if offset > len_minus_stride {
480	break;
481	}
482	}
483	}
484	}
485	for &u in &buffer[offset..] {
486	if is_utf16_code_unit_bidi(u) {
487	return `true`;
488	}
489	}
490	`false`
491	}
492	} else {
493	#[inline(always)]
494	fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
495	for &u in buffer {
496	if is_utf16_code_unit_bidi(u) {
497	return `true`;
498	}
499	}
500	`false`
501	}
502	}
503	}
504
505	cfg_if! {
506	if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
507	#[inline(always)]
508	fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
509	let mut offset = `0usize`;
510	let len = buffer.len();
511	if len >= SIMD_STRIDE_SIZE / `2` {
512	let src = buffer.as_ptr();
513	let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
514	SIMD_ALIGNMENT_MASK) / `2`;
515	if until_alignment + (SIMD_STRIDE_SIZE / `2`) <= len {
516	while until_alignment != `0` {
517	if buffer[offset] > `0xFF` {
518	// This transition isn't optimal, since the aligment is recomputing
519	// but not tweaking further today.
520	if is_utf16_bidi_impl(&buffer[offset..]) {
521	return Latin1Bidi::Bidi;
522	}
523	return Latin1Bidi::LeftToRight;
524	}
525	offset += `1`;
526	until_alignment -= `1`;
527	}
528	let len_minus_stride = len - (SIMD_STRIDE_SIZE / `2`);
529	loop {
530	let mut s = unsafe { (src.add(offset) as const u16x8) };
531	if !simd_is_latin1(s) {
532	loop {
533	if is_u16x8_bidi(s) {
534	return Latin1Bidi::Bidi;
535	}
536	offset += SIMD_STRIDE_SIZE / `2`;
537	if offset > len_minus_stride {
538	for &u in &buffer[offset..] {
539	if is_utf16_code_unit_bidi(u) {
540	return Latin1Bidi::Bidi;
541	}
542	}
543	return Latin1Bidi::LeftToRight;
544	}
545	s = unsafe { (src.add(offset) as const u16x8) };
546	}
547	}
548	offset += SIMD_STRIDE_SIZE / `2`;
549	if offset > len_minus_stride {
550	break;
551	}
552	}
553	}
554	}
555	let mut iter = (&buffer[offset..]).iter();
556	loop {
557	if let Some(&u) = iter.next() {
558	if u > `0xFF` {
559	let mut inner_u = u;
560	loop {
561	if is_utf16_code_unit_bidi(inner_u) {
562	return Latin1Bidi::Bidi;
563	}
564	if let Some(&code_unit) = iter.next() {
565	inner_u = code_unit;
566	} else {
567	return Latin1Bidi::LeftToRight;
568	}
569	}
570	}
571	} else {
572	return Latin1Bidi::Latin1;
573	}
574	}
575	}
576	} else {
577	#[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
578	#[inline(always)]
579	fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
580	let mut offset = `0usize`;
581	let len = buffer.len();
582	if len >= ALU_ALIGNMENT / `2` {
583	let src = buffer.as_ptr();
584	let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
585	ALU_ALIGNMENT_MASK) / `2`;
586	if until_alignment + ALU_ALIGNMENT / `2` <= len {
587	while until_alignment != `0` {
588	if buffer[offset] > `0xFF` {
589	if is_utf16_bidi_impl(&buffer[offset..]) {
590	return Latin1Bidi::Bidi;
591	}
592	return Latin1Bidi::LeftToRight;
593	}
594	offset += `1`;
595	until_alignment -= `1`;
596	}
597	let len_minus_stride = len - ALU_ALIGNMENT / `2`;
598	loop {
599	if unsafe { (src.add(offset) as const usize) } & LATIN1_MASK != `0` {
600	if is_utf16_bidi_impl(&buffer[offset..]) {
601	return Latin1Bidi::Bidi;
602	}
603	return Latin1Bidi::LeftToRight;
604	}
605	offset += ALU_ALIGNMENT / `2`;
606	if offset > len_minus_stride {
607	break;
608	}
609	}
610	}
611	}
612	let mut iter = (&buffer[offset..]).iter();
613	loop {
614	if let Some(&u) = iter.next() {
615	if u > `0xFF` {
616	let mut inner_u = u;
617	loop {
618	if is_utf16_code_unit_bidi(inner_u) {
619	return Latin1Bidi::Bidi;
620	}
621	if let Some(&code_unit) = iter.next() {
622	inner_u = code_unit;
623	} else {
624	return Latin1Bidi::LeftToRight;
625	}
626	}
627	}
628	} else {
629	return Latin1Bidi::Latin1;
630	}
631	}
632	}
633	}
634	}
635
636	/// Checks whether the buffer is all-ASCII.
637	///
638	/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
639	/// is not guaranteed to fail fast.)
640	pub fn is_ascii(buffer: &[u8]) -> bool {
641	is_ascii_impl(buffer)
642	}
643
644	/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
645	/// only ASCII characters).
646	///
647	/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
648	/// is not guaranteed to fail fast.)
649	pub fn is_basic_latin(buffer: &[u16]) -> bool {
650	is_basic_latin_impl(buffer)
651	}
652
653	/// Checks whether the buffer is valid UTF-8 representing only code points
654	/// less than or equal to U+00FF.
655	///
656	/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
657	/// invalidity or code points above U+00FF are discovered.
658	pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
659	is_utf8_latin1_impl(buffer).is_none()
660	}
661
662	/// Checks whether the buffer represents only code points less than or equal
663	/// to U+00FF.
664	///
665	/// Fails fast. (I.e. returns before having read the whole buffer if code
666	/// points above U+00FF are discovered.
667	pub fn is_str_latin1(buffer: &str) -> bool {
668	is_str_latin1_impl(buffer).is_none()
669	}
670
671	/// Checks whether the buffer represents only code point less than or equal
672	/// to U+00FF.
673	///
674	/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
675	/// is not guaranteed to fail fast.)
676	pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
677	is_utf16_latin1_impl(buffer)
678	}
679
680	/// Checks whether a potentially-invalid UTF-8 buffer contains code points
681	/// that trigger right-to-left processing.
682	///
683	/// The check is done on a Unicode block basis without regard to assigned
684	/// vs. unassigned code points in the block. Hebrew presentation forms in
685	/// the Alphabetic Presentation Forms block are treated as if they formed
686	/// a block on their own (i.e. it treated as right-to-left). Additionally,
687	/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
688	/// for. Control characters that are technically bidi controls but do not
689	/// cause right-to-left behavior without the presence of right-to-left
690	/// characters or right-to-left controls are not checked for. As a special
691	/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
692	///
693	/// Returns `true` if the input is invalid UTF-8 or the input contains an
694	/// RTL character. Returns `false` if the input is valid UTF-8 and contains
695	/// no RTL characters.
696	#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
697	#[inline]
698	pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
699	// As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
700	// than UTF-8 validation followed by `is_str_bidi()` for German,
701	// Russian and Japanese. However, this is considerably slower for Thai.
702	// Chances are that the compiler makes some branch predictions that are
703	// unfortunate for Thai. Not spending the time to manually optimize
704	// further at this time, since it's unclear if this variant even has
705	// use cases. However, this is worth revisiting once Rust gets the
706	// ability to annotate relative priorities of match arms.
707
708	// U+058F: D6 8F
709	// U+0590: D6 90
710	// U+08FF: E0 A3 BF
711	// U+0900: E0 A4 80
712	//
713	// U+200F: E2 80 8F
714	// U+202B: E2 80 AB
715	// U+202E: E2 80 AE
716	// U+2067: E2 81 A7
717	//
718	// U+FB1C: EF AC 9C
719	// U+FB1D: EF AC 9D
720	// U+FDFF: EF B7 BF
721	// U+FE00: EF B8 80
722	//
723	// U+FE6F: EF B9 AF
724	// U+FE70: EF B9 B0
725	// U+FEFE: EF BB BE
726	// U+FEFF: EF BB BF
727	//
728	// U+107FF: F0 90 9F BF
729	// U+10800: F0 90 A0 80
730	// U+10FFF: F0 90 BF BF
731	// U+11000: F0 91 80 80
732	//
733	// U+1E7FF: F0 9E 9F BF
734	// U+1E800: F0 9E A0 80
735	// U+1EFFF: F0 9E BF BF
736	// U+1F000: F0 9F 80 80
737	let mut src = buffer;
738	'outer: loop {
739	if let Some((mut byte, mut read)) = validate_ascii(src) {
740	// Check for the longest sequence to avoid checking twice for the
741	// multi-byte sequences.
742	if read + `4` <= src.len() {
743	'inner: loop {
744	// At this point, `byte` is not included in `read`.
745	match byte {
746	`0`..=`0x7F` => {
747	// ASCII: go back to SIMD.
748	read += `1`;
749	src = &src[read..];
750	continue 'outer;
751	}
752	`0xC2`..=`0xD5` => {
753	// Two-byte
754	let second = unsafe { *(src.get_unchecked(read + `1`)) };
755	if !in_inclusive_range8(second, `0x80`, `0xBF`) {
756	return `true`;
757	}
758	read += `2`;
759	}
760	`0xD6` => {
761	// Two-byte
762	let second = unsafe { *(src.get_unchecked(read + `1`)) };
763	if !in_inclusive_range8(second, `0x80`, `0xBF`) {
764	return `true`;
765	}
766	// XXX consider folding the above and below checks
767	if second > `0x8F` {
768	return `true`;
769	}
770	read += `2`;
771	}
772	// two-byte starting with 0xD7 and above is bidi
773	`0xE1` \| `0xE3`..=`0xEC` \| `0xEE` => {
774	// Three-byte normal
775	let second = unsafe { *(src.get_unchecked(read + `1`)) };
776	let third = unsafe { *(src.get_unchecked(read + `2`)) };
777	if ((UTF8_DATA.table[usize::from(second)]
778	& unsafe {
779	(UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`))
780	})
781	\| (third >> `6`))
782	!= `2`
783	{
784	return `true`;
785	}
786	read += `3`;
787	}
788	`0xE2` => {
789	// Three-byte normal, potentially bidi
790	let second = unsafe { *(src.get_unchecked(read + `1`)) };
791	let third = unsafe { *(src.get_unchecked(read + `2`)) };
792	if ((UTF8_DATA.table[usize::from(second)]
793	& unsafe {
794	(UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`))
795	})
796	\| (third >> `6`))
797	!= `2`
798	{
799	return `true`;
800	}
801	if second == `0x80` {
802	if third == `0x8F` \|\| third == `0xAB` \|\| third == `0xAE` {
803	return `true`;
804	}
805	} else if second == `0x81` {
806	if third == `0xA7` {
807	return `true`;
808	}
809	}
810	read += `3`;
811	}
812	`0xEF` => {
813	// Three-byte normal, potentially bidi
814	let second = unsafe { *(src.get_unchecked(read + `1`)) };
815	let third = unsafe { *(src.get_unchecked(read + `2`)) };
816	if ((UTF8_DATA.table[usize::from(second)]
817	& unsafe {
818	(UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`))
819	})
820	\| (third >> `6`))
821	!= `2`
822	{
823	return `true`;
824	}
825	if in_inclusive_range8(second, `0xAC`, `0xB7`) {
826	if second == `0xAC` {
827	if third > `0x9C` {
828	return `true`;
829	}
830	} else {
831	return `true`;
832	}
833	} else if in_inclusive_range8(second, `0xB9`, `0xBB`) {
834	if second == `0xB9` {
835	if third > `0xAF` {
836	return `true`;
837	}
838	} else if second == `0xBB` {
839	if third != `0xBF` {
840	return `true`;
841	}
842	} else {
843	return `true`;
844	}
845	}
846	read += `3`;
847	}
848	`0xE0` => {
849	// Three-byte special lower bound, potentially bidi
850	let second = unsafe { *(src.get_unchecked(read + `1`)) };
851	let third = unsafe { *(src.get_unchecked(read + `2`)) };
852	if ((UTF8_DATA.table[usize::from(second)]
853	& unsafe {
854	(UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`))
855	})
856	\| (third >> `6`))
857	!= `2`
858	{
859	return `true`;
860	}
861	// XXX can this be folded into the above validity check
862	if second < `0xA4` {
863	return `true`;
864	}
865	read += `3`;
866	}
867	`0xED` => {
868	// Three-byte special upper bound
869	let second = unsafe { *(src.get_unchecked(read + `1`)) };
870	let third = unsafe { *(src.get_unchecked(read + `2`)) };
871	if ((UTF8_DATA.table[usize::from(second)]
872	& unsafe {
873	(UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`))
874	})
875	\| (third >> `6`))
876	!= `2`
877	{
878	return `true`;
879	}
880	read += `3`;
881	}
882	`0xF1`..=`0xF4` => {
883	// Four-byte normal
884	let second = unsafe { *(src.get_unchecked(read + `1`)) };
885	let third = unsafe { *(src.get_unchecked(read + `2`)) };
886	let fourth = unsafe { *(src.get_unchecked(read + `3`)) };
887	if (u16::from(
888	UTF8_DATA.table[usize::from(second)]
889	& unsafe {
890	(UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`))
891	},
892	) \| u16::from(third >> `6`)
893	\| (u16::from(fourth & `0xC0`) << `2`))
894	!= `0x202`
895	{
896	return `true`;
897	}
898	read += `4`;
899	}
900	`0xF0` => {
901	// Four-byte special lower bound, potentially bidi
902	let second = unsafe { *(src.get_unchecked(read + `1`)) };
903	let third = unsafe { *(src.get_unchecked(read + `2`)) };
904	let fourth = unsafe { *(src.get_unchecked(read + `3`)) };
905	if (u16::from(
906	UTF8_DATA.table[usize::from(second)]
907	& unsafe {
908	(UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`))
909	},
910	) \| u16::from(third >> `6`)
911	\| (u16::from(fourth & `0xC0`) << `2`))
912	!= `0x202`
913	{
914	return `true`;
915	}
916	if unlikely(second == `0x90` \|\| second == `0x9E`) {
917	let third = src[read + `2`];
918	if third >= `0xA0` {
919	return `true`;
920	}
921	}
922	read += `4`;
923	}
924	_ => {
925	// Invalid lead or bidi-only lead
926	return `true`;
927	}
928	}
929	if read + `4` > src.len() {
930	if read == src.len() {
931	return `false`;
932	}
933	byte = src[read];
934	break 'inner;
935	}
936	byte = src[read];
937	continue 'inner;
938	}
939	}
940	// We can't have a complete 4-byte sequence, but we could still have
941	// a complete shorter sequence.
942
943	// At this point, `byte` is not included in `read`.
944	match byte {
945	`0`..=`0x7F` => {
946	// ASCII: go back to SIMD.
947	read += `1`;
948	src = &src[read..];
949	continue 'outer;
950	}
951	`0xC2`..=`0xD5` => {
952	// Two-byte
953	let new_read = read + `2`;
954	if new_read > src.len() {
955	return `true`;
956	}
957	let second = unsafe { *(src.get_unchecked(read + `1`)) };
958	if !in_inclusive_range8(second, `0x80`, `0xBF`) {
959	return `true`;
960	}
961	read = new_read;
962	// We need to deal with the case where we came here with 3 bytes
963	// left, so we need to take a look at the last one.
964	src = &src[read..];
965	continue 'outer;
966	}
967	`0xD6` => {
968	// Two-byte, potentially bidi
969	let new_read = read + `2`;
970	if new_read > src.len() {
971	return `true`;
972	}
973	let second = unsafe { *(src.get_unchecked(read + `1`)) };
974	if !in_inclusive_range8(second, `0x80`, `0xBF`) {
975	return `true`;
976	}
977	// XXX consider folding the above and below checks
978	if second > `0x8F` {
979	return `true`;
980	}
981	read = new_read;
982	// We need to deal with the case where we came here with 3 bytes
983	// left, so we need to take a look at the last one.
984	src = &src[read..];
985	continue 'outer;
986	}
987	// two-byte starting with 0xD7 and above is bidi
988	`0xE1` \| `0xE3`..=`0xEC` \| `0xEE` => {
989	// Three-byte normal
990	let new_read = read + `3`;
991	if new_read > src.len() {
992	return `true`;
993	}
994	let second = unsafe { *(src.get_unchecked(read + `1`)) };
995	let third = unsafe { *(src.get_unchecked(read + `2`)) };
996	if ((UTF8_DATA.table[usize::from(second)]
997	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) })
998	\| (third >> `6`))
999	!= `2`
1000	{
1001	return `true`;
1002	}
1003	}
1004	`0xE2` => {
1005	// Three-byte normal, potentially bidi
1006	let new_read = read + `3`;
1007	if new_read > src.len() {
1008	return `true`;
1009	}
1010	let second = unsafe { *(src.get_unchecked(read + `1`)) };
1011	let third = unsafe { *(src.get_unchecked(read + `2`)) };
1012	if ((UTF8_DATA.table[usize::from(second)]
1013	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) })
1014	\| (third >> `6`))
1015	!= `2`
1016	{
1017	return `true`;
1018	}
1019	if second == `0x80` {
1020	if third == `0x8F` \|\| third == `0xAB` \|\| third == `0xAE` {
1021	return `true`;
1022	}
1023	} else if second == `0x81` {
1024	if third == `0xA7` {
1025	return `true`;
1026	}
1027	}
1028	}
1029	`0xEF` => {
1030	// Three-byte normal, potentially bidi
1031	let new_read = read + `3`;
1032	if new_read > src.len() {
1033	return `true`;
1034	}
1035	let second = unsafe { *(src.get_unchecked(read + `1`)) };
1036	let third = unsafe { *(src.get_unchecked(read + `2`)) };
1037	if ((UTF8_DATA.table[usize::from(second)]
1038	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) })
1039	\| (third >> `6`))
1040	!= `2`
1041	{
1042	return `true`;
1043	}
1044	if in_inclusive_range8(second, `0xAC`, `0xB7`) {
1045	if second == `0xAC` {
1046	if third > `0x9C` {
1047	return `true`;
1048	}
1049	} else {
1050	return `true`;
1051	}
1052	} else if in_inclusive_range8(second, `0xB9`, `0xBB`) {
1053	if second == `0xB9` {
1054	if third > `0xAF` {
1055	return `true`;
1056	}
1057	} else if second == `0xBB` {
1058	if third != `0xBF` {
1059	return `true`;
1060	}
1061	} else {
1062	return `true`;
1063	}
1064	}
1065	}
1066	`0xE0` => {
1067	// Three-byte special lower bound, potentially bidi
1068	let new_read = read + `3`;
1069	if new_read > src.len() {
1070	return `true`;
1071	}
1072	let second = unsafe { *(src.get_unchecked(read + `1`)) };
1073	let third = unsafe { *(src.get_unchecked(read + `2`)) };
1074	if ((UTF8_DATA.table[usize::from(second)]
1075	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) })
1076	\| (third >> `6`))
1077	!= `2`
1078	{
1079	return `true`;
1080	}
1081	// XXX can this be folded into the above validity check
1082	if second < `0xA4` {
1083	return `true`;
1084	}
1085	}
1086	`0xED` => {
1087	// Three-byte special upper bound
1088	let new_read = read + `3`;
1089	if new_read > src.len() {
1090	return `true`;
1091	}
1092	let second = unsafe { *(src.get_unchecked(read + `1`)) };
1093	let third = unsafe { *(src.get_unchecked(read + `2`)) };
1094	if ((UTF8_DATA.table[usize::from(second)]
1095	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) })
1096	\| (third >> `6`))
1097	!= `2`
1098	{
1099	return `true`;
1100	}
1101	}
1102	_ => {
1103	// Invalid lead, 4-byte lead or 2-byte bidi-only lead
1104	return `true`;
1105	}
1106	}
1107	return `false`;
1108	} else {
1109	return `false`;
1110	}
1111	}
1112	}
1113
1114	/// Checks whether a valid UTF-8 buffer contains code points that trigger
1115	/// right-to-left processing.
1116	///
1117	/// The check is done on a Unicode block basis without regard to assigned
1118	/// vs. unassigned code points in the block. Hebrew presentation forms in
1119	/// the Alphabetic Presentation Forms block are treated as if they formed
1120	/// a block on their own (i.e. it treated as right-to-left). Additionally,
1121	/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1122	/// for. Control characters that are technically bidi controls but do not
1123	/// cause right-to-left behavior without the presence of right-to-left
1124	/// characters or right-to-left controls are not checked for. As a special
1125	/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1126	#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1127	#[inline]
1128	pub fn is_str_bidi(buffer: &str) -> bool {
1129	// U+058F: D6 8F
1130	// U+0590: D6 90
1131	// U+08FF: E0 A3 BF
1132	// U+0900: E0 A4 80
1133	//
1134	// U+200F: E2 80 8F
1135	// U+202B: E2 80 AB
1136	// U+202E: E2 80 AE
1137	// U+2067: E2 81 A7
1138	//
1139	// U+FB1C: EF AC 9C
1140	// U+FB1D: EF AC 9D
1141	// U+FDFF: EF B7 BF
1142	// U+FE00: EF B8 80
1143	//
1144	// U+FE6F: EF B9 AF
1145	// U+FE70: EF B9 B0
1146	// U+FEFE: EF BB BE
1147	// U+FEFF: EF BB BF
1148	//
1149	// U+107FF: F0 90 9F BF
1150	// U+10800: F0 90 A0 80
1151	// U+10FFF: F0 90 BF BF
1152	// U+11000: F0 91 80 80
1153	//
1154	// U+1E7FF: F0 9E 9F BF
1155	// U+1E800: F0 9E A0 80
1156	// U+1EFFF: F0 9E BF BF
1157	// U+1F000: F0 9F 80 80
1158	let mut bytes = buffer.as_bytes();
1159	'outer: loop {
1160	// TODO: Instead of just validating ASCII using SIMD, use SIMD
1161	// to check for non-ASCII lead bytes, too, to quickly conclude
1162	// that the vector consist entirely of CJK and below-Hebrew
1163	// code points.
1164	// Unfortunately, scripts above Arabic but below CJK share
1165	// lead bytes with RTL.
1166	if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1167	'inner: loop {
1168	// At this point, `byte` is not included in `read`.
1169	if byte < `0xE0` {
1170	if byte >= `0x80` {
1171	// Two-byte
1172	// Adding `unlikely` here improved throughput on
1173	// Russian plain text by 33%!
1174	if unlikely(byte >= `0xD6`) {
1175	if byte == `0xD6` {
1176	let second = bytes[read + `1`];
1177	if second > `0x8F` {
1178	return `true`;
1179	}
1180	} else {
1181	return `true`;
1182	}
1183	}
1184	read += `2`;
1185	} else {
1186	// ASCII: write and go back to SIMD.
1187	read += `1`;
1188	// Intuitively, we should go back to the outer loop only
1189	// if byte is 0x30 or above, so as to avoid trashing on
1190	// ASCII space, comma and period in non-Latin context.
1191	// However, the extra branch seems to cost more than it's
1192	// worth.
1193	bytes = &bytes[read..];
1194	continue 'outer;
1195	}
1196	} else if byte < `0xF0` {
1197	// Three-byte
1198	if unlikely(!in_inclusive_range8(byte, `0xE3`, `0xEE`) && byte != `0xE1`) {
1199	let second = bytes[read + `1`];
1200	if byte == `0xE0` {
1201	if second < `0xA4` {
1202	return `true`;
1203	}
1204	} else if byte == `0xE2` {
1205	let third = bytes[read + `2`];
1206	if second == `0x80` {
1207	if third == `0x8F` \|\| third == `0xAB` \|\| third == `0xAE` {
1208	return `true`;
1209	}
1210	} else if second == `0x81` {
1211	if third == `0xA7` {
1212	return `true`;
1213	}
1214	}
1215	} else {
1216	debug_assert_eq!(byte, `0xEF`);
1217	if in_inclusive_range8(second, `0xAC`, `0xB7`) {
1218	if second == `0xAC` {
1219	let third = bytes[read + `2`];
1220	if third > `0x9C` {
1221	return `true`;
1222	}
1223	} else {
1224	return `true`;
1225	}
1226	} else if in_inclusive_range8(second, `0xB9`, `0xBB`) {
1227	if second == `0xB9` {
1228	let third = bytes[read + `2`];
1229	if third > `0xAF` {
1230	return `true`;
1231	}
1232	} else if second == `0xBB` {
1233	let third = bytes[read + `2`];
1234	if third != `0xBF` {
1235	return `true`;
1236	}
1237	} else {
1238	return `true`;
1239	}
1240	}
1241	}
1242	}
1243	read += `3`;
1244	} else {
1245	// Four-byte
1246	let second = bytes[read + `1`];
1247	if unlikely(byte == `0xF0` && (second == `0x90` \|\| second == `0x9E`)) {
1248	let third = bytes[read + `2`];
1249	if third >= `0xA0` {
1250	return `true`;
1251	}
1252	}
1253	read += `4`;
1254	}
1255	// The comparison is always < or == and never >, but including
1256	// > here to let the compiler assume that < is true if this
1257	// comparison is false.
1258	if read >= bytes.len() {
1259	return `false`;
1260	}
1261	byte = bytes[read];
1262	continue 'inner;
1263	}
1264	} else {
1265	return `false`;
1266	}
1267	}
1268	}
1269
1270	/// Checks whether a UTF-16 buffer contains code points that trigger
1271	/// right-to-left processing.
1272	///
1273	/// The check is done on a Unicode block basis without regard to assigned
1274	/// vs. unassigned code points in the block. Hebrew presentation forms in
1275	/// the Alphabetic Presentation Forms block are treated as if they formed
1276	/// a block on their own (i.e. it treated as right-to-left). Additionally,
1277	/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1278	/// for. Control characters that are technically bidi controls but do not
1279	/// cause right-to-left behavior without the presence of right-to-left
1280	/// characters or right-to-left controls are not checked for. As a special
1281	/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1282	///
1283	/// Returns `true` if the input contains an RTL character or an unpaired
1284	/// high surrogate that could be the high half of an RTL character.
1285	/// Returns `false` if the input contains neither RTL characters nor
1286	/// unpaired high surrogates that could be higher halves of RTL characters.
1287	pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1288	is_utf16_bidi_impl(buffer)
1289	}
1290
1291	/// Checks whether a scalar value triggers right-to-left processing.
1292	///
1293	/// The check is done on a Unicode block basis without regard to assigned
1294	/// vs. unassigned code points in the block. Hebrew presentation forms in
1295	/// the Alphabetic Presentation Forms block are treated as if they formed
1296	/// a block on their own (i.e. it treated as right-to-left). Additionally,
1297	/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1298	/// for. Control characters that are technically bidi controls but do not
1299	/// cause right-to-left behavior without the presence of right-to-left
1300	/// characters or right-to-left controls are not checked for. As a special
1301	/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1302	#[inline(always)]
1303	pub fn is_char_bidi(c: char) -> bool {
1304	// Controls:
1305	// Every control with RIGHT-TO-LEFT in its name in
1306	// https://www.unicode.org/charts/PDF/U2000.pdf
1307	// U+200F RLM
1308	// U+202B RLE
1309	// U+202E RLO
1310	// U+2067 RLI
1311	//
1312	// BMP RTL:
1313	// https://www.unicode.org/roadmaps/bmp/
1314	// U+0590...U+08FF
1315	// U+FB1D...U+FDFF Hebrew presentation forms and
1316	// Arabic Presentation Forms A
1317	// U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1318	//
1319	// Supplementary RTL:
1320	// https://www.unicode.org/roadmaps/smp/
1321	// U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1322	// U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1323	let code_point = u32::from(c);
1324	if code_point < `0x0590` {
1325	// Below Hebrew
1326	return `false`;
1327	}
1328	if in_range32(code_point, `0x0900`, `0xFB1D`) {
1329	// Above Arabic Extended-A and below Hebrew presentation forms
1330	if in_inclusive_range32(code_point, `0x200F`, `0x2067`) {
1331	// In the range that contains the RTL controls
1332	return code_point == `0x200F`
1333	\|\| code_point == `0x202B`
1334	\|\| code_point == `0x202E`
1335	\|\| code_point == `0x2067`;
1336	}
1337	return `false`;
1338	}
1339	if code_point > `0x1EFFF` {
1340	// Above second astral RTL. (Emoji is here.)
1341	return `false`;
1342	}
1343	if in_range32(code_point, `0x11000`, `0x1E800`) {
1344	// Between astral RTL blocks
1345	return `false`;
1346	}
1347	if in_range32(code_point, `0xFEFF`, `0x10800`) {
1348	// Above Arabic Presentations Forms B (excl. BOM) and below first
1349	// astral RTL
1350	return `false`;
1351	}
1352	if in_range32(code_point, `0xFE00`, `0xFE70`) {
1353	// Between Arabic Presentations Forms
1354	return `false`;
1355	}
1356	`true`
1357	}
1358
1359	/// Checks whether a UTF-16 code unit triggers right-to-left processing.
1360	///
1361	/// The check is done on a Unicode block basis without regard to assigned
1362	/// vs. unassigned code points in the block. Hebrew presentation forms in
1363	/// the Alphabetic Presentation Forms block are treated as if they formed
1364	/// a block on their own (i.e. it treated as right-to-left). Additionally,
1365	/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1366	/// for. Control characters that are technically bidi controls but do not
1367	/// cause right-to-left behavior without the presence of right-to-left
1368	/// characters or right-to-left controls are not checked for. As a special
1369	/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1370	///
1371	/// Since supplementary-plane right-to-left blocks are identifiable from the
1372	/// high surrogate without examining the low surrogate, this function returns
1373	/// `true` for such high surrogates making the function suitable for handling
1374	/// supplementary-plane text without decoding surrogate pairs to scalar
1375	/// values. Obviously, such high surrogates are then reported as right-to-left
1376	/// even if actually unpaired.
1377	#[inline(always)]
1378	pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1379	if u < `0x0590` {
1380	// Below Hebrew
1381	return `false`;
1382	}
1383	if in_range16(u, `0x0900`, `0xD802`) {
1384	// Above Arabic Extended-A and below first RTL surrogate
1385	if in_inclusive_range16(u, `0x200F`, `0x2067`) {
1386	// In the range that contains the RTL controls
1387	return u == `0x200F` \|\| u == `0x202B` \|\| u == `0x202E` \|\| u == `0x2067`;
1388	}
1389	return `false`;
1390	}
1391	if in_range16(u, `0xD83C`, `0xFB1D`) {
1392	// Between astral RTL high surrogates and Hebrew presentation forms
1393	// (Emoji is here)
1394	return `false`;
1395	}
1396	if in_range16(u, `0xD804`, `0xD83A`) {
1397	// Between RTL high surragates
1398	return `false`;
1399	}
1400	if u > `0xFEFE` {
1401	// Above Arabic Presentation Forms (excl. BOM)
1402	return `false`;
1403	}
1404	if in_range16(u, `0xFE00`, `0xFE70`) {
1405	// Between Arabic Presentations Forms
1406	return `false`;
1407	}
1408	`true`
1409	}
1410
1411	/// Checks whether a potentially invalid UTF-8 buffer contains code points
1412	/// that trigger right-to-left processing or is all-Latin1.
1413	///
1414	/// Possibly more efficient than performing the checks separately.
1415	///
1416	/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1417	/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1418	/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1419	pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1420	if let Some(offset: usize) = is_utf8_latin1_impl(buffer) {
1421	if is_utf8_bidi(&buffer[offset..]) {
1422	Latin1Bidi::Bidi
1423	} else {
1424	Latin1Bidi::LeftToRight
1425	}
1426	} else {
1427	Latin1Bidi::Latin1
1428	}
1429	}
1430
1431	/// Checks whether a valid UTF-8 buffer contains code points
1432	/// that trigger right-to-left processing or is all-Latin1.
1433	///
1434	/// Possibly more efficient than performing the checks separately.
1435	///
1436	/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1437	/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1438	/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1439	pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1440	// The transition from the latin1 check to the bidi check isn't
1441	// optimal but not tweaking it to perfection today.
1442	if let Some(offset: usize) = is_str_latin1_impl(buffer) {
1443	if is_str_bidi(&buffer[offset..]) {
1444	Latin1Bidi::Bidi
1445	} else {
1446	Latin1Bidi::LeftToRight
1447	}
1448	} else {
1449	Latin1Bidi::Latin1
1450	}
1451	}
1452
1453	/// Checks whether a potentially invalid UTF-16 buffer contains code points
1454	/// that trigger right-to-left processing or is all-Latin1.
1455	///
1456	/// Possibly more efficient than performing the checks separately.
1457	///
1458	/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1459	/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1460	/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1461	pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1462	check_utf16_for_latin1_and_bidi_impl(buffer)
1463	}
1464
1465	/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1466	/// with the REPLACEMENT CHARACTER.
1467	///
1468	/// The length of the destination buffer must be at least the length of the
1469	/// source buffer _plus one_.
1470	///
1471	/// Returns the number of `u16`s written.
1472	///
1473	/// # Panics
1474	///
1475	/// Panics if the destination buffer is shorter than stated above.
1476	pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1477	// TODO: Can the requirement for dst to be at least one unit longer
1478	// be eliminated?
1479	assert!(dst.len() > src.len());
1480	let mut decoder = Utf8Decoder::new_inner();
1481	let mut total_read = `0usize`;
1482	let mut total_written = `0usize`;
1483	loop {
1484	let (result, read, written) =
1485	decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], `true`);
1486	total_read += read;
1487	total_written += written;
1488	match result {
1489	DecoderResult::InputEmpty => {
1490	return total_written;
1491	}
1492	DecoderResult::OutputFull => {
1493	unreachable!("The assert at the top of the function should have caught this.");
1494	}
1495	DecoderResult::Malformed(_, _) => {
1496	// There should always be space for the U+FFFD, because
1497	// otherwise we'd have gotten OutputFull already.
1498	dst[total_written] = `0xFFFD`;
1499	total_written += `1`;
1500	}
1501	}
1502	}
1503	}
1504
1505	/// Converts valid UTF-8 to valid UTF-16.
1506	///
1507	/// The length of the destination buffer must be at least the length of the
1508	/// source buffer.
1509	///
1510	/// Returns the number of `u16`s written.
1511	///
1512	/// # Panics
1513	///
1514	/// Panics if the destination buffer is shorter than stated above.
1515	pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1516	assert!(
1517	dst.len() >= src.len(),
1518	"Destination must not be shorter than the source."
1519	);
1520	let bytes = src.as_bytes();
1521	let mut read = `0`;
1522	let mut written = `0`;
1523	'outer: loop {
1524	let mut byte = {
1525	let src_remaining = &bytes[read..];
1526	let dst_remaining = &mut dst[written..];
1527	let length = src_remaining.len();
1528	match unsafe {
1529	ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1530	} {
1531	None => {
1532	written += length;
1533	return written;
1534	}
1535	Some((non_ascii, consumed)) => {
1536	read += consumed;
1537	written += consumed;
1538	non_ascii
1539	}
1540	}
1541	};
1542	'inner: loop {
1543	// At this point, `byte` is not included in `read`.
1544	if byte < `0xE0` {
1545	if byte >= `0x80` {
1546	// Two-byte
1547	let second = unsafe { *(bytes.get_unchecked(read + `1`)) };
1548	let point = ((u16::from(byte) & `0x1F`) << `6`) \| (u16::from(second) & `0x3F`);
1549	unsafe { *(dst.get_unchecked_mut(written)) = point };
1550	read += `2`;
1551	written += `1`;
1552	} else {
1553	// ASCII: write and go back to SIMD.
1554	unsafe { (dst.get_unchecked_mut(written)) = u16*::from(byte) };
1555	read += `1`;
1556	written += `1`;
1557	// Intuitively, we should go back to the outer loop only
1558	// if byte is 0x30 or above, so as to avoid trashing on
1559	// ASCII space, comma and period in non-Latin context.
1560	// However, the extra branch seems to cost more than it's
1561	// worth.
1562	continue 'outer;
1563	}
1564	} else if byte < `0xF0` {
1565	// Three-byte
1566	let second = unsafe { *(bytes.get_unchecked(read + `1`)) };
1567	let third = unsafe { *(bytes.get_unchecked(read + `2`)) };
1568	let point = ((u16::from(byte) & `0xF`) << `12`)
1569	\| ((u16::from(second) & `0x3F`) << `6`)
1570	\| (u16::from(third) & `0x3F`);
1571	unsafe { *(dst.get_unchecked_mut(written)) = point };
1572	read += `3`;
1573	written += `1`;
1574	} else {
1575	// Four-byte
1576	let second = unsafe { *(bytes.get_unchecked(read + `1`)) };
1577	let third = unsafe { *(bytes.get_unchecked(read + `2`)) };
1578	let fourth = unsafe { *(bytes.get_unchecked(read + `3`)) };
1579	let point = ((u32::from(byte) & `0x7`) << `18`)
1580	\| ((u32::from(second) & `0x3F`) << `12`)
1581	\| ((u32::from(third) & `0x3F`) << `6`)
1582	\| (u32::from(fourth) & `0x3F`);
1583	unsafe { (dst.get_unchecked_mut(written)) = (`0xD7C0` + (point >> `10`)) as u16* };
1584	unsafe {
1585	(dst.get_unchecked_mut(written + `1`)) = (`0xDC00` + (point & `0x3FF`)) as u16*
1586	};
1587	read += `4`;
1588	written += `2`;
1589	}
1590	// The comparison is always < or == and never >, but including
1591	// > here to let the compiler assume that < is true if this
1592	// comparison is false.
1593	if read >= src.len() {
1594	return written;
1595	}
1596	byte = bytes[read];
1597	continue 'inner;
1598	}
1599	}
1600	}
1601
1602	/// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1603	///
1604	/// The length of the destination buffer must be at least the length of the
1605	/// source buffer.
1606	///
1607	/// Returns the number of `u16`s written or `None` if the input was invalid.
1608	///
1609	/// When the input was invalid, some output may have been written.
1610	///
1611	/// # Panics
1612	///
1613	/// Panics if the destination buffer is shorter than stated above.
1614	pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1615	assert!(
1616	dst.len() >= src.len(),
1617	"Destination must not be shorter than the source."
1618	);
1619	let (read: usize, written: usize) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1620	if read == src.len() {
1621	return Some(written);
1622	}
1623	None
1624	}
1625
1626	/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1627	/// with the REPLACEMENT CHARACTER with potentially insufficient output
1628	/// space.
1629	///
1630	/// Returns the number of code units read and the number of bytes written.
1631	///
1632	/// Guarantees that the bytes in the destination beyond the number of
1633	/// bytes claimed as written by the second item of the return tuple
1634	/// are left unmodified.
1635	///
1636	/// Not all code units are read if there isn't enough output space.
1637	///
1638	/// Note that this method isn't designed for general streamability but for
1639	/// not allocating memory for the worst case up front. Specifically,
1640	/// if the input starts with or ends with an unpaired surrogate, those are
1641	/// replaced with the REPLACEMENT CHARACTER.
1642	///
1643	/// Matches the semantics of `TextEncoder.encodeInto()` from the
1644	/// Encoding Standard.
1645	///
1646	/// # Safety
1647	///
1648	/// If you want to convert into a `&mut str`, use
1649	/// `convert_utf16_to_str_partial()` instead of using this function
1650	/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1651	#[inline(always)]
1652	pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1653	// The two functions called below are marked `inline(never)` to make
1654	// transitions from the hot part (first function) into the cold part
1655	// (second function) go through a return and another call to discouge
1656	// the CPU from speculating from the hot code into the cold code.
1657	// Letting the transitions be mere intra-function jumps, even to
1658	// basic blocks out-of-lined to the end of the function would wipe
1659	// away a quarter of Arabic encode performance on Haswell!
1660	let (read: usize, written: usize) = convert_utf16_to_utf8_partial_inner(src, dst);
1661	if likely(read == src.len()) {
1662	return (read, written);
1663	}
1664	let (tail_read: usize, tail_written: usize) =
1665	convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1666	(read + tail_read, written + tail_written)
1667	}
1668
1669	/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1670	/// with the REPLACEMENT CHARACTER.
1671	///
1672	/// The length of the destination buffer must be at least the length of the
1673	/// source buffer times three.
1674	///
1675	/// Returns the number of bytes written.
1676	///
1677	/// # Panics
1678	///
1679	/// Panics if the destination buffer is shorter than stated above.
1680	///
1681	/// # Safety
1682	///
1683	/// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1684	/// instead of using this function together with the `unsafe` method
1685	/// `as_bytes_mut()` on `&mut str`.
1686	#[inline(always)]
1687	pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1688	assert!(dst.len() >= src.len() * `3`);
1689	let (read: usize, written: usize) = convert_utf16_to_utf8_partial(src, dst);
1690	debug_assert_eq!(read, src.len());
1691	written
1692	}
1693
1694	/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1695	/// with the REPLACEMENT CHARACTER such that the validity of the output is
1696	/// signaled using the Rust type system with potentially insufficient output
1697	/// space.
1698	///
1699	/// Returns the number of code units read and the number of bytes written.
1700	///
1701	/// Not all code units are read if there isn't enough output space.
1702	///
1703	/// Note that this method isn't designed for general streamability but for
1704	/// not allocating memory for the worst case up front. Specifically,
1705	/// if the input starts with or ends with an unpaired surrogate, those are
1706	/// replaced with the REPLACEMENT CHARACTER.
1707	pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1708	let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1709	let (read: usize, written: usize) = convert_utf16_to_utf8_partial(src, dst:bytes);
1710	let len: usize = bytes.len();
1711	let mut trail: usize = written;
1712	while trail < len && ((bytes[trail] & `0xC0`) == `0x80`) {
1713	bytes[trail] = `0`;
1714	trail += `1`;
1715	}
1716	(read, written)
1717	}
1718
1719	/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1720	/// with the REPLACEMENT CHARACTER such that the validity of the output is
1721	/// signaled using the Rust type system.
1722	///
1723	/// The length of the destination buffer must be at least the length of the
1724	/// source buffer times three.
1725	///
1726	/// Returns the number of bytes written.
1727	///
1728	/// # Panics
1729	///
1730	/// Panics if the destination buffer is shorter than stated above.
1731	#[inline(always)]
1732	pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1733	assert!(dst.len() >= src.len() * `3`);
1734	let (read: usize, written: usize) = convert_utf16_to_str_partial(src, dst);
1735	debug_assert_eq!(read, src.len());
1736	written
1737	}
1738
1739	/// Converts bytes whose unsigned value is interpreted as Unicode code point
1740	/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1741	///
1742	/// The length of the destination buffer must be at least the length of the
1743	/// source buffer.
1744	///
1745	/// The number of `u16`s written equals the length of the source buffer.
1746	///
1747	/// # Panics
1748	///
1749	/// Panics if the destination buffer is shorter than stated above.
1750	pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1751	assert!(
1752	dst.len() >= src.len(),
1753	"Destination must not be shorter than the source."
1754	);
1755	// TODO: On aarch64, the safe version autovectorizes to the same unpacking
1756	// instructions and this code, but, yet, the autovectorized version is
1757	// faster.
1758	unsafe {
1759	unpack_latin1(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len());
1760	}
1761	}
1762
1763	/// Converts bytes whose unsigned value is interpreted as Unicode code point
1764	/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1765	/// output space.
1766	///
1767	/// Returns the number of bytes read and the number of bytes written.
1768	///
1769	/// If the output isn't large enough, not all input is consumed.
1770	///
1771	/// # Safety
1772	///
1773	/// If you want to convert into a `&mut str`, use
1774	/// `convert_utf16_to_str_partial()` instead of using this function
1775	/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1776	pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1777	let src_len = src.len();
1778	let src_ptr = src.as_ptr();
1779	let dst_ptr = dst.as_mut_ptr();
1780	let dst_len = dst.len();
1781	let mut total_read = `0usize`;
1782	let mut total_written = `0usize`;
1783	loop {
1784	// src can't advance more than dst
1785	let src_left = src_len - total_read;
1786	let dst_left = dst_len - total_written;
1787	let min_left = ::core::cmp::min(src_left, dst_left);
1788	if let Some((non_ascii, consumed)) = unsafe {
1789	ascii_to_ascii(
1790	src_ptr.add(total_read),
1791	dst_ptr.add(total_written),
1792	min_left,
1793	)
1794	} {
1795	total_read += consumed;
1796	total_written += consumed;
1797	if total_written.checked_add(`2`).unwrap() > dst_len {
1798	return (total_read, total_written);
1799	}
1800
1801	total_read += `1`; // consume `non_ascii`
1802
1803	dst[total_written] = (non_ascii >> `6`) \| `0xC0`;
1804	total_written += `1`;
1805	dst[total_written] = (non_ascii & `0x3F`) \| `0x80`;
1806	total_written += `1`;
1807	continue;
1808	}
1809	return (total_read + min_left, total_written + min_left);
1810	}
1811	}
1812
1813	/// Converts bytes whose unsigned value is interpreted as Unicode code point
1814	/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1815	///
1816	/// The length of the destination buffer must be at least the length of the
1817	/// source buffer times two.
1818	///
1819	/// Returns the number of bytes written.
1820	///
1821	/// # Panics
1822	///
1823	/// Panics if the destination buffer is shorter than stated above.
1824	///
1825	/// # Safety
1826	///
1827	/// Note that this function may write garbage beyond the number of bytes
1828	/// indicated by the return value, so using a `&mut str` interpreted as
1829	/// `&mut [u8]` as the destination is not safe. If you want to convert into
1830	/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1831	#[inline]
1832	pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1833	assert!(
1834	dst.len() >= src.len() * `2`,
1835	"Destination must not be shorter than the source times two."
1836	);
1837	let (read: usize, written: usize) = convert_latin1_to_utf8_partial(src, dst);
1838	debug_assert_eq!(read, src.len());
1839	written
1840	}
1841
1842	/// Converts bytes whose unsigned value is interpreted as Unicode code point
1843	/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1844	/// output is signaled using the Rust type system with potentially insufficient
1845	/// output space.
1846	///
1847	/// Returns the number of bytes read and the number of bytes written.
1848	///
1849	/// If the output isn't large enough, not all input is consumed.
1850	#[inline]
1851	pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1852	let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1853	let (read: usize, written: usize) = convert_latin1_to_utf8_partial(src, dst:bytes);
1854	let len: usize = bytes.len();
1855	let mut trail: usize = written;
1856	let max: usize = ::core::cmp::min(v1:len, v2:trail + MAX_STRIDE_SIZE);
1857	while trail < max {
1858	bytes[trail] = `0`;
1859	trail += `1`;
1860	}
1861	while trail < len && ((bytes[trail] & `0xC0`) == `0x80`) {
1862	bytes[trail] = `0`;
1863	trail += `1`;
1864	}
1865	(read, written)
1866	}
1867
1868	/// Converts bytes whose unsigned value is interpreted as Unicode code point
1869	/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1870	/// output is signaled using the Rust type system.
1871	///
1872	/// The length of the destination buffer must be at least the length of the
1873	/// source buffer times two.
1874	///
1875	/// Returns the number of bytes written.
1876	///
1877	/// # Panics
1878	///
1879	/// Panics if the destination buffer is shorter than stated above.
1880	#[inline]
1881	pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1882	assert!(
1883	dst.len() >= src.len() * `2`,
1884	"Destination must not be shorter than the source times two."
1885	);
1886	let (read: usize, written: usize) = convert_latin1_to_str_partial(src, dst);
1887	debug_assert_eq!(read, src.len());
1888	written
1889	}
1890
1891	/// If the input is valid UTF-8 representing only Unicode code points from
1892	/// U+0000 to U+00FF, inclusive, converts the input into output that
1893	/// represents the value of each code point as the unsigned byte value of
1894	/// each output byte.
1895	///
1896	/// If the input does not fulfill the condition stated above, this function
1897	/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1898	/// does something that is memory-safe without any promises about any
1899	/// properties of the output. In particular, callers shouldn't assume the
1900	/// output to be the same across crate versions or CPU architectures and
1901	/// should not assume that non-ASCII input can't map to ASCII output.
1902	///
1903	/// The length of the destination buffer must be at least the length of the
1904	/// source buffer.
1905	///
1906	/// Returns the number of bytes written.
1907	///
1908	/// # Panics
1909	///
1910	/// Panics if the destination buffer is shorter than stated above.
1911	///
1912	/// If debug assertions are enabled (and not fuzzing) and the input is
1913	/// not in the range U+0000 to U+00FF, inclusive.
1914	pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1915	assert!(
1916	dst.len() >= src.len(),
1917	"Destination must not be shorter than the source."
1918	);
1919	non_fuzz_debug_assert!(is_utf8_latin1(src));
1920	let src_len = src.len();
1921	let src_ptr = src.as_ptr();
1922	let dst_ptr = dst.as_mut_ptr();
1923	let mut total_read = `0usize`;
1924	let mut total_written = `0usize`;
1925	loop {
1926	// dst can't advance more than src
1927	let src_left = src_len - total_read;
1928	if let Some((non_ascii, consumed)) = unsafe {
1929	ascii_to_ascii(
1930	src_ptr.add(total_read),
1931	dst_ptr.add(total_written),
1932	src_left,
1933	)
1934	} {
1935	total_read += consumed + `1`;
1936	total_written += consumed;
1937
1938	if total_read == src_len {
1939	return total_written;
1940	}
1941
1942	let trail = src[total_read];
1943	total_read += `1`;
1944
1945	dst[total_written] = ((non_ascii & `0x1F`) << `6`) \| (trail & `0x3F`);
1946	total_written += `1`;
1947	continue;
1948	}
1949	return total_written + src_left;
1950	}
1951	}
1952
1953	/// If the input is valid UTF-16 representing only Unicode code points from
1954	/// U+0000 to U+00FF, inclusive, converts the input into output that
1955	/// represents the value of each code point as the unsigned byte value of
1956	/// each output byte.
1957	///
1958	/// If the input does not fulfill the condition stated above, does something
1959	/// that is memory-safe without any promises about any properties of the
1960	/// output and will probably assert in debug builds in future versions.
1961	/// In particular, callers shouldn't assume the output to be the same across
1962	/// crate versions or CPU architectures and should not assume that non-ASCII
1963	/// input can't map to ASCII output.
1964	///
1965	/// The length of the destination buffer must be at least the length of the
1966	/// source buffer.
1967	///
1968	/// The number of bytes written equals the length of the source buffer.
1969	///
1970	/// # Panics
1971	///
1972	/// Panics if the destination buffer is shorter than stated above.
1973	///
1974	/// (Probably in future versions if debug assertions are enabled (and not
1975	/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
1976	pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1977	assert!(
1978	dst.len() >= src.len(),
1979	"Destination must not be shorter than the source."
1980	);
1981	// non_fuzz_debug_assert!(is_utf16_latin1(src));
1982	unsafe {
1983	pack_latin1(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len());
1984	}
1985	}
1986
1987	/// Converts bytes whose unsigned value is interpreted as Unicode code point
1988	/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1989	///
1990	/// Borrows if input is ASCII-only. Performs a single heap allocation
1991	/// otherwise.
1992	///
1993	/// Only available if the `alloc` feature is enabled (enabled by default).
1994	#[cfg(feature = "alloc")]
1995	pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1996	let up_to: usize = ascii_valid_up_to(bytes);
1997	// >= makes later things optimize better than ==
1998	if up_to >= bytes.len() {
1999	debug_assert_eq!(up_to, bytes.len());
2000	let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
2001	return Cow::Borrowed(s);
2002	}
2003	let (head: &[u8], tail: &[u8]) = bytes.split_at(mid:up_to);
2004	let capacity: usize = head.len() + tail.len() * `2`;
2005	let mut vec: Vec = Vec::with_capacity(capacity);
2006	unsafe {
2007	vec.set_len(new_len:capacity);
2008	}
2009	(&mut vec[..up_to]).copy_from_slice(src:head);
2010	let written: usize = convert_latin1_to_utf8(src:tail, &mut vec[up_to..]);
2011	vec.truncate(len:up_to + written);
2012	Cow::Owned(unsafe { String::from_utf8_unchecked(bytes:vec) })
2013	}
2014
2015	/// If the input is valid UTF-8 representing only Unicode code points from
2016	/// U+0000 to U+00FF, inclusive, converts the input into output that
2017	/// represents the value of each code point as the unsigned byte value of
2018	/// each output byte.
2019	///
2020	/// If the input does not fulfill the condition stated above, this function
2021	/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2022	/// does something that is memory-safe without any promises about any
2023	/// properties of the output. In particular, callers shouldn't assume the
2024	/// output to be the same across crate versions or CPU architectures and
2025	/// should not assume that non-ASCII input can't map to ASCII output.
2026	///
2027	/// Borrows if input is ASCII-only. Performs a single heap allocation
2028	/// otherwise.
2029	///
2030	/// Only available if the `alloc` feature is enabled (enabled by default).
2031	#[cfg(feature = "alloc")]
2032	pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2033	let bytes: &[u8] = string.as_bytes();
2034	let up_to: usize = ascii_valid_up_to(bytes);
2035	// >= makes later things optimize better than ==
2036	if up_to >= bytes.len() {
2037	debug_assert_eq!(up_to, bytes.len());
2038	return Cow::Borrowed(bytes);
2039	}
2040	let (head: &[u8], tail: &[u8]) = bytes.split_at(mid:up_to);
2041	let capacity: usize = bytes.len();
2042	let mut vec: Vec = Vec::with_capacity(capacity);
2043	unsafe {
2044	vec.set_len(new_len:capacity);
2045	}
2046	(&mut vec[..up_to]).copy_from_slice(src:head);
2047	let written: usize = convert_utf8_to_latin1_lossy(src:tail, &mut vec[up_to..]);
2048	vec.truncate(len:up_to + written);
2049	Cow::Owned(vec)
2050	}
2051
2052	/// Returns the index of the first unpaired surrogate or, if the input is
2053	/// valid UTF-16 in its entirety, the length of the input.
2054	pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2055	utf16_valid_up_to_impl(buffer)
2056	}
2057
2058	/// Returns the index of first byte that starts an invalid byte
2059	/// sequence or a non-Latin1 byte sequence, or the length of the
2060	/// string if there are neither.
2061	pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2062	is_utf8_latin1_impl(buffer).unwrap_or(default:buffer.len())
2063	}
2064
2065	/// Returns the index of first byte that starts a non-Latin1 byte
2066	/// sequence, or the length of the string if there are none.
2067	pub fn str_latin1_up_to(buffer: &str) -> usize {
2068	is_str_latin1_impl(buffer).unwrap_or_else(\|\| buffer.len())
2069	}
2070
2071	/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2072	#[inline]
2073	pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2074	let mut offset: usize = `0`;
2075	loop {
2076	offset += utf16_valid_up_to(&buffer[offset..]);
2077	if offset == buffer.len() {
2078	return;
2079	}
2080	buffer[offset] = `0xFFFD`;
2081	offset += `1`;
2082	}
2083	}
2084
2085	/// Copies ASCII from source to destination up to the first non-ASCII byte
2086	/// (or the end of the input if it is ASCII in its entirety).
2087	///
2088	/// The length of the destination buffer must be at least the length of the
2089	/// source buffer.
2090	///
2091	/// Returns the number of bytes written.
2092	///
2093	/// # Panics
2094	///
2095	/// Panics if the destination buffer is shorter than stated above.
2096	pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2097	assert!(
2098	dst.len() >= src.len(),
2099	"Destination must not be shorter than the source."
2100	);
2101	if let Some((_, consumed: usize)) =
2102	unsafe { ascii_to_ascii(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()) }
2103	{
2104	consumed
2105	} else {
2106	src.len()
2107	}
2108	}
2109
2110	/// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2111	/// the first non-ASCII byte (or the end of the input if it is ASCII in its
2112	/// entirety).
2113	///
2114	/// The length of the destination buffer must be at least the length of the
2115	/// source buffer.
2116	///
2117	/// Returns the number of `u16`s written.
2118	///
2119	/// # Panics
2120	///
2121	/// Panics if the destination buffer is shorter than stated above.
2122	pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2123	assert!(
2124	dst.len() >= src.len(),
2125	"Destination must not be shorter than the source."
2126	);
2127	if let Some((_, consumed: usize)) =
2128	unsafe { ascii_to_basic_latin(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()) }
2129	{
2130	consumed
2131	} else {
2132	src.len()
2133	}
2134	}
2135
2136	/// Copies Basic Latin from source to destination narrowing it to ASCII up to
2137	/// the first non-Basic Latin code unit (or the end of the input if it is
2138	/// Basic Latin in its entirety).
2139	///
2140	/// The length of the destination buffer must be at least the length of the
2141	/// source buffer.
2142	///
2143	/// Returns the number of bytes written.
2144	///
2145	/// # Panics
2146	///
2147	/// Panics if the destination buffer is shorter than stated above.
2148	pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2149	assert!(
2150	dst.len() >= src.len(),
2151	"Destination must not be shorter than the source."
2152	);
2153	if let Some((_, consumed: usize)) =
2154	unsafe { basic_latin_to_ascii(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()) }
2155	{
2156	consumed
2157	} else {
2158	src.len()
2159	}
2160	}
2161
2162	// Any copyright to the test code below this comment is dedicated to the
2163	// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2164
2165	#[cfg(all(test, feature = "alloc"))]
2166	mod tests {
2167	use super::*;
2168
2169	#[test]
2170	fn test_is_ascii_success() {
2171	let mut src: Vec<u8> = Vec::with_capacity(`128`);
2172	src.resize(`128`, `0`);
2173	for i in `0`..src.len() {
2174	src[i] = i as u8;
2175	}
2176	for i in `0`..src.len() {
2177	assert!(is_ascii(&src[i..]));
2178	}
2179	}
2180
2181	#[test]
2182	fn test_is_ascii_fail() {
2183	let mut src: Vec<u8> = Vec::with_capacity(`128`);
2184	src.resize(`128`, `0`);
2185	for i in `0`..src.len() {
2186	src[i] = i as u8;
2187	}
2188	for i in `0`..src.len() {
2189	let tail = &mut src[i..];
2190	for j in `0`..tail.len() {
2191	tail[j] = `0xA0`;
2192	assert!(!is_ascii(tail));
2193	}
2194	}
2195	}
2196
2197	#[test]
2198	fn test_is_basic_latin_success() {
2199	let mut src: Vec<u16> = Vec::with_capacity(`128`);
2200	src.resize(`128`, `0`);
2201	for i in `0`..src.len() {
2202	src[i] = i as u16;
2203	}
2204	for i in `0`..src.len() {
2205	assert!(is_basic_latin(&src[i..]));
2206	}
2207	}
2208
2209	#[test]
2210	fn test_is_basic_latin_fail() {
2211	let mut src: Vec<u16> = Vec::with_capacity(`128`);
2212	src.resize(`128`, `0`);
2213	for i in `0`..src.len() {
2214	src[i] = i as u16;
2215	}
2216	for i in `0`..src.len() {
2217	let tail = &mut src[i..];
2218	for j in `0`..tail.len() {
2219	tail[j] = `0xA0`;
2220	assert!(!is_basic_latin(tail));
2221	}
2222	}
2223	}
2224
2225	#[test]
2226	fn test_is_utf16_latin1_success() {
2227	let mut src: Vec<u16> = Vec::with_capacity(`256`);
2228	src.resize(`256`, `0`);
2229	for i in `0`..src.len() {
2230	src[i] = i as u16;
2231	}
2232	for i in `0`..src.len() {
2233	assert!(is_utf16_latin1(&src[i..]));
2234	assert_eq!(
2235	check_utf16_for_latin1_and_bidi(&src[i..]),
2236	Latin1Bidi::Latin1
2237	);
2238	}
2239	}
2240
2241	#[test]
2242	fn test_is_utf16_latin1_fail() {
2243	let len = if cfg!(miri) { `64` } else { `256` }; // Miri is too slow
2244	let mut src: Vec<u16> = Vec::with_capacity(len);
2245	src.resize(len, `0`);
2246	for i in `0`..src.len() {
2247	src[i] = i as u16;
2248	}
2249	for i in `0`..src.len() {
2250	let tail = &mut src[i..];
2251	for j in `0`..tail.len() {
2252	tail[j] = `0x100` + j as u16;
2253	assert!(!is_utf16_latin1(tail));
2254	assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2255	}
2256	}
2257	}
2258
2259	#[test]
2260	fn test_is_str_latin1_success() {
2261	let len = if cfg!(miri) { `64` } else { `256` }; // Miri is too slow
2262	let mut src: Vec<u16> = Vec::with_capacity(len);
2263	src.resize(len, `0`);
2264	for i in `0`..src.len() {
2265	src[i] = i as u16;
2266	}
2267	for i in `0`..src.len() {
2268	let s = String::from_utf16(&src[i..]).unwrap();
2269	assert!(is_str_latin1(&s[..]));
2270	assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2271	}
2272	}
2273
2274	#[test]
2275	fn test_is_str_latin1_fail() {
2276	let len = if cfg!(miri) { `32` } else { `256` }; // Miri is too slow
2277	let mut src: Vec<u16> = Vec::with_capacity(len);
2278	src.resize(len, `0`);
2279	for i in `0`..src.len() {
2280	src[i] = i as u16;
2281	}
2282	for i in `0`..src.len() {
2283	let tail = &mut src[i..];
2284	for j in `0`..tail.len() {
2285	tail[j] = `0x100` + j as u16;
2286	let s = String::from_utf16(tail).unwrap();
2287	assert!(!is_str_latin1(&s[..]));
2288	assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2289	}
2290	}
2291	}
2292
2293	#[test]
2294	fn test_is_utf8_latin1_success() {
2295	let len = if cfg!(miri) { `64` } else { `256` }; // Miri is too slow
2296	let mut src: Vec<u16> = Vec::with_capacity(len);
2297	src.resize(len, `0`);
2298	for i in `0`..src.len() {
2299	src[i] = i as u16;
2300	}
2301	for i in `0`..src.len() {
2302	let s = String::from_utf16(&src[i..]).unwrap();
2303	assert!(is_utf8_latin1(s.as_bytes()));
2304	assert_eq!(
2305	check_utf8_for_latin1_and_bidi(s.as_bytes()),
2306	Latin1Bidi::Latin1
2307	);
2308	}
2309	}
2310
2311	#[test]
2312	fn test_is_utf8_latin1_fail() {
2313	let len = if cfg!(miri) { `32` } else { `256` }; // Miri is too slow
2314	let mut src: Vec<u16> = Vec::with_capacity(len);
2315	src.resize(len, `0`);
2316	for i in `0`..src.len() {
2317	src[i] = i as u16;
2318	}
2319	for i in `0`..src.len() {
2320	let tail = &mut src[i..];
2321	for j in `0`..tail.len() {
2322	tail[j] = `0x100` + j as u16;
2323	let s = String::from_utf16(tail).unwrap();
2324	assert!(!is_utf8_latin1(s.as_bytes()));
2325	assert_ne!(
2326	check_utf8_for_latin1_and_bidi(s.as_bytes()),
2327	Latin1Bidi::Latin1
2328	);
2329	}
2330	}
2331	}
2332
2333	#[test]
2334	fn test_is_utf8_latin1_invalid() {
2335	assert!(!is_utf8_latin1(b"`\xC3`"));
2336	assert!(!is_utf8_latin1(b"a`\xC3`"));
2337	assert!(!is_utf8_latin1(b"`\xFF`"));
2338	assert!(!is_utf8_latin1(b"a`\xFF`"));
2339	assert!(!is_utf8_latin1(b"`\xC3\xFF`"));
2340	assert!(!is_utf8_latin1(b"a`\xC3\xFF`"));
2341	}
2342
2343	#[test]
2344	fn test_convert_utf8_to_utf16() {
2345	let src = "abcdefghijklmnopqrstu`\u{1F4A9}`v`\u{2603}`w`\u{00B6}`xyzz";
2346	let mut dst: Vec<u16> = Vec::with_capacity(src.len() + `1`);
2347	dst.resize(src.len() + `1`, `0`);
2348	let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2349	dst.truncate(len);
2350	let reference: Vec<u16> = src.encode_utf16().collect();
2351	assert_eq!(dst, reference);
2352	}
2353
2354	#[test]
2355	fn test_convert_str_to_utf16() {
2356	let src = "abcdefghijklmnopqrstu`\u{1F4A9}`v`\u{2603}`w`\u{00B6}`xyzz";
2357	let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2358	dst.resize(src.len(), `0`);
2359	let len = convert_str_to_utf16(src, &mut dst[..]);
2360	dst.truncate(len);
2361	let reference: Vec<u16> = src.encode_utf16().collect();
2362	assert_eq!(dst, reference);
2363	}
2364
2365	#[test]
2366	fn test_convert_utf16_to_utf8_partial() {
2367	let reference = "abcdefghijklmnopqrstu`\u{1F4A9}`v`\u{2603}`w`\u{00B6}`xyzz";
2368	let src: Vec<u16> = reference.encode_utf16().collect();
2369	let mut dst: Vec<u8> = Vec::with_capacity(src.len() * `3` + `1`);
2370	dst.resize(src.len() * `3` + `1`, `0`);
2371	let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..`24`]);
2372	let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2373	dst.truncate(len);
2374	assert_eq!(dst, reference.as_bytes());
2375	}
2376
2377	#[test]
2378	fn test_convert_utf16_to_utf8() {
2379	let reference = "abcdefghijklmnopqrstu`\u{1F4A9}`v`\u{2603}`w`\u{00B6}`xyzz";
2380	let src: Vec<u16> = reference.encode_utf16().collect();
2381	let mut dst: Vec<u8> = Vec::with_capacity(src.len() * `3` + `1`);
2382	dst.resize(src.len() * `3` + `1`, `0`);
2383	let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2384	dst.truncate(len);
2385	assert_eq!(dst, reference.as_bytes());
2386	}
2387
2388	#[test]
2389	fn test_convert_latin1_to_utf16() {
2390	let mut src: Vec<u8> = Vec::with_capacity(`256`);
2391	src.resize(`256`, `0`);
2392	let mut reference: Vec<u16> = Vec::with_capacity(`256`);
2393	reference.resize(`256`, `0`);
2394	for i in `0`..`256` {
2395	src[i] = i as u8;
2396	reference[i] = i as u16;
2397	}
2398	let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2399	dst.resize(src.len(), `0`);
2400	convert_latin1_to_utf16(&src[..], &mut dst[..]);
2401	assert_eq!(dst, reference);
2402	}
2403
2404	#[test]
2405	fn test_convert_latin1_to_utf8_partial() {
2406	let mut dst = [`0u8`, `2`];
2407	let (read, written) = convert_latin1_to_utf8_partial(b"a`\xFF`", &mut dst[..]);
2408	assert_eq!(read, `1`);
2409	assert_eq!(written, `1`);
2410	}
2411
2412	#[test]
2413	fn test_convert_latin1_to_utf8() {
2414	let mut src: Vec<u8> = Vec::with_capacity(`256`);
2415	src.resize(`256`, `0`);
2416	let mut reference: Vec<u16> = Vec::with_capacity(`256`);
2417	reference.resize(`256`, `0`);
2418	for i in `0`..`256` {
2419	src[i] = i as u8;
2420	reference[i] = i as u16;
2421	}
2422	let s = String::from_utf16(&reference[..]).unwrap();
2423	let mut dst: Vec<u8> = Vec::with_capacity(src.len() * `2`);
2424	dst.resize(src.len() * `2`, `0`);
2425	let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2426	dst.truncate(len);
2427	assert_eq!(&dst[..], s.as_bytes());
2428	}
2429
2430	#[test]
2431	fn test_convert_utf8_to_latin1_lossy() {
2432	let mut reference: Vec<u8> = Vec::with_capacity(`256`);
2433	reference.resize(`256`, `0`);
2434	let mut src16: Vec<u16> = Vec::with_capacity(`256`);
2435	src16.resize(`256`, `0`);
2436	for i in `0`..`256` {
2437	src16[i] = i as u16;
2438	reference[i] = i as u8;
2439	}
2440	let src = String::from_utf16(&src16[..]).unwrap();
2441	let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2442	dst.resize(src.len(), `0`);
2443	let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2444	dst.truncate(len);
2445	assert_eq!(dst, reference);
2446	}
2447
2448	#[cfg(all(debug_assertions, not(fuzzing)))]
2449	#[test]
2450	#[should_panic]
2451	fn test_convert_utf8_to_latin1_lossy_panics() {
2452	let mut dst = [`0u8`; `16`];
2453	let _ = convert_utf8_to_latin1_lossy("`\u{100}`".as_bytes(), &mut dst[..]);
2454	}
2455
2456	#[test]
2457	fn test_convert_utf16_to_latin1_lossy() {
2458	let mut src: Vec<u16> = Vec::with_capacity(`256`);
2459	src.resize(`256`, `0`);
2460	let mut reference: Vec<u8> = Vec::with_capacity(`256`);
2461	reference.resize(`256`, `0`);
2462	for i in `0`..`256` {
2463	src[i] = i as u16;
2464	reference[i] = i as u8;
2465	}
2466	let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2467	dst.resize(src.len(), `0`);
2468	convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2469	assert_eq!(dst, reference);
2470	}
2471
2472	#[test]
2473	// #[should_panic]
2474	fn test_convert_utf16_to_latin1_lossy_panics() {
2475	let mut dst = [`0u8`; `16`];
2476	let _ = convert_utf16_to_latin1_lossy(&[`0x0100u16`], &mut dst[..]);
2477	}
2478
2479	#[test]
2480	fn test_utf16_valid_up_to() {
2481	let valid = vec![
2482	`0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0x2603u16`,
2483	`0xD83Du16`, `0xDCA9u16`, `0x00B6u16`,
2484	];
2485	assert_eq!(utf16_valid_up_to(&valid[..]), `16`);
2486	let lone_high = vec![
2487	`0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`,
2488	`0x2603u16`, `0xD83Du16`, `0x00B6u16`,
2489	];
2490	assert_eq!(utf16_valid_up_to(&lone_high[..]), `14`);
2491	let lone_low = vec![
2492	`0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`,
2493	`0x2603u16`, `0xDCA9u16`, `0x00B6u16`,
2494	];
2495	assert_eq!(utf16_valid_up_to(&lone_low[..]), `14`);
2496	let lone_high_at_end = vec![
2497	`0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`,
2498	`0x2603u16`, `0x00B6u16`, `0xD83Du16`,
2499	];
2500	assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), `15`);
2501	}
2502
2503	#[test]
2504	fn test_ensure_utf16_validity() {
2505	let mut src = vec![
2506	`0u16`, `0xD83Du16`, `0u16`, `0u16`, `0u16`, `0xD83Du16`, `0xDCA9u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`,
2507	`0u16`, `0xDCA9u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`,
2508	`0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`,
2509	];
2510	let reference = vec![
2511	`0u16`, `0xFFFDu16`, `0u16`, `0u16`, `0u16`, `0xD83Du16`, `0xDCA9u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`,
2512	`0u16`, `0xFFFDu16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`,
2513	`0u16`, `0u16`, `0u16`, `0u16`, `0u16`, `0u16`,
2514	];
2515	ensure_utf16_validity(&mut src[..]);
2516	assert_eq!(src, reference);
2517	}
2518
2519	#[test]
2520	fn test_is_char_bidi() {
2521	assert!(!is_char_bidi('a'));
2522	assert!(!is_char_bidi('`\u{03B1}`'));
2523	assert!(!is_char_bidi('`\u{3041}`'));
2524	assert!(!is_char_bidi('`\u{1F4A9}`'));
2525	assert!(!is_char_bidi('`\u{FE00}`'));
2526	assert!(!is_char_bidi('`\u{202C}`'));
2527	assert!(!is_char_bidi('`\u{FEFF}`'));
2528	assert!(is_char_bidi('`\u{0590}`'));
2529	assert!(is_char_bidi('`\u{08FF}`'));
2530	assert!(is_char_bidi('`\u{061C}`'));
2531	assert!(is_char_bidi('`\u{FB50}`'));
2532	assert!(is_char_bidi('`\u{FDFF}`'));
2533	assert!(is_char_bidi('`\u{FE70}`'));
2534	assert!(is_char_bidi('`\u{FEFE}`'));
2535	assert!(is_char_bidi('`\u{200F}`'));
2536	assert!(is_char_bidi('`\u{202B}`'));
2537	assert!(is_char_bidi('`\u{202E}`'));
2538	assert!(is_char_bidi('`\u{2067}`'));
2539	assert!(is_char_bidi('`\u{10800}`'));
2540	assert!(is_char_bidi('`\u{10FFF}`'));
2541	assert!(is_char_bidi('`\u{1E800}`'));
2542	assert!(is_char_bidi('`\u{1EFFF}`'));
2543	}
2544
2545	#[test]
2546	fn test_is_utf16_code_unit_bidi() {
2547	assert!(!is_utf16_code_unit_bidi(`0x0062`));
2548	assert!(!is_utf16_code_unit_bidi(`0x03B1`));
2549	assert!(!is_utf16_code_unit_bidi(`0x3041`));
2550	assert!(!is_utf16_code_unit_bidi(`0xD801`));
2551	assert!(!is_utf16_code_unit_bidi(`0xFE00`));
2552	assert!(!is_utf16_code_unit_bidi(`0x202C`));
2553	assert!(!is_utf16_code_unit_bidi(`0xFEFF`));
2554	assert!(is_utf16_code_unit_bidi(`0x0590`));
2555	assert!(is_utf16_code_unit_bidi(`0x08FF`));
2556	assert!(is_utf16_code_unit_bidi(`0x061C`));
2557	assert!(is_utf16_code_unit_bidi(`0xFB1D`));
2558	assert!(is_utf16_code_unit_bidi(`0xFB50`));
2559	assert!(is_utf16_code_unit_bidi(`0xFDFF`));
2560	assert!(is_utf16_code_unit_bidi(`0xFE70`));
2561	assert!(is_utf16_code_unit_bidi(`0xFEFE`));
2562	assert!(is_utf16_code_unit_bidi(`0x200F`));
2563	assert!(is_utf16_code_unit_bidi(`0x202B`));
2564	assert!(is_utf16_code_unit_bidi(`0x202E`));
2565	assert!(is_utf16_code_unit_bidi(`0x2067`));
2566	assert!(is_utf16_code_unit_bidi(`0xD802`));
2567	assert!(is_utf16_code_unit_bidi(`0xD803`));
2568	assert!(is_utf16_code_unit_bidi(`0xD83A`));
2569	assert!(is_utf16_code_unit_bidi(`0xD83B`));
2570	}
2571
2572	#[test]
2573	fn test_is_str_bidi() {
2574	assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2575	assert!(!is_str_bidi("abcdefghijklmnop`\u{03B1}`abcdefghijklmnop"));
2576	assert!(!is_str_bidi("abcdefghijklmnop`\u{3041}`abcdefghijklmnop"));
2577	assert!(!is_str_bidi("abcdefghijklmnop`\u{1F4A9}`abcdefghijklmnop"));
2578	assert!(!is_str_bidi("abcdefghijklmnop`\u{FE00}`abcdefghijklmnop"));
2579	assert!(!is_str_bidi("abcdefghijklmnop`\u{202C}`abcdefghijklmnop"));
2580	assert!(!is_str_bidi("abcdefghijklmnop`\u{FEFF}`abcdefghijklmnop"));
2581	assert!(is_str_bidi("abcdefghijklmnop`\u{0590}`abcdefghijklmnop"));
2582	assert!(is_str_bidi("abcdefghijklmnop`\u{08FF}`abcdefghijklmnop"));
2583	assert!(is_str_bidi("abcdefghijklmnop`\u{061C}`abcdefghijklmnop"));
2584	assert!(is_str_bidi("abcdefghijklmnop`\u{FB50}`abcdefghijklmnop"));
2585	assert!(is_str_bidi("abcdefghijklmnop`\u{FDFF}`abcdefghijklmnop"));
2586	assert!(is_str_bidi("abcdefghijklmnop`\u{FE70}`abcdefghijklmnop"));
2587	assert!(is_str_bidi("abcdefghijklmnop`\u{FEFE}`abcdefghijklmnop"));
2588	assert!(is_str_bidi("abcdefghijklmnop`\u{200F}`abcdefghijklmnop"));
2589	assert!(is_str_bidi("abcdefghijklmnop`\u{202B}`abcdefghijklmnop"));
2590	assert!(is_str_bidi("abcdefghijklmnop`\u{202E}`abcdefghijklmnop"));
2591	assert!(is_str_bidi("abcdefghijklmnop`\u{2067}`abcdefghijklmnop"));
2592	assert!(is_str_bidi("abcdefghijklmnop`\u{10800}`abcdefghijklmnop"));
2593	assert!(is_str_bidi("abcdefghijklmnop`\u{10FFF}`abcdefghijklmnop"));
2594	assert!(is_str_bidi("abcdefghijklmnop`\u{1E800}`abcdefghijklmnop"));
2595	assert!(is_str_bidi("abcdefghijklmnop`\u{1EFFF}`abcdefghijklmnop"));
2596	}
2597
2598	#[test]
2599	fn test_is_utf8_bidi() {
2600	assert!(!is_utf8_bidi(
2601	"abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2602	));
2603	assert!(!is_utf8_bidi(
2604	"abcdefghijklmnop`\u{03B1}`abcdefghijklmnop".as_bytes()
2605	));
2606	assert!(!is_utf8_bidi(
2607	"abcdefghijklmnop`\u{3041}`abcdefghijklmnop".as_bytes()
2608	));
2609	assert!(!is_utf8_bidi(
2610	"abcdefghijklmnop`\u{1F4A9}`abcdefghijklmnop".as_bytes()
2611	));
2612	assert!(!is_utf8_bidi(
2613	"abcdefghijklmnop`\u{FE00}`abcdefghijklmnop".as_bytes()
2614	));
2615	assert!(!is_utf8_bidi(
2616	"abcdefghijklmnop`\u{202C}`abcdefghijklmnop".as_bytes()
2617	));
2618	assert!(!is_utf8_bidi(
2619	"abcdefghijklmnop`\u{FEFF}`abcdefghijklmnop".as_bytes()
2620	));
2621	assert!(is_utf8_bidi(
2622	"abcdefghijklmnop`\u{0590}`abcdefghijklmnop".as_bytes()
2623	));
2624	assert!(is_utf8_bidi(
2625	"abcdefghijklmnop`\u{08FF}`abcdefghijklmnop".as_bytes()
2626	));
2627	assert!(is_utf8_bidi(
2628	"abcdefghijklmnop`\u{061C}`abcdefghijklmnop".as_bytes()
2629	));
2630	assert!(is_utf8_bidi(
2631	"abcdefghijklmnop`\u{FB50}`abcdefghijklmnop".as_bytes()
2632	));
2633	assert!(is_utf8_bidi(
2634	"abcdefghijklmnop`\u{FDFF}`abcdefghijklmnop".as_bytes()
2635	));
2636	assert!(is_utf8_bidi(
2637	"abcdefghijklmnop`\u{FE70}`abcdefghijklmnop".as_bytes()
2638	));
2639	assert!(is_utf8_bidi(
2640	"abcdefghijklmnop`\u{FEFE}`abcdefghijklmnop".as_bytes()
2641	));
2642	assert!(is_utf8_bidi(
2643	"abcdefghijklmnop`\u{200F}`abcdefghijklmnop".as_bytes()
2644	));
2645	assert!(is_utf8_bidi(
2646	"abcdefghijklmnop`\u{202B}`abcdefghijklmnop".as_bytes()
2647	));
2648	assert!(is_utf8_bidi(
2649	"abcdefghijklmnop`\u{202E}`abcdefghijklmnop".as_bytes()
2650	));
2651	assert!(is_utf8_bidi(
2652	"abcdefghijklmnop`\u{2067}`abcdefghijklmnop".as_bytes()
2653	));
2654	assert!(is_utf8_bidi(
2655	"abcdefghijklmnop`\u{10800}`abcdefghijklmnop".as_bytes()
2656	));
2657	assert!(is_utf8_bidi(
2658	"abcdefghijklmnop`\u{10FFF}`abcdefghijklmnop".as_bytes()
2659	));
2660	assert!(is_utf8_bidi(
2661	"abcdefghijklmnop`\u{1E800}`abcdefghijklmnop".as_bytes()
2662	));
2663	assert!(is_utf8_bidi(
2664	"abcdefghijklmnop`\u{1EFFF}`abcdefghijklmnop".as_bytes()
2665	));
2666	}
2667
2668	#[test]
2669	fn test_is_utf16_bidi() {
2670	assert!(!is_utf16_bidi(&[
2671	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x0062`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2672	`0x67`, `0x68`, `0x69`,
2673	]));
2674	assert!(!is_utf16_bidi(&[
2675	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x03B1`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2676	`0x67`, `0x68`, `0x69`,
2677	]));
2678	assert!(!is_utf16_bidi(&[
2679	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x3041`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2680	`0x67`, `0x68`, `0x69`,
2681	]));
2682	assert!(!is_utf16_bidi(&[
2683	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD801`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2684	`0x67`, `0x68`, `0x69`,
2685	]));
2686	assert!(!is_utf16_bidi(&[
2687	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFE00`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2688	`0x67`, `0x68`, `0x69`,
2689	]));
2690	assert!(!is_utf16_bidi(&[
2691	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x202C`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2692	`0x67`, `0x68`, `0x69`,
2693	]));
2694	assert!(!is_utf16_bidi(&[
2695	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFEFF`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2696	`0x67`, `0x68`, `0x69`,
2697	]));
2698	assert!(is_utf16_bidi(&[
2699	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x0590`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2700	`0x67`, `0x68`, `0x69`,
2701	]));
2702	assert!(is_utf16_bidi(&[
2703	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x08FF`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2704	`0x67`, `0x68`, `0x69`,
2705	]));
2706	assert!(is_utf16_bidi(&[
2707	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x061C`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2708	`0x67`, `0x68`, `0x69`,
2709	]));
2710	assert!(is_utf16_bidi(&[
2711	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFB1D`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2712	`0x67`, `0x68`, `0x69`,
2713	]));
2714	assert!(is_utf16_bidi(&[
2715	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFB50`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2716	`0x67`, `0x68`, `0x69`,
2717	]));
2718	assert!(is_utf16_bidi(&[
2719	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFDFF`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2720	`0x67`, `0x68`, `0x69`,
2721	]));
2722	assert!(is_utf16_bidi(&[
2723	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFE70`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2724	`0x67`, `0x68`, `0x69`,
2725	]));
2726	assert!(is_utf16_bidi(&[
2727	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFEFE`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2728	`0x67`, `0x68`, `0x69`,
2729	]));
2730	assert!(is_utf16_bidi(&[
2731	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x200F`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2732	`0x67`, `0x68`, `0x69`,
2733	]));
2734	assert!(is_utf16_bidi(&[
2735	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x202B`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2736	`0x67`, `0x68`, `0x69`,
2737	]));
2738	assert!(is_utf16_bidi(&[
2739	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x202E`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2740	`0x67`, `0x68`, `0x69`,
2741	]));
2742	assert!(is_utf16_bidi(&[
2743	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x2067`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2744	`0x67`, `0x68`, `0x69`,
2745	]));
2746	assert!(is_utf16_bidi(&[
2747	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD802`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2748	`0x67`, `0x68`, `0x69`,
2749	]));
2750	assert!(is_utf16_bidi(&[
2751	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD803`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2752	`0x67`, `0x68`, `0x69`,
2753	]));
2754	assert!(is_utf16_bidi(&[
2755	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD83A`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2756	`0x67`, `0x68`, `0x69`,
2757	]));
2758	assert!(is_utf16_bidi(&[
2759	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD83B`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`,
2760	`0x67`, `0x68`, `0x69`,
2761	]));
2762
2763	assert!(is_utf16_bidi(&[
2764	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x0590`, `0x3041`, `0x62`, `0x63`, `0x64`, `0x65`,
2765	`0x66`, `0x67`, `0x68`, `0x69`,
2766	]));
2767	}
2768
2769	#[test]
2770	fn test_check_str_for_latin1_and_bidi() {
2771	assert_ne!(
2772	check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2773	Latin1Bidi::Bidi
2774	);
2775	assert_ne!(
2776	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{03B1}`abcdefghijklmnop"),
2777	Latin1Bidi::Bidi
2778	);
2779	assert_ne!(
2780	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{3041}`abcdefghijklmnop"),
2781	Latin1Bidi::Bidi
2782	);
2783	assert_ne!(
2784	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{1F4A9}`abcdefghijklmnop"),
2785	Latin1Bidi::Bidi
2786	);
2787	assert_ne!(
2788	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{FE00}`abcdefghijklmnop"),
2789	Latin1Bidi::Bidi
2790	);
2791	assert_ne!(
2792	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{202C}`abcdefghijklmnop"),
2793	Latin1Bidi::Bidi
2794	);
2795	assert_ne!(
2796	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{FEFF}`abcdefghijklmnop"),
2797	Latin1Bidi::Bidi
2798	);
2799	assert_eq!(
2800	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{0590}`abcdefghijklmnop"),
2801	Latin1Bidi::Bidi
2802	);
2803	assert_eq!(
2804	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{08FF}`abcdefghijklmnop"),
2805	Latin1Bidi::Bidi
2806	);
2807	assert_eq!(
2808	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{061C}`abcdefghijklmnop"),
2809	Latin1Bidi::Bidi
2810	);
2811	assert_eq!(
2812	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{FB50}`abcdefghijklmnop"),
2813	Latin1Bidi::Bidi
2814	);
2815	assert_eq!(
2816	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{FDFF}`abcdefghijklmnop"),
2817	Latin1Bidi::Bidi
2818	);
2819	assert_eq!(
2820	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{FE70}`abcdefghijklmnop"),
2821	Latin1Bidi::Bidi
2822	);
2823	assert_eq!(
2824	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{FEFE}`abcdefghijklmnop"),
2825	Latin1Bidi::Bidi
2826	);
2827	assert_eq!(
2828	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{200F}`abcdefghijklmnop"),
2829	Latin1Bidi::Bidi
2830	);
2831	assert_eq!(
2832	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{202B}`abcdefghijklmnop"),
2833	Latin1Bidi::Bidi
2834	);
2835	assert_eq!(
2836	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{202E}`abcdefghijklmnop"),
2837	Latin1Bidi::Bidi
2838	);
2839	assert_eq!(
2840	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{2067}`abcdefghijklmnop"),
2841	Latin1Bidi::Bidi
2842	);
2843	assert_eq!(
2844	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{10800}`abcdefghijklmnop"),
2845	Latin1Bidi::Bidi
2846	);
2847	assert_eq!(
2848	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{10FFF}`abcdefghijklmnop"),
2849	Latin1Bidi::Bidi
2850	);
2851	assert_eq!(
2852	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{1E800}`abcdefghijklmnop"),
2853	Latin1Bidi::Bidi
2854	);
2855	assert_eq!(
2856	check_str_for_latin1_and_bidi("abcdefghijklmnop`\u{1EFFF}`abcdefghijklmnop"),
2857	Latin1Bidi::Bidi
2858	);
2859	}
2860
2861	#[test]
2862	fn test_check_utf8_for_latin1_and_bidi() {
2863	assert_ne!(
2864	check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2865	Latin1Bidi::Bidi
2866	);
2867	assert_ne!(
2868	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{03B1}`abcdefghijklmnop".as_bytes()),
2869	Latin1Bidi::Bidi
2870	);
2871	assert_ne!(
2872	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{3041}`abcdefghijklmnop".as_bytes()),
2873	Latin1Bidi::Bidi
2874	);
2875	assert_ne!(
2876	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{1F4A9}`abcdefghijklmnop".as_bytes()),
2877	Latin1Bidi::Bidi
2878	);
2879	assert_ne!(
2880	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{FE00}`abcdefghijklmnop".as_bytes()),
2881	Latin1Bidi::Bidi
2882	);
2883	assert_ne!(
2884	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{202C}`abcdefghijklmnop".as_bytes()),
2885	Latin1Bidi::Bidi
2886	);
2887	assert_ne!(
2888	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{FEFF}`abcdefghijklmnop".as_bytes()),
2889	Latin1Bidi::Bidi
2890	);
2891	assert_eq!(
2892	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{0590}`abcdefghijklmnop".as_bytes()),
2893	Latin1Bidi::Bidi
2894	);
2895	assert_eq!(
2896	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{08FF}`abcdefghijklmnop".as_bytes()),
2897	Latin1Bidi::Bidi
2898	);
2899	assert_eq!(
2900	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{061C}`abcdefghijklmnop".as_bytes()),
2901	Latin1Bidi::Bidi
2902	);
2903	assert_eq!(
2904	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{FB50}`abcdefghijklmnop".as_bytes()),
2905	Latin1Bidi::Bidi
2906	);
2907	assert_eq!(
2908	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{FDFF}`abcdefghijklmnop".as_bytes()),
2909	Latin1Bidi::Bidi
2910	);
2911	assert_eq!(
2912	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{FE70}`abcdefghijklmnop".as_bytes()),
2913	Latin1Bidi::Bidi
2914	);
2915	assert_eq!(
2916	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{FEFE}`abcdefghijklmnop".as_bytes()),
2917	Latin1Bidi::Bidi
2918	);
2919	assert_eq!(
2920	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{200F}`abcdefghijklmnop".as_bytes()),
2921	Latin1Bidi::Bidi
2922	);
2923	assert_eq!(
2924	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{202B}`abcdefghijklmnop".as_bytes()),
2925	Latin1Bidi::Bidi
2926	);
2927	assert_eq!(
2928	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{202E}`abcdefghijklmnop".as_bytes()),
2929	Latin1Bidi::Bidi
2930	);
2931	assert_eq!(
2932	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{2067}`abcdefghijklmnop".as_bytes()),
2933	Latin1Bidi::Bidi
2934	);
2935	assert_eq!(
2936	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{10800}`abcdefghijklmnop".as_bytes()),
2937	Latin1Bidi::Bidi
2938	);
2939	assert_eq!(
2940	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{10FFF}`abcdefghijklmnop".as_bytes()),
2941	Latin1Bidi::Bidi
2942	);
2943	assert_eq!(
2944	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{1E800}`abcdefghijklmnop".as_bytes()),
2945	Latin1Bidi::Bidi
2946	);
2947	assert_eq!(
2948	check_utf8_for_latin1_and_bidi("abcdefghijklmnop`\u{1EFFF}`abcdefghijklmnop".as_bytes()),
2949	Latin1Bidi::Bidi
2950	);
2951	}
2952
2953	#[test]
2954	fn test_check_utf16_for_latin1_and_bidi() {
2955	assert_ne!(
2956	check_utf16_for_latin1_and_bidi(&[
2957	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x0062`, `0x62`, `0x63`, `0x64`, `0x65`,
2958	`0x66`, `0x67`, `0x68`, `0x69`,
2959	]),
2960	Latin1Bidi::Bidi
2961	);
2962	assert_ne!(
2963	check_utf16_for_latin1_and_bidi(&[
2964	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x03B1`, `0x62`, `0x63`, `0x64`, `0x65`,
2965	`0x66`, `0x67`, `0x68`, `0x69`,
2966	]),
2967	Latin1Bidi::Bidi
2968	);
2969	assert_ne!(
2970	check_utf16_for_latin1_and_bidi(&[
2971	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x3041`, `0x62`, `0x63`, `0x64`, `0x65`,
2972	`0x66`, `0x67`, `0x68`, `0x69`,
2973	]),
2974	Latin1Bidi::Bidi
2975	);
2976	assert_ne!(
2977	check_utf16_for_latin1_and_bidi(&[
2978	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD801`, `0x62`, `0x63`, `0x64`, `0x65`,
2979	`0x66`, `0x67`, `0x68`, `0x69`,
2980	]),
2981	Latin1Bidi::Bidi
2982	);
2983	assert_ne!(
2984	check_utf16_for_latin1_and_bidi(&[
2985	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFE00`, `0x62`, `0x63`, `0x64`, `0x65`,
2986	`0x66`, `0x67`, `0x68`, `0x69`,
2987	]),
2988	Latin1Bidi::Bidi
2989	);
2990	assert_ne!(
2991	check_utf16_for_latin1_and_bidi(&[
2992	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x202C`, `0x62`, `0x63`, `0x64`, `0x65`,
2993	`0x66`, `0x67`, `0x68`, `0x69`,
2994	]),
2995	Latin1Bidi::Bidi
2996	);
2997	assert_ne!(
2998	check_utf16_for_latin1_and_bidi(&[
2999	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFEFF`, `0x62`, `0x63`, `0x64`, `0x65`,
3000	`0x66`, `0x67`, `0x68`, `0x69`,
3001	]),
3002	Latin1Bidi::Bidi
3003	);
3004	assert_eq!(
3005	check_utf16_for_latin1_and_bidi(&[
3006	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x0590`, `0x62`, `0x63`, `0x64`, `0x65`,
3007	`0x66`, `0x67`, `0x68`, `0x69`,
3008	]),
3009	Latin1Bidi::Bidi
3010	);
3011	assert_eq!(
3012	check_utf16_for_latin1_and_bidi(&[
3013	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x08FF`, `0x62`, `0x63`, `0x64`, `0x65`,
3014	`0x66`, `0x67`, `0x68`, `0x69`,
3015	]),
3016	Latin1Bidi::Bidi
3017	);
3018	assert_eq!(
3019	check_utf16_for_latin1_and_bidi(&[
3020	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x061C`, `0x62`, `0x63`, `0x64`, `0x65`,
3021	`0x66`, `0x67`, `0x68`, `0x69`,
3022	]),
3023	Latin1Bidi::Bidi
3024	);
3025	assert_eq!(
3026	check_utf16_for_latin1_and_bidi(&[
3027	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFB1D`, `0x62`, `0x63`, `0x64`, `0x65`,
3028	`0x66`, `0x67`, `0x68`, `0x69`,
3029	]),
3030	Latin1Bidi::Bidi
3031	);
3032	assert_eq!(
3033	check_utf16_for_latin1_and_bidi(&[
3034	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFB50`, `0x62`, `0x63`, `0x64`, `0x65`,
3035	`0x66`, `0x67`, `0x68`, `0x69`,
3036	]),
3037	Latin1Bidi::Bidi
3038	);
3039	assert_eq!(
3040	check_utf16_for_latin1_and_bidi(&[
3041	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFDFF`, `0x62`, `0x63`, `0x64`, `0x65`,
3042	`0x66`, `0x67`, `0x68`, `0x69`,
3043	]),
3044	Latin1Bidi::Bidi
3045	);
3046	assert_eq!(
3047	check_utf16_for_latin1_and_bidi(&[
3048	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFE70`, `0x62`, `0x63`, `0x64`, `0x65`,
3049	`0x66`, `0x67`, `0x68`, `0x69`,
3050	]),
3051	Latin1Bidi::Bidi
3052	);
3053	assert_eq!(
3054	check_utf16_for_latin1_and_bidi(&[
3055	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xFEFE`, `0x62`, `0x63`, `0x64`, `0x65`,
3056	`0x66`, `0x67`, `0x68`, `0x69`,
3057	]),
3058	Latin1Bidi::Bidi
3059	);
3060	assert_eq!(
3061	check_utf16_for_latin1_and_bidi(&[
3062	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x200F`, `0x62`, `0x63`, `0x64`, `0x65`,
3063	`0x66`, `0x67`, `0x68`, `0x69`,
3064	]),
3065	Latin1Bidi::Bidi
3066	);
3067	assert_eq!(
3068	check_utf16_for_latin1_and_bidi(&[
3069	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x202B`, `0x62`, `0x63`, `0x64`, `0x65`,
3070	`0x66`, `0x67`, `0x68`, `0x69`,
3071	]),
3072	Latin1Bidi::Bidi
3073	);
3074	assert_eq!(
3075	check_utf16_for_latin1_and_bidi(&[
3076	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x202E`, `0x62`, `0x63`, `0x64`, `0x65`,
3077	`0x66`, `0x67`, `0x68`, `0x69`,
3078	]),
3079	Latin1Bidi::Bidi
3080	);
3081	assert_eq!(
3082	check_utf16_for_latin1_and_bidi(&[
3083	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x2067`, `0x62`, `0x63`, `0x64`, `0x65`,
3084	`0x66`, `0x67`, `0x68`, `0x69`,
3085	]),
3086	Latin1Bidi::Bidi
3087	);
3088	assert_eq!(
3089	check_utf16_for_latin1_and_bidi(&[
3090	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD802`, `0x62`, `0x63`, `0x64`, `0x65`,
3091	`0x66`, `0x67`, `0x68`, `0x69`,
3092	]),
3093	Latin1Bidi::Bidi
3094	);
3095	assert_eq!(
3096	check_utf16_for_latin1_and_bidi(&[
3097	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD803`, `0x62`, `0x63`, `0x64`, `0x65`,
3098	`0x66`, `0x67`, `0x68`, `0x69`,
3099	]),
3100	Latin1Bidi::Bidi
3101	);
3102	assert_eq!(
3103	check_utf16_for_latin1_and_bidi(&[
3104	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD83A`, `0x62`, `0x63`, `0x64`, `0x65`,
3105	`0x66`, `0x67`, `0x68`, `0x69`,
3106	]),
3107	Latin1Bidi::Bidi
3108	);
3109	assert_eq!(
3110	check_utf16_for_latin1_and_bidi(&[
3111	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0xD83B`, `0x62`, `0x63`, `0x64`, `0x65`,
3112	`0x66`, `0x67`, `0x68`, `0x69`,
3113	]),
3114	Latin1Bidi::Bidi
3115	);
3116
3117	assert_eq!(
3118	check_utf16_for_latin1_and_bidi(&[
3119	`0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x0590`, `0x3041`, `0x62`, `0x63`, `0x64`,
3120	`0x65`, `0x66`, `0x67`, `0x68`, `0x69`,
3121	]),
3122	Latin1Bidi::Bidi
3123	);
3124	}
3125
3126	#[inline(always)]
3127	pub fn reference_is_char_bidi(c: char) -> bool {
3128	match c {
3129	'`\u{0590}`'..='`\u{08FF}`'
3130	\| '`\u{FB1D}`'..='`\u{FDFF}`'
3131	\| '`\u{FE70}`'..='`\u{FEFE}`'
3132	\| '`\u{10800}`'..='`\u{10FFF}`'
3133	\| '`\u{1E800}`'..='`\u{1EFFF}`'
3134	\| '`\u{200F}`'
3135	\| '`\u{202B}`'
3136	\| '`\u{202E}`'
3137	\| '`\u{2067}`' => `true`,
3138	_ => `false`,
3139	}
3140	}
3141
3142	#[inline(always)]
3143	pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3144	match u {
3145	`0x0590`..=`0x08FF`
3146	\| `0xFB1D`..=`0xFDFF`
3147	\| `0xFE70`..=`0xFEFE`
3148	\| `0xD802`
3149	\| `0xD803`
3150	\| `0xD83A`
3151	\| `0xD83B`
3152	\| `0x200F`
3153	\| `0x202B`
3154	\| `0x202E`
3155	\| `0x2067` => `true`,
3156	_ => `false`,
3157	}
3158	}
3159
3160	#[test]
3161	#[cfg_attr(miri, ignore)] // Miri is too slow
3162	fn test_is_char_bidi_thoroughly() {
3163	for i in `0`..`0xD800u32` {
3164	let c: char = ::core::char::from_u32(i).unwrap();
3165	assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3166	}
3167	for i in `0xE000`..`0x110000u32` {
3168	let c: char = ::core::char::from_u32(i).unwrap();
3169	assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3170	}
3171	}
3172
3173	#[test]
3174	#[cfg_attr(miri, ignore)] // Miri is too slow
3175	fn test_is_utf16_code_unit_bidi_thoroughly() {
3176	for i in `0`..`0x10000u32` {
3177	let u = i as u16;
3178	assert_eq!(
3179	is_utf16_code_unit_bidi(u),
3180	reference_is_utf16_code_unit_bidi(u)
3181	);
3182	}
3183	}
3184
3185	#[test]
3186	#[cfg_attr(miri, ignore)] // Miri is too slow
3187	fn test_is_str_bidi_thoroughly() {
3188	let mut buf = [`0`; `4`];
3189	for i in `0`..`0xD800u32` {
3190	let c: char = ::core::char::from_u32(i).unwrap();
3191	assert_eq!(
3192	is_str_bidi(c.encode_utf8(&mut buf[..])),
3193	reference_is_char_bidi(c)
3194	);
3195	}
3196	for i in `0xE000`..`0x110000u32` {
3197	let c: char = ::core::char::from_u32(i).unwrap();
3198	assert_eq!(
3199	is_str_bidi(c.encode_utf8(&mut buf[..])),
3200	reference_is_char_bidi(c)
3201	);
3202	}
3203	}
3204
3205	#[test]
3206	#[cfg_attr(miri, ignore)] // Miri is too slow
3207	fn test_is_utf8_bidi_thoroughly() {
3208	let mut buf = [`0`; `8`];
3209	for i in `0`..`0xD800u32` {
3210	let c: char = ::core::char::from_u32(i).unwrap();
3211	let expect = reference_is_char_bidi(c);
3212	{
3213	let len = {
3214	let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3215	assert_eq!(is_utf8_bidi(bytes), expect);
3216	bytes.len()
3217	};
3218	{
3219	let tail = &mut buf[len..];
3220	for b in tail.iter_mut() {
3221	*b = `0`;
3222	}
3223	}
3224	}
3225	assert_eq!(is_utf8_bidi(&buf[..]), expect);
3226	}
3227	for i in `0xE000`..`0x110000u32` {
3228	let c: char = ::core::char::from_u32(i).unwrap();
3229	let expect = reference_is_char_bidi(c);
3230	{
3231	let len = {
3232	let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3233	assert_eq!(is_utf8_bidi(bytes), expect);
3234	bytes.len()
3235	};
3236	{
3237	let tail = &mut buf[len..];
3238	for b in tail.iter_mut() {
3239	*b = `0`;
3240	}
3241	}
3242	}
3243	assert_eq!(is_utf8_bidi(&buf[..]), expect);
3244	}
3245	}
3246
3247	#[test]
3248	#[cfg_attr(miri, ignore)] // Miri is too slow
3249	fn test_is_utf16_bidi_thoroughly() {
3250	let mut buf = [`0`; `32`];
3251	for i in `0`..`0x10000u32` {
3252	let u = i as u16;
3253	buf[`15`] = u;
3254	assert_eq!(
3255	is_utf16_bidi(&buf[..]),
3256	reference_is_utf16_code_unit_bidi(u)
3257	);
3258	}
3259	}
3260
3261	#[test]
3262	fn test_is_utf8_bidi_edge_cases() {
3263	assert!(!is_utf8_bidi(b"`\xD5\xBF\x61`"));
3264	assert!(!is_utf8_bidi(b"`\xD6\x80\x61`"));
3265	assert!(!is_utf8_bidi(b"abc"));
3266	assert!(is_utf8_bidi(b"`\xD5\xBF\xC2`"));
3267	assert!(is_utf8_bidi(b"`\xD6\x80\xC2`"));
3268	assert!(is_utf8_bidi(b"ab`\xC2`"));
3269	}
3270
3271	#[test]
3272	fn test_decode_latin1() {
3273	match decode_latin1(b"ab") {
3274	Cow::Borrowed(s) => {
3275	assert_eq!(s, "ab");
3276	}
3277	Cow::Owned(_) => {
3278	unreachable!("Should have borrowed");
3279	}
3280	}
3281	assert_eq!(decode_latin1(b"a`\xE4`"), "a`\u{E4}`");
3282	}
3283
3284	#[test]
3285	fn test_encode_latin1_lossy() {
3286	match encode_latin1_lossy("ab") {
3287	Cow::Borrowed(s) => {
3288	assert_eq!(s, b"ab");
3289	}
3290	Cow::Owned(_) => {
3291	unreachable!("Should have borrowed");
3292	}
3293	}
3294	assert_eq!(encode_latin1_lossy("a`\u{E4}`"), &(b"a`\xE4`")[..]);
3295	}
3296
3297	#[test]
3298	fn test_convert_utf8_to_utf16_without_replacement() {
3299	let mut buf = [`0u16`; `5`];
3300	assert_eq!(
3301	convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..`2`]),
3302	Some(`2`)
3303	);
3304	assert_eq!(buf[`0`], u16::from(b'a'));
3305	assert_eq!(buf[`1`], u16::from(b'b'));
3306	assert_eq!(buf[`2`], `0`);
3307	assert_eq!(
3308	convert_utf8_to_utf16_without_replacement(b"`\xC3\xA4`c", &mut buf[..`3`]),
3309	Some(`2`)
3310	);
3311	assert_eq!(buf[`0`], `0xE4`);
3312	assert_eq!(buf[`1`], u16::from(b'c'));
3313	assert_eq!(buf[`2`], `0`);
3314	assert_eq!(
3315	convert_utf8_to_utf16_without_replacement(b"`\xE2\x98\x83`", &mut buf[..`3`]),
3316	Some(`1`)
3317	);
3318	assert_eq!(buf[`0`], `0x2603`);
3319	assert_eq!(buf[`1`], u16::from(b'c'));
3320	assert_eq!(buf[`2`], `0`);
3321	assert_eq!(
3322	convert_utf8_to_utf16_without_replacement(b"`\xE2\x98\x83`d", &mut buf[..`4`]),
3323	Some(`2`)
3324	);
3325	assert_eq!(buf[`0`], `0x2603`);
3326	assert_eq!(buf[`1`], u16::from(b'd'));
3327	assert_eq!(buf[`2`], `0`);
3328	assert_eq!(
3329	convert_utf8_to_utf16_without_replacement(b"`\xE2\x98\x83\xC3\xA4`", &mut buf[..`5`]),
3330	Some(`2`)
3331	);
3332	assert_eq!(buf[`0`], `0x2603`);
3333	assert_eq!(buf[`1`], `0xE4`);
3334	assert_eq!(buf[`2`], `0`);
3335	assert_eq!(
3336	convert_utf8_to_utf16_without_replacement(b"`\xF0\x9F\x93\x8E`", &mut buf[..`4`]),
3337	Some(`2`)
3338	);
3339	assert_eq!(buf[`0`], `0xD83D`);
3340	assert_eq!(buf[`1`], `0xDCCE`);
3341	assert_eq!(buf[`2`], `0`);
3342	assert_eq!(
3343	convert_utf8_to_utf16_without_replacement(b"`\xF0\x9F\x93\x8E`e", &mut buf[..`5`]),
3344	Some(`3`)
3345	);
3346	assert_eq!(buf[`0`], `0xD83D`);
3347	assert_eq!(buf[`1`], `0xDCCE`);
3348	assert_eq!(buf[`2`], u16::from(b'e'));
3349	assert_eq!(
3350	convert_utf8_to_utf16_without_replacement(b"`\xF0\x9F\x93`", &mut buf[..`5`]),
3351	None
3352	);
3353	}
3354	}
3355