utf_8.rs source code [crates/encoding_rs-0.8.32/src/utf_8.rs]

1	// Copyright Mozilla Foundation. See the COPYRIGHT
2	// file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	use super::*;
11	use crate::ascii::ascii_to_basic_latin;
12	use crate::ascii::basic_latin_to_ascii;
13	use crate::ascii::validate_ascii;
14	use crate::handles::*;
15	use crate::mem::convert_utf16_to_utf8_partial;
16	use crate::variant::*;
17
18	cfg_if! {
19	if #[cfg(feature = "simd-accel")] {
20	use ::core::intrinsics::unlikely;
21	use ::core::intrinsics::likely;
22	} else {
23	#[inline(always)]
24	fn unlikely(b: bool) -> bool {
25	b
26	}
27	#[inline(always)]
28	fn likely(b: bool) -> bool {
29	b
30	}
31	}
32	}
33
34	#[repr(align(`64`))] // Align to cache lines
35	pub struct Utf8Data {
36	pub table: [u8; `384`],
37	}
38
39	// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
40	// Instead, please regenerate using generate-encoding-data.py
41
42	pub static UTF8_DATA: Utf8Data = Utf8Data {
43	table: [
44	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
45	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
46	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
47	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
48	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
49	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
50	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
51	`252`, `252`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `84`, `148`, `148`, `148`,
52	`148`, `148`, `148`, `148`, `148`, `148`, `148`, `148`, `148`, `148`, `148`, `148`, `148`, `164`, `164`, `164`, `164`, `164`,
53	`164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`,
54	`164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `164`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
55	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
56	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
57	`252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`, `252`,
58	`252`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`,
59	`4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`,
60	`4`, `4`, `4`, `4`, `4`, `4`, `4`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`,
61	`8`, `8`, `8`, `8`, `8`, `8`, `8`, `16`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `32`, `8`, `8`, `64`, `8`, `8`, `8`, `128`, `4`,
62	`4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`,
63	],
64	};
65
66	// END GENERATED CODE
67
68	pub fn utf8_valid_up_to(src: &[u8]) -> usize {
69	let mut read = `0`;
70	'outer: loop {
71	let mut byte = {
72	let src_remaining = &src[read..];
73	match validate_ascii(src_remaining) {
74	None => {
75	return src.len();
76	}
77	Some((non_ascii, consumed)) => {
78	read += consumed;
79	non_ascii
80	}
81	}
82	};
83	// Check for the longest sequence to avoid checking twice for the
84	// multi-byte sequences. This can't overflow with 64-bit address space,
85	// because full 64 bits aren't in use. In the 32-bit PAE case, for this
86	// to overflow would mean that the source slice would be so large that
87	// the address space of the process would not have space for any code.
88	// Therefore, the slice cannot be so long that this would overflow.
89	if likely(read + `4` <= src.len()) {
90	'inner: loop {
91	// At this point, `byte` is not included in `read`, because we
92	// don't yet know that a) the UTF-8 sequence is valid and b) that there
93	// is output space if it is an astral sequence.
94	// Inspecting the lead byte directly is faster than what the
95	// std lib does!
96	if likely(in_inclusive_range8(byte, `0xC2`, `0xDF`)) {
97	// Two-byte
98	let second = unsafe { *(src.get_unchecked(read + `1`)) };
99	if !in_inclusive_range8(second, `0x80`, `0xBF`) {
100	break 'outer;
101	}
102	read += `2`;
103
104	// Next lead (manually inlined)
105	if likely(read + `4` <= src.len()) {
106	byte = unsafe { *(src.get_unchecked(read)) };
107	if byte < `0x80` {
108	read += `1`;
109	continue 'outer;
110	}
111	continue 'inner;
112	}
113	break 'inner;
114	}
115	if likely(byte < `0xF0`) {
116	'three: loop {
117	// Three-byte
118	let second = unsafe { *(src.get_unchecked(read + `1`)) };
119	let third = unsafe { *(src.get_unchecked(read + `2`)) };
120	if ((UTF8_DATA.table[usize::from(second)]
121	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) })
122	\| (third >> `6`))
123	!= `2`
124	{
125	break 'outer;
126	}
127	read += `3`;
128
129	// Next lead (manually inlined)
130	if likely(read + `4` <= src.len()) {
131	byte = unsafe { *(src.get_unchecked(read)) };
132	if in_inclusive_range8(byte, `0xE0`, `0xEF`) {
133	continue 'three;
134	}
135	if likely(byte < `0x80`) {
136	read += `1`;
137	continue 'outer;
138	}
139	continue 'inner;
140	}
141	break 'inner;
142	}
143	}
144	// Four-byte
145	let second = unsafe { *(src.get_unchecked(read + `1`)) };
146	let third = unsafe { *(src.get_unchecked(read + `2`)) };
147	let fourth = unsafe { *(src.get_unchecked(read + `3`)) };
148	if (u16::from(
149	UTF8_DATA.table[usize::from(second)]
150	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) },
151	) \| u16::from(third >> `6`)
152	\| (u16::from(fourth & `0xC0`) << `2`))
153	!= `0x202`
154	{
155	break 'outer;
156	}
157	read += `4`;
158
159	// Next lead
160	if likely(read + `4` <= src.len()) {
161	byte = unsafe { *(src.get_unchecked(read)) };
162	if byte < `0x80` {
163	read += `1`;
164	continue 'outer;
165	}
166	continue 'inner;
167	}
168	break 'inner;
169	}
170	}
171	// We can't have a complete 4-byte sequence, but we could still have
172	// one to three shorter sequences.
173	'tail: loop {
174	// >= is better for bound check elision than ==
175	if read >= src.len() {
176	break 'outer;
177	}
178	byte = src[read];
179	// At this point, `byte` is not included in `read`, because we
180	// don't yet know that a) the UTF-8 sequence is valid and b) that there
181	// is output space if it is an astral sequence.
182	// Inspecting the lead byte directly is faster than what the
183	// std lib does!
184	if byte < `0x80` {
185	read += `1`;
186	continue 'tail;
187	}
188	if in_inclusive_range8(byte, `0xC2`, `0xDF`) {
189	// Two-byte
190	let new_read = read + `2`;
191	if new_read > src.len() {
192	break 'outer;
193	}
194	let second = src[read + `1`];
195	if !in_inclusive_range8(second, `0x80`, `0xBF`) {
196	break 'outer;
197	}
198	read += `2`;
199	continue 'tail;
200	}
201	// We need to exclude valid four byte lead bytes, because
202	// `UTF8_DATA.second_mask` covers
203	if byte < `0xF0` {
204	// Three-byte
205	let new_read = read + `3`;
206	if new_read > src.len() {
207	break 'outer;
208	}
209	let second = src[read + `1`];
210	let third = src[read + `2`];
211	if ((UTF8_DATA.table[usize::from(second)]
212	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) })
213	\| (third >> `6`))
214	!= `2`
215	{
216	break 'outer;
217	}
218	read += `3`;
219	// `'tail` handles sequences shorter than 4, so
220	// there can't be another sequence after this one.
221	break 'outer;
222	}
223	break 'outer;
224	}
225	}
226	read
227	}
228
229	#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))]
230	pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) {
231	let mut read = `0`;
232	let mut written = `0`;
233	'outer: loop {
234	let mut byte = {
235	let src_remaining = &src[read..];
236	let dst_remaining = &mut dst[written..];
237	let length = ::core::cmp::min(src_remaining.len(), dst_remaining.len());
238	match unsafe {
239	ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
240	} {
241	None => {
242	read += length;
243	written += length;
244	break 'outer;
245	}
246	Some((non_ascii, consumed)) => {
247	read += consumed;
248	written += consumed;
249	non_ascii
250	}
251	}
252	};
253	// Check for the longest sequence to avoid checking twice for the
254	// multi-byte sequences. This can't overflow with 64-bit address space,
255	// because full 64 bits aren't in use. In the 32-bit PAE case, for this
256	// to overflow would mean that the source slice would be so large that
257	// the address space of the process would not have space for any code.
258	// Therefore, the slice cannot be so long that this would overflow.
259	if likely(read + `4` <= src.len()) {
260	'inner: loop {
261	// At this point, `byte` is not included in `read`, because we
262	// don't yet know that a) the UTF-8 sequence is valid and b) that there
263	// is output space if it is an astral sequence.
264	// We know, thanks to `ascii_to_basic_latin` that there is output
265	// space for at least one UTF-16 code unit, so no need to check
266	// for output space in the BMP cases.
267	// Inspecting the lead byte directly is faster than what the
268	// std lib does!
269	if likely(in_inclusive_range8(byte, `0xC2`, `0xDF`)) {
270	// Two-byte
271	let second = unsafe { *(src.get_unchecked(read + `1`)) };
272	if !in_inclusive_range8(second, `0x80`, `0xBF`) {
273	break 'outer;
274	}
275	unsafe {
276	*(dst.get_unchecked_mut(written)) =
277	((u16::from(byte) & `0x1F`) << `6`) \| (u16::from(second) & `0x3F`)
278	};
279	read += `2`;
280	written += `1`;
281
282	// Next lead (manually inlined)
283	if written == dst.len() {
284	break 'outer;
285	}
286	if likely(read + `4` <= src.len()) {
287	byte = unsafe { *(src.get_unchecked(read)) };
288	if byte < `0x80` {
289	unsafe { (dst.get_unchecked_mut(written)) = u16*::from(byte) };
290	read += `1`;
291	written += `1`;
292	continue 'outer;
293	}
294	continue 'inner;
295	}
296	break 'inner;
297	}
298	if likely(byte < `0xF0`) {
299	'three: loop {
300	// Three-byte
301	let second = unsafe { *(src.get_unchecked(read + `1`)) };
302	let third = unsafe { *(src.get_unchecked(read + `2`)) };
303	if ((UTF8_DATA.table[usize::from(second)]
304	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) })
305	\| (third >> `6`))
306	!= `2`
307	{
308	break 'outer;
309	}
310	let point = ((u16::from(byte) & `0xF`) << `12`)
311	\| ((u16::from(second) & `0x3F`) << `6`)
312	\| (u16::from(third) & `0x3F`);
313	unsafe { *(dst.get_unchecked_mut(written)) = point };
314	read += `3`;
315	written += `1`;
316
317	// Next lead (manually inlined)
318	if written == dst.len() {
319	break 'outer;
320	}
321	if likely(read + `4` <= src.len()) {
322	byte = unsafe { *(src.get_unchecked(read)) };
323	if in_inclusive_range8(byte, `0xE0`, `0xEF`) {
324	continue 'three;
325	}
326	if likely(byte < `0x80`) {
327	unsafe { (dst.get_unchecked_mut(written)) = u16*::from(byte) };
328	read += `1`;
329	written += `1`;
330	continue 'outer;
331	}
332	continue 'inner;
333	}
334	break 'inner;
335	}
336	}
337	// Four-byte
338	if written + `1` == dst.len() {
339	break 'outer;
340	}
341	let second = unsafe { *(src.get_unchecked(read + `1`)) };
342	let third = unsafe { *(src.get_unchecked(read + `2`)) };
343	let fourth = unsafe { *(src.get_unchecked(read + `3`)) };
344	if (u16::from(
345	UTF8_DATA.table[usize::from(second)]
346	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) },
347	) \| u16::from(third >> `6`)
348	\| (u16::from(fourth & `0xC0`) << `2`))
349	!= `0x202`
350	{
351	break 'outer;
352	}
353	let point = ((u32::from(byte) & `0x7`) << `18`)
354	\| ((u32::from(second) & `0x3F`) << `12`)
355	\| ((u32::from(third) & `0x3F`) << `6`)
356	\| (u32::from(fourth) & `0x3F`);
357	unsafe { (dst.get_unchecked_mut(written)) = (`0xD7C0` + (point >> `10`)) as u16* };
358	unsafe {
359	(dst.get_unchecked_mut(written + `1`)) = (`0xDC00` + (point & `0x3FF`)) as u16*
360	};
361	read += `4`;
362	written += `2`;
363
364	// Next lead
365	if written == dst.len() {
366	break 'outer;
367	}
368	if likely(read + `4` <= src.len()) {
369	byte = unsafe { *(src.get_unchecked(read)) };
370	if byte < `0x80` {
371	unsafe { (dst.get_unchecked_mut(written)) = u16*::from(byte) };
372	read += `1`;
373	written += `1`;
374	continue 'outer;
375	}
376	continue 'inner;
377	}
378	break 'inner;
379	}
380	}
381	// We can't have a complete 4-byte sequence, but we could still have
382	// one to three shorter sequences.
383	'tail: loop {
384	// >= is better for bound check elision than ==
385	if read >= src.len() \|\| written >= dst.len() {
386	break 'outer;
387	}
388	byte = src[read];
389	// At this point, `byte` is not included in `read`, because we
390	// don't yet know that a) the UTF-8 sequence is valid and b) that there
391	// is output space if it is an astral sequence.
392	// Inspecting the lead byte directly is faster than what the
393	// std lib does!
394	if byte < `0x80` {
395	dst[written] = u16::from(byte);
396	read += `1`;
397	written += `1`;
398	continue 'tail;
399	}
400	if in_inclusive_range8(byte, `0xC2`, `0xDF`) {
401	// Two-byte
402	let new_read = read + `2`;
403	if new_read > src.len() {
404	break 'outer;
405	}
406	let second = src[read + `1`];
407	if !in_inclusive_range8(second, `0x80`, `0xBF`) {
408	break 'outer;
409	}
410	dst[written] = ((u16::from(byte) & `0x1F`) << `6`) \| (u16::from(second) & `0x3F`);
411	read += `2`;
412	written += `1`;
413	continue 'tail;
414	}
415	// We need to exclude valid four byte lead bytes, because
416	// `UTF8_DATA.second_mask` covers
417	if byte < `0xF0` {
418	// Three-byte
419	let new_read = read + `3`;
420	if new_read > src.len() {
421	break 'outer;
422	}
423	let second = src[read + `1`];
424	let third = src[read + `2`];
425	if ((UTF8_DATA.table[usize::from(second)]
426	& unsafe { (UTF8_DATA.table.get_unchecked(byte as usize* + `0x80`)) })
427	\| (third >> `6`))
428	!= `2`
429	{
430	break 'outer;
431	}
432	let point = ((u16::from(byte) & `0xF`) << `12`)
433	\| ((u16::from(second) & `0x3F`) << `6`)
434	\| (u16::from(third) & `0x3F`);
435	dst[written] = point;
436	read += `3`;
437	written += `1`;
438	// `'tail` handles sequences shorter than 4, so
439	// there can't be another sequence after this one.
440	break 'outer;
441	}
442	break 'outer;
443	}
444	}
445	(read, written)
446	}
447
448	pub struct Utf8Decoder {
449	code_point: u32,
450	bytes_seen: usize, // 1, 2 or 3: counts continuations only
451	bytes_needed: usize, // 1, 2 or 3: counts continuations only
452	lower_boundary: u8,
453	upper_boundary: u8,
454	}
455
456	impl Utf8Decoder {
457	pub fn new_inner() -> Utf8Decoder {
458	Utf8Decoder {
459	code_point: `0`,
460	bytes_seen: `0`,
461	bytes_needed: `0`,
462	lower_boundary: `0x80u8`,
463	upper_boundary: `0xBFu8`,
464	}
465	}
466
467	pub fn new() -> VariantDecoder {
468	VariantDecoder::Utf8(Utf8Decoder::new_inner())
469	}
470
471	pub fn in_neutral_state(&self) -> bool {
472	self.bytes_needed == `0`
473	}
474
475	fn extra_from_state(&self) -> usize {
476	if self.bytes_needed == `0` {
477	`0`
478	} else {
479	self.bytes_seen + `1`
480	}
481	}
482
483	pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
484	byte_length.checked_add(`1` + self.extra_from_state())
485	}
486
487	pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
488	byte_length.checked_add(`3` + self.extra_from_state())
489	}
490
491	pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
492	checked_add(
493	`3`,
494	checked_mul(`3`, byte_length.checked_add(self.extra_from_state())),
495	)
496	}
497
498	decoder_functions!(
499	{},
500	{
501	// This is the fast path. The rest runs only at the
502	// start and end for partial sequences.
503	if self.bytes_needed == `0` {
504	dest.copy_utf8_up_to_invalid_from(&mut source);
505	}
506	},
507	{
508	if self.bytes_needed != `0` {
509	let bad_bytes = (self.bytes_seen + `1`) as u8;
510	self.code_point = `0`;
511	self.bytes_needed = `0`;
512	self.bytes_seen = `0`;
513	return (
514	DecoderResult::Malformed(bad_bytes, `0`),
515	src_consumed,
516	dest.written(),
517	);
518	}
519	},
520	{
521	if self.bytes_needed == `0` {
522	if b < `0x80u8` {
523	destination_handle.write_ascii(b);
524	continue;
525	}
526	if b < `0xC2u8` {
527	return (
528	DecoderResult::Malformed(`1`, `0`),
529	unread_handle.consumed(),
530	destination_handle.written(),
531	);
532	}
533	if b < `0xE0u8` {
534	self.bytes_needed = `1`;
535	self.code_point = u32::from(b) & `0x1F`;
536	continue;
537	}
538	if b < `0xF0u8` {
539	if b == `0xE0u8` {
540	self.lower_boundary = `0xA0u8`;
541	} else if b == `0xEDu8` {
542	self.upper_boundary = `0x9Fu8`;
543	}
544	self.bytes_needed = `2`;
545	self.code_point = u32::from(b) & `0xF`;
546	continue;
547	}
548	if b < `0xF5u8` {
549	if b == `0xF0u8` {
550	self.lower_boundary = `0x90u8`;
551	} else if b == `0xF4u8` {
552	self.upper_boundary = `0x8Fu8`;
553	}
554	self.bytes_needed = `3`;
555	self.code_point = u32::from(b) & `0x7`;
556	continue;
557	}
558	return (
559	DecoderResult::Malformed(`1`, `0`),
560	unread_handle.consumed(),
561	destination_handle.written(),
562	);
563	}
564	// self.bytes_needed != 0
565	if !(b >= self.lower_boundary && b <= self.upper_boundary) {
566	let bad_bytes = (self.bytes_seen + `1`) as u8;
567	self.code_point = `0`;
568	self.bytes_needed = `0`;
569	self.bytes_seen = `0`;
570	self.lower_boundary = `0x80u8`;
571	self.upper_boundary = `0xBFu8`;
572	return (
573	DecoderResult::Malformed(bad_bytes, `0`),
574	unread_handle.unread(),
575	destination_handle.written(),
576	);
577	}
578	self.lower_boundary = `0x80u8`;
579	self.upper_boundary = `0xBFu8`;
580	self.code_point = (self.code_point << `6`) \| (u32::from(b) & `0x3F`);
581	self.bytes_seen += `1`;
582	if self.bytes_seen != self.bytes_needed {
583	continue;
584	}
585	if self.bytes_needed == `3` {
586	destination_handle.write_astral(self.code_point);
587	} else {
588	destination_handle.write_bmp_excl_ascii(self.code_point as u16);
589	}
590	self.code_point = `0`;
591	self.bytes_needed = `0`;
592	self.bytes_seen = `0`;
593	continue;
594	},
595	self,
596	src_consumed,
597	dest,
598	source,
599	b,
600	destination_handle,
601	unread_handle,
602	check_space_astral
603	);
604	}
605
606	#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
607	#[inline(never)]
608	pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
609	let mut read = `0`;
610	let mut written = `0`;
611	'outer: loop {
612	let mut unit = {
613	let src_remaining = &src[read..];
614	let dst_remaining = &mut dst[written..];
615	let length = if dst_remaining.len() < src_remaining.len() {
616	dst_remaining.len()
617	} else {
618	src_remaining.len()
619	};
620	match unsafe {
621	basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
622	} {
623	None => {
624	read += length;
625	written += length;
626	return (read, written);
627	}
628	Some((non_ascii, consumed)) => {
629	read += consumed;
630	written += consumed;
631	non_ascii
632	}
633	}
634	};
635	'inner: loop {
636	// The following loop is only broken out of as a goto forward.
637	loop {
638	// Unfortunately, this check isn't enough for the compiler to elide
639	// the bound checks on writes to dst, which is why they are manually
640	// elided, which makes a measurable difference.
641	if written.checked_add(`4`).unwrap() > dst.len() {
642	return (read, written);
643	}
644	read += `1`;
645	if unit < `0x800` {
646	unsafe {
647	*(dst.get_unchecked_mut(written)) = (unit >> `6`) as u8 \| `0xC0u8`;
648	written += `1`;
649	*(dst.get_unchecked_mut(written)) = (unit & `0x3F`) as u8 \| `0x80u8`;
650	written += `1`;
651	}
652	break;
653	}
654	let unit_minus_surrogate_start = unit.wrapping_sub(`0xD800`);
655	if likely(unit_minus_surrogate_start > (`0xDFFF` - `0xD800`)) {
656	unsafe {
657	*(dst.get_unchecked_mut(written)) = (unit >> `12`) as u8 \| `0xE0u8`;
658	written += `1`;
659	*(dst.get_unchecked_mut(written)) = ((unit & `0xFC0`) >> `6`) as u8 \| `0x80u8`;
660	written += `1`;
661	*(dst.get_unchecked_mut(written)) = (unit & `0x3F`) as u8 \| `0x80u8`;
662	written += `1`;
663	}
664	break;
665	}
666	if likely(unit_minus_surrogate_start <= (`0xDBFF` - `0xD800`)) {
667	// high surrogate
668	// read > src.len() is impossible, but using
669	// >= instead of == allows the compiler to elide a bound check.
670	if read >= src.len() {
671	debug_assert_eq!(read, src.len());
672	// Unpaired surrogate at the end of the buffer.
673	unsafe {
674	*(dst.get_unchecked_mut(written)) = `0xEFu8`;
675	written += `1`;
676	*(dst.get_unchecked_mut(written)) = `0xBFu8`;
677	written += `1`;
678	*(dst.get_unchecked_mut(written)) = `0xBDu8`;
679	written += `1`;
680	}
681	return (read, written);
682	}
683	let second = src[read];
684	let second_minus_low_surrogate_start = second.wrapping_sub(`0xDC00`);
685	if likely(second_minus_low_surrogate_start <= (`0xDFFF` - `0xDC00`)) {
686	// The next code unit is a low surrogate. Advance position.
687	read += `1`;
688	let astral = (u32::from(unit) << `10`) + u32::from(second)
689	- (((`0xD800u32` << `10`) - `0x10000u32`) + `0xDC00u32`);
690	unsafe {
691	*(dst.get_unchecked_mut(written)) = (astral >> `18`) as u8 \| `0xF0u8`;
692	written += `1`;
693	*(dst.get_unchecked_mut(written)) =
694	((astral & `0x3F000u32`) >> `12`) as u8 \| `0x80u8`;
695	written += `1`;
696	*(dst.get_unchecked_mut(written)) =
697	((astral & `0xFC0u32`) >> `6`) as u8 \| `0x80u8`;
698	written += `1`;
699	*(dst.get_unchecked_mut(written)) = (astral & `0x3F`) as u8 \| `0x80u8`;
700	written += `1`;
701	}
702	break;
703	}
704	// The next code unit is not a low surrogate. Don't advance
705	// position and treat the high surrogate as unpaired.
706	// Fall through
707	}
708	// Unpaired low surrogate
709	unsafe {
710	*(dst.get_unchecked_mut(written)) = `0xEFu8`;
711	written += `1`;
712	*(dst.get_unchecked_mut(written)) = `0xBFu8`;
713	written += `1`;
714	*(dst.get_unchecked_mut(written)) = `0xBDu8`;
715	written += `1`;
716	}
717	break;
718	}
719	// Now see if the next unit is Basic Latin
720	// read > src.len() is impossible, but using
721	// >= instead of == allows the compiler to elide a bound check.
722	if read >= src.len() {
723	debug_assert_eq!(read, src.len());
724	return (read, written);
725	}
726	unit = src[read];
727	if unlikely(unit < `0x80`) {
728	// written > dst.len() is impossible, but using
729	// >= instead of == allows the compiler to elide a bound check.
730	if written >= dst.len() {
731	debug_assert_eq!(written, dst.len());
732	return (read, written);
733	}
734	dst[written] = unit as u8;
735	read += `1`;
736	written += `1`;
737	// Mysteriously, adding a punctuation check here makes
738	// the expected benificiary cases slower!
739	continue 'outer;
740	}
741	continue 'inner;
742	}
743	}
744	}
745
746	#[inline(never)]
747	pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
748	// Everything below is cold code!
749	let mut read = `0`;
750	let mut written = `0`;
751	let mut unit = src[read];
752	// We now have up to 3 output slots, so an astral character
753	// will not fit.
754	if unit < `0x800` {
755	loop {
756	if unit < `0x80` {
757	if written >= dst.len() {
758	return (read, written);
759	}
760	read += `1`;
761	dst[written] = unit as u8;
762	written += `1`;
763	} else if unit < `0x800` {
764	if written + `2` > dst.len() {
765	return (read, written);
766	}
767	read += `1`;
768	dst[written] = (unit >> `6`) as u8 \| `0xC0u8`;
769	written += `1`;
770	dst[written] = (unit & `0x3F`) as u8 \| `0x80u8`;
771	written += `1`;
772	} else {
773	return (read, written);
774	}
775	// read > src.len() is impossible, but using
776	// >= instead of == allows the compiler to elide a bound check.
777	if read >= src.len() {
778	debug_assert_eq!(read, src.len());
779	return (read, written);
780	}
781	unit = src[read];
782	}
783	}
784	// Could be an unpaired surrogate, but we'll need 3 output
785	// slots in any case.
786	if written + `3` > dst.len() {
787	return (read, written);
788	}
789	read += `1`;
790	let unit_minus_surrogate_start = unit.wrapping_sub(`0xD800`);
791	if unit_minus_surrogate_start <= (`0xDFFF` - `0xD800`) {
792	// Got surrogate
793	if unit_minus_surrogate_start <= (`0xDBFF` - `0xD800`) {
794	// Got high surrogate
795	if read >= src.len() {
796	// Unpaired high surrogate
797	unit = `0xFFFD`;
798	} else {
799	let second = src[read];
800	if in_inclusive_range16(second, `0xDC00`, `0xDFFF`) {
801	// Valid surrogate pair, but we know it won't fit.
802	read -= `1`;
803	return (read, written);
804	}
805	// Unpaired high
806	unit = `0xFFFD`;
807	}
808	} else {
809	// Unpaired low
810	unit = `0xFFFD`;
811	}
812	}
813	dst[written] = (unit >> `12`) as u8 \| `0xE0u8`;
814	written += `1`;
815	dst[written] = ((unit & `0xFC0`) >> `6`) as u8 \| `0x80u8`;
816	written += `1`;
817	dst[written] = (unit & `0x3F`) as u8 \| `0x80u8`;
818	written += `1`;
819	debug_assert_eq!(written, dst.len());
820	(read, written)
821	}
822
823	pub struct Utf8Encoder;
824
825	impl Utf8Encoder {
826	pub fn new(encoding: &'static Encoding) -> Encoder {
827	Encoder::new(encoding, VariantEncoder::Utf8(Utf8Encoder))
828	}
829
830	pub fn max_buffer_length_from_utf16_without_replacement(
831	&self,
832	u16_length: usize,
833	) -> Option<usize> {
834	u16_length.checked_mul(`3`)
835	}
836
837	pub fn max_buffer_length_from_utf8_without_replacement(
838	&self,
839	byte_length: usize,
840	) -> Option<usize> {
841	Some(byte_length)
842	}
843
844	pub fn encode_from_utf16_raw(
845	&mut self,
846	src: &[u16],
847	dst: &mut [u8],
848	_last: bool,
849	) -> (EncoderResult, usize, usize) {
850	let (read, written) = convert_utf16_to_utf8_partial(src, dst);
851	(
852	if read == src.len() {
853	EncoderResult::InputEmpty
854	} else {
855	EncoderResult::OutputFull
856	},
857	read,
858	written,
859	)
860	}
861
862	pub fn encode_from_utf8_raw(
863	&mut self,
864	src: &str,
865	dst: &mut [u8],
866	_last: bool,
867	) -> (EncoderResult, usize, usize) {
868	let bytes = src.as_bytes();
869	let mut to_write = bytes.len();
870	if to_write <= dst.len() {
871	(&mut dst[..to_write]).copy_from_slice(bytes);
872	return (EncoderResult::InputEmpty, to_write, to_write);
873	}
874	to_write = dst.len();
875	// Move back until we find a UTF-8 sequence boundary.
876	while (bytes[to_write] & `0xC0`) == `0x80` {
877	to_write -= `1`;
878	}
879	(&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]);
880	(EncoderResult::OutputFull, to_write, to_write)
881	}
882	}
883
884	// Any copyright to the test code below this comment is dedicated to the
885	// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
886
887	#[cfg(all(test, feature = "alloc"))]
888	mod tests {
889	use super::super::testing::*;
890	use super::super::*;
891
892	// fn decode_utf8_to_utf16(bytes: &[u8], expect: &[u16]) {
893	// decode_to_utf16_without_replacement(UTF_8, bytes, expect);
894	// }
895
896	fn decode_utf8_to_utf8(bytes: &[u8], expect: &str) {
897	decode_to_utf8(UTF_8, bytes, expect);
898	}
899
900	fn decode_valid_utf8(string: &str) {
901	decode_utf8_to_utf8(string.as_bytes(), string);
902	}
903
904	fn encode_utf8_from_utf16(string: &[u16], expect: &[u8]) {
905	encode_from_utf16(UTF_8, string, expect);
906	}
907
908	fn encode_utf8_from_utf8(string: &str, expect: &[u8]) {
909	encode_from_utf8(UTF_8, string, expect);
910	}
911
912	fn encode_utf8_from_utf16_with_output_limit(
913	string: &[u16],
914	expect: &str,
915	limit: usize,
916	expect_result: EncoderResult,
917	) {
918	let mut dst = Vec::new();
919	{
920	dst.resize(limit, `0u8`);
921	let mut encoder = UTF_8.new_encoder();
922	let (result, read, written) =
923	encoder.encode_from_utf16_without_replacement(string, &mut dst, `false`);
924	assert_eq!(result, expect_result);
925	if expect_result == EncoderResult::InputEmpty {
926	assert_eq!(read, string.len());
927	}
928	assert_eq!(&dst[..written], expect.as_bytes());
929	}
930	{
931	dst.resize(`64`, `0u8`);
932	for (i, elem) in dst.iter_mut().enumerate() {
933	*elem = i as u8;
934	}
935	let mut encoder = UTF_8.new_encoder();
936	let (_, _, mut j) =
937	encoder.encode_from_utf16_without_replacement(string, &mut dst, `false`);
938	while j < dst.len() {
939	assert_eq!(usize::from(dst[j]), j);
940	j += `1`;
941	}
942	}
943	}
944
945	#[test]
946	fn test_utf8_decode() {
947	// Empty
948	decode_valid_utf8("");
949	// ASCII
950	decode_valid_utf8("ab");
951	// Low BMP
952	decode_valid_utf8("a`\u{E4}`Z");
953	// High BMP
954	decode_valid_utf8("a`\u{2603}`Z");
955	// Astral
956	decode_valid_utf8("a`\u{1F4A9}`Z");
957	// Low BMP with last byte missing
958	decode_utf8_to_utf8(b"a`\xC3`Z", "a`\u{FFFD}`Z");
959	decode_utf8_to_utf8(b"a`\xC3`", "a`\u{FFFD}`");
960	// High BMP with last byte missing
961	decode_utf8_to_utf8(b"a`\xE2\x98`Z", "a`\u{FFFD}`Z");
962	decode_utf8_to_utf8(b"a`\xE2\x98`", "a`\u{FFFD}`");
963	// Astral with last byte missing
964	decode_utf8_to_utf8(b"a`\xF0\x9F\x92`Z", "a`\u{FFFD}`Z");
965	decode_utf8_to_utf8(b"a`\xF0\x9F\x92`", "a`\u{FFFD}`");
966	// Lone highest continuation
967	decode_utf8_to_utf8(b"a`\xBF`Z", "a`\u{FFFD}`Z");
968	decode_utf8_to_utf8(b"a`\xBF`", "a`\u{FFFD}`");
969	// Two lone highest continuations
970	decode_utf8_to_utf8(b"a`\xBF\xBF`Z", "a`\u{FFFD}\u{FFFD}`Z");
971	decode_utf8_to_utf8(b"a`\xBF\xBF`", "a`\u{FFFD}\u{FFFD}`");
972	// Low BMP followed by lowest lone continuation
973	decode_utf8_to_utf8(b"a`\xC3\xA4\x80`Z", "a`\u{E4}\u{FFFD}`Z");
974	decode_utf8_to_utf8(b"a`\xC3\xA4\x80`", "a`\u{E4}\u{FFFD}`");
975	// Low BMP followed by highest lone continuation
976	decode_utf8_to_utf8(b"a`\xC3\xA4\xBF`Z", "a`\u{E4}\u{FFFD}`Z");
977	decode_utf8_to_utf8(b"a`\xC3\xA4\xBF`", "a`\u{E4}\u{FFFD}`");
978	// High BMP followed by lowest lone continuation
979	decode_utf8_to_utf8(b"a`\xE2\x98\x83\x80`Z", "a`\u{2603}\u{FFFD}`Z");
980	decode_utf8_to_utf8(b"a`\xE2\x98\x83\x80`", "a`\u{2603}\u{FFFD}`");
981	// High BMP followed by highest lone continuation
982	decode_utf8_to_utf8(b"a`\xE2\x98\x83\xBF`Z", "a`\u{2603}\u{FFFD}`Z");
983	decode_utf8_to_utf8(b"a`\xE2\x98\x83\xBF`", "a`\u{2603}\u{FFFD}`");
984	// Astral followed by lowest lone continuation
985	decode_utf8_to_utf8(b"a`\xF0\x9F\x92\xA9\x80`Z", "a`\u{1F4A9}\u{FFFD}`Z");
986	decode_utf8_to_utf8(b"a`\xF0\x9F\x92\xA9\x80`", "a`\u{1F4A9}\u{FFFD}`");
987	// Astral followed by highest lone continuation
988	decode_utf8_to_utf8(b"a`\xF0\x9F\x92\xA9\xBF`Z", "a`\u{1F4A9}\u{FFFD}`Z");
989	decode_utf8_to_utf8(b"a`\xF0\x9F\x92\xA9\xBF`", "a`\u{1F4A9}\u{FFFD}`");
990
991	// Boundary conditions
992	// Lowest single-byte
993	decode_valid_utf8("Z`\x00`");
994	decode_valid_utf8("Z`\x00`Z");
995	// Lowest single-byte as two-byte overlong sequence
996	decode_utf8_to_utf8(b"a`\xC0\x80`", "a`\u{FFFD}\u{FFFD}`");
997	decode_utf8_to_utf8(b"a`\xC0\x80`Z", "a`\u{FFFD}\u{FFFD}`Z");
998	// Lowest single-byte as three-byte overlong sequence
999	decode_utf8_to_utf8(b"a`\xE0\x80\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}`");
1000	decode_utf8_to_utf8(b"a`\xE0\x80\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1001	// Lowest single-byte as four-byte overlong sequence
1002	decode_utf8_to_utf8(b"a`\xF0\x80\x80\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1003	decode_utf8_to_utf8(b"a`\xF0\x80\x80\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1004	// One below lowest single-byte
1005	decode_utf8_to_utf8(b"a`\xFF`", "a`\u{FFFD}`");
1006	decode_utf8_to_utf8(b"a`\xFF`Z", "a`\u{FFFD}`Z");
1007	// Highest single-byte
1008	decode_valid_utf8("a`\x7F`");
1009	decode_valid_utf8("a`\x7F`Z");
1010	// Highest single-byte as two-byte overlong sequence
1011	decode_utf8_to_utf8(b"a`\xC1\xBF`", "a`\u{FFFD}\u{FFFD}`");
1012	decode_utf8_to_utf8(b"a`\xC1\xBF`Z", "a`\u{FFFD}\u{FFFD}`Z");
1013	// Highest single-byte as three-byte overlong sequence
1014	decode_utf8_to_utf8(b"a`\xE0\x81\xBF`", "a`\u{FFFD}\u{FFFD}\u{FFFD}`");
1015	decode_utf8_to_utf8(b"a`\xE0\x81\xBF`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1016	// Highest single-byte as four-byte overlong sequence
1017	decode_utf8_to_utf8(b"a`\xF0\x80\x81\xBF`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1018	decode_utf8_to_utf8(b"a`\xF0\x80\x81\xBF`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1019	// One past highest single byte (also lone continuation)
1020	decode_utf8_to_utf8(b"a`\x80`Z", "a`\u{FFFD}`Z");
1021	decode_utf8_to_utf8(b"a`\x80`", "a`\u{FFFD}`");
1022	// Two lone continuations
1023	decode_utf8_to_utf8(b"a`\x80\x80`Z", "a`\u{FFFD}\u{FFFD}`Z");
1024	decode_utf8_to_utf8(b"a`\x80\x80`", "a`\u{FFFD}\u{FFFD}`");
1025	// Three lone continuations
1026	decode_utf8_to_utf8(b"a`\x80\x80\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1027	decode_utf8_to_utf8(b"a`\x80\x80\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}`");
1028	// Four lone continuations
1029	decode_utf8_to_utf8(b"a`\x80\x80\x80\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1030	decode_utf8_to_utf8(b"a`\x80\x80\x80\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1031	// Lowest two-byte
1032	decode_utf8_to_utf8(b"a`\xC2\x80`", "a`\u{0080}`");
1033	decode_utf8_to_utf8(b"a`\xC2\x80`Z", "a`\u{0080}`Z");
1034	// Lowest two-byte as three-byte overlong sequence
1035	decode_utf8_to_utf8(b"a`\xE0\x82\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}`");
1036	decode_utf8_to_utf8(b"a`\xE0\x82\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1037	// Lowest two-byte as four-byte overlong sequence
1038	decode_utf8_to_utf8(b"a`\xF0\x80\x82\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1039	decode_utf8_to_utf8(b"a`\xF0\x80\x82\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1040	// Lead one below lowest two-byte
1041	decode_utf8_to_utf8(b"a`\xC1\x80`", "a`\u{FFFD}\u{FFFD}`");
1042	decode_utf8_to_utf8(b"a`\xC1\x80`Z", "a`\u{FFFD}\u{FFFD}`Z");
1043	// Trail one below lowest two-byte
1044	decode_utf8_to_utf8(b"a`\xC2\x7F`", "a`\u{FFFD}\u{007F}`");
1045	decode_utf8_to_utf8(b"a`\xC2\x7F`Z", "a`\u{FFFD}\u{007F}`Z");
1046	// Highest two-byte
1047	decode_utf8_to_utf8(b"a`\xDF\xBF`", "a`\u{07FF}`");
1048	decode_utf8_to_utf8(b"a`\xDF\xBF`Z", "a`\u{07FF}`Z");
1049	// Highest two-byte as three-byte overlong sequence
1050	decode_utf8_to_utf8(b"a`\xE0\x9F\xBF`", "a`\u{FFFD}\u{FFFD}\u{FFFD}`");
1051	decode_utf8_to_utf8(b"a`\xE0\x9F\xBF`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1052	// Highest two-byte as four-byte overlong sequence
1053	decode_utf8_to_utf8(b"a`\xF0\x80\x9F\xBF`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1054	decode_utf8_to_utf8(b"a`\xF0\x80\x9F\xBF`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1055	// Lowest three-byte
1056	decode_utf8_to_utf8(b"a`\xE0\xA0\x80`", "a`\u{0800}`");
1057	decode_utf8_to_utf8(b"a`\xE0\xA0\x80`Z", "a`\u{0800}`Z");
1058	// Lowest three-byte as four-byte overlong sequence
1059	decode_utf8_to_utf8(b"a`\xF0\x80\xA0\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1060	decode_utf8_to_utf8(b"a`\xF0\x80\xA0\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1061	// Highest below surrogates
1062	decode_utf8_to_utf8(b"a`\xED\x9F\xBF`", "a`\u{D7FF}`");
1063	decode_utf8_to_utf8(b"a`\xED\x9F\xBF`Z", "a`\u{D7FF}`Z");
1064	// Highest below surrogates as four-byte overlong sequence
1065	decode_utf8_to_utf8(b"a`\xF0\x8D\x9F\xBF`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1066	decode_utf8_to_utf8(b"a`\xF0\x8D\x9F\xBF`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1067	// First surrogate
1068	decode_utf8_to_utf8(b"a`\xED\xA0\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}`");
1069	decode_utf8_to_utf8(b"a`\xED\xA0\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1070	// First surrogate as four-byte overlong sequence
1071	decode_utf8_to_utf8(b"a`\xF0\x8D\xA0\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1072	decode_utf8_to_utf8(b"a`\xF0\x8D\xA0\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1073	// Last surrogate
1074	decode_utf8_to_utf8(b"a`\xED\xBF\xBF`", "a`\u{FFFD}\u{FFFD}\u{FFFD}`");
1075	decode_utf8_to_utf8(b"a`\xED\xBF\xBF`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1076	// Last surrogate as four-byte overlong sequence
1077	decode_utf8_to_utf8(b"a`\xF0\x8D\xBF\xBF`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1078	decode_utf8_to_utf8(b"a`\xF0\x8D\xBF\xBF`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1079	// Lowest above surrogates
1080	decode_utf8_to_utf8(b"a`\xEE\x80\x80`", "a`\u{E000}`");
1081	decode_utf8_to_utf8(b"a`\xEE\x80\x80`Z", "a`\u{E000}`Z");
1082	// Lowest above surrogates as four-byte overlong sequence
1083	decode_utf8_to_utf8(b"a`\xF0\x8E\x80\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1084	decode_utf8_to_utf8(b"a`\xF0\x8E\x80\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1085	// Highest three-byte
1086	decode_utf8_to_utf8(b"a`\xEF\xBF\xBF`", "a`\u{FFFF}`");
1087	decode_utf8_to_utf8(b"a`\xEF\xBF\xBF`Z", "a`\u{FFFF}`Z");
1088	// Highest three-byte as four-byte overlong sequence
1089	decode_utf8_to_utf8(b"a`\xF0\x8F\xBF\xBF`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1090	decode_utf8_to_utf8(b"a`\xF0\x8F\xBF\xBF`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1091	// Lowest four-byte
1092	decode_utf8_to_utf8(b"a`\xF0\x90\x80\x80`", "a`\u{10000}`");
1093	decode_utf8_to_utf8(b"a`\xF0\x90\x80\x80`Z", "a`\u{10000}`Z");
1094	// Highest four-byte
1095	decode_utf8_to_utf8(b"a`\xF4\x8F\xBF\xBF`", "a`\u{10FFFF}`");
1096	decode_utf8_to_utf8(b"a`\xF4\x8F\xBF\xBF`Z", "a`\u{10FFFF}`Z");
1097	// One past highest four-byte
1098	decode_utf8_to_utf8(b"a`\xF4\x90\x80\x80`", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`");
1099	decode_utf8_to_utf8(b"a`\xF4\x90\x80\x80`Z", "a`\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}`Z");
1100
1101	// Highest four-byte with last byte replaced with 0xFF
1102	decode_utf8_to_utf8(b"a`\xF4\x8F\xBF\xFF`", "a`\u{FFFD}\u{FFFD}`");
1103	decode_utf8_to_utf8(b"a`\xF4\x8F\xBF\xFF`Z", "a`\u{FFFD}\u{FFFD}`Z");
1104	}
1105
1106	#[test]
1107	fn test_utf8_encode() {
1108	// Empty
1109	encode_utf8_from_utf16(&[], b"");
1110	encode_utf8_from_utf8("", b"");
1111
1112	encode_utf8_from_utf16(&[`0x0000`], "`\u{0000}`".as_bytes());
1113	encode_utf8_from_utf16(&[`0x007F`], "`\u{007F}`".as_bytes());
1114	encode_utf8_from_utf16(&[`0x0080`], "`\u{0080}`".as_bytes());
1115	encode_utf8_from_utf16(&[`0x07FF`], "`\u{07FF}`".as_bytes());
1116	encode_utf8_from_utf16(&[`0x0800`], "`\u{0800}`".as_bytes());
1117	encode_utf8_from_utf16(&[`0xD7FF`], "`\u{D7FF}`".as_bytes());
1118	encode_utf8_from_utf16(&[`0xD800`], "`\u{FFFD}`".as_bytes());
1119	encode_utf8_from_utf16(&[`0xD800`, `0x0062`], "`\u{FFFD}\u{0062}`".as_bytes());
1120	encode_utf8_from_utf16(&[`0xDFFF`], "`\u{FFFD}`".as_bytes());
1121	encode_utf8_from_utf16(&[`0xDFFF`, `0x0062`], "`\u{FFFD}\u{0062}`".as_bytes());
1122	encode_utf8_from_utf16(&[`0xE000`], "`\u{E000}`".as_bytes());
1123	encode_utf8_from_utf16(&[`0xFFFF`], "`\u{FFFF}`".as_bytes());
1124	encode_utf8_from_utf16(&[`0xD800`, `0xDC00`], "`\u{10000}`".as_bytes());
1125	encode_utf8_from_utf16(&[`0xDBFF`, `0xDFFF`], "`\u{10FFFF}`".as_bytes());
1126	encode_utf8_from_utf16(&[`0xDC00`, `0xDEDE`], "`\u{FFFD}\u{FFFD}`".as_bytes());
1127	}
1128
1129	#[test]
1130	fn test_encode_utf8_from_utf16_with_output_limit() {
1131	encode_utf8_from_utf16_with_output_limit(&[`0x0062`], "`\u{62}`", `1`, EncoderResult::InputEmpty);
1132	encode_utf8_from_utf16_with_output_limit(&[`0x00A7`], "`\u{A7}`", `2`, EncoderResult::InputEmpty);
1133	encode_utf8_from_utf16_with_output_limit(
1134	&[`0x2603`],
1135	"`\u{2603}`",
1136	`3`,
1137	EncoderResult::InputEmpty,
1138	);
1139	encode_utf8_from_utf16_with_output_limit(
1140	&[`0xD83D`, `0xDCA9`],
1141	"`\u{1F4A9}`",
1142	`4`,
1143	EncoderResult::InputEmpty,
1144	);
1145
1146	encode_utf8_from_utf16_with_output_limit(&[`0x00A7`], "", `1`, EncoderResult::OutputFull);
1147	encode_utf8_from_utf16_with_output_limit(&[`0x2603`], "", `2`, EncoderResult::OutputFull);
1148	encode_utf8_from_utf16_with_output_limit(
1149	&[`0xD83D`, `0xDCA9`],
1150	"",
1151	`3`,
1152	EncoderResult::OutputFull,
1153	);
1154
1155	encode_utf8_from_utf16_with_output_limit(
1156	&[`0x0063`, `0x0062`],
1157	"`\u{63}\u{62}`",
1158	`2`,
1159	EncoderResult::InputEmpty,
1160	);
1161	encode_utf8_from_utf16_with_output_limit(
1162	&[`0x0063`, `0x00A7`],
1163	"`\u{63}\u{A7}`",
1164	`3`,
1165	EncoderResult::InputEmpty,
1166	);
1167	encode_utf8_from_utf16_with_output_limit(
1168	&[`0x0063`, `0x2603`],
1169	"`\u{63}\u{2603}`",
1170	`4`,
1171	EncoderResult::InputEmpty,
1172	);
1173	encode_utf8_from_utf16_with_output_limit(
1174	&[`0x0063`, `0xD83D`, `0xDCA9`],
1175	"`\u{63}\u{1F4A9}`",
1176	`5`,
1177	EncoderResult::InputEmpty,
1178	);
1179
1180	encode_utf8_from_utf16_with_output_limit(
1181	&[`0x0063`, `0x00A7`],
1182	"`\u{63}`",
1183	`2`,
1184	EncoderResult::OutputFull,
1185	);
1186	encode_utf8_from_utf16_with_output_limit(
1187	&[`0x0063`, `0x2603`],
1188	"`\u{63}`",
1189	`3`,
1190	EncoderResult::OutputFull,
1191	);
1192	encode_utf8_from_utf16_with_output_limit(
1193	&[`0x0063`, `0xD83D`, `0xDCA9`],
1194	"`\u{63}`",
1195	`4`,
1196	EncoderResult::OutputFull,
1197	);
1198
1199	encode_utf8_from_utf16_with_output_limit(
1200	&[`0x00B6`, `0x0062`],
1201	"`\u{B6}\u{62}`",
1202	`3`,
1203	EncoderResult::InputEmpty,
1204	);
1205	encode_utf8_from_utf16_with_output_limit(
1206	&[`0x00B6`, `0x00A7`],
1207	"`\u{B6}\u{A7}`",
1208	`4`,
1209	EncoderResult::InputEmpty,
1210	);
1211	encode_utf8_from_utf16_with_output_limit(
1212	&[`0x00B6`, `0x2603`],
1213	"`\u{B6}\u{2603}`",
1214	`5`,
1215	EncoderResult::InputEmpty,
1216	);
1217	encode_utf8_from_utf16_with_output_limit(
1218	&[`0x00B6`, `0xD83D`, `0xDCA9`],
1219	"`\u{B6}\u{1F4A9}`",
1220	`6`,
1221	EncoderResult::InputEmpty,
1222	);
1223
1224	encode_utf8_from_utf16_with_output_limit(
1225	&[`0x00B6`, `0x00A7`],
1226	"`\u{B6}`",
1227	`3`,
1228	EncoderResult::OutputFull,
1229	);
1230	encode_utf8_from_utf16_with_output_limit(
1231	&[`0x00B6`, `0x2603`],
1232	"`\u{B6}`",
1233	`4`,
1234	EncoderResult::OutputFull,
1235	);
1236	encode_utf8_from_utf16_with_output_limit(
1237	&[`0x00B6`, `0xD83D`, `0xDCA9`],
1238	"`\u{B6}`",
1239	`5`,
1240	EncoderResult::OutputFull,
1241	);
1242
1243	encode_utf8_from_utf16_with_output_limit(
1244	&[`0x263A`, `0x0062`],
1245	"`\u{263A}\u{62}`",
1246	`4`,
1247	EncoderResult::InputEmpty,
1248	);
1249	encode_utf8_from_utf16_with_output_limit(
1250	&[`0x263A`, `0x00A7`],
1251	"`\u{263A}\u{A7}`",
1252	`5`,
1253	EncoderResult::InputEmpty,
1254	);
1255	encode_utf8_from_utf16_with_output_limit(
1256	&[`0x263A`, `0x2603`],
1257	"`\u{263A}\u{2603}`",
1258	`6`,
1259	EncoderResult::InputEmpty,
1260	);
1261	encode_utf8_from_utf16_with_output_limit(
1262	&[`0x263A`, `0xD83D`, `0xDCA9`],
1263	"`\u{263A}\u{1F4A9}`",
1264	`7`,
1265	EncoderResult::InputEmpty,
1266	);
1267
1268	encode_utf8_from_utf16_with_output_limit(
1269	&[`0x263A`, `0x00A7`],
1270	"`\u{263A}`",
1271	`4`,
1272	EncoderResult::OutputFull,
1273	);
1274	encode_utf8_from_utf16_with_output_limit(
1275	&[`0x263A`, `0x2603`],
1276	"`\u{263A}`",
1277	`5`,
1278	EncoderResult::OutputFull,
1279	);
1280	encode_utf8_from_utf16_with_output_limit(
1281	&[`0x263A`, `0xD83D`, `0xDCA9`],
1282	"`\u{263A}`",
1283	`6`,
1284	EncoderResult::OutputFull,
1285	);
1286
1287	encode_utf8_from_utf16_with_output_limit(
1288	&[`0xD83D`, `0xDE0E`, `0x0062`],
1289	"`\u{1F60E}\u{62}`",
1290	`5`,
1291	EncoderResult::InputEmpty,
1292	);
1293	encode_utf8_from_utf16_with_output_limit(
1294	&[`0xD83D`, `0xDE0E`, `0x00A7`],
1295	"`\u{1F60E}\u{A7}`",
1296	`6`,
1297	EncoderResult::InputEmpty,
1298	);
1299	encode_utf8_from_utf16_with_output_limit(
1300	&[`0xD83D`, `0xDE0E`, `0x2603`],
1301	"`\u{1F60E}\u{2603}`",
1302	`7`,
1303	EncoderResult::InputEmpty,
1304	);
1305	encode_utf8_from_utf16_with_output_limit(
1306	&[`0xD83D`, `0xDE0E`, `0xD83D`, `0xDCA9`],
1307	"`\u{1F60E}\u{1F4A9}`",
1308	`8`,
1309	EncoderResult::InputEmpty,
1310	);
1311
1312	encode_utf8_from_utf16_with_output_limit(
1313	&[`0xD83D`, `0xDE0E`, `0x00A7`],
1314	"`\u{1F60E}`",
1315	`5`,
1316	EncoderResult::OutputFull,
1317	);
1318	encode_utf8_from_utf16_with_output_limit(
1319	&[`0xD83D`, `0xDE0E`, `0x2603`],
1320	"`\u{1F60E}`",
1321	`6`,
1322	EncoderResult::OutputFull,
1323	);
1324	encode_utf8_from_utf16_with_output_limit(
1325	&[`0xD83D`, `0xDE0E`, `0xD83D`, `0xDCA9`],
1326	"`\u{1F60E}`",
1327	`7`,
1328	EncoderResult::OutputFull,
1329	);
1330
1331	encode_utf8_from_utf16_with_output_limit(
1332	&[`0x0063`, `0x00B6`, `0x0062`, `0x0062`],
1333	"`\u{63}\u{B6}\u{62}\u{62}`",
1334	`5`,
1335	EncoderResult::InputEmpty,
1336	);
1337	encode_utf8_from_utf16_with_output_limit(
1338	&[`0x0063`, `0x00B6`, `0x0062`, `0x0062`],
1339	"`\u{63}\u{B6}\u{62}`",
1340	`4`,
1341	EncoderResult::OutputFull,
1342	);
1343
1344	encode_utf8_from_utf16_with_output_limit(
1345	&[`0x0063`, `0x00B6`, `0x0062`, `0x0062`, `0x0062`],
1346	"`\u{63}\u{B6}\u{62}\u{62}\u{62}`",
1347	`6`,
1348	EncoderResult::InputEmpty,
1349	);
1350	encode_utf8_from_utf16_with_output_limit(
1351	&[`0x0063`, `0x00B6`, `0x0062`, `0x0062`, `0x0062`],
1352	"`\u{63}\u{B6}\u{62}\u{62}`",
1353	`5`,
1354	EncoderResult::OutputFull,
1355	);
1356
1357	encode_utf8_from_utf16_with_output_limit(
1358	&[`0x263A`, `0x0062`, `0x0062`],
1359	"`\u{263A}\u{62}\u{62}`",
1360	`5`,
1361	EncoderResult::InputEmpty,
1362	);
1363	encode_utf8_from_utf16_with_output_limit(
1364	&[`0x263A`, `0x0062`, `0x0062`],
1365	"`\u{263A}\u{62}`",
1366	`4`,
1367	EncoderResult::OutputFull,
1368	);
1369
1370	encode_utf8_from_utf16_with_output_limit(
1371	&[`0x263A`, `0x0062`, `0x0062`, `0x0062`],
1372	"`\u{263A}\u{62}\u{62}\u{62}`",
1373	`6`,
1374	EncoderResult::InputEmpty,
1375	);
1376	encode_utf8_from_utf16_with_output_limit(
1377	&[`0x263A`, `0x0062`, `0x0062`, `0x0062`],
1378	"`\u{263A}\u{62}\u{62}`",
1379	`5`,
1380	EncoderResult::OutputFull,
1381	);
1382
1383	encode_utf8_from_utf16_with_output_limit(
1384	&[`0x0063`, `0x00B6`, `0x00A7`],
1385	"`\u{63}\u{B6}\u{A7}`",
1386	`5`,
1387	EncoderResult::InputEmpty,
1388	);
1389	encode_utf8_from_utf16_with_output_limit(
1390	&[`0x0063`, `0x00B6`, `0x00A7`],
1391	"`\u{63}\u{B6}`",
1392	`4`,
1393	EncoderResult::OutputFull,
1394	);
1395
1396	encode_utf8_from_utf16_with_output_limit(
1397	&[`0x0063`, `0x00B6`, `0x00A7`, `0x0062`],
1398	"`\u{63}\u{B6}\u{A7}\u{62}`",
1399	`6`,
1400	EncoderResult::InputEmpty,
1401	);
1402	encode_utf8_from_utf16_with_output_limit(
1403	&[`0x0063`, `0x00B6`, `0x00A7`, `0x0062`],
1404	"`\u{63}\u{B6}\u{A7}`",
1405	`5`,
1406	EncoderResult::OutputFull,
1407	);
1408
1409	encode_utf8_from_utf16_with_output_limit(
1410	&[`0x263A`, `0x00A7`, `0x0062`],
1411	"`\u{263A}\u{A7}\u{62}`",
1412	`6`,
1413	EncoderResult::InputEmpty,
1414	);
1415	encode_utf8_from_utf16_with_output_limit(
1416	&[`0x263A`, `0x00A7`, `0x0062`],
1417	"`\u{263A}\u{A7}`",
1418	`5`,
1419	EncoderResult::OutputFull,
1420	);
1421
1422	encode_utf8_from_utf16_with_output_limit(
1423	&[`0x0063`, `0x00B6`, `0x0062`, `0x00A7`],
1424	"`\u{63}\u{B6}\u{62}\u{A7}`",
1425	`6`,
1426	EncoderResult::InputEmpty,
1427	);
1428	encode_utf8_from_utf16_with_output_limit(
1429	&[`0x0063`, `0x00B6`, `0x0062`, `0x00A7`],
1430	"`\u{63}\u{B6}\u{62}`",
1431	`5`,
1432	EncoderResult::OutputFull,
1433	);
1434
1435	encode_utf8_from_utf16_with_output_limit(
1436	&[`0x263A`, `0x0062`, `0x00A7`],
1437	"`\u{263A}\u{62}\u{A7}`",
1438	`6`,
1439	EncoderResult::InputEmpty,
1440	);
1441	encode_utf8_from_utf16_with_output_limit(
1442	&[`0x263A`, `0x0062`, `0x00A7`],
1443	"`\u{263A}\u{62}`",
1444	`5`,
1445	EncoderResult::OutputFull,
1446	);
1447
1448	encode_utf8_from_utf16_with_output_limit(
1449	&[`0x0063`, `0x00B6`, `0x2603`],
1450	"`\u{63}\u{B6}\u{2603}`",
1451	`6`,
1452	EncoderResult::InputEmpty,
1453	);
1454	encode_utf8_from_utf16_with_output_limit(
1455	&[`0x0063`, `0x00B6`, `0x2603`],
1456	"`\u{63}\u{B6}`",
1457	`5`,
1458	EncoderResult::OutputFull,
1459	);
1460
1461	encode_utf8_from_utf16_with_output_limit(
1462	&[`0x263A`, `0x2603`],
1463	"`\u{263A}\u{2603}`",
1464	`6`,
1465	EncoderResult::InputEmpty,
1466	);
1467	encode_utf8_from_utf16_with_output_limit(
1468	&[`0x263A`, `0x2603`],
1469	"`\u{263A}`",
1470	`5`,
1471	EncoderResult::OutputFull,
1472	);
1473
1474	encode_utf8_from_utf16_with_output_limit(
1475	&[`0x0063`, `0x00B6`, `0xD83D`],
1476	"`\u{63}\u{B6}\u{FFFD}`",
1477	`6`,
1478	EncoderResult::InputEmpty,
1479	);
1480	encode_utf8_from_utf16_with_output_limit(
1481	&[`0x0063`, `0x00B6`, `0xD83D`],
1482	"`\u{63}\u{B6}`",
1483	`5`,
1484	EncoderResult::OutputFull,
1485	);
1486
1487	encode_utf8_from_utf16_with_output_limit(
1488	&[`0x263A`, `0xD83D`],
1489	"`\u{263A}\u{FFFD}`",
1490	`6`,
1491	EncoderResult::InputEmpty,
1492	);
1493	encode_utf8_from_utf16_with_output_limit(
1494	&[`0x263A`, `0xD83D`],
1495	"`\u{263A}`",
1496	`5`,
1497	EncoderResult::OutputFull,
1498	);
1499
1500	encode_utf8_from_utf16_with_output_limit(
1501	&[`0x0063`, `0x00B6`, `0xDCA9`],
1502	"`\u{63}\u{B6}\u{FFFD}`",
1503	`6`,
1504	EncoderResult::InputEmpty,
1505	);
1506	encode_utf8_from_utf16_with_output_limit(
1507	&[`0x0063`, `0x00B6`, `0xDCA9`],
1508	"`\u{63}\u{B6}`",
1509	`5`,
1510	EncoderResult::OutputFull,
1511	);
1512
1513	encode_utf8_from_utf16_with_output_limit(
1514	&[`0x263A`, `0xDCA9`],
1515	"`\u{263A}\u{FFFD}`",
1516	`6`,
1517	EncoderResult::InputEmpty,
1518	);
1519	encode_utf8_from_utf16_with_output_limit(
1520	&[`0x263A`, `0xDCA9`],
1521	"`\u{263A}`",
1522	`5`,
1523	EncoderResult::OutputFull,
1524	);
1525	}
1526
1527	#[test]
1528	fn test_utf8_max_length_from_utf16() {
1529	let mut encoder = UTF_8.new_encoder();
1530	let mut output = [`0u8`; `13`];
1531	let input = &[`0x2C9Fu16`, `0x2CA9u16`, `0x2CA3u16`, `0x2C9Fu16`];
1532	let needed = encoder
1533	.max_buffer_length_from_utf16_without_replacement(input.len())
1534	.unwrap();
1535	let (result, _, _) =
1536	encoder.encode_from_utf16_without_replacement(input, &mut output[..needed], `true`);
1537	assert_eq!(result, EncoderResult::InputEmpty);
1538	}
1539
1540	#[test]
1541	fn test_decode_bom_prefixed_split_byte_triple() {
1542	let mut output = [`0u16`; `20`];
1543	let mut decoder = UTF_8.new_decoder();
1544	{
1545	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
1546	let (result, read, written, had_errors) =
1547	decoder.decode_to_utf16(b"`\xEF`", &mut output[..needed], `false`);
1548	assert_eq!(result, CoderResult::InputEmpty);
1549	assert_eq!(read, `1`);
1550	assert_eq!(written, `0`);
1551	assert!(!had_errors);
1552	}
1553	{
1554	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
1555	let (result, read, written, had_errors) =
1556	decoder.decode_to_utf16(b"`\xBF`", &mut output[..needed], `false`);
1557	assert_eq!(result, CoderResult::InputEmpty);
1558	assert_eq!(read, `1`);
1559	assert_eq!(written, `0`);
1560	assert!(!had_errors);
1561	}
1562	{
1563	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
1564	let (result, read, written, had_errors) =
1565	decoder.decode_to_utf16(b"`\xBE`", &mut output[..needed], `true`);
1566	assert_eq!(result, CoderResult::InputEmpty);
1567	assert_eq!(read, `1`);
1568	assert_eq!(written, `1`);
1569	assert!(!had_errors);
1570	assert_eq!(output[`0`], `0xFFFE`);
1571	}
1572	}
1573
1574	#[test]
1575	fn test_decode_bom_prefixed_split_byte_pair() {
1576	let mut output = [`0u16`; `20`];
1577	let mut decoder = UTF_8.new_decoder();
1578	{
1579	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
1580	let (result, read, written, had_errors) =
1581	decoder.decode_to_utf16(b"`\xEF`", &mut output[..needed], `false`);
1582	assert_eq!(result, CoderResult::InputEmpty);
1583	assert_eq!(read, `1`);
1584	assert_eq!(written, `0`);
1585	assert!(!had_errors);
1586	}
1587	{
1588	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
1589	let (result, read, written, had_errors) =
1590	decoder.decode_to_utf16(b"`\xBC`", &mut output[..needed], `true`);
1591	assert_eq!(result, CoderResult::InputEmpty);
1592	assert_eq!(read, `1`);
1593	assert_eq!(written, `1`);
1594	assert!(had_errors);
1595	assert_eq!(output[`0`], `0xFFFD`);
1596	}
1597	}
1598
1599	#[test]
1600	fn test_decode_bom_prefix() {
1601	let mut output = [`0u16`; `20`];
1602	let mut decoder = UTF_8.new_decoder();
1603	{
1604	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
1605	let (result, read, written, had_errors) =
1606	decoder.decode_to_utf16(b"`\xEF`", &mut output[..needed], `true`);
1607	assert_eq!(result, CoderResult::InputEmpty);
1608	assert_eq!(read, `1`);
1609	assert_eq!(written, `1`);
1610	assert!(had_errors);
1611	assert_eq!(output[`0`], `0xFFFD`);
1612	}
1613	}
1614
1615	#[test]
1616	fn test_tail() {
1617	let mut output = [`0u16`; `1`];
1618	let mut decoder = UTF_8.new_decoder_without_bom_handling();
1619	{
1620	let (result, read, written, had_errors) =
1621	decoder.decode_to_utf16("`\u{E4}`a".as_bytes(), &mut output[..], `false`);
1622	assert_eq!(result, CoderResult::OutputFull);
1623	assert_eq!(read, `2`);
1624	assert_eq!(written, `1`);
1625	assert!(!had_errors);
1626	assert_eq!(output[`0`], `0x00E4`);
1627	}
1628	}
1629	}
1630