utf8.rs source code [crates/bstr/src/utf8.rs]

1	use core::{char, cmp, fmt, str};
2
3	use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
4
5	// The UTF-8 decoder provided here is based on the one presented here:
6	// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
7	//
8	// We could* have done UTF-8 decoding by using a DFA generated by `\p{any}`*
9	// using regex-automata that is roughly the same size. The real benefit of
10	// Hoehrmann's formulation is that the byte class mapping below is manually
11	// tailored such that each byte's class doubles as a shift to mask out the
12	// bits necessary for constructing the leading bits of each codepoint value
13	// from the initial byte.
14	//
15	// There are some minor differences between this implementation and Hoehrmann's
16	// formulation.
17	//
18	// Firstly, we make REJECT have state ID 0, since it makes the state table
19	// itself a little easier to read and is consistent with the notion that 0
20	// means "false" or "bad."
21	//
22	// Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
23	// path.
24	//
25	// Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
26	// in the core decoding loop. (Which is what regex-automata would do by
27	// default.)
28	//
29	// Fourthly, we split the byte class mapping and transition table into two
30	// arrays because it's clearer.
31	//
32	// It is unlikely that this is the fastest way to do UTF-8 decoding, however,
33	// it is fairly simple.
34
35	const ACCEPT: usize = `12`;
36	const REJECT: usize = `0`;
37
38	/// SAFETY: The decode below function relies on the correctness of these
39	/// equivalence classes.
40	#[rustfmt::skip]
41	const CLASSES: [u8; `256`] = [
42	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
43	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
44	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
45	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
46	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,
47	`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`, `7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,
48	`8`,`8`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`, `2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,
49	`10`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`4`,`3`,`3`, `11`,`6`,`6`,`6`,`5`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,
50	];
51
52	/// SAFETY: The decode below function relies on the correctness of this state
53	/// machine.
54	#[rustfmt::skip]
55	const STATES_FORWARD: &[u8] = &[
56	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
57	`12`, `0`, `24`, `36`, `60`, `96`, `84`, `0`, `0`, `0`, `48`, `72`,
58	`0`, `12`, `0`, `0`, `0`, `0`, `0`, `12`, `0`, `12`, `0`, `0`,
59	`0`, `24`, `0`, `0`, `0`, `0`, `0`, `24`, `0`, `24`, `0`, `0`,
60	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `24`, `0`, `0`, `0`, `0`,
61	`0`, `24`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `24`, `0`, `0`,
62	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `36`, `0`, `36`, `0`, `0`,
63	`0`, `36`, `0`, `0`, `0`, `0`, `0`, `36`, `0`, `36`, `0`, `0`,
64	`0`, `36`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
65	];
66
67	/// An iterator over Unicode scalar values in a byte string.
68	///
69	/// When invalid UTF-8 byte sequences are found, they are substituted with the
70	/// Unicode replacement codepoint (`U+FFFD`) using the
71	/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
72	///
73	/// This iterator is created by the
74	/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
75	/// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
76	#[derive(Clone, Debug)]
77	pub struct Chars<'a> {
78	bs: &'a [u8],
79	}
80
81	impl<'a> Chars<'a> {
82	pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
83	Chars { bs }
84	}
85
86	/// View the underlying data as a subslice of the original data.
87	///
88	/// The slice returned has the same lifetime as the original slice, and so
89	/// the iterator can continue to be used while this exists.
90	///
91	/// # Examples
92	///
93	/// ```
94	/// use bstr::ByteSlice;
95	///
96	/// let mut chars = b"abc".chars();
97	///
98	/// assert_eq!(b"abc", chars.as_bytes());
99	/// chars.next();
100	/// assert_eq!(b"bc", chars.as_bytes());
101	/// chars.next();
102	/// chars.next();
103	/// assert_eq!(b"", chars.as_bytes());
104	/// ```
105	#[inline]
106	pub fn as_bytes(&self) -> &'a [u8] {
107	self.bs
108	}
109	}
110
111	impl<'a> Iterator for Chars<'a> {
112	type Item = char;
113
114	#[inline]
115	fn next(&mut self) -> Option<char> {
116	let (ch: char, size: usize) = decode_lossy(self.bs);
117	if size == `0` {
118	return None;
119	}
120	self.bs = &self.bs[size..];
121	Some(ch)
122	}
123	}
124
125	impl<'a> DoubleEndedIterator for Chars<'a> {
126	#[inline]
127	fn next_back(&mut self) -> Option<char> {
128	let (ch: char, size: usize) = decode_last_lossy(self.bs);
129	if size == `0` {
130	return None;
131	}
132	self.bs = &self.bs[..self.bs.len() - size];
133	Some(ch)
134	}
135	}
136
137	/// An iterator over Unicode scalar values in a byte string and their
138	/// byte index positions.
139	///
140	/// When invalid UTF-8 byte sequences are found, they are substituted with the
141	/// Unicode replacement codepoint (`U+FFFD`) using the
142	/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
143	///
144	/// Note that this is slightly different from the `CharIndices` iterator
145	/// provided by the standard library. Aside from working on possibly invalid
146	/// UTF-8, this iterator provides both the corresponding starting and ending
147	/// byte indices of each codepoint yielded. The ending position is necessary to
148	/// slice the original byte string when invalid UTF-8 bytes are converted into
149	/// a Unicode replacement codepoint, since a single replacement codepoint can
150	/// substitute anywhere from 1 to 3 invalid bytes (inclusive).
151	///
152	/// This iterator is created by the
153	/// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
154	/// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
155	#[derive(Clone, Debug)]
156	pub struct CharIndices<'a> {
157	bs: &'a [u8],
158	forward_index: usize,
159	reverse_index: usize,
160	}
161
162	impl<'a> CharIndices<'a> {
163	pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
164	CharIndices { bs, forward_index: `0`, reverse_index: bs.len() }
165	}
166
167	/// View the underlying data as a subslice of the original data.
168	///
169	/// The slice returned has the same lifetime as the original slice, and so
170	/// the iterator can continue to be used while this exists.
171	///
172	/// # Examples
173	///
174	/// ```
175	/// use bstr::ByteSlice;
176	///
177	/// let mut it = b"abc".char_indices();
178	///
179	/// assert_eq!(b"abc", it.as_bytes());
180	/// it.next();
181	/// assert_eq!(b"bc", it.as_bytes());
182	/// it.next();
183	/// it.next();
184	/// assert_eq!(b"", it.as_bytes());
185	/// ```
186	#[inline]
187	pub fn as_bytes(&self) -> &'a [u8] {
188	self.bs
189	}
190	}
191
192	impl<'a> Iterator for CharIndices<'a> {
193	type Item = (usize, usize, char);
194
195	#[inline]
196	fn next(&mut self) -> Option<(usize, usize, char)> {
197	let index: usize = self.forward_index;
198	let (ch: char, size: usize) = decode_lossy(self.bs);
199	if size == `0` {
200	return None;
201	}
202	self.bs = &self.bs[size..];
203	self.forward_index += size;
204	Some((index, index + size, ch))
205	}
206	}
207
208	impl<'a> DoubleEndedIterator for CharIndices<'a> {
209	#[inline]
210	fn next_back(&mut self) -> Option<(usize, usize, char)> {
211	let (ch: char, size: usize) = decode_last_lossy(self.bs);
212	if size == `0` {
213	return None;
214	}
215	self.bs = &self.bs[..self.bs.len() - size];
216	self.reverse_index -= size;
217	Some((self.reverse_index, self.reverse_index + size, ch))
218	}
219	}
220
221	impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {}
222
223	/// An iterator over chunks of valid UTF-8 in a byte slice.
224	///
225	/// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
226	#[derive(Clone, Debug)]
227	pub struct Utf8Chunks<'a> {
228	pub(super) bytes: &'a [u8],
229	}
230
231	/// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
232	///
233	/// This is yielded by the
234	/// [`Utf8Chunks`](struct.Utf8Chunks.html)
235	/// iterator, which can be created via the
236	/// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
237	/// method.
238	///
239	/// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
240	/// are being iterated over.
241	#[cfg_attr(test, derive(Debug, PartialEq))]
242	pub struct Utf8Chunk<'a> {
243	/// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
244	///
245	/// This is empty between adjacent invalid UTF-8 byte sequences.
246	valid: &'a str,
247	/// A sequence of invalid UTF-8 bytes.
248	///
249	/// Can only be empty in the last chunk.
250	///
251	/// Should be replaced by a single unicode replacement character, if not
252	/// empty.
253	invalid: &'a BStr,
254	/// Indicates whether the invalid sequence could've been valid if there
255	/// were more bytes.
256	///
257	/// Can only be true in the last chunk.
258	incomplete: bool,
259	}
260
261	impl<'a> Utf8Chunk<'a> {
262	/// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
263	///
264	/// This may be empty if there are consecutive sequences of invalid UTF-8
265	/// bytes.
266	#[inline]
267	pub fn valid(&self) -> &'a str {
268	self.valid
269	}
270
271	/// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
272	/// immediately follow the valid UTF-8 bytes in this chunk.
273	///
274	/// This is only empty when this chunk corresponds to the last chunk in
275	/// the original bytes.
276	///
277	/// The maximum length of this slice is 3. That is, invalid UTF-8 byte
278	/// sequences greater than 1 always correspond to a valid _prefix_ of
279	/// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
280	/// of maximal subparts" strategy that is described in more detail in the
281	/// docs for the
282	/// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
283	/// method.
284	#[inline]
285	pub fn invalid(&self) -> &'a [u8] {
286	self.invalid.as_bytes()
287	}
288
289	/// Returns whether the invalid sequence might still become valid if more
290	/// bytes are added.
291	///
292	/// Returns true if the end of the input was reached unexpectedly,
293	/// without encountering an unexpected byte.
294	///
295	/// This can only be the case for the last chunk.
296	#[inline]
297	pub fn incomplete(&self) -> bool {
298	self.incomplete
299	}
300	}
301
302	impl<'a> Iterator for Utf8Chunks<'a> {
303	type Item = Utf8Chunk<'a>;
304
305	#[inline]
306	fn next(&mut self) -> Option<Utf8Chunk<'a>> {
307	if self.bytes.is_empty() {
308	return None;
309	}
310	match validate(self.bytes) {
311	Ok(()) => {
312	let valid = self.bytes;
313	self.bytes = &[];
314	Some(Utf8Chunk {
315	// SAFETY: This is safe because of the guarantees provided
316	// by utf8::validate.
317	valid: unsafe { str::from_utf8_unchecked(valid) },
318	invalid: [].as_bstr(),
319	incomplete: `false`,
320	})
321	}
322	Err(e) => {
323	let (valid, rest) = self.bytes.split_at(e.valid_up_to());
324	// SAFETY: This is safe because of the guarantees provided by
325	// utf8::validate.
326	let valid = unsafe { str::from_utf8_unchecked(valid) };
327	let (invalid_len, incomplete) = match e.error_len() {
328	Some(n) => (n, `false`),
329	None => (rest.len(), `true`),
330	};
331	let (invalid, rest) = rest.split_at(invalid_len);
332	self.bytes = rest;
333	Some(Utf8Chunk {
334	valid,
335	invalid: invalid.as_bstr(),
336	incomplete,
337	})
338	}
339	}
340	}
341
342	#[inline]
343	fn size_hint(&self) -> (usize, Option<usize>) {
344	if self.bytes.is_empty() {
345	(`0`, Some(`0`))
346	} else {
347	(`1`, Some(self.bytes.len()))
348	}
349	}
350	}
351
352	impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
353
354	/// An error that occurs when UTF-8 decoding fails.
355	///
356	/// This error occurs when attempting to convert a non-UTF-8 byte
357	/// string to a Rust string that must be valid UTF-8. For example,
358	/// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
359	///
360	/// # Example
361	///
362	/// This example shows what happens when a given byte sequence is invalid,
363	/// but ends with a sequence that is a possible prefix of valid UTF-8.
364	///
365	/// ```
366	/// use bstr::{B, ByteSlice};
367	///
368	/// let s = B(b"foobar`\xF1\x80\x80`");
369	/// let err = s.to_str().unwrap_err();
370	/// assert_eq!(err.valid_up_to(), `6`);
371	/// assert_eq!(err.error_len(), None);
372	/// ```
373	///
374	/// This example shows what happens when a given byte sequence contains
375	/// invalid UTF-8.
376	///
377	/// ```
378	/// use bstr::ByteSlice;
379	///
380	/// let s = b"foobar`\xF1\x80\x80`quux";
381	/// let err = s.to_str().unwrap_err();
382	/// assert_eq!(err.valid_up_to(), `6`);
383	/// // The error length reports the maximum number of bytes that correspond to
384	/// // a valid prefix of a UTF-8 encoded codepoint.
385	/// assert_eq!(err.error_len(), Some(`3`));
386	///
387	/// // In contrast to the above which contains a single invalid prefix,
388	/// // consider the case of multiple individual bytes that are never valid
389	/// // prefixes. Note how the value of error_len changes!
390	/// let s = b"foobar`\xFF\xFF`quux";
391	/// let err = s.to_str().unwrap_err();
392	/// assert_eq!(err.valid_up_to(), `6`);
393	/// assert_eq!(err.error_len(), Some(`1`));
394	///
395	/// // The fact that it's an invalid prefix does not change error_len even
396	/// // when it immediately precedes the end of the string.
397	/// let s = b"foobar`\xFF`";
398	/// let err = s.to_str().unwrap_err();
399	/// assert_eq!(err.valid_up_to(), `6`);
400	/// assert_eq!(err.error_len(), Some(`1`));
401	/// ```
402	#[derive(Clone, Debug, Eq, PartialEq)]
403	pub struct Utf8Error {
404	valid_up_to: usize,
405	error_len: Option<usize>,
406	}
407
408	impl Utf8Error {
409	/// Returns the byte index of the position immediately following the last
410	/// valid UTF-8 byte.
411	///
412	/// # Example
413	///
414	/// This examples shows how `valid_up_to` can be used to retrieve a
415	/// possibly empty prefix that is guaranteed to be valid UTF-8:
416	///
417	/// ```
418	/// use bstr::ByteSlice;
419	///
420	/// let s = b"foobar`\xF1\x80\x80`quux";
421	/// let err = s.to_str().unwrap_err();
422	///
423	/// // This is guaranteed to never panic.
424	/// let string = s[..err.valid_up_to()].to_str().unwrap();
425	/// assert_eq!(string, "foobar");
426	/// ```
427	#[inline]
428	pub fn valid_up_to(&self) -> usize {
429	self.valid_up_to
430	}
431
432	/// Returns the total number of invalid UTF-8 bytes immediately following
433	/// the position returned by `valid_up_to`. This value is always at least
434	/// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
435	/// encoded codepoint.
436	///
437	/// If the end of the original input was found before a valid UTF-8 encoded
438	/// codepoint could be completed, then this returns `None`. This is useful
439	/// when processing streams, where a `None` value signals that more input
440	/// might be needed.
441	#[inline]
442	pub fn error_len(&self) -> Option<usize> {
443	self.error_len
444	}
445	}
446
447	#[cfg(feature = "std")]
448	impl std::error::Error for Utf8Error {
449	fn description(&self) -> &str {
450	"invalid UTF-8"
451	}
452	}
453
454	impl fmt::Display for Utf8Error {
455	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
456	write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
457	}
458	}
459
460	/// Returns OK if and only if the given slice is completely valid UTF-8.
461	///
462	/// If the slice isn't valid UTF-8, then an error is returned that explains
463	/// the first location at which invalid UTF-8 was detected.
464	pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
465	// The fast path for validating UTF-8. It steps through a UTF-8 automaton
466	// and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
467	// detected, it backs up and runs the slower version of the UTF-8 automaton
468	// to determine correct error information.
469	fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
470	let mut state = ACCEPT;
471	let mut i = `0`;
472
473	while i < slice.len() {
474	let b = slice[i];
475
476	// ASCII fast path. If we see two consecutive ASCII bytes, then try
477	// to validate as much ASCII as possible very quickly.
478	if state == ACCEPT
479	&& b <= `0x7F`
480	&& slice.get(i + `1`).map_or(`false`, \|&b\| b <= `0x7F`)
481	{
482	i += ascii::first_non_ascii_byte(&slice[i..]);
483	continue;
484	}
485
486	state = step(state, b);
487	if state == REJECT {
488	return Err(find_valid_up_to(slice, i));
489	}
490	i += `1`;
491	}
492	if state != ACCEPT {
493	Err(find_valid_up_to(slice, slice.len()))
494	} else {
495	Ok(())
496	}
497	}
498
499	// Given the first position at which a UTF-8 sequence was determined to be
500	// invalid, return an error that correctly reports the position at which
501	// the last complete UTF-8 sequence ends.
502	#[inline(never)]
503	fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
504	// In order to find the last valid byte, we need to back up an amount
505	// that guarantees every preceding byte is part of a valid UTF-8
506	// code unit sequence. To do this, we simply locate the last leading
507	// byte that occurs before rejected_at.
508	let mut backup = rejected_at.saturating_sub(`1`);
509	while backup > `0` && !is_leading_or_invalid_utf8_byte(slice[backup]) {
510	backup -= `1`;
511	}
512	let upto = cmp::min(slice.len(), rejected_at.saturating_add(`1`));
513	let mut err = slow(&slice[backup..upto]).unwrap_err();
514	err.valid_up_to += backup;
515	err
516	}
517
518	// Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
519	// when an invalid sequence is found. This is split out from validate so
520	// that the fast path doesn't need to keep track of the position of the
521	// last valid UTF-8 byte. In particular, tracking this requires checking
522	// for an ACCEPT state on each byte, which degrades throughput pretty
523	// badly.
524	fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
525	let mut state = ACCEPT;
526	let mut valid_up_to = `0`;
527	for (i, &b) in slice.iter().enumerate() {
528	state = step(state, b);
529	if state == ACCEPT {
530	valid_up_to = i + `1`;
531	} else if state == REJECT {
532	// Our error length must always be at least 1.
533	let error_len = Some(cmp::max(`1`, i - valid_up_to));
534	return Err(Utf8Error { valid_up_to, error_len });
535	}
536	}
537	if state != ACCEPT {
538	Err(Utf8Error { valid_up_to, error_len: None })
539	} else {
540	Ok(())
541	}
542	}
543
544	// Advance to the next state given the current state and current byte.
545	fn step(state: usize, b: u8) -> usize {
546	let class = CLASSES[b as usize];
547	// SAFETY: This is safe because 'class' is always <=11 and 'state' is
548	// always <=96. Therefore, the maximal index is 96+11 = 107, where
549	// STATES_FORWARD.len() = 108 such that every index is guaranteed to be
550	// valid by construction of the state machine and the byte equivalence
551	// classes.
552	unsafe {
553	STATES_FORWARD.get_unchecked(state + class as usize) as usize*
554	}
555	}
556
557	fast(slice)
558	}
559
560	/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
561	///
562	/// When successful, the corresponding Unicode scalar value is returned along
563	/// with the number of bytes it was encoded with. The number of bytes consumed
564	/// for a successful decode is always between 1 and 4, inclusive.
565	///
566	/// When unsuccessful, `None` is returned along with the number of bytes that
567	/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
568	/// the number of bytes consumed is always between 0 and 3, inclusive, where
569	/// 0 is only returned when `slice` is empty.
570	///
571	/// # Examples
572	///
573	/// Basic usage:
574	///
575	/// ```
576	/// use bstr::decode_utf8;
577	///
578	/// // Decoding a valid codepoint.
579	/// let (ch, size) = decode_utf8(b"`\xE2\x98\x83`");
580	/// assert_eq!(Some('☃'), ch);
581	/// assert_eq!(`3`, size);
582	///
583	/// // Decoding an incomplete codepoint.
584	/// let (ch, size) = decode_utf8(b"`\xE2\x98`");
585	/// assert_eq!(None, ch);
586	/// assert_eq!(`2`, size);
587	/// ```
588	///
589	/// This example shows how to iterate over all codepoints in UTF-8 encoded
590	/// bytes, while replacing invalid UTF-8 sequences with the replacement
591	/// codepoint:
592	///
593	/// ```
594	/// use bstr::{B, decode_utf8};
595	///
596	/// let mut bytes = B(b"`\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61`");
597	/// let mut chars = vec![];
598	/// while !bytes.is_empty() {
599	/// let (ch, size) = decode_utf8(bytes);
600	/// bytes = &bytes[size..];
601	/// chars.push(ch.unwrap_or('`\u{FFFD}`'));
602	/// }
603	/// assert_eq!(vec!['☃', '`\u{FFFD}`', '𝞃', '`\u{FFFD}`', 'a'], chars);
604	/// ```
605	#[inline]
606	pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
607	let slice = slice.as_ref();
608	match slice.first() {
609	None => return (None, `0`),
610	Some(&b) if b <= `0x7F` => return (Some(b as char), `1`),
611	_ => {}
612	}
613
614	let (mut state, mut cp, mut i) = (ACCEPT, `0`, `0`);
615	while i < slice.len() {
616	decode_step(&mut state, &mut cp, slice[i]);
617	i += `1`;
618
619	if state == ACCEPT {
620	// SAFETY: This is safe because `decode_step` guarantees that
621	// `cp` is a valid Unicode scalar value in an ACCEPT state.
622	let ch = unsafe { char::from_u32_unchecked(cp) };
623	return (Some(ch), i);
624	} else if state == REJECT {
625	// At this point, we always want to advance at least one byte.
626	return (None, cmp::max(`1`, i.saturating_sub(`1`)));
627	}
628	}
629	(None, i)
630	}
631
632	/// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
633	/// slice.
634	///
635	/// When successful, the corresponding Unicode scalar value is returned along
636	/// with the number of bytes it was encoded with. The number of bytes consumed
637	/// for a successful decode is always between 1 and 4, inclusive.
638	///
639	/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
640	/// along with the number of bytes that make up a maximal prefix of a valid
641	/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
642	/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
643	/// empty.
644	///
645	/// # Examples
646	///
647	/// Basic usage:
648	///
649	/// ```ignore
650	/// use bstr::decode_utf8_lossy;
651	///
652	/// // Decoding a valid codepoint.
653	/// let (ch, size) = decode_utf8_lossy(b"`\xE2\x98\x83`");
654	/// assert_eq!('☃', ch);
655	/// assert_eq!(`3`, size);
656	///
657	/// // Decoding an incomplete codepoint.
658	/// let (ch, size) = decode_utf8_lossy(b"`\xE2\x98`");
659	/// assert_eq!('`\u{FFFD}`', ch);
660	/// assert_eq!(`2`, size);
661	/// ```
662	///
663	/// This example shows how to iterate over all codepoints in UTF-8 encoded
664	/// bytes, while replacing invalid UTF-8 sequences with the replacement
665	/// codepoint:
666	///
667	/// ```ignore
668	/// use bstr::{B, decode_utf8_lossy};
669	///
670	/// let mut bytes = B(b"`\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61`");
671	/// let mut chars = vec![];
672	/// while !bytes.is_empty() {
673	/// let (ch, size) = decode_utf8_lossy(bytes);
674	/// bytes = &bytes[size..];
675	/// chars.push(ch);
676	/// }
677	/// assert_eq!(vec!['☃', '`\u{FFFD}`', '𝞃', '`\u{FFFD}`', 'a'], chars);
678	/// ```
679	#[inline]
680	pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
681	match decode(slice) {
682	(Some(ch: char), size: usize) => (ch, size),
683	(None, size: usize) => ('`\u{FFFD}`', size),
684	}
685	}
686
687	/// UTF-8 decode a single Unicode scalar value from the end of a slice.
688	///
689	/// When successful, the corresponding Unicode scalar value is returned along
690	/// with the number of bytes it was encoded with. The number of bytes consumed
691	/// for a successful decode is always between 1 and 4, inclusive.
692	///
693	/// When unsuccessful, `None` is returned along with the number of bytes that
694	/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
695	/// the number of bytes consumed is always between 0 and 3, inclusive, where
696	/// 0 is only returned when `slice` is empty.
697	///
698	/// # Examples
699	///
700	/// Basic usage:
701	///
702	/// ```
703	/// use bstr::decode_last_utf8;
704	///
705	/// // Decoding a valid codepoint.
706	/// let (ch, size) = decode_last_utf8(b"`\xE2\x98\x83`");
707	/// assert_eq!(Some('☃'), ch);
708	/// assert_eq!(`3`, size);
709	///
710	/// // Decoding an incomplete codepoint.
711	/// let (ch, size) = decode_last_utf8(b"`\xE2\x98`");
712	/// assert_eq!(None, ch);
713	/// assert_eq!(`2`, size);
714	/// ```
715	///
716	/// This example shows how to iterate over all codepoints in UTF-8 encoded
717	/// bytes in reverse, while replacing invalid UTF-8 sequences with the
718	/// replacement codepoint:
719	///
720	/// ```
721	/// use bstr::{B, decode_last_utf8};
722	///
723	/// let mut bytes = B(b"`\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61`");
724	/// let mut chars = vec![];
725	/// while !bytes.is_empty() {
726	/// let (ch, size) = decode_last_utf8(bytes);
727	/// bytes = &bytes[..bytes.len()-size];
728	/// chars.push(ch.unwrap_or('`\u{FFFD}`'));
729	/// }
730	/// assert_eq!(vec!['a', '`\u{FFFD}`', '𝞃', '`\u{FFFD}`', '☃'], chars);
731	/// ```
732	#[inline]
733	pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
734	// TODO: We could implement this by reversing the UTF-8 automaton, but for
735	// now, we do it the slow way by using the forward automaton.
736
737	let slice: &[u8] = slice.as_ref();
738	if slice.is_empty() {
739	return (None, `0`);
740	}
741	let mut start: usize = slice.len() - `1`;
742	let limit: usize = slice.len().saturating_sub(`4`);
743	while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) {
744	start -= `1`;
745	}
746	let (ch: Option, size: usize) = decode(&slice[start..]);
747	// If we didn't consume all of the bytes, then that means there's at least
748	// one stray byte that never occurs in a valid code unit prefix, so we can
749	// advance by one byte.
750	if start + size != slice.len() {
751	(None, `1`)
752	} else {
753	(ch, size)
754	}
755	}
756
757	/// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
758	///
759	/// When successful, the corresponding Unicode scalar value is returned along
760	/// with the number of bytes it was encoded with. The number of bytes consumed
761	/// for a successful decode is always between 1 and 4, inclusive.
762	///
763	/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
764	/// along with the number of bytes that make up a maximal prefix of a valid
765	/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
766	/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
767	/// empty.
768	///
769	/// # Examples
770	///
771	/// Basic usage:
772	///
773	/// ```ignore
774	/// use bstr::decode_last_utf8_lossy;
775	///
776	/// // Decoding a valid codepoint.
777	/// let (ch, size) = decode_last_utf8_lossy(b"`\xE2\x98\x83`");
778	/// assert_eq!('☃', ch);
779	/// assert_eq!(`3`, size);
780	///
781	/// // Decoding an incomplete codepoint.
782	/// let (ch, size) = decode_last_utf8_lossy(b"`\xE2\x98`");
783	/// assert_eq!('`\u{FFFD}`', ch);
784	/// assert_eq!(`2`, size);
785	/// ```
786	///
787	/// This example shows how to iterate over all codepoints in UTF-8 encoded
788	/// bytes in reverse, while replacing invalid UTF-8 sequences with the
789	/// replacement codepoint:
790	///
791	/// ```ignore
792	/// use bstr::decode_last_utf8_lossy;
793	///
794	/// let mut bytes = B(b"`\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61`");
795	/// let mut chars = vec![];
796	/// while !bytes.is_empty() {
797	/// let (ch, size) = decode_last_utf8_lossy(bytes);
798	/// bytes = &bytes[..bytes.len()-size];
799	/// chars.push(ch);
800	/// }
801	/// assert_eq!(vec!['a', '`\u{FFFD}`', '𝞃', '`\u{FFFD}`', '☃'], chars);
802	/// ```
803	#[inline]
804	pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
805	match decode_last(slice) {
806	(Some(ch: char), size: usize) => (ch, size),
807	(None, size: usize) => ('`\u{FFFD}`', size),
808	}
809	}
810
811	/// SAFETY: The decode function relies on state being equal to ACCEPT only if
812	/// cp is a valid Unicode scalar value.
813	#[inline]
814	pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
815	let class: u8 = CLASSES[b as usize];
816	let b: u32 = u32::from(b);
817	if *state == ACCEPT {
818	*cp = (`0xFF` >> class) & b;
819	} else {
820	cp = (b & `0b0011_1111`) \| (cp << `6`);
821	}
822	state = STATES_FORWARD[state + class as usize] as usize;
823	}
824
825	/// Returns true if and only if the given byte is either a valid leading UTF-8
826	/// byte, or is otherwise an invalid byte that can never appear anywhere in a
827	/// valid UTF-8 sequence.
828	fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
829	// In the ASCII case, the most significant bit is never set. The leading
830	// byte of a 2/3/4-byte sequence always has the top two most significant
831	// bits set. For bytes that can never appear anywhere in valid UTF-8, this
832	// also returns true, since every such byte has its two most significant
833	// bits set:
834	//
835	// \xC0 :: 11000000
836	// \xC1 :: 11000001
837	// \xF5 :: 11110101
838	// \xF6 :: 11110110
839	// \xF7 :: 11110111
840	// \xF8 :: 11111000
841	// \xF9 :: 11111001
842	// \xFA :: 11111010
843	// \xFB :: 11111011
844	// \xFC :: 11111100
845	// \xFD :: 11111101
846	// \xFE :: 11111110
847	// \xFF :: 11111111
848	(b & `0b1100_0000`) != `0b1000_0000`
849	}
850
851	#[cfg(all(test, feature = "std"))]
852	mod tests {
853	use core::char;
854
855	use alloc::{string::String, vec, vec::Vec};
856
857	use crate::{
858	ext_slice::{ByteSlice, B},
859	tests::LOSSY_TESTS,
860	utf8::{self, Utf8Error},
861	};
862
863	fn utf8e(valid_up_to: usize) -> Utf8Error {
864	Utf8Error { valid_up_to, error_len: None }
865	}
866
867	fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
868	Utf8Error { valid_up_to, error_len: Some(error_len) }
869	}
870
871	#[test]
872	#[cfg(not(miri))]
873	fn validate_all_codepoints() {
874	for i in `0`..(`0x10FFFF` + `1`) {
875	let cp = match char::from_u32(i) {
876	None => continue,
877	Some(cp) => cp,
878	};
879	let mut buf = [`0`; `4`];
880	let s = cp.encode_utf8(&mut buf);
881	assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
882	}
883	}
884
885	#[test]
886	fn validate_multiple_codepoints() {
887	assert_eq!(Ok(()), utf8::validate(b"abc"));
888	assert_eq!(Ok(()), utf8::validate(b"a`\xE2\x98\x83`a"));
889	assert_eq!(Ok(()), utf8::validate(b"a`\xF0\x9D\x9C\xB7`a"));
890	assert_eq!(Ok(()), utf8::validate(b"`\xE2\x98\x83\xF0\x9D\x9C\xB7`",));
891	assert_eq!(
892	Ok(()),
893	utf8::validate(b"a`\xE2\x98\x83`a`\xF0\x9D\x9C\xB7`a",)
894	);
895	assert_eq!(
896	Ok(()),
897	utf8::validate(b"`\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD`",)
898	);
899	}
900
901	#[test]
902	fn validate_errors() {
903	// single invalid byte
904	assert_eq!(Err(utf8e2(`0`, `1`)), utf8::validate(b"`\xFF`"));
905	// single invalid byte after ASCII
906	assert_eq!(Err(utf8e2(`1`, `1`)), utf8::validate(b"a`\xFF`"));
907	// single invalid byte after 2 byte sequence
908	assert_eq!(Err(utf8e2(`2`, `1`)), utf8::validate(b"`\xCE\xB2\xFF`"));
909	// single invalid byte after 3 byte sequence
910	assert_eq!(Err(utf8e2(`3`, `1`)), utf8::validate(b"`\xE2\x98\x83\xFF`"));
911	// single invalid byte after 4 byte sequence
912	assert_eq!(Err(utf8e2(`4`, `1`)), utf8::validate(b"`\xF0\x9D\x9D\xB1\xFF`"));
913
914	// An invalid 2-byte sequence with a valid 1-byte prefix.
915	assert_eq!(Err(utf8e2(`0`, `1`)), utf8::validate(b"`\xCE\xF0`"));
916	// An invalid 3-byte sequence with a valid 2-byte prefix.
917	assert_eq!(Err(utf8e2(`0`, `2`)), utf8::validate(b"`\xE2\x98\xF0`"));
918	// An invalid 4-byte sequence with a valid 3-byte prefix.
919	assert_eq!(Err(utf8e2(`0`, `3`)), utf8::validate(b"`\xF0\x9D\x9D\xF0`"));
920
921	// An overlong sequence. Should be \xE2\x82\xAC, but we encode the
922	// same codepoint value in 4 bytes. This not only tests that we reject
923	// overlong sequences, but that we get valid_up_to correct.
924	assert_eq!(Err(utf8e2(`0`, `1`)), utf8::validate(b"`\xF0\x82\x82\xAC`"));
925	assert_eq!(Err(utf8e2(`1`, `1`)), utf8::validate(b"a`\xF0\x82\x82\xAC`"));
926	assert_eq!(
927	Err(utf8e2(`3`, `1`)),
928	utf8::validate(b"`\xE2\x98\x83\xF0\x82\x82\xAC`",)
929	);
930
931	// Check that encoding a surrogate codepoint using the UTF-8 scheme
932	// fails validation.
933	assert_eq!(Err(utf8e2(`0`, `1`)), utf8::validate(b"`\xED\xA0\x80`"));
934	assert_eq!(Err(utf8e2(`1`, `1`)), utf8::validate(b"a`\xED\xA0\x80`"));
935	assert_eq!(
936	Err(utf8e2(`3`, `1`)),
937	utf8::validate(b"`\xE2\x98\x83\xED\xA0\x80`",)
938	);
939
940	// Check that an incomplete 2-byte sequence fails.
941	assert_eq!(Err(utf8e2(`0`, `1`)), utf8::validate(b"`\xCE`a"));
942	assert_eq!(Err(utf8e2(`1`, `1`)), utf8::validate(b"a`\xCE`a"));
943	assert_eq!(
944	Err(utf8e2(`3`, `1`)),
945	utf8::validate(b"`\xE2\x98\x83\xCE\xE2\x98\x83`",)
946	);
947	// Check that an incomplete 3-byte sequence fails.
948	assert_eq!(Err(utf8e2(`0`, `2`)), utf8::validate(b"`\xE2\x98`a"));
949	assert_eq!(Err(utf8e2(`1`, `2`)), utf8::validate(b"a`\xE2\x98`a"));
950	assert_eq!(
951	Err(utf8e2(`3`, `2`)),
952	utf8::validate(b"`\xE2\x98\x83\xE2\x98\xE2\x98\x83`",)
953	);
954	// Check that an incomplete 4-byte sequence fails.
955	assert_eq!(Err(utf8e2(`0`, `3`)), utf8::validate(b"`\xF0\x9D\x9C`a"));
956	assert_eq!(Err(utf8e2(`1`, `3`)), utf8::validate(b"a`\xF0\x9D\x9C`a"));
957	assert_eq!(
958	Err(utf8e2(`4`, `3`)),
959	utf8::validate(b"`\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83`",)
960	);
961	assert_eq!(
962	Err(utf8e2(`6`, `3`)),
963	utf8::validate(b"foobar`\xF1\x80\x80`quux",)
964	);
965
966	// Check that an incomplete (EOF) 2-byte sequence fails.
967	assert_eq!(Err(utf8e(`0`)), utf8::validate(b"`\xCE`"));
968	assert_eq!(Err(utf8e(`1`)), utf8::validate(b"a`\xCE`"));
969	assert_eq!(Err(utf8e(`3`)), utf8::validate(b"`\xE2\x98\x83\xCE`"));
970	// Check that an incomplete (EOF) 3-byte sequence fails.
971	assert_eq!(Err(utf8e(`0`)), utf8::validate(b"`\xE2\x98`"));
972	assert_eq!(Err(utf8e(`1`)), utf8::validate(b"a`\xE2\x98`"));
973	assert_eq!(Err(utf8e(`3`)), utf8::validate(b"`\xE2\x98\x83\xE2\x98`"));
974	// Check that an incomplete (EOF) 4-byte sequence fails.
975	assert_eq!(Err(utf8e(`0`)), utf8::validate(b"`\xF0\x9D\x9C`"));
976	assert_eq!(Err(utf8e(`1`)), utf8::validate(b"a`\xF0\x9D\x9C`"));
977	assert_eq!(
978	Err(utf8e(`4`)),
979	utf8::validate(b"`\xF0\x9D\x9C\xB1\xF0\x9D\x9C`",)
980	);
981
982	// Test that we errors correct even after long valid sequences. This
983	// checks that our "backup" logic for detecting errors is correct.
984	assert_eq!(
985	Err(utf8e2(`8`, `1`)),
986	utf8::validate(b"`\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF`",)
987	);
988	}
989
990	#[test]
991	fn decode_valid() {
992	fn d(mut s: &str) -> Vec<char> {
993	let mut chars = vec![];
994	while !s.is_empty() {
995	let (ch, size) = utf8::decode(s.as_bytes());
996	s = &s[size..];
997	chars.push(ch.unwrap());
998	}
999	chars
1000	}
1001
1002	assert_eq!(vec!['☃'], d("☃"));
1003	assert_eq!(vec!['☃', '☃'], d("☃☃"));
1004	assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
1005	assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
1006	assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
1007	}
1008
1009	#[test]
1010	fn decode_invalid() {
1011	let (ch, size) = utf8::decode(b"");
1012	assert_eq!(None, ch);
1013	assert_eq!(`0`, size);
1014
1015	let (ch, size) = utf8::decode(b"`\xFF`");
1016	assert_eq!(None, ch);
1017	assert_eq!(`1`, size);
1018
1019	let (ch, size) = utf8::decode(b"`\xCE\xF0`");
1020	assert_eq!(None, ch);
1021	assert_eq!(`1`, size);
1022
1023	let (ch, size) = utf8::decode(b"`\xE2\x98\xF0`");
1024	assert_eq!(None, ch);
1025	assert_eq!(`2`, size);
1026
1027	let (ch, size) = utf8::decode(b"`\xF0\x9D\x9D`");
1028	assert_eq!(None, ch);
1029	assert_eq!(`3`, size);
1030
1031	let (ch, size) = utf8::decode(b"`\xF0\x9D\x9D\xF0`");
1032	assert_eq!(None, ch);
1033	assert_eq!(`3`, size);
1034
1035	let (ch, size) = utf8::decode(b"`\xF0\x82\x82\xAC`");
1036	assert_eq!(None, ch);
1037	assert_eq!(`1`, size);
1038
1039	let (ch, size) = utf8::decode(b"`\xED\xA0\x80`");
1040	assert_eq!(None, ch);
1041	assert_eq!(`1`, size);
1042
1043	let (ch, size) = utf8::decode(b"`\xCE`a");
1044	assert_eq!(None, ch);
1045	assert_eq!(`1`, size);
1046
1047	let (ch, size) = utf8::decode(b"`\xE2\x98`a");
1048	assert_eq!(None, ch);
1049	assert_eq!(`2`, size);
1050
1051	let (ch, size) = utf8::decode(b"`\xF0\x9D\x9C`a");
1052	assert_eq!(None, ch);
1053	assert_eq!(`3`, size);
1054	}
1055
1056	#[test]
1057	fn decode_lossy() {
1058	let (ch, size) = utf8::decode_lossy(b"");
1059	assert_eq!('`\u{FFFD}`', ch);
1060	assert_eq!(`0`, size);
1061
1062	let (ch, size) = utf8::decode_lossy(b"`\xFF`");
1063	assert_eq!('`\u{FFFD}`', ch);
1064	assert_eq!(`1`, size);
1065
1066	let (ch, size) = utf8::decode_lossy(b"`\xCE\xF0`");
1067	assert_eq!('`\u{FFFD}`', ch);
1068	assert_eq!(`1`, size);
1069
1070	let (ch, size) = utf8::decode_lossy(b"`\xE2\x98\xF0`");
1071	assert_eq!('`\u{FFFD}`', ch);
1072	assert_eq!(`2`, size);
1073
1074	let (ch, size) = utf8::decode_lossy(b"`\xF0\x9D\x9D\xF0`");
1075	assert_eq!('`\u{FFFD}`', ch);
1076	assert_eq!(`3`, size);
1077
1078	let (ch, size) = utf8::decode_lossy(b"`\xF0\x82\x82\xAC`");
1079	assert_eq!('`\u{FFFD}`', ch);
1080	assert_eq!(`1`, size);
1081
1082	let (ch, size) = utf8::decode_lossy(b"`\xED\xA0\x80`");
1083	assert_eq!('`\u{FFFD}`', ch);
1084	assert_eq!(`1`, size);
1085
1086	let (ch, size) = utf8::decode_lossy(b"`\xCE`a");
1087	assert_eq!('`\u{FFFD}`', ch);
1088	assert_eq!(`1`, size);
1089
1090	let (ch, size) = utf8::decode_lossy(b"`\xE2\x98`a");
1091	assert_eq!('`\u{FFFD}`', ch);
1092	assert_eq!(`2`, size);
1093
1094	let (ch, size) = utf8::decode_lossy(b"`\xF0\x9D\x9C`a");
1095	assert_eq!('`\u{FFFD}`', ch);
1096	assert_eq!(`3`, size);
1097	}
1098
1099	#[test]
1100	fn decode_last_valid() {
1101	fn d(mut s: &str) -> Vec<char> {
1102	let mut chars = vec![];
1103	while !s.is_empty() {
1104	let (ch, size) = utf8::decode_last(s.as_bytes());
1105	s = &s[..s.len() - size];
1106	chars.push(ch.unwrap());
1107	}
1108	chars
1109	}
1110
1111	assert_eq!(vec!['☃'], d("☃"));
1112	assert_eq!(vec!['☃', '☃'], d("☃☃"));
1113	assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
1114	assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
1115	assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲"));
1116	}
1117
1118	#[test]
1119	fn decode_last_invalid() {
1120	let (ch, size) = utf8::decode_last(b"");
1121	assert_eq!(None, ch);
1122	assert_eq!(`0`, size);
1123
1124	let (ch, size) = utf8::decode_last(b"`\xFF`");
1125	assert_eq!(None, ch);
1126	assert_eq!(`1`, size);
1127
1128	let (ch, size) = utf8::decode_last(b"`\xCE\xF0`");
1129	assert_eq!(None, ch);
1130	assert_eq!(`1`, size);
1131
1132	let (ch, size) = utf8::decode_last(b"`\xCE`");
1133	assert_eq!(None, ch);
1134	assert_eq!(`1`, size);
1135
1136	let (ch, size) = utf8::decode_last(b"`\xE2\x98\xF0`");
1137	assert_eq!(None, ch);
1138	assert_eq!(`1`, size);
1139
1140	let (ch, size) = utf8::decode_last(b"`\xE2\x98`");
1141	assert_eq!(None, ch);
1142	assert_eq!(`2`, size);
1143
1144	let (ch, size) = utf8::decode_last(b"`\xF0\x9D\x9D\xF0`");
1145	assert_eq!(None, ch);
1146	assert_eq!(`1`, size);
1147
1148	let (ch, size) = utf8::decode_last(b"`\xF0\x9D\x9D`");
1149	assert_eq!(None, ch);
1150	assert_eq!(`3`, size);
1151
1152	let (ch, size) = utf8::decode_last(b"`\xF0\x82\x82\xAC`");
1153	assert_eq!(None, ch);
1154	assert_eq!(`1`, size);
1155
1156	let (ch, size) = utf8::decode_last(b"`\xED\xA0\x80`");
1157	assert_eq!(None, ch);
1158	assert_eq!(`1`, size);
1159
1160	let (ch, size) = utf8::decode_last(b"`\xED\xA0`");
1161	assert_eq!(None, ch);
1162	assert_eq!(`1`, size);
1163
1164	let (ch, size) = utf8::decode_last(b"`\xED`");
1165	assert_eq!(None, ch);
1166	assert_eq!(`1`, size);
1167
1168	let (ch, size) = utf8::decode_last(b"a`\xCE`");
1169	assert_eq!(None, ch);
1170	assert_eq!(`1`, size);
1171
1172	let (ch, size) = utf8::decode_last(b"a`\xE2\x98`");
1173	assert_eq!(None, ch);
1174	assert_eq!(`2`, size);
1175
1176	let (ch, size) = utf8::decode_last(b"a`\xF0\x9D\x9C`");
1177	assert_eq!(None, ch);
1178	assert_eq!(`3`, size);
1179	}
1180
1181	#[test]
1182	fn decode_last_lossy() {
1183	let (ch, size) = utf8::decode_last_lossy(b"");
1184	assert_eq!('`\u{FFFD}`', ch);
1185	assert_eq!(`0`, size);
1186
1187	let (ch, size) = utf8::decode_last_lossy(b"`\xFF`");
1188	assert_eq!('`\u{FFFD}`', ch);
1189	assert_eq!(`1`, size);
1190
1191	let (ch, size) = utf8::decode_last_lossy(b"`\xCE\xF0`");
1192	assert_eq!('`\u{FFFD}`', ch);
1193	assert_eq!(`1`, size);
1194
1195	let (ch, size) = utf8::decode_last_lossy(b"`\xCE`");
1196	assert_eq!('`\u{FFFD}`', ch);
1197	assert_eq!(`1`, size);
1198
1199	let (ch, size) = utf8::decode_last_lossy(b"`\xE2\x98\xF0`");
1200	assert_eq!('`\u{FFFD}`', ch);
1201	assert_eq!(`1`, size);
1202
1203	let (ch, size) = utf8::decode_last_lossy(b"`\xE2\x98`");
1204	assert_eq!('`\u{FFFD}`', ch);
1205	assert_eq!(`2`, size);
1206
1207	let (ch, size) = utf8::decode_last_lossy(b"`\xF0\x9D\x9D\xF0`");
1208	assert_eq!('`\u{FFFD}`', ch);
1209	assert_eq!(`1`, size);
1210
1211	let (ch, size) = utf8::decode_last_lossy(b"`\xF0\x9D\x9D`");
1212	assert_eq!('`\u{FFFD}`', ch);
1213	assert_eq!(`3`, size);
1214
1215	let (ch, size) = utf8::decode_last_lossy(b"`\xF0\x82\x82\xAC`");
1216	assert_eq!('`\u{FFFD}`', ch);
1217	assert_eq!(`1`, size);
1218
1219	let (ch, size) = utf8::decode_last_lossy(b"`\xED\xA0\x80`");
1220	assert_eq!('`\u{FFFD}`', ch);
1221	assert_eq!(`1`, size);
1222
1223	let (ch, size) = utf8::decode_last_lossy(b"`\xED\xA0`");
1224	assert_eq!('`\u{FFFD}`', ch);
1225	assert_eq!(`1`, size);
1226
1227	let (ch, size) = utf8::decode_last_lossy(b"`\xED`");
1228	assert_eq!('`\u{FFFD}`', ch);
1229	assert_eq!(`1`, size);
1230
1231	let (ch, size) = utf8::decode_last_lossy(b"a`\xCE`");
1232	assert_eq!('`\u{FFFD}`', ch);
1233	assert_eq!(`1`, size);
1234
1235	let (ch, size) = utf8::decode_last_lossy(b"a`\xE2\x98`");
1236	assert_eq!('`\u{FFFD}`', ch);
1237	assert_eq!(`2`, size);
1238
1239	let (ch, size) = utf8::decode_last_lossy(b"a`\xF0\x9D\x9C`");
1240	assert_eq!('`\u{FFFD}`', ch);
1241	assert_eq!(`3`, size);
1242	}
1243
1244	#[test]
1245	fn chars() {
1246	for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
1247	let got: String = B(input).chars().collect();
1248	assert_eq!(
1249	expected, got,
1250	"chars(ith: {:?}, given: {:?})",
1251	i, input,
1252	);
1253	let got: String =
1254	B(input).char_indices().map(\|(_, _, ch)\| ch).collect();
1255	assert_eq!(
1256	expected, got,
1257	"char_indices(ith: {:?}, given: {:?})",
1258	i, input,
1259	);
1260
1261	let expected: String = expected.chars().rev().collect();
1262
1263	let got: String = B(input).chars().rev().collect();
1264	assert_eq!(
1265	expected, got,
1266	"chars.rev(ith: {:?}, given: {:?})",
1267	i, input,
1268	);
1269	let got: String =
1270	B(input).char_indices().rev().map(\|(_, _, ch)\| ch).collect();
1271	assert_eq!(
1272	expected, got,
1273	"char_indices.rev(ith: {:?}, given: {:?})",
1274	i, input,
1275	);
1276	}
1277	}
1278
1279	#[test]
1280	fn utf8_chunks() {
1281	let mut c = utf8::Utf8Chunks { bytes: b"123`\xC0`" };
1282	assert_eq!(
1283	(c.next(), c.next()),
1284	(
1285	Some(utf8::Utf8Chunk {
1286	valid: "123",
1287	invalid: b"`\xC0`".as_bstr(),
1288	incomplete: `false`,
1289	}),
1290	None,
1291	)
1292	);
1293
1294	let mut c = utf8::Utf8Chunks { bytes: b"123`\xFF\xFF`" };
1295	assert_eq!(
1296	(c.next(), c.next(), c.next()),
1297	(
1298	Some(utf8::Utf8Chunk {
1299	valid: "123",
1300	invalid: b"`\xFF`".as_bstr(),
1301	incomplete: `false`,
1302	}),
1303	Some(utf8::Utf8Chunk {
1304	valid: "",
1305	invalid: b"`\xFF`".as_bstr(),
1306	incomplete: `false`,
1307	}),
1308	None,
1309	)
1310	);
1311
1312	let mut c = utf8::Utf8Chunks { bytes: b"123`\xD0`" };
1313	assert_eq!(
1314	(c.next(), c.next()),
1315	(
1316	Some(utf8::Utf8Chunk {
1317	valid: "123",
1318	invalid: b"`\xD0`".as_bstr(),
1319	incomplete: `true`,
1320	}),
1321	None,
1322	)
1323	);
1324
1325	let mut c = utf8::Utf8Chunks { bytes: b"123`\xD0`456" };
1326	assert_eq!(
1327	(c.next(), c.next(), c.next()),
1328	(
1329	Some(utf8::Utf8Chunk {
1330	valid: "123",
1331	invalid: b"`\xD0`".as_bstr(),
1332	incomplete: `false`,
1333	}),
1334	Some(utf8::Utf8Chunk {
1335	valid: "456",
1336	invalid: b"".as_bstr(),
1337	incomplete: `false`,
1338	}),
1339	None,
1340	)
1341	);
1342
1343	let mut c = utf8::Utf8Chunks { bytes: b"123`\xE2\x98`" };
1344	assert_eq!(
1345	(c.next(), c.next()),
1346	(
1347	Some(utf8::Utf8Chunk {
1348	valid: "123",
1349	invalid: b"`\xE2\x98`".as_bstr(),
1350	incomplete: `true`,
1351	}),
1352	None,
1353	)
1354	);
1355
1356	let mut c = utf8::Utf8Chunks { bytes: b"123`\xF4\x8F\xBF`" };
1357	assert_eq!(
1358	(c.next(), c.next()),
1359	(
1360	Some(utf8::Utf8Chunk {
1361	valid: "123",
1362	invalid: b"`\xF4\x8F\xBF`".as_bstr(),
1363	incomplete: `true`,
1364	}),
1365	None,
1366	)
1367	);
1368	}
1369	}
1370