lossy.rs source code [crates/core/src/str/lossy.rs]

1	use super::from_utf8_unchecked;
2	use super::validations::utf8_char_width;
3	use crate::fmt;
4	use crate::fmt::{Formatter, Write};
5	use crate::iter::FusedIterator;
6
7	impl [u8] {
8	/// Creates an iterator over the contiguous valid UTF-8 ranges of this
9	/// slice, and the non-UTF-8 fragments in between.
10	///
11	/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
12	///
13	/// # Examples
14	///
15	/// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
16	/// code in the form of a C-string literal (`c"..."`).
17	///
18	/// ```
19	/// use std::fmt::Write as _;
20	///
21	/// pub fn cstr_literal(bytes: &[u8]) -> String {
22	/// let mut repr = String::new();
23	/// repr.push_str("c`\"`");
24	/// for chunk in bytes.utf8_chunks() {
25	/// for ch in chunk.valid().chars() {
26	/// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
27	/// write!(repr, "{}", ch.escape_debug()).unwrap();
28	/// }
29	/// for byte in chunk.invalid() {
30	/// write!(repr, "`\\`x{:02X}", byte).unwrap();
31	/// }
32	/// }
33	/// repr.push('"');
34	/// repr
35	/// }
36	///
37	/// fn main() {
38	/// let lit = cstr_literal(b"`\xfe`rris the `\xf0\x9f\xa6\x80\x07`");
39	/// let expected = stringify!(c"`\xFE`rris the 🦀`\u{7}`");
40	/// assert_eq!(lit, expected);
41	/// }
42	/// ```
43	#[stable(feature = "utf8_chunks", since = "1.79.0")]
44	pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
45	Utf8Chunks { source: self }
46	}
47	}
48
49	/// An item returned by the [`Utf8Chunks`] iterator.
50	///
51	/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
52	/// when decoding a UTF-8 string.
53	///
54	/// # Examples
55	///
56	/// ```
57	/// // An invalid UTF-8 string
58	/// let bytes = b"foo`\xF1\x80`bar";
59	///
60	/// // Decode the first `Utf8Chunk`
61	/// let chunk = bytes.utf8_chunks().next().unwrap();
62	///
63	/// // The first three characters are valid UTF-8
64	/// assert_eq!("foo", chunk.valid());
65	///
66	/// // The fourth character is broken
67	/// assert_eq!(b"`\xF1\x80`", chunk.invalid());
68	/// ```
69	#[stable(feature = "utf8_chunks", since = "1.79.0")]
70	#[derive(Clone, Debug, PartialEq, Eq)]
71	pub struct Utf8Chunk<'a> {
72	valid: &'a str,
73	invalid: &'a [u8],
74	}
75
76	impl<'a> Utf8Chunk<'a> {
77	/// Returns the next validated UTF-8 substring.
78	///
79	/// This substring can be empty at the start of the string or between
80	/// broken UTF-8 characters.
81	#[must_use]
82	#[stable(feature = "utf8_chunks", since = "1.79.0")]
83	pub fn valid(&self) -> &'a str {
84	self.valid
85	}
86
87	/// Returns the invalid sequence that caused a failure.
88	///
89	/// The returned slice will have a maximum length of 3 and starts after the
90	/// substring given by [`valid`]. Decoding will resume after this sequence.
91	///
92	/// If empty, this is the last chunk in the string. If non-empty, an
93	/// unexpected byte was encountered or the end of the input was reached
94	/// unexpectedly.
95	///
96	/// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
97	/// CHARACTER`].
98	///
99	/// [`valid`]: Self::valid
100	/// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
101	#[must_use]
102	#[stable(feature = "utf8_chunks", since = "1.79.0")]
103	pub fn invalid(&self) -> &'a [u8] {
104	self.invalid
105	}
106	}
107
108	#[must_use]
109	#[unstable(feature = "str_internals", issue = "none")]
110	pub struct Debug<'a>(&'a [u8]);
111
112	#[unstable(feature = "str_internals", issue = "none")]
113	impl fmt::Debug for Debug<'_> {
114	fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
115	f.write_char('"')?;
116
117	for chunk in self.0.utf8_chunks() {
118	// Valid part.
119	// Here we partially parse UTF-8 again which is suboptimal.
120	{
121	let valid = chunk.valid();
122	let mut from = `0`;
123	for (i, c) in valid.char_indices() {
124	let esc = c.escape_debug();
125	// If char needs escaping, flush backlog so far and write, else skip
126	if esc.len() != `1` {
127	f.write_str(&valid[from..i])?;
128	for c in esc {
129	f.write_char(c)?;
130	}
131	from = i + c.len_utf8();
132	}
133	}
134	f.write_str(&valid[from..])?;
135	}
136
137	// Broken parts of string as hex escape.
138	for &b in chunk.invalid() {
139	write!(f, "`\\`x{:`02`X}", b)?;
140	}
141	}
142
143	f.write_char('"')
144	}
145	}
146
147	/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
148	/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
149	///
150	/// If you want a simple conversion from UTF-8 byte slices to string slices,
151	/// [`from_utf8`] is easier to use.
152	///
153	/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
154	///
155	/// [byteslice]: slice
156	/// [`from_utf8`]: super::from_utf8
157	///
158	/// # Examples
159	///
160	/// This can be used to create functionality similar to
161	/// [`String::from_utf8_lossy`] without allocating heap memory:
162	///
163	/// ```
164	/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
165	/// for chunk in input.utf8_chunks() {
166	/// push(chunk.valid());
167	///
168	/// if !chunk.invalid().is_empty() {
169	/// push("`\u{FFFD}`");
170	/// }
171	/// }
172	/// }
173	/// ```
174	///
175	/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
176	#[must_use = "iterators are lazy and do nothing unless consumed"]
177	#[stable(feature = "utf8_chunks", since = "1.79.0")]
178	#[derive(Clone)]
179	pub struct Utf8Chunks<'a> {
180	source: &'a [u8],
181	}
182
183	impl<'a> Utf8Chunks<'a> {
184	#[doc(hidden)]
185	#[unstable(feature = "str_internals", issue = "none")]
186	pub fn debug(&self) -> Debug<'_> {
187	Debug(self.source)
188	}
189	}
190
191	#[stable(feature = "utf8_chunks", since = "1.79.0")]
192	impl<'a> Iterator for Utf8Chunks<'a> {
193	type Item = Utf8Chunk<'a>;
194
195	fn next(&mut self) -> Option<Utf8Chunk<'a>> {
196	if self.source.is_empty() {
197	return None;
198	}
199
200	const TAG_CONT_U8: u8 = `128`;
201	fn safe_get(xs: &[u8], i: usize) -> u8 {
202	*xs.get(i).unwrap_or(&`0`)
203	}
204
205	let mut i = `0`;
206	let mut valid_up_to = `0`;
207	while i < self.source.len() {
208	// SAFETY: `i < self.source.len()` per previous line.
209	// For some reason the following are both significantly slower:
210	// while let Some(&byte) = self.source.get(i) {
211	// while let Some(byte) = self.source.get(i).copied() {
212	let byte = unsafe { *self.source.get_unchecked(i) };
213	i += `1`;
214
215	if byte < `128` {
216	// This could be a `1 => ...` case in the match below, but for
217	// the common case of all-ASCII inputs, we bypass loading the
218	// sizeable UTF8_CHAR_WIDTH table into cache.
219	} else {
220	let w = utf8_char_width(byte);
221
222	match w {
223	`2` => {
224	if safe_get(self.source, i) & `192` != TAG_CONT_U8 {
225	break;
226	}
227	i += `1`;
228	}
229	`3` => {
230	match (byte, safe_get(self.source, i)) {
231	(`0xE0`, `0xA0`..=`0xBF`) => (),
232	(`0xE1`..=`0xEC`, `0x80`..=`0xBF`) => (),
233	(`0xED`, `0x80`..=`0x9F`) => (),
234	(`0xEE`..=`0xEF`, `0x80`..=`0xBF`) => (),
235	_ => break,
236	}
237	i += `1`;
238	if safe_get(self.source, i) & `192` != TAG_CONT_U8 {
239	break;
240	}
241	i += `1`;
242	}
243	`4` => {
244	match (byte, safe_get(self.source, i)) {
245	(`0xF0`, `0x90`..=`0xBF`) => (),
246	(`0xF1`..=`0xF3`, `0x80`..=`0xBF`) => (),
247	(`0xF4`, `0x80`..=`0x8F`) => (),
248	_ => break,
249	}
250	i += `1`;
251	if safe_get(self.source, i) & `192` != TAG_CONT_U8 {
252	break;
253	}
254	i += `1`;
255	if safe_get(self.source, i) & `192` != TAG_CONT_U8 {
256	break;
257	}
258	i += `1`;
259	}
260	_ => break,
261	}
262	}
263
264	valid_up_to = i;
265	}
266
267	// SAFETY: `i <= self.source.len()` because it is only ever incremented
268	// via `i += 1` and in between every single one of those increments, `i`
269	// is compared against `self.source.len()`. That happens either
270	// literally by `i < self.source.len()` in the while-loop's condition,
271	// or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
272	// loop is terminated as soon as the latest `i += 1` has made `i` no
273	// longer less than `self.source.len()`, which means it'll be at most
274	// equal to `self.source.len()`.
275	let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
276	self.source = remaining;
277
278	// SAFETY: `valid_up_to <= i` because it is only ever assigned via
279	// `valid_up_to = i` and `i` only increases.
280	let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
281
282	Some(Utf8Chunk {
283	// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
284	valid: unsafe { from_utf8_unchecked(valid) },
285	invalid,
286	})
287	}
288	}
289
290	#[stable(feature = "utf8_chunks", since = "1.79.0")]
291	impl FusedIterator for Utf8Chunks<'_> {}
292
293	#[stable(feature = "utf8_chunks", since = "1.79.0")]
294	impl fmt::Debug for Utf8Chunks<'_> {
295	fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
296	f.debug_struct("Utf8Chunks").field(name:"source", &self.debug()).finish()
297	}
298	}
299

Provided by KDAB

Definitions