lossy.rs source code [crates/core/src/str/lossy.rs]

1	use super::from_utf8_unchecked;
2	use super::validations::utf8_char_width;
3	use crate::fmt;
4	use crate::fmt::{Formatter, Write};
5	use crate::iter::FusedIterator;
6
7	impl [u8] {
8	/// Creates an iterator over the contiguous valid UTF-8 ranges of this
9	/// slice, and the non-UTF-8 fragments in between.
10	///
11	/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
12	///
13	/// # Examples
14	///
15	/// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
16	/// code in the form of a C-string literal (`c"..."`).
17	///
18	/// ```
19	/// use std::fmt::Write as _;
20	///
21	/// pub fn cstr_literal(bytes: &[u8]) -> String {
22	/// let mut repr = String::new();
23	/// repr.push_str("c`\"`");
24	/// for chunk in bytes.utf8_chunks() {
25	/// for ch in chunk.valid().chars() {
26	/// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
27	/// write!(repr, "{}", ch.escape_debug()).unwrap();
28	/// }
29	/// for byte in chunk.invalid() {
30	/// write!(repr, "`\\`x{:02X}", byte).unwrap();
31	/// }
32	/// }
33	/// repr.push('"');
34	/// repr
35	/// }
36	///
37	/// fn main() {
38	/// let lit = cstr_literal(b"`\xfe`rris the `\xf0\x9f\xa6\x80\x07`");
39	/// let expected = stringify!(c"`\xFE`rris the 🦀`\u{7}`");
40	/// assert_eq!(lit, expected);
41	/// }
42	/// ```
43	#[stable(feature = "utf8_chunks", since = "1.79.0")]
44	pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
45	Utf8Chunks { source: self }
46	}
47	}
48
49	/// An item returned by the [`Utf8Chunks`] iterator.
50	///
51	/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
52	/// when decoding a UTF-8 string.
53	///
54	/// # Examples
55	///
56	/// ```
57	/// // An invalid UTF-8 string
58	/// let bytes = b"foo`\xF1\x80`bar";
59	///
60	/// // Decode the first `Utf8Chunk`
61	/// let chunk = bytes.utf8_chunks().next().unwrap();
62	///
63	/// // The first three characters are valid UTF-8
64	/// assert_eq!("foo", chunk.valid());
65	///
66	/// // The fourth character is broken
67	/// assert_eq!(b"`\xF1\x80`", chunk.invalid());
68	/// ```
69	#[stable(feature = "utf8_chunks", since = "1.79.0")]
70	#[derive(Clone, Debug, PartialEq, Eq)]
71	pub struct Utf8Chunk<'a> {
72	valid: &'a str,
73	invalid: &'a [u8],
74	}
75
76	impl<'a> Utf8Chunk<'a> {
77	/// Returns the next validated UTF-8 substring.
78	///
79	/// This substring can be empty at the start of the string or between
80	/// broken UTF-8 characters.
81	#[must_use]
82	#[stable(feature = "utf8_chunks", since = "1.79.0")]
83	pub fn valid(&self) -> &'a str {
84	self.valid
85	}
86
87	/// Returns the invalid sequence that caused a failure.
88	///
89	/// The returned slice will have a maximum length of 3 and starts after the
90	/// substring given by [`valid`]. Decoding will resume after this sequence.
91	///
92	/// If empty, this is the last chunk in the string. If non-empty, an
93	/// unexpected byte was encountered or the end of the input was reached
94	/// unexpectedly.
95	///
96	/// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
97	/// CHARACTER`].
98	///
99	/// [`valid`]: Self::valid
100	/// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
101	#[must_use]
102	#[stable(feature = "utf8_chunks", since = "1.79.0")]
103	pub fn invalid(&self) -> &'a [u8] {
104	self.invalid
105	}
106	}
107
108	#[must_use]
109	#[unstable(feature = "str_internals", issue = "none")]
110	pub struct Debug<'a>(&'a [u8]);
111
112	#[unstable(feature = "str_internals", issue = "none")]
113	impl fmt::Debug for Debug<'_> {
114	fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
115	f.write_char('"')?;
116
117	for chunk in self.0.utf8_chunks() {
118	// Valid part.
119	// Here we partially parse UTF-8 again which is suboptimal.
120	{
121	let valid = chunk.valid();
122	let mut from = `0`;
123	for (i, c) in valid.char_indices() {
124	let esc = c.escape_debug();
125	// If char needs escaping, flush backlog so far and write, else skip
126	if esc.len() != `1` {
127	f.write_str(&valid[from..i])?;
128	for c in esc {
129	f.write_char(c)?;
130	}
131	from = i + c.len_utf8();
132	}
133	}
134	f.write_str(&valid[from..])?;
135	}
136
137	// Broken parts of string as hex escape.
138	for &b in chunk.invalid() {
139	write!(f, "`\\`x{:`02`X}", b)?;
140	}
141	}
142
143	f.write_char('"')
144	}
145	}
146
147	/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
148	/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
149	///
150	/// This struct is created by the [`utf8_chunks`] method on bytes slices.
151	/// If you want a simple conversion from UTF-8 byte slices to string slices,
152	/// [`from_utf8`] is easier to use.
153	///
154	/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
155	///
156	/// [byteslice]: slice
157	/// [`utf8_chunks`]: slice::utf8_chunks
158	/// [`from_utf8`]: super::from_utf8
159	///
160	/// # Examples
161	///
162	/// This can be used to create functionality similar to
163	/// [`String::from_utf8_lossy`] without allocating heap memory:
164	///
165	/// ```
166	/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
167	/// for chunk in input.utf8_chunks() {
168	/// push(chunk.valid());
169	///
170	/// if !chunk.invalid().is_empty() {
171	/// push("`\u{FFFD}`");
172	/// }
173	/// }
174	/// }
175	/// ```
176	///
177	/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
178	#[must_use = "iterators are lazy and do nothing unless consumed"]
179	#[stable(feature = "utf8_chunks", since = "1.79.0")]
180	#[derive(Clone)]
181	pub struct Utf8Chunks<'a> {
182	source: &'a [u8],
183	}
184
185	impl<'a> Utf8Chunks<'a> {
186	#[doc(hidden)]
187	#[unstable(feature = "str_internals", issue = "none")]
188	pub fn debug(&self) -> Debug<'_> {
189	Debug(self.source)
190	}
191	}
192
193	#[stable(feature = "utf8_chunks", since = "1.79.0")]
194	impl<'a> Iterator for Utf8Chunks<'a> {
195	type Item = Utf8Chunk<'a>;
196
197	fn next(&mut self) -> Option<Utf8Chunk<'a>> {
198	if self.source.is_empty() {
199	return None;
200	}
201
202	const TAG_CONT_U8: u8 = `128`;
203	fn safe_get(xs: &[u8], i: usize) -> u8 {
204	*xs.get(i).unwrap_or(&`0`)
205	}
206
207	let mut i = `0`;
208	let mut valid_up_to = `0`;
209	while i < self.source.len() {
210	// SAFETY: `i < self.source.len()` per previous line.
211	// For some reason the following are both significantly slower:
212	// while let Some(&byte) = self.source.get(i) {
213	// while let Some(byte) = self.source.get(i).copied() {
214	let byte = unsafe { *self.source.get_unchecked(i) };
215	i += `1`;
216
217	if byte < `128` {
218	// This could be a `1 => ...` case in the match below, but for
219	// the common case of all-ASCII inputs, we bypass loading the
220	// sizeable UTF8_CHAR_WIDTH table into cache.
221	} else {
222	let w = utf8_char_width(byte);
223
224	match w {
225	`2` => {
226	if safe_get(self.source, i) & `192` != TAG_CONT_U8 {
227	break;
228	}
229	i += `1`;
230	}
231	`3` => {
232	match (byte, safe_get(self.source, i)) {
233	(`0xE0`, `0xA0`..=`0xBF`) => (),
234	(`0xE1`..=`0xEC`, `0x80`..=`0xBF`) => (),
235	(`0xED`, `0x80`..=`0x9F`) => (),
236	(`0xEE`..=`0xEF`, `0x80`..=`0xBF`) => (),
237	_ => break,
238	}
239	i += `1`;
240	if safe_get(self.source, i) & `192` != TAG_CONT_U8 {
241	break;
242	}
243	i += `1`;
244	}
245	`4` => {
246	match (byte, safe_get(self.source, i)) {
247	(`0xF0`, `0x90`..=`0xBF`) => (),
248	(`0xF1`..=`0xF3`, `0x80`..=`0xBF`) => (),
249	(`0xF4`, `0x80`..=`0x8F`) => (),
250	_ => break,
251	}
252	i += `1`;
253	if safe_get(self.source, i) & `192` != TAG_CONT_U8 {
254	break;
255	}
256	i += `1`;
257	if safe_get(self.source, i) & `192` != TAG_CONT_U8 {
258	break;
259	}
260	i += `1`;
261	}
262	_ => break,
263	}
264	}
265
266	valid_up_to = i;
267	}
268
269	// SAFETY: `i <= self.source.len()` because it is only ever incremented
270	// via `i += 1` and in between every single one of those increments, `i`
271	// is compared against `self.source.len()`. That happens either
272	// literally by `i < self.source.len()` in the while-loop's condition,
273	// or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
274	// loop is terminated as soon as the latest `i += 1` has made `i` no
275	// longer less than `self.source.len()`, which means it'll be at most
276	// equal to `self.source.len()`.
277	let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
278	self.source = remaining;
279
280	// SAFETY: `valid_up_to <= i` because it is only ever assigned via
281	// `valid_up_to = i` and `i` only increases.
282	let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
283
284	Some(Utf8Chunk {
285	// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
286	valid: unsafe { from_utf8_unchecked(valid) },
287	invalid,
288	})
289	}
290	}
291
292	#[stable(feature = "utf8_chunks", since = "1.79.0")]
293	impl FusedIterator for Utf8Chunks<'_> {}
294
295	#[stable(feature = "utf8_chunks", since = "1.79.0")]
296	impl fmt::Debug for Utf8Chunks<'_> {
297	fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
298	f.debug_struct("Utf8Chunks").field(name:"source", &self.debug()).finish()
299	}
300	}
301

Provided by KDAB

Definitions

Learn Rust with the experts

Find out more