1 | use super::from_utf8_unchecked; |
2 | use super::validations::utf8_char_width; |
3 | use crate::fmt; |
4 | use crate::fmt::{Formatter, Write}; |
5 | use crate::iter::FusedIterator; |
6 | |
7 | impl [u8] { |
8 | /// Creates an iterator over the contiguous valid UTF-8 ranges of this |
9 | /// slice, and the non-UTF-8 fragments in between. |
10 | /// |
11 | /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator. |
12 | /// |
13 | /// # Examples |
14 | /// |
15 | /// This function formats arbitrary but mostly-UTF-8 bytes into Rust source |
16 | /// code in the form of a C-string literal (`c"..."`). |
17 | /// |
18 | /// ``` |
19 | /// use std::fmt::Write as _; |
20 | /// |
21 | /// pub fn cstr_literal(bytes: &[u8]) -> String { |
22 | /// let mut repr = String::new(); |
23 | /// repr.push_str("c \"" ); |
24 | /// for chunk in bytes.utf8_chunks() { |
25 | /// for ch in chunk.valid().chars() { |
26 | /// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters. |
27 | /// write!(repr, "{}" , ch.escape_debug()).unwrap(); |
28 | /// } |
29 | /// for byte in chunk.invalid() { |
30 | /// write!(repr, " \\x{:02X}" , byte).unwrap(); |
31 | /// } |
32 | /// } |
33 | /// repr.push('"' ); |
34 | /// repr |
35 | /// } |
36 | /// |
37 | /// fn main() { |
38 | /// let lit = cstr_literal(b" \xferris the \xf0\x9f\xa6\x80\x07" ); |
39 | /// let expected = stringify!(c" \xFErris the 🦀 \u{7}" ); |
40 | /// assert_eq!(lit, expected); |
41 | /// } |
42 | /// ``` |
43 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
44 | pub fn utf8_chunks(&self) -> Utf8Chunks<'_> { |
45 | Utf8Chunks { source: self } |
46 | } |
47 | } |
48 | |
49 | /// An item returned by the [`Utf8Chunks`] iterator. |
50 | /// |
51 | /// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character |
52 | /// when decoding a UTF-8 string. |
53 | /// |
54 | /// # Examples |
55 | /// |
56 | /// ``` |
57 | /// // An invalid UTF-8 string |
58 | /// let bytes = b"foo \xF1\x80bar" ; |
59 | /// |
60 | /// // Decode the first `Utf8Chunk` |
61 | /// let chunk = bytes.utf8_chunks().next().unwrap(); |
62 | /// |
63 | /// // The first three characters are valid UTF-8 |
64 | /// assert_eq!("foo" , chunk.valid()); |
65 | /// |
66 | /// // The fourth character is broken |
67 | /// assert_eq!(b" \xF1\x80" , chunk.invalid()); |
68 | /// ``` |
69 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
70 | #[derive (Clone, Debug, PartialEq, Eq)] |
71 | pub struct Utf8Chunk<'a> { |
72 | valid: &'a str, |
73 | invalid: &'a [u8], |
74 | } |
75 | |
76 | impl<'a> Utf8Chunk<'a> { |
77 | /// Returns the next validated UTF-8 substring. |
78 | /// |
79 | /// This substring can be empty at the start of the string or between |
80 | /// broken UTF-8 characters. |
81 | #[must_use ] |
82 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
83 | pub fn valid(&self) -> &'a str { |
84 | self.valid |
85 | } |
86 | |
87 | /// Returns the invalid sequence that caused a failure. |
88 | /// |
89 | /// The returned slice will have a maximum length of 3 and starts after the |
90 | /// substring given by [`valid`]. Decoding will resume after this sequence. |
91 | /// |
92 | /// If empty, this is the last chunk in the string. If non-empty, an |
93 | /// unexpected byte was encountered or the end of the input was reached |
94 | /// unexpectedly. |
95 | /// |
96 | /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT |
97 | /// CHARACTER`]. |
98 | /// |
99 | /// [`valid`]: Self::valid |
100 | /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER |
101 | #[must_use ] |
102 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
103 | pub fn invalid(&self) -> &'a [u8] { |
104 | self.invalid |
105 | } |
106 | } |
107 | |
108 | #[must_use ] |
109 | #[unstable (feature = "str_internals" , issue = "none" )] |
110 | pub struct Debug<'a>(&'a [u8]); |
111 | |
112 | #[unstable (feature = "str_internals" , issue = "none" )] |
113 | impl fmt::Debug for Debug<'_> { |
114 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
115 | f.write_char('"' )?; |
116 | |
117 | for chunk in self.0.utf8_chunks() { |
118 | // Valid part. |
119 | // Here we partially parse UTF-8 again which is suboptimal. |
120 | { |
121 | let valid = chunk.valid(); |
122 | let mut from = 0; |
123 | for (i, c) in valid.char_indices() { |
124 | let esc = c.escape_debug(); |
125 | // If char needs escaping, flush backlog so far and write, else skip |
126 | if esc.len() != 1 { |
127 | f.write_str(&valid[from..i])?; |
128 | for c in esc { |
129 | f.write_char(c)?; |
130 | } |
131 | from = i + c.len_utf8(); |
132 | } |
133 | } |
134 | f.write_str(&valid[from..])?; |
135 | } |
136 | |
137 | // Broken parts of string as hex escape. |
138 | for &b in chunk.invalid() { |
139 | write!(f, " \\x {:02X}" , b)?; |
140 | } |
141 | } |
142 | |
143 | f.write_char('"' ) |
144 | } |
145 | } |
146 | |
147 | /// An iterator used to decode a slice of mostly UTF-8 bytes to string slices |
148 | /// ([`&str`]) and byte slices ([`&[u8]`][byteslice]). |
149 | /// |
150 | /// If you want a simple conversion from UTF-8 byte slices to string slices, |
151 | /// [`from_utf8`] is easier to use. |
152 | /// |
153 | /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator. |
154 | /// |
155 | /// [byteslice]: slice |
156 | /// [`from_utf8`]: super::from_utf8 |
157 | /// |
158 | /// # Examples |
159 | /// |
160 | /// This can be used to create functionality similar to |
161 | /// [`String::from_utf8_lossy`] without allocating heap memory: |
162 | /// |
163 | /// ``` |
164 | /// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) { |
165 | /// for chunk in input.utf8_chunks() { |
166 | /// push(chunk.valid()); |
167 | /// |
168 | /// if !chunk.invalid().is_empty() { |
169 | /// push(" \u{FFFD}" ); |
170 | /// } |
171 | /// } |
172 | /// } |
173 | /// ``` |
174 | /// |
175 | /// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy |
176 | #[must_use = "iterators are lazy and do nothing unless consumed" ] |
177 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
178 | #[derive (Clone)] |
179 | pub struct Utf8Chunks<'a> { |
180 | source: &'a [u8], |
181 | } |
182 | |
183 | impl<'a> Utf8Chunks<'a> { |
184 | #[doc (hidden)] |
185 | #[unstable (feature = "str_internals" , issue = "none" )] |
186 | pub fn debug(&self) -> Debug<'_> { |
187 | Debug(self.source) |
188 | } |
189 | } |
190 | |
191 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
192 | impl<'a> Iterator for Utf8Chunks<'a> { |
193 | type Item = Utf8Chunk<'a>; |
194 | |
195 | fn next(&mut self) -> Option<Utf8Chunk<'a>> { |
196 | if self.source.is_empty() { |
197 | return None; |
198 | } |
199 | |
200 | const TAG_CONT_U8: u8 = 128; |
201 | fn safe_get(xs: &[u8], i: usize) -> u8 { |
202 | *xs.get(i).unwrap_or(&0) |
203 | } |
204 | |
205 | let mut i = 0; |
206 | let mut valid_up_to = 0; |
207 | while i < self.source.len() { |
208 | // SAFETY: `i < self.source.len()` per previous line. |
209 | // For some reason the following are both significantly slower: |
210 | // while let Some(&byte) = self.source.get(i) { |
211 | // while let Some(byte) = self.source.get(i).copied() { |
212 | let byte = unsafe { *self.source.get_unchecked(i) }; |
213 | i += 1; |
214 | |
215 | if byte < 128 { |
216 | // This could be a `1 => ...` case in the match below, but for |
217 | // the common case of all-ASCII inputs, we bypass loading the |
218 | // sizeable UTF8_CHAR_WIDTH table into cache. |
219 | } else { |
220 | let w = utf8_char_width(byte); |
221 | |
222 | match w { |
223 | 2 => { |
224 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
225 | break; |
226 | } |
227 | i += 1; |
228 | } |
229 | 3 => { |
230 | match (byte, safe_get(self.source, i)) { |
231 | (0xE0, 0xA0..=0xBF) => (), |
232 | (0xE1..=0xEC, 0x80..=0xBF) => (), |
233 | (0xED, 0x80..=0x9F) => (), |
234 | (0xEE..=0xEF, 0x80..=0xBF) => (), |
235 | _ => break, |
236 | } |
237 | i += 1; |
238 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
239 | break; |
240 | } |
241 | i += 1; |
242 | } |
243 | 4 => { |
244 | match (byte, safe_get(self.source, i)) { |
245 | (0xF0, 0x90..=0xBF) => (), |
246 | (0xF1..=0xF3, 0x80..=0xBF) => (), |
247 | (0xF4, 0x80..=0x8F) => (), |
248 | _ => break, |
249 | } |
250 | i += 1; |
251 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
252 | break; |
253 | } |
254 | i += 1; |
255 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
256 | break; |
257 | } |
258 | i += 1; |
259 | } |
260 | _ => break, |
261 | } |
262 | } |
263 | |
264 | valid_up_to = i; |
265 | } |
266 | |
267 | // SAFETY: `i <= self.source.len()` because it is only ever incremented |
268 | // via `i += 1` and in between every single one of those increments, `i` |
269 | // is compared against `self.source.len()`. That happens either |
270 | // literally by `i < self.source.len()` in the while-loop's condition, |
271 | // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The |
272 | // loop is terminated as soon as the latest `i += 1` has made `i` no |
273 | // longer less than `self.source.len()`, which means it'll be at most |
274 | // equal to `self.source.len()`. |
275 | let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) }; |
276 | self.source = remaining; |
277 | |
278 | // SAFETY: `valid_up_to <= i` because it is only ever assigned via |
279 | // `valid_up_to = i` and `i` only increases. |
280 | let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
281 | |
282 | Some(Utf8Chunk { |
283 | // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. |
284 | valid: unsafe { from_utf8_unchecked(valid) }, |
285 | invalid, |
286 | }) |
287 | } |
288 | } |
289 | |
290 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
291 | impl FusedIterator for Utf8Chunks<'_> {} |
292 | |
293 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
294 | impl fmt::Debug for Utf8Chunks<'_> { |
295 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
296 | f.debug_struct("Utf8Chunks" ).field(name:"source" , &self.debug()).finish() |
297 | } |
298 | } |
299 | |