1 | use super::from_utf8_unchecked; |
2 | use super::validations::utf8_char_width; |
3 | use crate::fmt; |
4 | use crate::fmt::{Formatter, Write}; |
5 | use crate::iter::FusedIterator; |
6 | |
7 | impl [u8] { |
8 | /// Creates an iterator over the contiguous valid UTF-8 ranges of this |
9 | /// slice, and the non-UTF-8 fragments in between. |
10 | /// |
11 | /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator. |
12 | /// |
13 | /// # Examples |
14 | /// |
15 | /// This function formats arbitrary but mostly-UTF-8 bytes into Rust source |
16 | /// code in the form of a C-string literal (`c"..."`). |
17 | /// |
18 | /// ``` |
19 | /// use std::fmt::Write as _; |
20 | /// |
21 | /// pub fn cstr_literal(bytes: &[u8]) -> String { |
22 | /// let mut repr = String::new(); |
23 | /// repr.push_str("c \"" ); |
24 | /// for chunk in bytes.utf8_chunks() { |
25 | /// for ch in chunk.valid().chars() { |
26 | /// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters. |
27 | /// write!(repr, "{}" , ch.escape_debug()).unwrap(); |
28 | /// } |
29 | /// for byte in chunk.invalid() { |
30 | /// write!(repr, " \\x{:02X}" , byte).unwrap(); |
31 | /// } |
32 | /// } |
33 | /// repr.push('"' ); |
34 | /// repr |
35 | /// } |
36 | /// |
37 | /// fn main() { |
38 | /// let lit = cstr_literal(b" \xferris the \xf0\x9f\xa6\x80\x07" ); |
39 | /// let expected = stringify!(c" \xFErris the 🦀 \u{7}" ); |
40 | /// assert_eq!(lit, expected); |
41 | /// } |
42 | /// ``` |
43 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
44 | pub fn utf8_chunks(&self) -> Utf8Chunks<'_> { |
45 | Utf8Chunks { source: self } |
46 | } |
47 | } |
48 | |
49 | /// An item returned by the [`Utf8Chunks`] iterator. |
50 | /// |
51 | /// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character |
52 | /// when decoding a UTF-8 string. |
53 | /// |
54 | /// # Examples |
55 | /// |
56 | /// ``` |
57 | /// // An invalid UTF-8 string |
58 | /// let bytes = b"foo \xF1\x80bar" ; |
59 | /// |
60 | /// // Decode the first `Utf8Chunk` |
61 | /// let chunk = bytes.utf8_chunks().next().unwrap(); |
62 | /// |
63 | /// // The first three characters are valid UTF-8 |
64 | /// assert_eq!("foo" , chunk.valid()); |
65 | /// |
66 | /// // The fourth character is broken |
67 | /// assert_eq!(b" \xF1\x80" , chunk.invalid()); |
68 | /// ``` |
69 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
70 | #[derive (Clone, Debug, PartialEq, Eq)] |
71 | pub struct Utf8Chunk<'a> { |
72 | valid: &'a str, |
73 | invalid: &'a [u8], |
74 | } |
75 | |
76 | impl<'a> Utf8Chunk<'a> { |
77 | /// Returns the next validated UTF-8 substring. |
78 | /// |
79 | /// This substring can be empty at the start of the string or between |
80 | /// broken UTF-8 characters. |
81 | #[must_use ] |
82 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
83 | pub fn valid(&self) -> &'a str { |
84 | self.valid |
85 | } |
86 | |
87 | /// Returns the invalid sequence that caused a failure. |
88 | /// |
89 | /// The returned slice will have a maximum length of 3 and starts after the |
90 | /// substring given by [`valid`]. Decoding will resume after this sequence. |
91 | /// |
92 | /// If empty, this is the last chunk in the string. If non-empty, an |
93 | /// unexpected byte was encountered or the end of the input was reached |
94 | /// unexpectedly. |
95 | /// |
96 | /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT |
97 | /// CHARACTER`]. |
98 | /// |
99 | /// [`valid`]: Self::valid |
100 | /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER |
101 | #[must_use ] |
102 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
103 | pub fn invalid(&self) -> &'a [u8] { |
104 | self.invalid |
105 | } |
106 | } |
107 | |
108 | #[must_use ] |
109 | #[unstable (feature = "str_internals" , issue = "none" )] |
110 | pub struct Debug<'a>(&'a [u8]); |
111 | |
112 | #[unstable (feature = "str_internals" , issue = "none" )] |
113 | impl fmt::Debug for Debug<'_> { |
114 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
115 | f.write_char('"' )?; |
116 | |
117 | for chunk in self.0.utf8_chunks() { |
118 | // Valid part. |
119 | // Here we partially parse UTF-8 again which is suboptimal. |
120 | { |
121 | let valid = chunk.valid(); |
122 | let mut from = 0; |
123 | for (i, c) in valid.char_indices() { |
124 | let esc = c.escape_debug(); |
125 | // If char needs escaping, flush backlog so far and write, else skip |
126 | if esc.len() != 1 { |
127 | f.write_str(&valid[from..i])?; |
128 | for c in esc { |
129 | f.write_char(c)?; |
130 | } |
131 | from = i + c.len_utf8(); |
132 | } |
133 | } |
134 | f.write_str(&valid[from..])?; |
135 | } |
136 | |
137 | // Broken parts of string as hex escape. |
138 | for &b in chunk.invalid() { |
139 | write!(f, " \\x {:02X}" , b)?; |
140 | } |
141 | } |
142 | |
143 | f.write_char('"' ) |
144 | } |
145 | } |
146 | |
147 | /// An iterator used to decode a slice of mostly UTF-8 bytes to string slices |
148 | /// ([`&str`]) and byte slices ([`&[u8]`][byteslice]). |
149 | /// |
150 | /// This struct is created by the [`utf8_chunks`] method on bytes slices. |
151 | /// If you want a simple conversion from UTF-8 byte slices to string slices, |
152 | /// [`from_utf8`] is easier to use. |
153 | /// |
154 | /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator. |
155 | /// |
156 | /// [byteslice]: slice |
157 | /// [`utf8_chunks`]: slice::utf8_chunks |
158 | /// [`from_utf8`]: super::from_utf8 |
159 | /// |
160 | /// # Examples |
161 | /// |
162 | /// This can be used to create functionality similar to |
163 | /// [`String::from_utf8_lossy`] without allocating heap memory: |
164 | /// |
165 | /// ``` |
166 | /// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) { |
167 | /// for chunk in input.utf8_chunks() { |
168 | /// push(chunk.valid()); |
169 | /// |
170 | /// if !chunk.invalid().is_empty() { |
171 | /// push(" \u{FFFD}" ); |
172 | /// } |
173 | /// } |
174 | /// } |
175 | /// ``` |
176 | /// |
177 | /// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy |
178 | #[must_use = "iterators are lazy and do nothing unless consumed" ] |
179 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
180 | #[derive (Clone)] |
181 | pub struct Utf8Chunks<'a> { |
182 | source: &'a [u8], |
183 | } |
184 | |
185 | impl<'a> Utf8Chunks<'a> { |
186 | #[doc (hidden)] |
187 | #[unstable (feature = "str_internals" , issue = "none" )] |
188 | pub fn debug(&self) -> Debug<'_> { |
189 | Debug(self.source) |
190 | } |
191 | } |
192 | |
193 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
194 | impl<'a> Iterator for Utf8Chunks<'a> { |
195 | type Item = Utf8Chunk<'a>; |
196 | |
197 | fn next(&mut self) -> Option<Utf8Chunk<'a>> { |
198 | if self.source.is_empty() { |
199 | return None; |
200 | } |
201 | |
202 | const TAG_CONT_U8: u8 = 128; |
203 | fn safe_get(xs: &[u8], i: usize) -> u8 { |
204 | *xs.get(i).unwrap_or(&0) |
205 | } |
206 | |
207 | let mut i = 0; |
208 | let mut valid_up_to = 0; |
209 | while i < self.source.len() { |
210 | // SAFETY: `i < self.source.len()` per previous line. |
211 | // For some reason the following are both significantly slower: |
212 | // while let Some(&byte) = self.source.get(i) { |
213 | // while let Some(byte) = self.source.get(i).copied() { |
214 | let byte = unsafe { *self.source.get_unchecked(i) }; |
215 | i += 1; |
216 | |
217 | if byte < 128 { |
218 | // This could be a `1 => ...` case in the match below, but for |
219 | // the common case of all-ASCII inputs, we bypass loading the |
220 | // sizeable UTF8_CHAR_WIDTH table into cache. |
221 | } else { |
222 | let w = utf8_char_width(byte); |
223 | |
224 | match w { |
225 | 2 => { |
226 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
227 | break; |
228 | } |
229 | i += 1; |
230 | } |
231 | 3 => { |
232 | match (byte, safe_get(self.source, i)) { |
233 | (0xE0, 0xA0..=0xBF) => (), |
234 | (0xE1..=0xEC, 0x80..=0xBF) => (), |
235 | (0xED, 0x80..=0x9F) => (), |
236 | (0xEE..=0xEF, 0x80..=0xBF) => (), |
237 | _ => break, |
238 | } |
239 | i += 1; |
240 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
241 | break; |
242 | } |
243 | i += 1; |
244 | } |
245 | 4 => { |
246 | match (byte, safe_get(self.source, i)) { |
247 | (0xF0, 0x90..=0xBF) => (), |
248 | (0xF1..=0xF3, 0x80..=0xBF) => (), |
249 | (0xF4, 0x80..=0x8F) => (), |
250 | _ => break, |
251 | } |
252 | i += 1; |
253 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
254 | break; |
255 | } |
256 | i += 1; |
257 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
258 | break; |
259 | } |
260 | i += 1; |
261 | } |
262 | _ => break, |
263 | } |
264 | } |
265 | |
266 | valid_up_to = i; |
267 | } |
268 | |
269 | // SAFETY: `i <= self.source.len()` because it is only ever incremented |
270 | // via `i += 1` and in between every single one of those increments, `i` |
271 | // is compared against `self.source.len()`. That happens either |
272 | // literally by `i < self.source.len()` in the while-loop's condition, |
273 | // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The |
274 | // loop is terminated as soon as the latest `i += 1` has made `i` no |
275 | // longer less than `self.source.len()`, which means it'll be at most |
276 | // equal to `self.source.len()`. |
277 | let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) }; |
278 | self.source = remaining; |
279 | |
280 | // SAFETY: `valid_up_to <= i` because it is only ever assigned via |
281 | // `valid_up_to = i` and `i` only increases. |
282 | let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
283 | |
284 | Some(Utf8Chunk { |
285 | // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. |
286 | valid: unsafe { from_utf8_unchecked(valid) }, |
287 | invalid, |
288 | }) |
289 | } |
290 | } |
291 | |
292 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
293 | impl FusedIterator for Utf8Chunks<'_> {} |
294 | |
295 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
296 | impl fmt::Debug for Utf8Chunks<'_> { |
297 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
298 | f.debug_struct("Utf8Chunks" ).field(name:"source" , &self.debug()).finish() |
299 | } |
300 | } |
301 | |