1 | use crate::fmt; |
2 | use crate::fmt::Formatter; |
3 | use crate::fmt::Write; |
4 | use crate::iter::FusedIterator; |
5 | |
6 | use super::from_utf8_unchecked; |
7 | use super::validations::utf8_char_width; |
8 | |
9 | /// An item returned by the [`Utf8Chunks`] iterator. |
10 | /// |
11 | /// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character |
12 | /// when decoding a UTF-8 string. |
13 | /// |
14 | /// # Examples |
15 | /// |
16 | /// ``` |
17 | /// #![feature(utf8_chunks)] |
18 | /// |
19 | /// use std::str::Utf8Chunks; |
20 | /// |
21 | /// // An invalid UTF-8 string |
22 | /// let bytes = b"foo \xF1\x80bar" ; |
23 | /// |
24 | /// // Decode the first `Utf8Chunk` |
25 | /// let chunk = Utf8Chunks::new(bytes).next().unwrap(); |
26 | /// |
27 | /// // The first three characters are valid UTF-8 |
28 | /// assert_eq!("foo" , chunk.valid()); |
29 | /// |
30 | /// // The fourth character is broken |
31 | /// assert_eq!(b" \xF1\x80" , chunk.invalid()); |
32 | /// ``` |
33 | #[unstable (feature = "utf8_chunks" , issue = "99543" )] |
34 | #[derive (Clone, Debug, PartialEq, Eq)] |
35 | pub struct Utf8Chunk<'a> { |
36 | valid: &'a str, |
37 | invalid: &'a [u8], |
38 | } |
39 | |
40 | impl<'a> Utf8Chunk<'a> { |
41 | /// Returns the next validated UTF-8 substring. |
42 | /// |
43 | /// This substring can be empty at the start of the string or between |
44 | /// broken UTF-8 characters. |
45 | #[must_use ] |
46 | #[unstable (feature = "utf8_chunks" , issue = "99543" )] |
47 | pub fn valid(&self) -> &'a str { |
48 | self.valid |
49 | } |
50 | |
51 | /// Returns the invalid sequence that caused a failure. |
52 | /// |
53 | /// The returned slice will have a maximum length of 3 and starts after the |
54 | /// substring given by [`valid`]. Decoding will resume after this sequence. |
55 | /// |
56 | /// If empty, this is the last chunk in the string. If non-empty, an |
57 | /// unexpected byte was encountered or the end of the input was reached |
58 | /// unexpectedly. |
59 | /// |
60 | /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT |
61 | /// CHARACTER`]. |
62 | /// |
63 | /// [`valid`]: Self::valid |
64 | /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER |
65 | #[must_use ] |
66 | #[unstable (feature = "utf8_chunks" , issue = "99543" )] |
67 | pub fn invalid(&self) -> &'a [u8] { |
68 | self.invalid |
69 | } |
70 | } |
71 | |
72 | #[must_use ] |
73 | #[unstable (feature = "str_internals" , issue = "none" )] |
74 | pub struct Debug<'a>(&'a [u8]); |
75 | |
76 | #[unstable (feature = "str_internals" , issue = "none" )] |
77 | impl fmt::Debug for Debug<'_> { |
78 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
79 | f.write_char('"' )?; |
80 | |
81 | for chunk in Utf8Chunks::new(self.0) { |
82 | // Valid part. |
83 | // Here we partially parse UTF-8 again which is suboptimal. |
84 | { |
85 | let valid = chunk.valid(); |
86 | let mut from = 0; |
87 | for (i, c) in valid.char_indices() { |
88 | let esc = c.escape_debug(); |
89 | // If char needs escaping, flush backlog so far and write, else skip |
90 | if esc.len() != 1 { |
91 | f.write_str(&valid[from..i])?; |
92 | for c in esc { |
93 | f.write_char(c)?; |
94 | } |
95 | from = i + c.len_utf8(); |
96 | } |
97 | } |
98 | f.write_str(&valid[from..])?; |
99 | } |
100 | |
101 | // Broken parts of string as hex escape. |
102 | for &b in chunk.invalid() { |
103 | write!(f, " \\x {:02X}" , b)?; |
104 | } |
105 | } |
106 | |
107 | f.write_char('"' ) |
108 | } |
109 | } |
110 | |
111 | /// An iterator used to decode a slice of mostly UTF-8 bytes to string slices |
112 | /// ([`&str`]) and byte slices ([`&[u8]`][byteslice]). |
113 | /// |
114 | /// If you want a simple conversion from UTF-8 byte slices to string slices, |
115 | /// [`from_utf8`] is easier to use. |
116 | /// |
117 | /// [byteslice]: slice |
118 | /// [`from_utf8`]: super::from_utf8 |
119 | /// |
120 | /// # Examples |
121 | /// |
122 | /// This can be used to create functionality similar to |
123 | /// [`String::from_utf8_lossy`] without allocating heap memory: |
124 | /// |
125 | /// ``` |
126 | /// #![feature(utf8_chunks)] |
127 | /// |
128 | /// use std::str::Utf8Chunks; |
129 | /// |
130 | /// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) { |
131 | /// for chunk in Utf8Chunks::new(input) { |
132 | /// push(chunk.valid()); |
133 | /// |
134 | /// if !chunk.invalid().is_empty() { |
135 | /// push(" \u{FFFD}" ); |
136 | /// } |
137 | /// } |
138 | /// } |
139 | /// ``` |
140 | /// |
141 | /// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy |
142 | #[must_use = "iterators are lazy and do nothing unless consumed" ] |
143 | #[unstable (feature = "utf8_chunks" , issue = "99543" )] |
144 | #[derive (Clone)] |
145 | pub struct Utf8Chunks<'a> { |
146 | source: &'a [u8], |
147 | } |
148 | |
149 | impl<'a> Utf8Chunks<'a> { |
150 | /// Creates a new iterator to decode the bytes. |
151 | #[unstable (feature = "utf8_chunks" , issue = "99543" )] |
152 | pub fn new(bytes: &'a [u8]) -> Self { |
153 | Self { source: bytes } |
154 | } |
155 | |
156 | #[doc (hidden)] |
157 | #[unstable (feature = "str_internals" , issue = "none" )] |
158 | pub fn debug(&self) -> Debug<'_> { |
159 | Debug(self.source) |
160 | } |
161 | } |
162 | |
163 | #[unstable (feature = "utf8_chunks" , issue = "99543" )] |
164 | impl<'a> Iterator for Utf8Chunks<'a> { |
165 | type Item = Utf8Chunk<'a>; |
166 | |
167 | fn next(&mut self) -> Option<Utf8Chunk<'a>> { |
168 | if self.source.is_empty() { |
169 | return None; |
170 | } |
171 | |
172 | const TAG_CONT_U8: u8 = 128; |
173 | fn safe_get(xs: &[u8], i: usize) -> u8 { |
174 | *xs.get(i).unwrap_or(&0) |
175 | } |
176 | |
177 | let mut i = 0; |
178 | let mut valid_up_to = 0; |
179 | while i < self.source.len() { |
180 | // SAFETY: `i < self.source.len()` per previous line. |
181 | // For some reason the following are both significantly slower: |
182 | // while let Some(&byte) = self.source.get(i) { |
183 | // while let Some(byte) = self.source.get(i).copied() { |
184 | let byte = unsafe { *self.source.get_unchecked(i) }; |
185 | i += 1; |
186 | |
187 | if byte < 128 { |
188 | // This could be a `1 => ...` case in the match below, but for |
189 | // the common case of all-ASCII inputs, we bypass loading the |
190 | // sizeable UTF8_CHAR_WIDTH table into cache. |
191 | } else { |
192 | let w = utf8_char_width(byte); |
193 | |
194 | match w { |
195 | 2 => { |
196 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
197 | break; |
198 | } |
199 | i += 1; |
200 | } |
201 | 3 => { |
202 | match (byte, safe_get(self.source, i)) { |
203 | (0xE0, 0xA0..=0xBF) => (), |
204 | (0xE1..=0xEC, 0x80..=0xBF) => (), |
205 | (0xED, 0x80..=0x9F) => (), |
206 | (0xEE..=0xEF, 0x80..=0xBF) => (), |
207 | _ => break, |
208 | } |
209 | i += 1; |
210 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
211 | break; |
212 | } |
213 | i += 1; |
214 | } |
215 | 4 => { |
216 | match (byte, safe_get(self.source, i)) { |
217 | (0xF0, 0x90..=0xBF) => (), |
218 | (0xF1..=0xF3, 0x80..=0xBF) => (), |
219 | (0xF4, 0x80..=0x8F) => (), |
220 | _ => break, |
221 | } |
222 | i += 1; |
223 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
224 | break; |
225 | } |
226 | i += 1; |
227 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
228 | break; |
229 | } |
230 | i += 1; |
231 | } |
232 | _ => break, |
233 | } |
234 | } |
235 | |
236 | valid_up_to = i; |
237 | } |
238 | |
239 | // SAFETY: `i <= self.source.len()` because it is only ever incremented |
240 | // via `i += 1` and in between every single one of those increments, `i` |
241 | // is compared against `self.source.len()`. That happens either |
242 | // literally by `i < self.source.len()` in the while-loop's condition, |
243 | // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The |
244 | // loop is terminated as soon as the latest `i += 1` has made `i` no |
245 | // longer less than `self.source.len()`, which means it'll be at most |
246 | // equal to `self.source.len()`. |
247 | let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) }; |
248 | self.source = remaining; |
249 | |
250 | // SAFETY: `valid_up_to <= i` because it is only ever assigned via |
251 | // `valid_up_to = i` and `i` only increases. |
252 | let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
253 | |
254 | Some(Utf8Chunk { |
255 | // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. |
256 | valid: unsafe { from_utf8_unchecked(valid) }, |
257 | invalid, |
258 | }) |
259 | } |
260 | } |
261 | |
262 | #[unstable (feature = "utf8_chunks" , issue = "99543" )] |
263 | impl FusedIterator for Utf8Chunks<'_> {} |
264 | |
265 | #[unstable (feature = "utf8_chunks" , issue = "99543" )] |
266 | impl fmt::Debug for Utf8Chunks<'_> { |
267 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
268 | f.debug_struct("Utf8Chunks" ).field(name:"source" , &self.debug()).finish() |
269 | } |
270 | } |
271 | |