| 1 | use super::from_utf8_unchecked; |
| 2 | use super::validations::utf8_char_width; |
| 3 | use crate::fmt; |
| 4 | use crate::fmt::{Formatter, Write}; |
| 5 | use crate::iter::FusedIterator; |
| 6 | |
| 7 | impl [u8] { |
| 8 | /// Creates an iterator over the contiguous valid UTF-8 ranges of this |
| 9 | /// slice, and the non-UTF-8 fragments in between. |
| 10 | /// |
| 11 | /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator. |
| 12 | /// |
| 13 | /// # Examples |
| 14 | /// |
| 15 | /// This function formats arbitrary but mostly-UTF-8 bytes into Rust source |
| 16 | /// code in the form of a C-string literal (`c"..."`). |
| 17 | /// |
| 18 | /// ``` |
| 19 | /// use std::fmt::Write as _; |
| 20 | /// |
| 21 | /// pub fn cstr_literal(bytes: &[u8]) -> String { |
| 22 | /// let mut repr = String::new(); |
| 23 | /// repr.push_str("c \"" ); |
| 24 | /// for chunk in bytes.utf8_chunks() { |
| 25 | /// for ch in chunk.valid().chars() { |
| 26 | /// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters. |
| 27 | /// write!(repr, "{}" , ch.escape_debug()).unwrap(); |
| 28 | /// } |
| 29 | /// for byte in chunk.invalid() { |
| 30 | /// write!(repr, " \\x{:02X}" , byte).unwrap(); |
| 31 | /// } |
| 32 | /// } |
| 33 | /// repr.push('"' ); |
| 34 | /// repr |
| 35 | /// } |
| 36 | /// |
| 37 | /// fn main() { |
| 38 | /// let lit = cstr_literal(b" \xferris the \xf0\x9f\xa6\x80\x07" ); |
| 39 | /// let expected = stringify!(c" \xFErris the 🦀 \u{7}" ); |
| 40 | /// assert_eq!(lit, expected); |
| 41 | /// } |
| 42 | /// ``` |
| 43 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
| 44 | pub fn utf8_chunks(&self) -> Utf8Chunks<'_> { |
| 45 | Utf8Chunks { source: self } |
| 46 | } |
| 47 | } |
| 48 | |
| 49 | /// An item returned by the [`Utf8Chunks`] iterator. |
| 50 | /// |
| 51 | /// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character |
| 52 | /// when decoding a UTF-8 string. |
| 53 | /// |
| 54 | /// # Examples |
| 55 | /// |
| 56 | /// ``` |
| 57 | /// // An invalid UTF-8 string |
| 58 | /// let bytes = b"foo \xF1\x80bar" ; |
| 59 | /// |
| 60 | /// // Decode the first `Utf8Chunk` |
| 61 | /// let chunk = bytes.utf8_chunks().next().unwrap(); |
| 62 | /// |
| 63 | /// // The first three characters are valid UTF-8 |
| 64 | /// assert_eq!("foo" , chunk.valid()); |
| 65 | /// |
| 66 | /// // The fourth character is broken |
| 67 | /// assert_eq!(b" \xF1\x80" , chunk.invalid()); |
| 68 | /// ``` |
| 69 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
| 70 | #[derive (Clone, Debug, PartialEq, Eq)] |
| 71 | pub struct Utf8Chunk<'a> { |
| 72 | valid: &'a str, |
| 73 | invalid: &'a [u8], |
| 74 | } |
| 75 | |
| 76 | impl<'a> Utf8Chunk<'a> { |
| 77 | /// Returns the next validated UTF-8 substring. |
| 78 | /// |
| 79 | /// This substring can be empty at the start of the string or between |
| 80 | /// broken UTF-8 characters. |
| 81 | #[must_use ] |
| 82 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
| 83 | pub fn valid(&self) -> &'a str { |
| 84 | self.valid |
| 85 | } |
| 86 | |
| 87 | /// Returns the invalid sequence that caused a failure. |
| 88 | /// |
| 89 | /// The returned slice will have a maximum length of 3 and starts after the |
| 90 | /// substring given by [`valid`]. Decoding will resume after this sequence. |
| 91 | /// |
| 92 | /// If empty, this is the last chunk in the string. If non-empty, an |
| 93 | /// unexpected byte was encountered or the end of the input was reached |
| 94 | /// unexpectedly. |
| 95 | /// |
| 96 | /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT |
| 97 | /// CHARACTER`]. |
| 98 | /// |
| 99 | /// [`valid`]: Self::valid |
| 100 | /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER |
| 101 | #[must_use ] |
| 102 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
| 103 | pub fn invalid(&self) -> &'a [u8] { |
| 104 | self.invalid |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | #[must_use ] |
| 109 | #[unstable (feature = "str_internals" , issue = "none" )] |
| 110 | pub struct Debug<'a>(&'a [u8]); |
| 111 | |
| 112 | #[unstable (feature = "str_internals" , issue = "none" )] |
| 113 | impl fmt::Debug for Debug<'_> { |
| 114 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
| 115 | f.write_char('"' )?; |
| 116 | |
| 117 | for chunk in self.0.utf8_chunks() { |
| 118 | // Valid part. |
| 119 | // Here we partially parse UTF-8 again which is suboptimal. |
| 120 | { |
| 121 | let valid = chunk.valid(); |
| 122 | let mut from = 0; |
| 123 | for (i, c) in valid.char_indices() { |
| 124 | let esc = c.escape_debug(); |
| 125 | // If char needs escaping, flush backlog so far and write, else skip |
| 126 | if esc.len() != 1 { |
| 127 | f.write_str(&valid[from..i])?; |
| 128 | for c in esc { |
| 129 | f.write_char(c)?; |
| 130 | } |
| 131 | from = i + c.len_utf8(); |
| 132 | } |
| 133 | } |
| 134 | f.write_str(&valid[from..])?; |
| 135 | } |
| 136 | |
| 137 | // Broken parts of string as hex escape. |
| 138 | for &b in chunk.invalid() { |
| 139 | write!(f, " \\x {:02X}" , b)?; |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | f.write_char('"' ) |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | /// An iterator used to decode a slice of mostly UTF-8 bytes to string slices |
| 148 | /// ([`&str`]) and byte slices ([`&[u8]`][byteslice]). |
| 149 | /// |
| 150 | /// This struct is created by the [`utf8_chunks`] method on bytes slices. |
| 151 | /// If you want a simple conversion from UTF-8 byte slices to string slices, |
| 152 | /// [`from_utf8`] is easier to use. |
| 153 | /// |
| 154 | /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator. |
| 155 | /// |
| 156 | /// [byteslice]: slice |
| 157 | /// [`utf8_chunks`]: slice::utf8_chunks |
| 158 | /// [`from_utf8`]: super::from_utf8 |
| 159 | /// |
| 160 | /// # Examples |
| 161 | /// |
| 162 | /// This can be used to create functionality similar to |
| 163 | /// [`String::from_utf8_lossy`] without allocating heap memory: |
| 164 | /// |
| 165 | /// ``` |
| 166 | /// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) { |
| 167 | /// for chunk in input.utf8_chunks() { |
| 168 | /// push(chunk.valid()); |
| 169 | /// |
| 170 | /// if !chunk.invalid().is_empty() { |
| 171 | /// push(" \u{FFFD}" ); |
| 172 | /// } |
| 173 | /// } |
| 174 | /// } |
| 175 | /// ``` |
| 176 | /// |
| 177 | /// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy |
| 178 | #[must_use = "iterators are lazy and do nothing unless consumed" ] |
| 179 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
| 180 | #[derive (Clone)] |
| 181 | pub struct Utf8Chunks<'a> { |
| 182 | source: &'a [u8], |
| 183 | } |
| 184 | |
| 185 | impl<'a> Utf8Chunks<'a> { |
| 186 | #[doc (hidden)] |
| 187 | #[unstable (feature = "str_internals" , issue = "none" )] |
| 188 | pub fn debug(&self) -> Debug<'_> { |
| 189 | Debug(self.source) |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
| 194 | impl<'a> Iterator for Utf8Chunks<'a> { |
| 195 | type Item = Utf8Chunk<'a>; |
| 196 | |
| 197 | fn next(&mut self) -> Option<Utf8Chunk<'a>> { |
| 198 | if self.source.is_empty() { |
| 199 | return None; |
| 200 | } |
| 201 | |
| 202 | const TAG_CONT_U8: u8 = 128; |
| 203 | fn safe_get(xs: &[u8], i: usize) -> u8 { |
| 204 | *xs.get(i).unwrap_or(&0) |
| 205 | } |
| 206 | |
| 207 | let mut i = 0; |
| 208 | let mut valid_up_to = 0; |
| 209 | while i < self.source.len() { |
| 210 | // SAFETY: `i < self.source.len()` per previous line. |
| 211 | // For some reason the following are both significantly slower: |
| 212 | // while let Some(&byte) = self.source.get(i) { |
| 213 | // while let Some(byte) = self.source.get(i).copied() { |
| 214 | let byte = unsafe { *self.source.get_unchecked(i) }; |
| 215 | i += 1; |
| 216 | |
| 217 | if byte < 128 { |
| 218 | // This could be a `1 => ...` case in the match below, but for |
| 219 | // the common case of all-ASCII inputs, we bypass loading the |
| 220 | // sizeable UTF8_CHAR_WIDTH table into cache. |
| 221 | } else { |
| 222 | let w = utf8_char_width(byte); |
| 223 | |
| 224 | match w { |
| 225 | 2 => { |
| 226 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
| 227 | break; |
| 228 | } |
| 229 | i += 1; |
| 230 | } |
| 231 | 3 => { |
| 232 | match (byte, safe_get(self.source, i)) { |
| 233 | (0xE0, 0xA0..=0xBF) => (), |
| 234 | (0xE1..=0xEC, 0x80..=0xBF) => (), |
| 235 | (0xED, 0x80..=0x9F) => (), |
| 236 | (0xEE..=0xEF, 0x80..=0xBF) => (), |
| 237 | _ => break, |
| 238 | } |
| 239 | i += 1; |
| 240 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
| 241 | break; |
| 242 | } |
| 243 | i += 1; |
| 244 | } |
| 245 | 4 => { |
| 246 | match (byte, safe_get(self.source, i)) { |
| 247 | (0xF0, 0x90..=0xBF) => (), |
| 248 | (0xF1..=0xF3, 0x80..=0xBF) => (), |
| 249 | (0xF4, 0x80..=0x8F) => (), |
| 250 | _ => break, |
| 251 | } |
| 252 | i += 1; |
| 253 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
| 254 | break; |
| 255 | } |
| 256 | i += 1; |
| 257 | if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
| 258 | break; |
| 259 | } |
| 260 | i += 1; |
| 261 | } |
| 262 | _ => break, |
| 263 | } |
| 264 | } |
| 265 | |
| 266 | valid_up_to = i; |
| 267 | } |
| 268 | |
| 269 | // SAFETY: `i <= self.source.len()` because it is only ever incremented |
| 270 | // via `i += 1` and in between every single one of those increments, `i` |
| 271 | // is compared against `self.source.len()`. That happens either |
| 272 | // literally by `i < self.source.len()` in the while-loop's condition, |
| 273 | // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The |
| 274 | // loop is terminated as soon as the latest `i += 1` has made `i` no |
| 275 | // longer less than `self.source.len()`, which means it'll be at most |
| 276 | // equal to `self.source.len()`. |
| 277 | let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) }; |
| 278 | self.source = remaining; |
| 279 | |
| 280 | // SAFETY: `valid_up_to <= i` because it is only ever assigned via |
| 281 | // `valid_up_to = i` and `i` only increases. |
| 282 | let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
| 283 | |
| 284 | Some(Utf8Chunk { |
| 285 | // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. |
| 286 | valid: unsafe { from_utf8_unchecked(valid) }, |
| 287 | invalid, |
| 288 | }) |
| 289 | } |
| 290 | } |
| 291 | |
| 292 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
| 293 | impl FusedIterator for Utf8Chunks<'_> {} |
| 294 | |
| 295 | #[stable (feature = "utf8_chunks" , since = "1.79.0" )] |
| 296 | impl fmt::Debug for Utf8Chunks<'_> { |
| 297 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
| 298 | f.debug_struct("Utf8Chunks" ).field(name:"source" , &self.debug()).finish() |
| 299 | } |
| 300 | } |
| 301 | |