| 1 | /// An iterator of `char` values that represent an escaping of arbitrary bytes. |
| 2 | /// |
| 3 | /// The lifetime parameter `'a` refers to the lifetime of the bytes being |
| 4 | /// escaped. |
| 5 | /// |
| 6 | /// This iterator is created by the |
| 7 | /// [`ByteSlice::escape_bytes`](crate::ByteSlice::escape_bytes) method. |
| 8 | #[derive (Clone, Debug)] |
| 9 | pub struct EscapeBytes<'a> { |
| 10 | remaining: &'a [u8], |
| 11 | state: EscapeState, |
| 12 | } |
| 13 | |
| 14 | impl<'a> EscapeBytes<'a> { |
| 15 | pub(crate) fn new(bytes: &'a [u8]) -> EscapeBytes<'a> { |
| 16 | EscapeBytes { remaining: bytes, state: EscapeState::Start } |
| 17 | } |
| 18 | } |
| 19 | |
| 20 | impl<'a> Iterator for EscapeBytes<'a> { |
| 21 | type Item = char; |
| 22 | |
| 23 | #[inline ] |
| 24 | fn next(&mut self) -> Option<char> { |
| 25 | use self::EscapeState::*; |
| 26 | |
| 27 | match self.state { |
| 28 | Start => { |
| 29 | let byte = match crate::decode_utf8(self.remaining) { |
| 30 | (None, 0) => return None, |
| 31 | // If we see invalid UTF-8 or ASCII, then we always just |
| 32 | // peel one byte off. If it's printable ASCII, we'll pass |
| 33 | // it through as-is below. Otherwise, below, it will get |
| 34 | // escaped in some way. |
| 35 | (None, _) | (Some(_), 1) => { |
| 36 | let byte = self.remaining[0]; |
| 37 | self.remaining = &self.remaining[1..]; |
| 38 | byte |
| 39 | } |
| 40 | // For any valid UTF-8 that is not ASCII, we pass it |
| 41 | // through as-is. We don't do any Unicode escaping. |
| 42 | (Some(ch), size) => { |
| 43 | self.remaining = &self.remaining[size..]; |
| 44 | return Some(ch); |
| 45 | } |
| 46 | }; |
| 47 | self.state = match byte { |
| 48 | 0x21..=0x5B | 0x5D..=0x7E => { |
| 49 | return Some(char::from(byte)) |
| 50 | } |
| 51 | b' \0' => SpecialEscape('0' ), |
| 52 | b' \n' => SpecialEscape('n' ), |
| 53 | b' \r' => SpecialEscape('r' ), |
| 54 | b' \t' => SpecialEscape('t' ), |
| 55 | b' \\' => SpecialEscape(' \\' ), |
| 56 | _ => HexEscapeX(byte), |
| 57 | }; |
| 58 | Some(' \\' ) |
| 59 | } |
| 60 | SpecialEscape(ch) => { |
| 61 | self.state = Start; |
| 62 | Some(ch) |
| 63 | } |
| 64 | HexEscapeX(byte) => { |
| 65 | self.state = HexEscapeHighNybble(byte); |
| 66 | Some('x' ) |
| 67 | } |
| 68 | HexEscapeHighNybble(byte) => { |
| 69 | self.state = HexEscapeLowNybble(byte); |
| 70 | let nybble = byte >> 4; |
| 71 | Some(hexdigit_to_char(nybble)) |
| 72 | } |
| 73 | HexEscapeLowNybble(byte) => { |
| 74 | self.state = Start; |
| 75 | let nybble = byte & 0xF; |
| 76 | Some(hexdigit_to_char(nybble)) |
| 77 | } |
| 78 | } |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | impl<'a> core::fmt::Display for EscapeBytes<'a> { |
| 83 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 84 | use core::fmt::Write; |
| 85 | for ch: char in self.clone() { |
| 86 | f.write_char(ch)?; |
| 87 | } |
| 88 | Ok(()) |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | /// The state used by the FSM in the escaping iterator. |
| 93 | #[derive (Clone, Debug)] |
| 94 | enum EscapeState { |
| 95 | /// Read and remove the next byte from 'remaining'. If 'remaining' is |
| 96 | /// empty, then return None. Otherwise, escape the byte according to the |
| 97 | /// following rules or emit it as-is. |
| 98 | /// |
| 99 | /// If it's \n, \r, \t, \\ or \0, then emit a '\' and set the current |
| 100 | /// state to 'SpecialEscape(n | r | t | \ | 0)'. Otherwise, if the 'byte' |
| 101 | /// is not in [\x21-\x5B\x5D-\x7E], then emit a '\' and set the state to |
| 102 | /// to 'HexEscapeX(byte)'. |
| 103 | Start, |
| 104 | /// Emit the given codepoint as is. This assumes '\' has just been emitted. |
| 105 | /// Then set the state to 'Start'. |
| 106 | SpecialEscape(char), |
| 107 | /// Emit the 'x' part of a hex escape. This assumes '\' has just been |
| 108 | /// emitted. Then set the state to 'HexEscapeHighNybble(byte)'. |
| 109 | HexEscapeX(u8), |
| 110 | /// Emit the high nybble of the byte as a hexadecimal digit. This |
| 111 | /// assumes '\x' has just been emitted. Then set the state to |
| 112 | /// 'HexEscapeLowNybble(byte)'. |
| 113 | HexEscapeHighNybble(u8), |
| 114 | /// Emit the low nybble of the byte as a hexadecimal digit. This assume |
| 115 | /// '\xZ' has just been emitted, where 'Z' is the high nybble of this byte. |
| 116 | /// Then set the state to 'Start'. |
| 117 | HexEscapeLowNybble(u8), |
| 118 | } |
| 119 | |
| 120 | /// An iterator of `u8` values that represent an unescaping of a sequence of |
| 121 | /// codepoints. |
| 122 | /// |
| 123 | /// The type parameter `I` refers to the iterator of codepoints that is |
| 124 | /// unescaped. |
| 125 | /// |
| 126 | /// Currently this iterator is not exposed in the crate API, and instead all |
| 127 | /// we expose is a `ByteVec::unescape` method. Which of course requires an |
| 128 | /// alloc. That's the most convenient form of this, but in theory, we could |
| 129 | /// expose this for core-only use cases too. I'm just not quite sure what the |
| 130 | /// API should be. |
| 131 | #[derive (Clone, Debug)] |
| 132 | #[cfg (feature = "alloc" )] |
| 133 | pub(crate) struct UnescapeBytes<I> { |
| 134 | it: I, |
| 135 | state: UnescapeState, |
| 136 | } |
| 137 | |
| 138 | #[cfg (feature = "alloc" )] |
| 139 | impl<I: Iterator<Item = char>> UnescapeBytes<I> { |
| 140 | pub(crate) fn new<T: IntoIterator<IntoIter = I>>( |
| 141 | t: T, |
| 142 | ) -> UnescapeBytes<I> { |
| 143 | UnescapeBytes { it: t.into_iter(), state: UnescapeState::Start } |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | #[cfg (feature = "alloc" )] |
| 148 | impl<I: Iterator<Item = char>> Iterator for UnescapeBytes<I> { |
| 149 | type Item = u8; |
| 150 | |
| 151 | fn next(&mut self) -> Option<u8> { |
| 152 | use self::UnescapeState::*; |
| 153 | |
| 154 | loop { |
| 155 | match self.state { |
| 156 | Start => { |
| 157 | let ch = self.it.next()?; |
| 158 | match ch { |
| 159 | ' \\' => { |
| 160 | self.state = Escape; |
| 161 | } |
| 162 | ch => { |
| 163 | self.state = UnescapeState::bytes(&[], ch); |
| 164 | } |
| 165 | } |
| 166 | } |
| 167 | Bytes { buf, mut cur, len } => { |
| 168 | let byte = buf[cur]; |
| 169 | cur += 1; |
| 170 | if cur >= len { |
| 171 | self.state = Start; |
| 172 | } else { |
| 173 | self.state = Bytes { buf, cur, len }; |
| 174 | } |
| 175 | return Some(byte); |
| 176 | } |
| 177 | Escape => { |
| 178 | let ch = match self.it.next() { |
| 179 | Some(ch) => ch, |
| 180 | None => { |
| 181 | self.state = Start; |
| 182 | // Incomplete escape sequences unescape as |
| 183 | // themselves. |
| 184 | return Some(b' \\' ); |
| 185 | } |
| 186 | }; |
| 187 | match ch { |
| 188 | '0' => { |
| 189 | self.state = Start; |
| 190 | return Some(b' \x00' ); |
| 191 | } |
| 192 | ' \\' => { |
| 193 | self.state = Start; |
| 194 | return Some(b' \\' ); |
| 195 | } |
| 196 | 'r' => { |
| 197 | self.state = Start; |
| 198 | return Some(b' \r' ); |
| 199 | } |
| 200 | 'n' => { |
| 201 | self.state = Start; |
| 202 | return Some(b' \n' ); |
| 203 | } |
| 204 | 't' => { |
| 205 | self.state = Start; |
| 206 | return Some(b' \t' ); |
| 207 | } |
| 208 | 'x' => { |
| 209 | self.state = HexFirst; |
| 210 | } |
| 211 | ch => { |
| 212 | // An invalid escape sequence unescapes as itself. |
| 213 | self.state = UnescapeState::bytes(&[b' \\' ], ch); |
| 214 | } |
| 215 | } |
| 216 | } |
| 217 | HexFirst => { |
| 218 | let ch = match self.it.next() { |
| 219 | Some(ch) => ch, |
| 220 | None => { |
| 221 | // An incomplete escape sequence unescapes as |
| 222 | // itself. |
| 223 | self.state = UnescapeState::bytes_raw(&[b'x' ]); |
| 224 | return Some(b' \\' ); |
| 225 | } |
| 226 | }; |
| 227 | match ch { |
| 228 | '0' ..='9' | 'A' ..='F' | 'a' ..='f' => { |
| 229 | self.state = HexSecond(ch); |
| 230 | } |
| 231 | ch => { |
| 232 | // An invalid escape sequence unescapes as itself. |
| 233 | self.state = UnescapeState::bytes(&[b'x' ], ch); |
| 234 | return Some(b' \\' ); |
| 235 | } |
| 236 | } |
| 237 | } |
| 238 | HexSecond(first) => { |
| 239 | let second = match self.it.next() { |
| 240 | Some(ch) => ch, |
| 241 | None => { |
| 242 | // An incomplete escape sequence unescapes as |
| 243 | // itself. |
| 244 | self.state = UnescapeState::bytes(&[b'x' ], first); |
| 245 | return Some(b' \\' ); |
| 246 | } |
| 247 | }; |
| 248 | match second { |
| 249 | '0' ..='9' | 'A' ..='F' | 'a' ..='f' => { |
| 250 | self.state = Start; |
| 251 | let hinybble = char_to_hexdigit(first); |
| 252 | let lonybble = char_to_hexdigit(second); |
| 253 | let byte = hinybble << 4 | lonybble; |
| 254 | return Some(byte); |
| 255 | } |
| 256 | ch => { |
| 257 | // An invalid escape sequence unescapes as itself. |
| 258 | self.state = |
| 259 | UnescapeState::bytes2(&[b'x' ], first, ch); |
| 260 | return Some(b' \\' ); |
| 261 | } |
| 262 | } |
| 263 | } |
| 264 | } |
| 265 | } |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | /// The state used by the FSM in the unescaping iterator. |
| 270 | #[derive (Clone, Debug)] |
| 271 | #[cfg (feature = "alloc" )] |
| 272 | enum UnescapeState { |
| 273 | /// The start state. Look for an escape sequence, otherwise emit the next |
| 274 | /// codepoint as-is. |
| 275 | Start, |
| 276 | /// Emit the byte at `buf[cur]`. |
| 277 | /// |
| 278 | /// This state should never be created when `cur >= len`. That is, when |
| 279 | /// this state is visited, it is assumed that `cur < len`. |
| 280 | Bytes { buf: [u8; 11], cur: usize, len: usize }, |
| 281 | /// This state is entered after a `\` is seen. |
| 282 | Escape, |
| 283 | /// This state is entered after a `\x` is seen. |
| 284 | HexFirst, |
| 285 | /// This state is entered after a `\xN` is seen, where `N` is in |
| 286 | /// `[0-9A-Fa-f]`. The given codepoint corresponds to `N`. |
| 287 | HexSecond(char), |
| 288 | } |
| 289 | |
| 290 | #[cfg (feature = "alloc" )] |
| 291 | impl UnescapeState { |
| 292 | /// Create a new `Bytes` variant with the given slice. |
| 293 | /// |
| 294 | /// # Panics |
| 295 | /// |
| 296 | /// Panics if `bytes.len() > 11`. |
| 297 | fn bytes_raw(bytes: &[u8]) -> UnescapeState { |
| 298 | // This can be increased, you just need to make sure 'buf' in the |
| 299 | // 'Bytes' state has enough room. |
| 300 | assert!(bytes.len() <= 11, "no more than 11 bytes allowed" ); |
| 301 | let mut buf = [0; 11]; |
| 302 | buf[..bytes.len()].copy_from_slice(bytes); |
| 303 | UnescapeState::Bytes { buf, cur: 0, len: bytes.len() } |
| 304 | } |
| 305 | |
| 306 | /// Create a new `Bytes` variant with the prefix byte slice, followed by |
| 307 | /// the UTF-8 encoding of the given char. |
| 308 | /// |
| 309 | /// # Panics |
| 310 | /// |
| 311 | /// Panics if `prefix.len() > 3`. |
| 312 | fn bytes(prefix: &[u8], ch: char) -> UnescapeState { |
| 313 | // This can be increased, you just need to make sure 'buf' in the |
| 314 | // 'Bytes' state has enough room. |
| 315 | assert!(prefix.len() <= 3, "no more than 3 bytes allowed" ); |
| 316 | let mut buf = [0; 11]; |
| 317 | buf[..prefix.len()].copy_from_slice(prefix); |
| 318 | let chlen = ch.encode_utf8(&mut buf[prefix.len()..]).len(); |
| 319 | UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + chlen } |
| 320 | } |
| 321 | |
| 322 | /// Create a new `Bytes` variant with the prefix byte slice, followed by |
| 323 | /// the UTF-8 encoding of `ch1` and then `ch2`. |
| 324 | /// |
| 325 | /// # Panics |
| 326 | /// |
| 327 | /// Panics if `prefix.len() > 3`. |
| 328 | fn bytes2(prefix: &[u8], ch1: char, ch2: char) -> UnescapeState { |
| 329 | // This can be increased, you just need to make sure 'buf' in the |
| 330 | // 'Bytes' state has enough room. |
| 331 | assert!(prefix.len() <= 3, "no more than 3 bytes allowed" ); |
| 332 | let mut buf = [0; 11]; |
| 333 | buf[..prefix.len()].copy_from_slice(prefix); |
| 334 | let len1 = ch1.encode_utf8(&mut buf[prefix.len()..]).len(); |
| 335 | let len2 = ch2.encode_utf8(&mut buf[prefix.len() + len1..]).len(); |
| 336 | UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + len1 + len2 } |
| 337 | } |
| 338 | } |
| 339 | |
| 340 | /// Convert the given codepoint to its corresponding hexadecimal digit. |
| 341 | /// |
| 342 | /// # Panics |
| 343 | /// |
| 344 | /// This panics if `ch` is not in `[0-9A-Fa-f]`. |
| 345 | #[cfg (feature = "alloc" )] |
| 346 | fn char_to_hexdigit(ch: char) -> u8 { |
| 347 | u8::try_from(ch.to_digit(radix:16).unwrap()).unwrap() |
| 348 | } |
| 349 | |
| 350 | /// Convert the given hexadecimal digit to its corresponding codepoint. |
| 351 | /// |
| 352 | /// # Panics |
| 353 | /// |
| 354 | /// This panics when `digit > 15`. |
| 355 | fn hexdigit_to_char(digit: u8) -> char { |
| 356 | char::from_digit(num:u32::from(digit), radix:16).unwrap().to_ascii_uppercase() |
| 357 | } |
| 358 | |
| 359 | #[cfg (all(test, feature = "std" ))] |
| 360 | mod tests { |
| 361 | use alloc::string::{String, ToString}; |
| 362 | |
| 363 | use crate::BString; |
| 364 | |
| 365 | use super::*; |
| 366 | |
| 367 | #[allow (non_snake_case)] |
| 368 | fn B<B: AsRef<[u8]>>(bytes: B) -> BString { |
| 369 | BString::from(bytes.as_ref()) |
| 370 | } |
| 371 | |
| 372 | fn e<B: AsRef<[u8]>>(bytes: B) -> String { |
| 373 | EscapeBytes::new(bytes.as_ref()).to_string() |
| 374 | } |
| 375 | |
| 376 | fn u(string: &str) -> BString { |
| 377 | UnescapeBytes::new(string.chars()).collect() |
| 378 | } |
| 379 | |
| 380 | #[test ] |
| 381 | fn escape() { |
| 382 | assert_eq!(r"a" , e(br"a" )); |
| 383 | assert_eq!(r"\\x61" , e(br"\x61" )); |
| 384 | assert_eq!(r"a" , e(b" \x61" )); |
| 385 | assert_eq!(r"~" , e(b" \x7E" )); |
| 386 | assert_eq!(r"\x7F" , e(b" \x7F" )); |
| 387 | |
| 388 | assert_eq!(r"\n" , e(b" \n" )); |
| 389 | assert_eq!(r"\r" , e(b" \r" )); |
| 390 | assert_eq!(r"\t" , e(b" \t" )); |
| 391 | assert_eq!(r"\\" , e(b" \\" )); |
| 392 | assert_eq!(r"\0" , e(b" \0" )); |
| 393 | assert_eq!(r"\0" , e(b" \x00" )); |
| 394 | |
| 395 | assert_eq!(r"\x88" , e(b" \x88" )); |
| 396 | assert_eq!(r"\x8F" , e(b" \x8F" )); |
| 397 | assert_eq!(r"\xF8" , e(b" \xF8" )); |
| 398 | assert_eq!(r"\xFF" , e(b" \xFF" )); |
| 399 | |
| 400 | assert_eq!(r"\xE2" , e(b" \xE2" )); |
| 401 | assert_eq!(r"\xE2\x98" , e(b" \xE2\x98" )); |
| 402 | assert_eq!(r"☃" , e(b" \xE2\x98\x83" )); |
| 403 | |
| 404 | assert_eq!(r"\xF0" , e(b" \xF0" )); |
| 405 | assert_eq!(r"\xF0\x9F" , e(b" \xF0\x9F" )); |
| 406 | assert_eq!(r"\xF0\x9F\x92" , e(b" \xF0\x9F\x92" )); |
| 407 | assert_eq!(r"💩" , e(b" \xF0\x9F\x92\xA9" )); |
| 408 | } |
| 409 | |
| 410 | #[test ] |
| 411 | fn unescape() { |
| 412 | assert_eq!(B(r"a" ), u(r"a" )); |
| 413 | assert_eq!(B(r"\x61" ), u(r"\\x61" )); |
| 414 | assert_eq!(B(r"a" ), u(r"\x61" )); |
| 415 | assert_eq!(B(r"~" ), u(r"\x7E" )); |
| 416 | assert_eq!(B(b" \x7F" ), u(r"\x7F" )); |
| 417 | |
| 418 | assert_eq!(B(b" \n" ), u(r"\n" )); |
| 419 | assert_eq!(B(b" \r" ), u(r"\r" )); |
| 420 | assert_eq!(B(b" \t" ), u(r"\t" )); |
| 421 | assert_eq!(B(b" \\" ), u(r"\\" )); |
| 422 | assert_eq!(B(b" \0" ), u(r"\0" )); |
| 423 | assert_eq!(B(b" \0" ), u(r"\x00" )); |
| 424 | |
| 425 | assert_eq!(B(b" \x88" ), u(r"\x88" )); |
| 426 | assert_eq!(B(b" \x8F" ), u(r"\x8F" )); |
| 427 | assert_eq!(B(b" \xF8" ), u(r"\xF8" )); |
| 428 | assert_eq!(B(b" \xFF" ), u(r"\xFF" )); |
| 429 | |
| 430 | assert_eq!(B(b" \xE2" ), u(r"\xE2" )); |
| 431 | assert_eq!(B(b" \xE2\x98" ), u(r"\xE2\x98" )); |
| 432 | assert_eq!(B("☃" ), u(r"\xE2\x98\x83" )); |
| 433 | |
| 434 | assert_eq!(B(b" \xF0" ), u(r"\xf0" )); |
| 435 | assert_eq!(B(b" \xF0\x9F" ), u(r"\xf0\x9f" )); |
| 436 | assert_eq!(B(b" \xF0\x9F\x92" ), u(r"\xf0\x9f\x92" )); |
| 437 | assert_eq!(B("💩" ), u(r"\xf0\x9f\x92\xa9" )); |
| 438 | } |
| 439 | |
| 440 | #[test ] |
| 441 | fn unescape_weird() { |
| 442 | assert_eq!(B(b" \\" ), u(r"\" )); |
| 443 | assert_eq!(B(b" \\" ), u(r"\\" )); |
| 444 | assert_eq!(B(b" \\x" ), u(r"\x" )); |
| 445 | assert_eq!(B(b" \\xA" ), u(r"\xA" )); |
| 446 | |
| 447 | assert_eq!(B(b" \\xZ" ), u(r"\xZ" )); |
| 448 | assert_eq!(B(b" \\xZZ" ), u(r"\xZZ" )); |
| 449 | assert_eq!(B(b" \\i" ), u(r"\i" )); |
| 450 | assert_eq!(B(b" \\u" ), u(r"\u" )); |
| 451 | assert_eq!(B(b" \\u{2603}" ), u(r"\u{2603}" )); |
| 452 | } |
| 453 | } |
| 454 | |