1 | /// An iterator of `char` values that represent an escaping of arbitrary bytes. |
2 | /// |
3 | /// The lifetime parameter `'a` refers to the lifetime of the bytes being |
4 | /// escaped. |
5 | /// |
6 | /// This iterator is created by the |
7 | /// [`ByteSlice::escape_bytes`](crate::ByteSlice::escape_bytes) method. |
8 | #[derive (Clone, Debug)] |
9 | pub struct EscapeBytes<'a> { |
10 | remaining: &'a [u8], |
11 | state: EscapeState, |
12 | } |
13 | |
14 | impl<'a> EscapeBytes<'a> { |
15 | pub(crate) fn new(bytes: &'a [u8]) -> EscapeBytes { |
16 | EscapeBytes { remaining: bytes, state: EscapeState::Start } |
17 | } |
18 | } |
19 | |
20 | impl<'a> Iterator for EscapeBytes<'a> { |
21 | type Item = char; |
22 | |
23 | #[inline ] |
24 | fn next(&mut self) -> Option<char> { |
25 | use self::EscapeState::*; |
26 | |
27 | match self.state { |
28 | Start => { |
29 | let byte = match crate::decode_utf8(self.remaining) { |
30 | (None, 0) => return None, |
31 | // If we see invalid UTF-8 or ASCII, then we always just |
32 | // peel one byte off. If it's printable ASCII, we'll pass |
33 | // it through as-is below. Otherwise, below, it will get |
34 | // escaped in some way. |
35 | (None, _) | (Some(_), 1) => { |
36 | let byte = self.remaining[0]; |
37 | self.remaining = &self.remaining[1..]; |
38 | byte |
39 | } |
40 | // For any valid UTF-8 that is not ASCII, we pass it |
41 | // through as-is. We don't do any Unicode escaping. |
42 | (Some(ch), size) => { |
43 | self.remaining = &self.remaining[size..]; |
44 | return Some(ch); |
45 | } |
46 | }; |
47 | self.state = match byte { |
48 | 0x21..=0x5B | 0x5D..=0x7E => { |
49 | return Some(char::from(byte)) |
50 | } |
51 | b' \0' => SpecialEscape('0' ), |
52 | b' \n' => SpecialEscape('n' ), |
53 | b' \r' => SpecialEscape('r' ), |
54 | b' \t' => SpecialEscape('t' ), |
55 | b' \\' => SpecialEscape(' \\' ), |
56 | _ => HexEscapeX(byte), |
57 | }; |
58 | Some(' \\' ) |
59 | } |
60 | SpecialEscape(ch) => { |
61 | self.state = Start; |
62 | Some(ch) |
63 | } |
64 | HexEscapeX(byte) => { |
65 | self.state = HexEscapeHighNybble(byte); |
66 | Some('x' ) |
67 | } |
68 | HexEscapeHighNybble(byte) => { |
69 | self.state = HexEscapeLowNybble(byte); |
70 | let nybble = byte >> 4; |
71 | Some(hexdigit_to_char(nybble)) |
72 | } |
73 | HexEscapeLowNybble(byte) => { |
74 | self.state = Start; |
75 | let nybble = byte & 0xF; |
76 | Some(hexdigit_to_char(nybble)) |
77 | } |
78 | } |
79 | } |
80 | } |
81 | |
82 | impl<'a> core::fmt::Display for EscapeBytes<'a> { |
83 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
84 | use core::fmt::Write; |
85 | for ch: char in self.clone() { |
86 | f.write_char(ch)?; |
87 | } |
88 | Ok(()) |
89 | } |
90 | } |
91 | |
92 | /// The state used by the FSM in the escaping iterator. |
93 | #[derive (Clone, Debug)] |
94 | enum EscapeState { |
95 | /// Read and remove the next byte from 'remaining'. If 'remaining' is |
96 | /// empty, then return None. Otherwise, escape the byte according to the |
97 | /// following rules or emit it as-is. |
98 | /// |
99 | /// If it's \n, \r, \t, \\ or \0, then emit a '\' and set the current |
100 | /// state to 'SpecialEscape(n | r | t | \ | 0)'. Otherwise, if the 'byte' |
101 | /// is not in [\x21-\x5B\x5D-\x7E], then emit a '\' and set the state to |
102 | /// to 'HexEscapeX(byte)'. |
103 | Start, |
104 | /// Emit the given codepoint as is. This assumes '\' has just been emitted. |
105 | /// Then set the state to 'Start'. |
106 | SpecialEscape(char), |
107 | /// Emit the 'x' part of a hex escape. This assumes '\' has just been |
108 | /// emitted. Then set the state to 'HexEscapeHighNybble(byte)'. |
109 | HexEscapeX(u8), |
110 | /// Emit the high nybble of the byte as a hexadecimal digit. This |
111 | /// assumes '\x' has just been emitted. Then set the state to |
112 | /// 'HexEscapeLowNybble(byte)'. |
113 | HexEscapeHighNybble(u8), |
114 | /// Emit the low nybble of the byte as a hexadecimal digit. This assume |
115 | /// '\xZ' has just been emitted, where 'Z' is the high nybble of this byte. |
116 | /// Then set the state to 'Start'. |
117 | HexEscapeLowNybble(u8), |
118 | } |
119 | |
120 | /// An iterator of `u8` values that represent an unescaping of a sequence of |
121 | /// codepoints. |
122 | /// |
123 | /// The type parameter `I` refers to the iterator of codepoints that is |
124 | /// unescaped. |
125 | /// |
126 | /// Currently this iterator is not exposed in the crate API, and instead all |
127 | /// we expose is a `ByteVec::unescape` method. Which of course requires an |
128 | /// alloc. That's the most convenient form of this, but in theory, we could |
129 | /// expose this for core-only use cases too. I'm just not quite sure what the |
130 | /// API should be. |
131 | #[derive (Clone, Debug)] |
132 | #[cfg (feature = "alloc" )] |
133 | pub(crate) struct UnescapeBytes<I> { |
134 | it: I, |
135 | state: UnescapeState, |
136 | } |
137 | |
138 | #[cfg (feature = "alloc" )] |
139 | impl<I: Iterator<Item = char>> UnescapeBytes<I> { |
140 | pub(crate) fn new<T: IntoIterator<IntoIter = I>>( |
141 | t: T, |
142 | ) -> UnescapeBytes<I> { |
143 | UnescapeBytes { it: t.into_iter(), state: UnescapeState::Start } |
144 | } |
145 | } |
146 | |
147 | #[cfg (feature = "alloc" )] |
148 | impl<I: Iterator<Item = char>> Iterator for UnescapeBytes<I> { |
149 | type Item = u8; |
150 | |
151 | fn next(&mut self) -> Option<u8> { |
152 | use self::UnescapeState::*; |
153 | |
154 | loop { |
155 | match self.state { |
156 | Start => { |
157 | let ch = self.it.next()?; |
158 | match ch { |
159 | ' \\' => { |
160 | self.state = Escape; |
161 | } |
162 | ch => { |
163 | self.state = UnescapeState::bytes(&[], ch); |
164 | } |
165 | } |
166 | } |
167 | Bytes { buf, mut cur, len } => { |
168 | let byte = buf[cur]; |
169 | cur += 1; |
170 | if cur >= len { |
171 | self.state = Start; |
172 | } else { |
173 | self.state = Bytes { buf, cur, len }; |
174 | } |
175 | return Some(byte); |
176 | } |
177 | Escape => { |
178 | let ch = match self.it.next() { |
179 | Some(ch) => ch, |
180 | None => { |
181 | self.state = Start; |
182 | // Incomplete escape sequences unescape as |
183 | // themselves. |
184 | return Some(b' \\' ); |
185 | } |
186 | }; |
187 | match ch { |
188 | '0' => { |
189 | self.state = Start; |
190 | return Some(b' \x00' ); |
191 | } |
192 | ' \\' => { |
193 | self.state = Start; |
194 | return Some(b' \\' ); |
195 | } |
196 | 'r' => { |
197 | self.state = Start; |
198 | return Some(b' \r' ); |
199 | } |
200 | 'n' => { |
201 | self.state = Start; |
202 | return Some(b' \n' ); |
203 | } |
204 | 't' => { |
205 | self.state = Start; |
206 | return Some(b' \t' ); |
207 | } |
208 | 'x' => { |
209 | self.state = HexFirst; |
210 | } |
211 | ch => { |
212 | // An invalid escape sequence unescapes as itself. |
213 | self.state = UnescapeState::bytes(&[b' \\' ], ch); |
214 | } |
215 | } |
216 | } |
217 | HexFirst => { |
218 | let ch = match self.it.next() { |
219 | Some(ch) => ch, |
220 | None => { |
221 | // An incomplete escape sequence unescapes as |
222 | // itself. |
223 | self.state = UnescapeState::bytes_raw(&[b'x' ]); |
224 | return Some(b' \\' ); |
225 | } |
226 | }; |
227 | match ch { |
228 | '0' ..='9' | 'A' ..='F' | 'a' ..='f' => { |
229 | self.state = HexSecond(ch); |
230 | } |
231 | ch => { |
232 | // An invalid escape sequence unescapes as itself. |
233 | self.state = UnescapeState::bytes(&[b'x' ], ch); |
234 | return Some(b' \\' ); |
235 | } |
236 | } |
237 | } |
238 | HexSecond(first) => { |
239 | let second = match self.it.next() { |
240 | Some(ch) => ch, |
241 | None => { |
242 | // An incomplete escape sequence unescapes as |
243 | // itself. |
244 | self.state = UnescapeState::bytes(&[b'x' ], first); |
245 | return Some(b' \\' ); |
246 | } |
247 | }; |
248 | match second { |
249 | '0' ..='9' | 'A' ..='F' | 'a' ..='f' => { |
250 | self.state = Start; |
251 | let hinybble = char_to_hexdigit(first); |
252 | let lonybble = char_to_hexdigit(second); |
253 | let byte = hinybble << 4 | lonybble; |
254 | return Some(byte); |
255 | } |
256 | ch => { |
257 | // An invalid escape sequence unescapes as itself. |
258 | self.state = |
259 | UnescapeState::bytes2(&[b'x' ], first, ch); |
260 | return Some(b' \\' ); |
261 | } |
262 | } |
263 | } |
264 | } |
265 | } |
266 | } |
267 | } |
268 | |
269 | /// The state used by the FSM in the unescaping iterator. |
270 | #[derive (Clone, Debug)] |
271 | #[cfg (feature = "alloc" )] |
272 | enum UnescapeState { |
273 | /// The start state. Look for an escape sequence, otherwise emit the next |
274 | /// codepoint as-is. |
275 | Start, |
276 | /// Emit the byte at `buf[cur]`. |
277 | /// |
278 | /// This state should never be created when `cur >= len`. That is, when |
279 | /// this state is visited, it is assumed that `cur < len`. |
280 | Bytes { buf: [u8; 11], cur: usize, len: usize }, |
281 | /// This state is entered after a `\` is seen. |
282 | Escape, |
283 | /// This state is entered after a `\x` is seen. |
284 | HexFirst, |
285 | /// This state is entered after a `\xN` is seen, where `N` is in |
286 | /// `[0-9A-Fa-f]`. The given codepoint corresponds to `N`. |
287 | HexSecond(char), |
288 | } |
289 | |
290 | #[cfg (feature = "alloc" )] |
291 | impl UnescapeState { |
292 | /// Create a new `Bytes` variant with the given slice. |
293 | /// |
294 | /// # Panics |
295 | /// |
296 | /// Panics if `bytes.len() > 11`. |
297 | fn bytes_raw(bytes: &[u8]) -> UnescapeState { |
298 | // This can be increased, you just need to make sure 'buf' in the |
299 | // 'Bytes' state has enough room. |
300 | assert!(bytes.len() <= 11, "no more than 11 bytes allowed" ); |
301 | let mut buf = [0; 11]; |
302 | buf[..bytes.len()].copy_from_slice(bytes); |
303 | UnescapeState::Bytes { buf, cur: 0, len: bytes.len() } |
304 | } |
305 | |
306 | /// Create a new `Bytes` variant with the prefix byte slice, followed by |
307 | /// the UTF-8 encoding of the given char. |
308 | /// |
309 | /// # Panics |
310 | /// |
311 | /// Panics if `prefix.len() > 3`. |
312 | fn bytes(prefix: &[u8], ch: char) -> UnescapeState { |
313 | // This can be increased, you just need to make sure 'buf' in the |
314 | // 'Bytes' state has enough room. |
315 | assert!(prefix.len() <= 3, "no more than 3 bytes allowed" ); |
316 | let mut buf = [0; 11]; |
317 | buf[..prefix.len()].copy_from_slice(prefix); |
318 | let chlen = ch.encode_utf8(&mut buf[prefix.len()..]).len(); |
319 | UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + chlen } |
320 | } |
321 | |
322 | /// Create a new `Bytes` variant with the prefix byte slice, followed by |
323 | /// the UTF-8 encoding of `ch1` and then `ch2`. |
324 | /// |
325 | /// # Panics |
326 | /// |
327 | /// Panics if `prefix.len() > 3`. |
328 | fn bytes2(prefix: &[u8], ch1: char, ch2: char) -> UnescapeState { |
329 | // This can be increased, you just need to make sure 'buf' in the |
330 | // 'Bytes' state has enough room. |
331 | assert!(prefix.len() <= 3, "no more than 3 bytes allowed" ); |
332 | let mut buf = [0; 11]; |
333 | buf[..prefix.len()].copy_from_slice(prefix); |
334 | let len1 = ch1.encode_utf8(&mut buf[prefix.len()..]).len(); |
335 | let len2 = ch2.encode_utf8(&mut buf[prefix.len() + len1..]).len(); |
336 | UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + len1 + len2 } |
337 | } |
338 | } |
339 | |
340 | /// Convert the given codepoint to its corresponding hexadecimal digit. |
341 | /// |
342 | /// # Panics |
343 | /// |
344 | /// This panics if `ch` is not in `[0-9A-Fa-f]`. |
345 | #[cfg (feature = "alloc" )] |
346 | fn char_to_hexdigit(ch: char) -> u8 { |
347 | u8::try_from(ch.to_digit(radix:16).unwrap()).unwrap() |
348 | } |
349 | |
350 | /// Convert the given hexadecimal digit to its corresponding codepoint. |
351 | /// |
352 | /// # Panics |
353 | /// |
354 | /// This panics when `digit > 15`. |
355 | fn hexdigit_to_char(digit: u8) -> char { |
356 | char::from_digit(num:u32::from(digit), radix:16).unwrap().to_ascii_uppercase() |
357 | } |
358 | |
359 | #[cfg (all(test, feature = "std" ))] |
360 | mod tests { |
361 | use crate::BString; |
362 | |
363 | use super::*; |
364 | |
365 | #[allow (non_snake_case)] |
366 | fn B<B: AsRef<[u8]>>(bytes: B) -> BString { |
367 | BString::from(bytes.as_ref()) |
368 | } |
369 | |
370 | fn e<B: AsRef<[u8]>>(bytes: B) -> String { |
371 | EscapeBytes::new(bytes.as_ref()).to_string() |
372 | } |
373 | |
374 | fn u(string: &str) -> BString { |
375 | UnescapeBytes::new(string.chars()).collect() |
376 | } |
377 | |
378 | #[test ] |
379 | fn escape() { |
380 | assert_eq!(r"a" , e(br"a" )); |
381 | assert_eq!(r"\\x61" , e(br"\x61" )); |
382 | assert_eq!(r"a" , e(b" \x61" )); |
383 | assert_eq!(r"~" , e(b" \x7E" )); |
384 | assert_eq!(r"\x7F" , e(b" \x7F" )); |
385 | |
386 | assert_eq!(r"\n" , e(b" \n" )); |
387 | assert_eq!(r"\r" , e(b" \r" )); |
388 | assert_eq!(r"\t" , e(b" \t" )); |
389 | assert_eq!(r"\\" , e(b" \\" )); |
390 | assert_eq!(r"\0" , e(b" \0" )); |
391 | assert_eq!(r"\0" , e(b" \x00" )); |
392 | |
393 | assert_eq!(r"\x88" , e(b" \x88" )); |
394 | assert_eq!(r"\x8F" , e(b" \x8F" )); |
395 | assert_eq!(r"\xF8" , e(b" \xF8" )); |
396 | assert_eq!(r"\xFF" , e(b" \xFF" )); |
397 | |
398 | assert_eq!(r"\xE2" , e(b" \xE2" )); |
399 | assert_eq!(r"\xE2\x98" , e(b" \xE2\x98" )); |
400 | assert_eq!(r"☃" , e(b" \xE2\x98\x83" )); |
401 | |
402 | assert_eq!(r"\xF0" , e(b" \xF0" )); |
403 | assert_eq!(r"\xF0\x9F" , e(b" \xF0\x9F" )); |
404 | assert_eq!(r"\xF0\x9F\x92" , e(b" \xF0\x9F\x92" )); |
405 | assert_eq!(r"💩" , e(b" \xF0\x9F\x92\xA9" )); |
406 | } |
407 | |
408 | #[test ] |
409 | fn unescape() { |
410 | assert_eq!(B(r"a" ), u(r"a" )); |
411 | assert_eq!(B(r"\x61" ), u(r"\\x61" )); |
412 | assert_eq!(B(r"a" ), u(r"\x61" )); |
413 | assert_eq!(B(r"~" ), u(r"\x7E" )); |
414 | assert_eq!(B(b" \x7F" ), u(r"\x7F" )); |
415 | |
416 | assert_eq!(B(b" \n" ), u(r"\n" )); |
417 | assert_eq!(B(b" \r" ), u(r"\r" )); |
418 | assert_eq!(B(b" \t" ), u(r"\t" )); |
419 | assert_eq!(B(b" \\" ), u(r"\\" )); |
420 | assert_eq!(B(b" \0" ), u(r"\0" )); |
421 | assert_eq!(B(b" \0" ), u(r"\x00" )); |
422 | |
423 | assert_eq!(B(b" \x88" ), u(r"\x88" )); |
424 | assert_eq!(B(b" \x8F" ), u(r"\x8F" )); |
425 | assert_eq!(B(b" \xF8" ), u(r"\xF8" )); |
426 | assert_eq!(B(b" \xFF" ), u(r"\xFF" )); |
427 | |
428 | assert_eq!(B(b" \xE2" ), u(r"\xE2" )); |
429 | assert_eq!(B(b" \xE2\x98" ), u(r"\xE2\x98" )); |
430 | assert_eq!(B("☃" ), u(r"\xE2\x98\x83" )); |
431 | |
432 | assert_eq!(B(b" \xF0" ), u(r"\xf0" )); |
433 | assert_eq!(B(b" \xF0\x9F" ), u(r"\xf0\x9f" )); |
434 | assert_eq!(B(b" \xF0\x9F\x92" ), u(r"\xf0\x9f\x92" )); |
435 | assert_eq!(B("💩" ), u(r"\xf0\x9f\x92\xa9" )); |
436 | } |
437 | |
438 | #[test ] |
439 | fn unescape_weird() { |
440 | assert_eq!(B(b" \\" ), u(r"\" )); |
441 | assert_eq!(B(b" \\" ), u(r"\\" )); |
442 | assert_eq!(B(b" \\x" ), u(r"\x" )); |
443 | assert_eq!(B(b" \\xA" ), u(r"\xA" )); |
444 | |
445 | assert_eq!(B(b" \\xZ" ), u(r"\xZ" )); |
446 | assert_eq!(B(b" \\xZZ" ), u(r"\xZZ" )); |
447 | assert_eq!(B(b" \\i" ), u(r"\i" )); |
448 | assert_eq!(B(b" \\u" ), u(r"\u" )); |
449 | assert_eq!(B(b" \\u{2603}" ), u(r"\u{2603}" )); |
450 | } |
451 | } |
452 | |