escape_bytes.rs source code [crates/bstr/src/escape_bytes.rs]

1	/// An iterator of `char` values that represent an escaping of arbitrary bytes.
2	///
3	/// The lifetime parameter `'a` refers to the lifetime of the bytes being
4	/// escaped.
5	///
6	/// This iterator is created by the
7	/// [`ByteSlice::escape_bytes`](crate::ByteSlice::escape_bytes) method.
8	#[derive(Clone, Debug)]
9	pub struct EscapeBytes<'a> {
10	remaining: &'a [u8],
11	state: EscapeState,
12	}
13
14	impl<'a> EscapeBytes<'a> {
15	pub(crate) fn new(bytes: &'a [u8]) -> EscapeBytes<'a> {
16	EscapeBytes { remaining: bytes, state: EscapeState::Start }
17	}
18	}
19
20	impl<'a> Iterator for EscapeBytes<'a> {
21	type Item = char;
22
23	#[inline]
24	fn next(&mut self) -> Option<char> {
25	use self::EscapeState::*;
26
27	match self.state {
28	Start => {
29	let byte = match crate::decode_utf8(self.remaining) {
30	(None, `0`) => return None,
31	// If we see invalid UTF-8 or ASCII, then we always just
32	// peel one byte off. If it's printable ASCII, we'll pass
33	// it through as-is below. Otherwise, below, it will get
34	// escaped in some way.
35	(None, _) \| (Some(_), `1`) => {
36	let byte = self.remaining[`0`];
37	self.remaining = &self.remaining[`1`..];
38	byte
39	}
40	// For any valid UTF-8 that is not ASCII, we pass it
41	// through as-is. We don't do any Unicode escaping.
42	(Some(ch), size) => {
43	self.remaining = &self.remaining[size..];
44	return Some(ch);
45	}
46	};
47	self.state = match byte {
48	`0x21`..=`0x5B` \| `0x5D`..=`0x7E` => {
49	return Some(char::from(byte))
50	}
51	b'`\0`' => SpecialEscape('0'),
52	b'`\n`' => SpecialEscape('n'),
53	b'`\r`' => SpecialEscape('r'),
54	b'`\t`' => SpecialEscape('t'),
55	b'`\\`' => SpecialEscape('`\\`'),
56	_ => HexEscapeX(byte),
57	};
58	Some('`\\`')
59	}
60	SpecialEscape(ch) => {
61	self.state = Start;
62	Some(ch)
63	}
64	HexEscapeX(byte) => {
65	self.state = HexEscapeHighNybble(byte);
66	Some('x')
67	}
68	HexEscapeHighNybble(byte) => {
69	self.state = HexEscapeLowNybble(byte);
70	let nybble = byte >> `4`;
71	Some(hexdigit_to_char(nybble))
72	}
73	HexEscapeLowNybble(byte) => {
74	self.state = Start;
75	let nybble = byte & `0xF`;
76	Some(hexdigit_to_char(nybble))
77	}
78	}
79	}
80	}
81
82	impl<'a> core::fmt::Display for EscapeBytes<'a> {
83	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
84	use core::fmt::Write;
85	for ch: char in self.clone() {
86	f.write_char(ch)?;
87	}
88	Ok(())
89	}
90	}
91
92	/// The state used by the FSM in the escaping iterator.
93	#[derive(Clone, Debug)]
94	enum EscapeState {
95	/// Read and remove the next byte from 'remaining'. If 'remaining' is
96	/// empty, then return None. Otherwise, escape the byte according to the
97	/// following rules or emit it as-is.
98	///
99	/// If it's \n, \r, \t, \\ or \0, then emit a '\' and set the current
100	/// state to 'SpecialEscape(n \| r \| t \| \ \| 0)'. Otherwise, if the 'byte'
101	/// is not in [\x21-\x5B\x5D-\x7E], then emit a '\' and set the state to
102	/// to 'HexEscapeX(byte)'.
103	Start,
104	/// Emit the given codepoint as is. This assumes '\' has just been emitted.
105	/// Then set the state to 'Start'.
106	SpecialEscape(char),
107	/// Emit the 'x' part of a hex escape. This assumes '\' has just been
108	/// emitted. Then set the state to 'HexEscapeHighNybble(byte)'.
109	HexEscapeX(u8),
110	/// Emit the high nybble of the byte as a hexadecimal digit. This
111	/// assumes '\x' has just been emitted. Then set the state to
112	/// 'HexEscapeLowNybble(byte)'.
113	HexEscapeHighNybble(u8),
114	/// Emit the low nybble of the byte as a hexadecimal digit. This assume
115	/// '\xZ' has just been emitted, where 'Z' is the high nybble of this byte.
116	/// Then set the state to 'Start'.
117	HexEscapeLowNybble(u8),
118	}
119
120	/// An iterator of `u8` values that represent an unescaping of a sequence of
121	/// codepoints.
122	///
123	/// The type parameter `I` refers to the iterator of codepoints that is
124	/// unescaped.
125	///
126	/// Currently this iterator is not exposed in the crate API, and instead all
127	/// we expose is a `ByteVec::unescape` method. Which of course requires an
128	/// alloc. That's the most convenient form of this, but in theory, we could
129	/// expose this for core-only use cases too. I'm just not quite sure what the
130	/// API should be.
131	#[derive(Clone, Debug)]
132	#[cfg(feature = "alloc")]
133	pub(crate) struct UnescapeBytes<I> {
134	it: I,
135	state: UnescapeState,
136	}
137
138	#[cfg(feature = "alloc")]
139	impl<I: Iterator<Item = char>> UnescapeBytes<I> {
140	pub(crate) fn new<T: IntoIterator<IntoIter = I>>(
141	t: T,
142	) -> UnescapeBytes<I> {
143	UnescapeBytes { it: t.into_iter(), state: UnescapeState::Start }
144	}
145	}
146
147	#[cfg(feature = "alloc")]
148	impl<I: Iterator<Item = char>> Iterator for UnescapeBytes<I> {
149	type Item = u8;
150
151	fn next(&mut self) -> Option<u8> {
152	use self::UnescapeState::*;
153
154	loop {
155	match self.state {
156	Start => {
157	let ch = self.it.next()?;
158	match ch {
159	'`\\`' => {
160	self.state = Escape;
161	}
162	ch => {
163	self.state = UnescapeState::bytes(&[], ch);
164	}
165	}
166	}
167	Bytes { buf, mut cur, len } => {
168	let byte = buf[cur];
169	cur += `1`;
170	if cur >= len {
171	self.state = Start;
172	} else {
173	self.state = Bytes { buf, cur, len };
174	}
175	return Some(byte);
176	}
177	Escape => {
178	let ch = match self.it.next() {
179	Some(ch) => ch,
180	None => {
181	self.state = Start;
182	// Incomplete escape sequences unescape as
183	// themselves.
184	return Some(b'`\\`');
185	}
186	};
187	match ch {
188	'0' => {
189	self.state = Start;
190	return Some(b'`\x00`');
191	}
192	'`\\`' => {
193	self.state = Start;
194	return Some(b'`\\`');
195	}
196	'r' => {
197	self.state = Start;
198	return Some(b'`\r`');
199	}
200	'n' => {
201	self.state = Start;
202	return Some(b'`\n`');
203	}
204	't' => {
205	self.state = Start;
206	return Some(b'`\t`');
207	}
208	'x' => {
209	self.state = HexFirst;
210	}
211	ch => {
212	// An invalid escape sequence unescapes as itself.
213	self.state = UnescapeState::bytes(&[b'`\\`'], ch);
214	}
215	}
216	}
217	HexFirst => {
218	let ch = match self.it.next() {
219	Some(ch) => ch,
220	None => {
221	// An incomplete escape sequence unescapes as
222	// itself.
223	self.state = UnescapeState::bytes_raw(&[b'x']);
224	return Some(b'`\\`');
225	}
226	};
227	match ch {
228	'0'..='9' \| 'A'..='F' \| 'a'..='f' => {
229	self.state = HexSecond(ch);
230	}
231	ch => {
232	// An invalid escape sequence unescapes as itself.
233	self.state = UnescapeState::bytes(&[b'x'], ch);
234	return Some(b'`\\`');
235	}
236	}
237	}
238	HexSecond(first) => {
239	let second = match self.it.next() {
240	Some(ch) => ch,
241	None => {
242	// An incomplete escape sequence unescapes as
243	// itself.
244	self.state = UnescapeState::bytes(&[b'x'], first);
245	return Some(b'`\\`');
246	}
247	};
248	match second {
249	'0'..='9' \| 'A'..='F' \| 'a'..='f' => {
250	self.state = Start;
251	let hinybble = char_to_hexdigit(first);
252	let lonybble = char_to_hexdigit(second);
253	let byte = hinybble << `4` \| lonybble;
254	return Some(byte);
255	}
256	ch => {
257	// An invalid escape sequence unescapes as itself.
258	self.state =
259	UnescapeState::bytes2(&[b'x'], first, ch);
260	return Some(b'`\\`');
261	}
262	}
263	}
264	}
265	}
266	}
267	}
268
269	/// The state used by the FSM in the unescaping iterator.
270	#[derive(Clone, Debug)]
271	#[cfg(feature = "alloc")]
272	enum UnescapeState {
273	/// The start state. Look for an escape sequence, otherwise emit the next
274	/// codepoint as-is.
275	Start,
276	/// Emit the byte at `buf[cur]`.
277	///
278	/// This state should never be created when `cur >= len`. That is, when
279	/// this state is visited, it is assumed that `cur < len`.
280	Bytes { buf: [u8; `11`], cur: usize, len: usize },
281	/// This state is entered after a `\` is seen.
282	Escape,
283	/// This state is entered after a `\x` is seen.
284	HexFirst,
285	/// This state is entered after a `\xN` is seen, where `N` is in
286	/// `[0-9A-Fa-f]`. The given codepoint corresponds to `N`.
287	HexSecond(char),
288	}
289
290	#[cfg(feature = "alloc")]
291	impl UnescapeState {
292	/// Create a new `Bytes` variant with the given slice.
293	///
294	/// # Panics
295	///
296	/// Panics if `bytes.len() > 11`.
297	fn bytes_raw(bytes: &[u8]) -> UnescapeState {
298	// This can be increased, you just need to make sure 'buf' in the
299	// 'Bytes' state has enough room.
300	assert!(bytes.len() <= `11`, "no more than 11 bytes allowed");
301	let mut buf = [`0`; `11`];
302	buf[..bytes.len()].copy_from_slice(bytes);
303	UnescapeState::Bytes { buf, cur: `0`, len: bytes.len() }
304	}
305
306	/// Create a new `Bytes` variant with the prefix byte slice, followed by
307	/// the UTF-8 encoding of the given char.
308	///
309	/// # Panics
310	///
311	/// Panics if `prefix.len() > 3`.
312	fn bytes(prefix: &[u8], ch: char) -> UnescapeState {
313	// This can be increased, you just need to make sure 'buf' in the
314	// 'Bytes' state has enough room.
315	assert!(prefix.len() <= `3`, "no more than 3 bytes allowed");
316	let mut buf = [`0`; `11`];
317	buf[..prefix.len()].copy_from_slice(prefix);
318	let chlen = ch.encode_utf8(&mut buf[prefix.len()..]).len();
319	UnescapeState::Bytes { buf, cur: `0`, len: prefix.len() + chlen }
320	}
321
322	/// Create a new `Bytes` variant with the prefix byte slice, followed by
323	/// the UTF-8 encoding of `ch1` and then `ch2`.
324	///
325	/// # Panics
326	///
327	/// Panics if `prefix.len() > 3`.
328	fn bytes2(prefix: &[u8], ch1: char, ch2: char) -> UnescapeState {
329	// This can be increased, you just need to make sure 'buf' in the
330	// 'Bytes' state has enough room.
331	assert!(prefix.len() <= `3`, "no more than 3 bytes allowed");
332	let mut buf = [`0`; `11`];
333	buf[..prefix.len()].copy_from_slice(prefix);
334	let len1 = ch1.encode_utf8(&mut buf[prefix.len()..]).len();
335	let len2 = ch2.encode_utf8(&mut buf[prefix.len() + len1..]).len();
336	UnescapeState::Bytes { buf, cur: `0`, len: prefix.len() + len1 + len2 }
337	}
338	}
339
340	/// Convert the given codepoint to its corresponding hexadecimal digit.
341	///
342	/// # Panics
343	///
344	/// This panics if `ch` is not in `[0-9A-Fa-f]`.
345	#[cfg(feature = "alloc")]
346	fn char_to_hexdigit(ch: char) -> u8 {
347	u8::try_from(ch.to_digit(radix:`16`).unwrap()).unwrap()
348	}
349
350	/// Convert the given hexadecimal digit to its corresponding codepoint.
351	///
352	/// # Panics
353	///
354	/// This panics when `digit > 15`.
355	fn hexdigit_to_char(digit: u8) -> char {
356	char::from_digit(num:u32::from(digit), radix:`16`).unwrap().to_ascii_uppercase()
357	}
358
359	#[cfg(all(test, feature = "std"))]
360	mod tests {
361	use alloc::string::{String, ToString};
362
363	use crate::BString;
364
365	use super::*;
366
367	#[allow(non_snake_case)]
368	fn B<B: AsRef<[u8]>>(bytes: B) -> BString {
369	BString::from(bytes.as_ref())
370	}
371
372	fn e<B: AsRef<[u8]>>(bytes: B) -> String {
373	EscapeBytes::new(bytes.as_ref()).to_string()
374	}
375
376	fn u(string: &str) -> BString {
377	UnescapeBytes::new(string.chars()).collect()
378	}
379
380	#[test]
381	fn escape() {
382	assert_eq!(r"a", e(br"a"));
383	assert_eq!(r"\\x61", e(br"\x61"));
384	assert_eq!(r"a", e(b"`\x61`"));
385	assert_eq!(r"~", e(b"`\x7E`"));
386	assert_eq!(r"\x7F", e(b"`\x7F`"));
387
388	assert_eq!(r"\n", e(b"`\n`"));
389	assert_eq!(r"\r", e(b"`\r`"));
390	assert_eq!(r"\t", e(b"`\t`"));
391	assert_eq!(r"\\", e(b"`\\`"));
392	assert_eq!(r"\0", e(b"`\0`"));
393	assert_eq!(r"\0", e(b"`\x00`"));
394
395	assert_eq!(r"\x88", e(b"`\x88`"));
396	assert_eq!(r"\x8F", e(b"`\x8F`"));
397	assert_eq!(r"\xF8", e(b"`\xF8`"));
398	assert_eq!(r"\xFF", e(b"`\xFF`"));
399
400	assert_eq!(r"\xE2", e(b"`\xE2`"));
401	assert_eq!(r"\xE2\x98", e(b"`\xE2\x98`"));
402	assert_eq!(r"☃", e(b"`\xE2\x98\x83`"));
403
404	assert_eq!(r"\xF0", e(b"`\xF0`"));
405	assert_eq!(r"\xF0\x9F", e(b"`\xF0\x9F`"));
406	assert_eq!(r"\xF0\x9F\x92", e(b"`\xF0\x9F\x92`"));
407	assert_eq!(r"💩", e(b"`\xF0\x9F\x92\xA9`"));
408	}
409
410	#[test]
411	fn unescape() {
412	assert_eq!(B(r"a"), u(r"a"));
413	assert_eq!(B(r"\x61"), u(r"\\x61"));
414	assert_eq!(B(r"a"), u(r"\x61"));
415	assert_eq!(B(r"~"), u(r"\x7E"));
416	assert_eq!(B(b"`\x7F`"), u(r"\x7F"));
417
418	assert_eq!(B(b"`\n`"), u(r"\n"));
419	assert_eq!(B(b"`\r`"), u(r"\r"));
420	assert_eq!(B(b"`\t`"), u(r"\t"));
421	assert_eq!(B(b"`\\`"), u(r"\\"));
422	assert_eq!(B(b"`\0`"), u(r"\0"));
423	assert_eq!(B(b"`\0`"), u(r"\x00"));
424
425	assert_eq!(B(b"`\x88`"), u(r"\x88"));
426	assert_eq!(B(b"`\x8F`"), u(r"\x8F"));
427	assert_eq!(B(b"`\xF8`"), u(r"\xF8"));
428	assert_eq!(B(b"`\xFF`"), u(r"\xFF"));
429
430	assert_eq!(B(b"`\xE2`"), u(r"\xE2"));
431	assert_eq!(B(b"`\xE2\x98`"), u(r"\xE2\x98"));
432	assert_eq!(B("☃"), u(r"\xE2\x98\x83"));
433
434	assert_eq!(B(b"`\xF0`"), u(r"\xf0"));
435	assert_eq!(B(b"`\xF0\x9F`"), u(r"\xf0\x9f"));
436	assert_eq!(B(b"`\xF0\x9F\x92`"), u(r"\xf0\x9f\x92"));
437	assert_eq!(B("💩"), u(r"\xf0\x9f\x92\xa9"));
438	}
439
440	#[test]
441	fn unescape_weird() {
442	assert_eq!(B(b"`\\`"), u(r"\"));
443	assert_eq!(B(b"`\\`"), u(r"\\"));
444	assert_eq!(B(b"`\\`x"), u(r"\x"));
445	assert_eq!(B(b"`\\`xA"), u(r"\xA"));
446
447	assert_eq!(B(b"`\\`xZ"), u(r"\xZ"));
448	assert_eq!(B(b"`\\`xZZ"), u(r"\xZZ"));
449	assert_eq!(B(b"`\\`i"), u(r"\i"));
450	assert_eq!(B(b"`\\`u"), u(r"\u"));
451	assert_eq!(B(b"`\\`u{2603}"), u(r"\u{2603}"));
452	}
453	}
454