escape.rs source code [crates/litrs/src/escape.rs]

1	use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::{hex_digit_value, check_suffix}};
2
3
4	/// Must start with `\`
5	pub(crate) fn unescape<E: Escapee>(input: &str, offset: usize) -> Result<(E, usize), ParseError> {
6	let first = input.as_bytes().get(`1`)
7	.ok_or(perr(offset, UnterminatedEscape))?;
8	let out = match first {
9	// Quote escapes
10	b'`\'`' => (E::from_byte(b'`\'`'), `2`),
11	b'"' => (E::from_byte(b'"'), `2`),
12
13	// Ascii escapes
14	b'n' => (E::from_byte(b'`\n`'), `2`),
15	b'r' => (E::from_byte(b'`\r`'), `2`),
16	b't' => (E::from_byte(b'`\t`'), `2`),
17	b'`\\`' => (E::from_byte(b'`\\`'), `2`),
18	b'0' => (E::from_byte(b'`\0`'), `2`),
19	b'x' => {
20	let hex_string = input.get(`2`..`4`)
21	.ok_or(perr(offset..offset + input.len(), UnterminatedEscape))?
22	.as_bytes();
23	let first = hex_digit_value(hex_string[`0`])
24	.ok_or(perr(offset..offset + `4`, InvalidXEscape))?;
25	let second = hex_digit_value(hex_string[`1`])
26	.ok_or(perr(offset..offset + `4`, InvalidXEscape))?;
27	let value = second + `16` * first;
28
29	if E::SUPPORTS_UNICODE && value > `0x7F` {
30	return Err(perr(offset..offset + `4`, NonAsciiXEscape));
31	}
32
33	(E::from_byte(value), `4`)
34	},
35
36	// Unicode escape
37	b'u' => {
38	if !E::SUPPORTS_UNICODE {
39	return Err(perr(offset..offset + `2`, UnicodeEscapeInByteLiteral));
40	}
41
42	if input.as_bytes().get(`2`) != Some(&b'{') {
43	return Err(perr(offset..offset + `2`, UnicodeEscapeWithoutBrace));
44	}
45
46	let closing_pos = input.bytes().position(\|b\| b == b'}')
47	.ok_or(perr(offset..offset + input.len(), UnterminatedUnicodeEscape))?;
48
49	let inner = &input[`3`..closing_pos];
50	if inner.as_bytes().first() == Some(&b'_') {
51	return Err(perr(`4`, InvalidStartOfUnicodeEscape));
52	}
53
54	let mut v: u32 = `0`;
55	let mut digit_count = `0`;
56	for (i, b) in inner.bytes().enumerate() {
57	if b == b'_'{
58	continue;
59	}
60
61	let digit = hex_digit_value(b)
62	.ok_or(perr(offset + `3` + i, NonHexDigitInUnicodeEscape))?;
63
64	if digit_count == `6` {
65	return Err(perr(offset + `3` + i, TooManyDigitInUnicodeEscape));
66	}
67	digit_count += `1`;
68	v = `16` * v + digit as u32;
69	}
70
71	let c = std::char::from_u32(v)
72	.ok_or(perr(offset..closing_pos + `1`, InvalidUnicodeEscapeChar))?;
73
74	(E::from_char(c), closing_pos + `1`)
75	}
76
77	_ => return Err(perr(offset..offset + `2`, UnknownEscape)),
78	};
79
80	Ok(out)
81	}
82
83	pub(crate) trait Escapee: Into<char> {
84	const SUPPORTS_UNICODE: bool;
85	fn from_byte(b: u8) -> Self;
86	fn from_char(c: char) -> Self;
87	}
88
89	impl Escapee for u8 {
90	const SUPPORTS_UNICODE: bool = `false`;
91	fn from_byte(b: u8) -> Self {
92	b
93	}
94	fn from_char(_: char) -> Self {
95	panic!("bug: `<u8 as Escapee>::from_char` was called");
96	}
97	}
98
99	impl Escapee for char {
100	const SUPPORTS_UNICODE: bool = `true`;
101	fn from_byte(b: u8) -> Self {
102	b.into()
103	}
104	fn from_char(c: char) -> Self {
105	c
106	}
107	}
108
109	/// Checks whether the character is skipped after a string continue start
110	/// (unescaped backlash followed by `\n`).
111	fn is_string_continue_skipable_whitespace(b: u8) -> bool {
112	b == b' ' \|\| b == b'`\t`' \|\| b == b'`\n`' \|\| b == b'`\r`'
113	}
114
115	/// Unescapes a whole string or byte string.
116	#[inline(never)]
117	pub(crate) fn unescape_string<E: Escapee>(
118	input: &str,
119	offset: usize,
120	) -> Result<(Option<String>, usize), ParseError> {
121	let mut closing_quote_pos = None;
122	let mut i = offset;
123	let mut end_last_escape = offset;
124	let mut value = String::new();
125	while i < input.len() {
126	match input.as_bytes()[i] {
127	// Handle "string continue".
128	b'`\\`' if input.as_bytes().get(i + `1`) == Some(&b'`\n`') => {
129	value.push_str(&input[end_last_escape..i]);
130
131	// Find the first non-whitespace character.
132	let end_escape = input[i + `2`..].bytes()
133	.position(\|b\| !is_string_continue_skipable_whitespace(b))
134	.ok_or(perr(None, UnterminatedString))?;
135
136	i += `2` + end_escape;
137	end_last_escape = i;
138	}
139	b'`\\`' => {
140	let (c, len) = unescape::<E>(&input[i..input.len() - `1`], i)?;
141	value.push_str(&input[end_last_escape..i]);
142	value.push(c.into());
143	i += len;
144	end_last_escape = i;
145	}
146	b'`\r`' => {
147	if input.as_bytes().get(i + `1`) == Some(&b'`\n`') {
148	value.push_str(&input[end_last_escape..i]);
149	value.push('`\n`');
150	i += `2`;
151	end_last_escape = i;
152	} else {
153	return Err(perr(i, IsolatedCr))
154	}
155	}
156	b'"' => {
157	closing_quote_pos = Some(i);
158	break;
159	},
160	b if !E::SUPPORTS_UNICODE && !b.is_ascii()
161	=> return Err(perr(i, NonAsciiInByteLiteral)),
162	_ => i += `1`,
163	}
164	}
165
166	let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedString))?;
167
168	let start_suffix = closing_quote_pos + `1`;
169	let suffix = &input[start_suffix..];
170	check_suffix(suffix).map_err(\|kind\| perr(start_suffix, kind))?;
171
172	// `value` is only empty if there was no escape in the input string
173	// (with the special case of the input being empty). This means the
174	// string value basically equals the input, so we store `None`.
175	let value = if value.is_empty() {
176	None
177	} else {
178	// There was an escape in the string, so we need to push the
179	// remaining unescaped part of the string still.
180	value.push_str(&input[end_last_escape..closing_quote_pos]);
181	Some(value)
182	};
183
184	Ok((value, start_suffix))
185	}
186
187	/// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to
188	/// just `\n` sequences. Returns an optional new string (if the input contained
189	/// any `\r\n`) and the number of hashes used by the literal.
190	#[inline(never)]
191	pub(crate) fn scan_raw_string<E: Escapee>(
192	input: &str,
193	offset: usize,
194	) -> Result<(Option<String>, u32, usize), ParseError> {
195	// Raw string literal
196	let num_hashes = input[offset..].bytes().position(\|b\| b != b'#')
197	.ok_or(perr(None, InvalidLiteral))?;
198
199	if input.as_bytes().get(offset + num_hashes) != Some(&b'"') {
200	return Err(perr(None, InvalidLiteral));
201	}
202	let start_inner = offset + num_hashes + `1`;
203	let hashes = &input[offset..num_hashes + offset];
204
205	let mut closing_quote_pos = None;
206	let mut i = start_inner;
207	let mut end_last_escape = start_inner;
208	let mut value = String::new();
209	while i < input.len() {
210	let b = input.as_bytes()[i];
211	if b == b'"' && input[i + `1`..].starts_with(hashes) {
212	closing_quote_pos = Some(i);
213	break;
214	}
215
216	if b == b'`\r`' {
217	// Convert `\r\n` into `\n`. This is currently not well documented
218	// in the Rust reference, but is done even for raw strings. That's
219	// because rustc simply converts all line endings when reading
220	// source files.
221	if input.as_bytes().get(i + `1`) == Some(&b'`\n`') {
222	value.push_str(&input[end_last_escape..i]);
223	value.push('`\n`');
224	i += `2`;
225	end_last_escape = i;
226	continue;
227	} else if E::SUPPORTS_UNICODE {
228	// If no \n follows the \r and we are scanning a raw string
229	// (not raw byte string), we error.
230	return Err(perr(i, IsolatedCr))
231	}
232	}
233
234	if !E::SUPPORTS_UNICODE {
235	if !b.is_ascii() {
236	return Err(perr(i, NonAsciiInByteLiteral));
237	}
238	}
239
240	i += `1`;
241	}
242
243	let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedRawString))?;
244
245	let start_suffix = closing_quote_pos + num_hashes + `1`;
246	let suffix = &input[start_suffix..];
247	check_suffix(suffix).map_err(\|kind\| perr(start_suffix, kind))?;
248
249	// `value` is only empty if there was no \r\n in the input string (with the
250	// special case of the input being empty). This means the string value
251	// equals the input, so we store `None`.
252	let value = if value.is_empty() {
253	None
254	} else {
255	// There was an \r\n in the string, so we need to push the remaining
256	// unescaped part of the string still.
257	value.push_str(&input[end_last_escape..closing_quote_pos]);
258	Some(value)
259	};
260
261	Ok((value, num_hashes as u32, start_suffix))
262	}
263