lib.rs source code [crates/rustc_literal_escaper/src/lib.rs]

1	//! Utilities for validating string and char literals and turning them into
2	//! values they represent.
3
4	use std::ops::Range;
5	use std::str::Chars;
6
7	use Mode::*;
8
9	#[cfg(test)]
10	mod tests;
11
12	/// Errors and warnings that can occur during string unescaping. They mostly
13	/// relate to malformed escape sequences, but there are a few that are about
14	/// other problems.
15	#[derive(Debug, PartialEq, Eq)]
16	pub enum EscapeError {
17	/// Expected 1 char, but 0 were found.
18	ZeroChars,
19	/// Expected 1 char, but more than 1 were found.
20	MoreThanOneChar,
21
22	/// Escaped '\' character without continuation.
23	LoneSlash,
24	/// Invalid escape character (e.g. '\z').
25	InvalidEscape,
26	/// Raw '\r' encountered.
27	BareCarriageReturn,
28	/// Raw '\r' encountered in raw string.
29	BareCarriageReturnInRawString,
30	/// Unescaped character that was expected to be escaped (e.g. raw '\t').
31	EscapeOnlyChar,
32
33	/// Numeric character escape is too short (e.g. '\x1').
34	TooShortHexEscape,
35	/// Invalid character in numeric escape (e.g. '\xz')
36	InvalidCharInHexEscape,
37	/// Character code in numeric escape is non-ascii (e.g. '\xFF').
38	OutOfRangeHexEscape,
39
40	/// '\u' not followed by '{'.
41	NoBraceInUnicodeEscape,
42	/// Non-hexadecimal value in '\u{..}'.
43	InvalidCharInUnicodeEscape,
44	/// '\u{}'
45	EmptyUnicodeEscape,
46	/// No closing brace in '\u{..}', e.g. '\u{12'.
47	UnclosedUnicodeEscape,
48	/// '\u{_12}'
49	LeadingUnderscoreUnicodeEscape,
50	/// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}'
51	OverlongUnicodeEscape,
52	/// Invalid in-bound unicode character code, e.g. '\u{DFFF}'.
53	LoneSurrogateUnicodeEscape,
54	/// Out of bounds unicode character code, e.g. '\u{FFFFFF}'.
55	OutOfRangeUnicodeEscape,
56
57	/// Unicode escape code in byte literal.
58	UnicodeEscapeInByte,
59	/// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
60	NonAsciiCharInByte,
61
62	// `\0` in a C string literal.
63	NulInCStr,
64
65	/// After a line ending with '\', the next line contains whitespace
66	/// characters that are not skipped.
67	UnskippedWhitespaceWarning,
68
69	/// After a line ending with '\', multiple lines are skipped.
70	MultipleSkippedLinesWarning,
71	}
72
73	impl EscapeError {
74	/// Returns true for actual errors, as opposed to warnings.
75	pub fn is_fatal(&self) -> bool {
76	!matches!(
77	self,
78	EscapeError::UnskippedWhitespaceWarning \| EscapeError::MultipleSkippedLinesWarning
79	)
80	}
81	}
82
83	/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without
84	/// quotes) and produces a sequence of escaped characters or errors.
85	///
86	/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
87	/// the callback will be called exactly once.
88	pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
89	where
90	F: FnMut(Range<usize>, Result<char, EscapeError>),
91	{
92	match mode {
93	Char \| Byte => {
94	let mut chars: Chars<'_> = src.chars();
95	let res: Result = unescape_char_or_byte(&mut chars, mode);
96	callback(`0`..(src.len() - chars.as_str().len()), res);
97	}
98	Str \| ByteStr => unescape_non_raw_common(src, mode, callback),
99	RawStr \| RawByteStr => check_raw_common(src, mode, callback),
100	RawCStr => check_raw_common(src, mode, &mut \|r: Range, mut result: Result\| {
101	if let Ok('`\0`') = result {
102	result = Err(EscapeError::NulInCStr);
103	}
104	callback(r, result)
105	}),
106	CStr => unreachable!(),
107	}
108	}
109
110	/// Used for mixed utf8 string literals, i.e. those that allow both unicode
111	/// chars and high bytes.
112	pub enum MixedUnit {
113	/// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
114	/// and Unicode chars (written directly or via `\u` escapes).
115	///
116	/// For example, if '¥' appears in a string it is represented here as
117	/// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
118	/// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
119	Char(char),
120
121	/// Used for high bytes (`\x80`..`\xff`).
122	///
123	/// For example, if `\xa5` appears in a string it is represented here as
124	/// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
125	/// byte string as the single byte `0xa5`.
126	HighByte(u8),
127	}
128
129	impl From<char> for MixedUnit {
130	fn from(c: char) -> Self {
131	MixedUnit::Char(c)
132	}
133	}
134
135	impl From<u8> for MixedUnit {
136	fn from(n: u8) -> Self {
137	if n.is_ascii() {
138	MixedUnit::Char(n as char)
139	} else {
140	MixedUnit::HighByte(n)
141	}
142	}
143	}
144
145	/// Takes the contents of a mixed-utf8 literal (without quotes) and produces
146	/// a sequence of escaped characters or errors.
147	///
148	/// Values are returned by invoking `callback`.
149	pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
150	where
151	F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
152	{
153	match mode {
154	CStr => unescape_non_raw_common(src, mode, &mut \|r: Range, mut result: Result\| {
155	if let Ok(MixedUnit::Char('`\0`')) = result {
156	result = Err(EscapeError::NulInCStr);
157	}
158	callback(r, result)
159	}),
160	Char \| Byte \| Str \| RawStr \| ByteStr \| RawByteStr \| RawCStr => unreachable!(),
161	}
162	}
163
164	/// Takes a contents of a char literal (without quotes), and returns an
165	/// unescaped char or an error.
166	pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
167	unescape_char_or_byte(&mut src.chars(), mode:Char)
168	}
169
170	/// Takes a contents of a byte literal (without quotes), and returns an
171	/// unescaped byte or an error.
172	pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
173	unescape_char_or_byte(&mut src.chars(), Byte).map(op:byte_from_char)
174	}
175
176	/// What kind of literal do we parse.
177	#[derive(Debug, Clone, Copy, PartialEq)]
178	pub enum Mode {
179	Char,
180
181	Byte,
182
183	Str,
184	RawStr,
185
186	ByteStr,
187	RawByteStr,
188
189	CStr,
190	RawCStr,
191	}
192
193	impl Mode {
194	pub fn in_double_quotes(self) -> bool {
195	match self {
196	Str \| RawStr \| ByteStr \| RawByteStr \| CStr \| RawCStr => `true`,
197	Char \| Byte => `false`,
198	}
199	}
200
201	/// Are `\x80`..`\xff` allowed?
202	fn allow_high_bytes(self) -> bool {
203	match self {
204	Char \| Str => `false`,
205	Byte \| ByteStr \| CStr => `true`,
206	RawStr \| RawByteStr \| RawCStr => unreachable!(),
207	}
208	}
209
210	/// Are unicode (non-ASCII) chars allowed?
211	#[inline]
212	fn allow_unicode_chars(self) -> bool {
213	match self {
214	Byte \| ByteStr \| RawByteStr => `false`,
215	Char \| Str \| RawStr \| CStr \| RawCStr => `true`,
216	}
217	}
218
219	/// Are unicode escapes (`\u`) allowed?
220	fn allow_unicode_escapes(self) -> bool {
221	match self {
222	Byte \| ByteStr => `false`,
223	Char \| Str \| CStr => `true`,
224	RawByteStr \| RawStr \| RawCStr => unreachable!(),
225	}
226	}
227
228	pub fn prefix_noraw(self) -> &'static str {
229	match self {
230	Char \| Str \| RawStr => "",
231	Byte \| ByteStr \| RawByteStr => "b",
232	CStr \| RawCStr => "c",
233	}
234	}
235	}
236
237	fn scan_escape<T: From<char> + From<u8>>(
238	chars: &mut Chars<'_>,
239	mode: Mode,
240	) -> Result<T, EscapeError> {
241	// Previous character was '\\', unescape what follows.
242	let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
243	'"' => '"',
244	'n' => '`\n`',
245	'r' => '`\r`',
246	't' => '`\t`',
247	'`\\`' => '`\\`',
248	'`\'`' => '`\'`',
249	'0' => '`\0`',
250	'x' => {
251	// Parse hexadecimal character code.
252
253	let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
254	let hi = hi.to_digit(`16`).ok_or(EscapeError::InvalidCharInHexEscape)?;
255
256	let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
257	let lo = lo.to_digit(`16`).ok_or(EscapeError::InvalidCharInHexEscape)?;
258
259	let value = (hi * `16` + lo) as u8;
260
261	return if !mode.allow_high_bytes() && !value.is_ascii() {
262	Err(EscapeError::OutOfRangeHexEscape)
263	} else {
264	// This may be a high byte, but that will only happen if `T` is
265	// `MixedUnit`, because of the `allow_high_bytes` check above.
266	Ok(T::from(value))
267	};
268	}
269	'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
270	_ => return Err(EscapeError::InvalidEscape),
271	};
272	Ok(T::from(res))
273	}
274
275	fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
276	// We've parsed '\u', now we have to parse '{..}'.
277
278	if chars.next() != Some('{') {
279	return Err(EscapeError::NoBraceInUnicodeEscape);
280	}
281
282	// First character must be a hexadecimal digit.
283	let mut n_digits = `1`;
284	let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
285	'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
286	'}' => return Err(EscapeError::EmptyUnicodeEscape),
287	c => c
288	.to_digit(`16`)
289	.ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
290	};
291
292	// First character is valid, now parse the rest of the number
293	// and closing brace.
294	loop {
295	match chars.next() {
296	None => return Err(EscapeError::UnclosedUnicodeEscape),
297	Some('_') => continue,
298	Some('}') => {
299	if n_digits > `6` {
300	return Err(EscapeError::OverlongUnicodeEscape);
301	}
302
303	// Incorrect syntax has higher priority for error reporting
304	// than unallowed value for a literal.
305	if !allow_unicode_escapes {
306	return Err(EscapeError::UnicodeEscapeInByte);
307	}
308
309	break std::char::from_u32(value).ok_or({
310	if value > `0x10FFFF` {
311	EscapeError::OutOfRangeUnicodeEscape
312	} else {
313	EscapeError::LoneSurrogateUnicodeEscape
314	}
315	});
316	}
317	Some(c) => {
318	let digit: u32 = c
319	.to_digit(`16`)
320	.ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
321	n_digits += `1`;
322	if n_digits > `6` {
323	// Stop updating value since we're sure that it's incorrect already.
324	continue;
325	}
326	value = value * `16` + digit;
327	}
328	};
329	}
330	}
331
332	#[inline]
333	fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
334	if allow_unicode_chars \|\| c.is_ascii() {
335	Ok(c)
336	} else {
337	Err(EscapeError::NonAsciiCharInByte)
338	}
339	}
340
341	fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
342	let c: char = chars.next().ok_or(err:EscapeError::ZeroChars)?;
343	let res: char = match c {
344	'`\\`' => scan_escape(chars, mode),
345	'`\n`' \| '`\t`' \| '`\'`' => Err(EscapeError::EscapeOnlyChar),
346	'`\r`' => Err(EscapeError::BareCarriageReturn),
347	_ => ascii_check(c, mode.allow_unicode_chars()),
348	}?;
349	if chars.next().is_some() {
350	return Err(EscapeError::MoreThanOneChar);
351	}
352	Ok(res)
353	}
354
355	/// Takes a contents of a string literal (without quotes) and produces a
356	/// sequence of escaped characters or errors.
357	fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
358	where
359	F: FnMut(Range<usize>, Result<T, EscapeError>),
360	{
361	let mut chars = src.chars();
362	let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
363
364	// The `start` and `end` computation here is complicated because
365	// `skip_ascii_whitespace` makes us to skip over chars without counting
366	// them in the range computation.
367	while let Some(c) = chars.next() {
368	let start = src.len() - chars.as_str().len() - c.len_utf8();
369	let res = match c {
370	'`\\`' => {
371	match chars.clone().next() {
372	Some('`\n`') => {
373	// Rust language specification requires us to skip whitespaces
374	// if unescaped '\' character is followed by '\n'.
375	// For details see [Rust language reference]
376	// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
377	skip_ascii_whitespace(&mut chars, start, &mut \|range, err\| {
378	callback(range, Err(err))
379	});
380	continue;
381	}
382	_ => scan_escape::<T>(&mut chars, mode),
383	}
384	}
385	'"' => Err(EscapeError::EscapeOnlyChar),
386	'`\r`' => Err(EscapeError::BareCarriageReturn),
387	_ => ascii_check(c, allow_unicode_chars).map(T::from),
388	};
389	let end = src.len() - chars.as_str().len();
390	callback(start..end, res);
391	}
392	}
393
394	fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
395	where
396	F: FnMut(Range<usize>, EscapeError),
397	{
398	let tail: &str = chars.as_str();
399	let first_non_space: usize = tail
400	.bytes()
401	.position(\|b\| b != b' ' && b != b'`\t`' && b != b'`\n`' && b != b'`\r`')
402	.unwrap_or(default:tail.len());
403	if tail[`1`..first_non_space].contains('`\n`') {
404	// The +1 accounts for the escaping slash.
405	let end: usize = start + first_non_space + `1`;
406	callback(start..end, EscapeError::MultipleSkippedLinesWarning);
407	}
408	let tail: &str = &tail[first_non_space..];
409	if let Some(c: char) = tail.chars().next() {
410	if c.is_whitespace() {
411	// For error reporting, we would like the span to contain the character that was not
412	// skipped. The +1 is necessary to account for the leading \ that started the escape.
413	let end: usize = start + first_non_space + c.len_utf8() + `1`;
414	callback(start..end, EscapeError::UnskippedWhitespaceWarning);
415	}
416	}
417	*chars = tail.chars();
418	}
419
420	/// Takes a contents of a string literal (without quotes) and produces a
421	/// sequence of characters or errors.
422	/// NOTE: Raw strings do not perform any explicit character escaping, here we
423	/// only produce errors on bare CR.
424	fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
425	where
426	F: FnMut(Range<usize>, Result<char, EscapeError>),
427	{
428	let mut chars: Chars<'_> = src.chars();
429	let allow_unicode_chars: bool = mode.allow_unicode_chars(); // get this outside the loop
430
431	// The `start` and `end` computation here matches the one in
432	// `unescape_non_raw_common` for consistency, even though this function
433	// doesn't have to worry about skipping any chars.
434	while let Some(c: char) = chars.next() {
435	let start: usize = src.len() - chars.as_str().len() - c.len_utf8();
436	let res: Result = match c {
437	'`\r`' => Err(EscapeError::BareCarriageReturnInRawString),
438	_ => ascii_check(c, allow_unicode_chars),
439	};
440	let end: usize = src.len() - chars.as_str().len();
441	callback(start..end, res);
442	}
443	}
444
445	#[inline]
446	pub fn byte_from_char(c: char) -> u8 {
447	let res: u32 = c as u32;
448	debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
449	res as u8
450	}
451