1//! Utilities for validating (raw) string, char, and byte literals and
2//! turning escape sequences into the values they represent.
3
4use std::ffi::CStr;
5use std::num::NonZero;
6use std::ops::Range;
7use std::str::Chars;
8
9#[cfg(test)]
10mod tests;
11
12/// Errors and warnings that can occur during string, char, and byte unescaping.
13///
14/// Mostly relating to malformed escape sequences, but also a few other problems.
15#[derive(Debug, PartialEq, Eq)]
16pub enum EscapeError {
17 /// Expected 1 char, but 0 were found.
18 ZeroChars,
19 /// Expected 1 char, but more than 1 were found.
20 MoreThanOneChar,
21
22 /// Escaped '\' character without continuation.
23 LoneSlash,
24 /// Invalid escape character (e.g. '\z').
25 InvalidEscape,
26 /// Raw '\r' encountered.
27 BareCarriageReturn,
28 /// Raw '\r' encountered in raw string.
29 BareCarriageReturnInRawString,
30 /// Unescaped character that was expected to be escaped (e.g. raw '\t').
31 EscapeOnlyChar,
32
33 /// Numeric character escape is too short (e.g. '\x1').
34 TooShortHexEscape,
35 /// Invalid character in numeric escape (e.g. '\xz')
36 InvalidCharInHexEscape,
37 /// Character code in numeric escape is non-ascii (e.g. '\xFF').
38 OutOfRangeHexEscape,
39
40 /// '\u' not followed by '{'.
41 NoBraceInUnicodeEscape,
42 /// Non-hexadecimal value in '\u{..}'.
43 InvalidCharInUnicodeEscape,
44 /// '\u{}'
45 EmptyUnicodeEscape,
46 /// No closing brace in '\u{..}', e.g. '\u{12'.
47 UnclosedUnicodeEscape,
48 /// '\u{_12}'
49 LeadingUnderscoreUnicodeEscape,
50 /// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}'
51 OverlongUnicodeEscape,
52 /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'.
53 LoneSurrogateUnicodeEscape,
54 /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'.
55 OutOfRangeUnicodeEscape,
56
57 /// Unicode escape code in byte literal.
58 UnicodeEscapeInByte,
59 /// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
60 NonAsciiCharInByte,
61
62 /// `\0` in a C string literal.
63 NulInCStr,
64
65 /// After a line ending with '\', the next line contains whitespace
66 /// characters that are not skipped.
67 UnskippedWhitespaceWarning,
68
69 /// After a line ending with '\', multiple lines are skipped.
70 MultipleSkippedLinesWarning,
71}
72
73impl EscapeError {
74 /// Returns true for actual errors, as opposed to warnings.
75 pub fn is_fatal(&self) -> bool {
76 !matches!(
77 self,
78 EscapeError::UnskippedWhitespaceWarning | EscapeError::MultipleSkippedLinesWarning
79 )
80 }
81}
82
83/// Check a raw string literal for validity
84///
85/// Takes the contents of a raw string literal (without quotes)
86/// and produces a sequence of characters or errors,
87/// which are returned by invoking `callback`.
88/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
89pub fn check_raw_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
90 str::check_raw(src, callback);
91}
92
93/// Check a raw byte string literal for validity
94///
95/// Takes the contents of a raw byte string literal (without quotes)
96/// and produces a sequence of bytes or errors,
97/// which are returned by invoking `callback`.
98/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
99pub fn check_raw_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u8, EscapeError>)) {
100 <[u8]>::check_raw(src, callback);
101}
102
103/// Check a raw C string literal for validity
104///
105/// Takes the contents of a raw C string literal (without quotes)
106/// and produces a sequence of characters or errors,
107/// which are returned by invoking `callback`.
108/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
109pub fn check_raw_c_str(
110 src: &str,
111 callback: impl FnMut(Range<usize>, Result<NonZero<char>, EscapeError>),
112) {
113 CStr::check_raw(src, callback);
114}
115
116/// Trait for checking raw string literals for validity
117trait CheckRaw {
118 /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
119 type RawUnit;
120
121 /// Converts chars to the unit type of the literal type
122 fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError>;
123
124 /// Takes the contents of a raw literal (without quotes)
125 /// and produces a sequence of `Result<Self::RawUnit, EscapeError>`
126 /// which are returned via `callback`.
127 ///
128 /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
129 fn check_raw(
130 src: &str,
131 mut callback: impl FnMut(Range<usize>, Result<Self::RawUnit, EscapeError>),
132 ) {
133 let mut chars = src.chars();
134 while let Some(c) = chars.next() {
135 let start = src.len() - chars.as_str().len() - c.len_utf8();
136 let res = match c {
137 '\r' => Err(EscapeError::BareCarriageReturnInRawString),
138 _ => Self::char2raw_unit(c),
139 };
140 let end = src.len() - chars.as_str().len();
141 callback(start..end, res);
142 }
143
144 // Unfortunately, it is a bit unclear whether the following equivalent code is slower or faster: bug 141855
145 // src.char_indices().for_each(|(pos, c)| {
146 // callback(
147 // pos..pos + c.len_utf8(),
148 // if c == '\r' {
149 // Err(EscapeError::BareCarriageReturnInRawString)
150 // } else {
151 // Self::char2raw_unit(c)
152 // },
153 // );
154 // });
155 }
156}
157
158impl CheckRaw for str {
159 type RawUnit = char;
160
161 #[inline]
162 fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
163 Ok(c)
164 }
165}
166
167impl CheckRaw for [u8] {
168 type RawUnit = u8;
169
170 #[inline]
171 fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
172 char2byte(c)
173 }
174}
175
176/// Turn an ascii char into a byte
177#[inline]
178fn char2byte(c: char) -> Result<u8, EscapeError> {
179 // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte)
180 if c.is_ascii() {
181 Ok(c as u8)
182 } else {
183 Err(EscapeError::NonAsciiCharInByte)
184 }
185}
186
187impl CheckRaw for CStr {
188 type RawUnit = NonZero<char>;
189
190 #[inline]
191 fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
192 NonZero::new(c).ok_or(err:EscapeError::NulInCStr)
193 }
194}
195
196/// Unescape a char literal
197///
198/// Takes the contents of a char literal (without quotes),
199/// and returns an unescaped char or an error.
200#[inline]
201pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
202 str::unescape_single(&mut src.chars())
203}
204
205/// Unescape a byte literal
206///
207/// Takes the contents of a byte literal (without quotes),
208/// and returns an unescaped byte or an error.
209#[inline]
210pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
211 <[u8]>::unescape_single(&mut src.chars())
212}
213
214/// Unescape a string literal
215///
216/// Takes the contents of a string literal (without quotes)
217/// and produces a sequence of escaped characters or errors,
218/// which are returned by invoking `callback`.
219pub fn unescape_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
220 str::unescape(src, callback)
221}
222
223/// Unescape a byte string literal
224///
225/// Takes the contents of a byte string literal (without quotes)
226/// and produces a sequence of escaped bytes or errors,
227/// which are returned by invoking `callback`.
228pub fn unescape_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u8, EscapeError>)) {
229 <[u8]>::unescape(src, callback)
230}
231
232/// Unescape a C string literal
233///
234/// Takes the contents of a C string literal (without quotes)
235/// and produces a sequence of escaped MixedUnits or errors,
236/// which are returned by invoking `callback`.
237pub fn unescape_c_str(
238 src: &str,
239 callback: impl FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
240) {
241 CStr::unescape(src, callback)
242}
243
244/// Enum representing either a char or a byte
245///
246/// Used for mixed utf8 string literals, i.e. those that allow both unicode
247/// chars and high bytes.
248#[derive(Copy, Clone, Debug, PartialEq, Eq)]
249pub enum MixedUnit {
250 /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
251 /// and Unicode chars (written directly or via `\u` escapes).
252 ///
253 /// For example, if '¥' appears in a string it is represented here as
254 /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
255 /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
256 Char(NonZero<char>),
257
258 /// Used for high bytes (`\x80`..`\xff`).
259 ///
260 /// For example, if `\xa5` appears in a string it is represented here as
261 /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
262 /// byte string as the single byte `0xa5`.
263 HighByte(NonZero<u8>),
264}
265
266impl From<NonZero<char>> for MixedUnit {
267 #[inline]
268 fn from(c: NonZero<char>) -> Self {
269 MixedUnit::Char(c)
270 }
271}
272
273impl From<NonZero<u8>> for MixedUnit {
274 #[inline]
275 fn from(byte: NonZero<u8>) -> Self {
276 if byte.get().is_ascii() {
277 MixedUnit::Char(NonZero::new(byte.get() as char).unwrap())
278 } else {
279 MixedUnit::HighByte(byte)
280 }
281 }
282}
283
284impl TryFrom<char> for MixedUnit {
285 type Error = EscapeError;
286
287 #[inline]
288 fn try_from(c: char) -> Result<Self, EscapeError> {
289 NonZero::new(c)
290 .map(MixedUnit::Char)
291 .ok_or(err:EscapeError::NulInCStr)
292 }
293}
294
295impl TryFrom<u8> for MixedUnit {
296 type Error = EscapeError;
297
298 #[inline]
299 fn try_from(byte: u8) -> Result<Self, EscapeError> {
300 NonZero::new(byte)
301 .map(From::from)
302 .ok_or(err:EscapeError::NulInCStr)
303 }
304}
305
306/// Trait for unescaping escape sequences in strings
307trait Unescape {
308 /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
309 type Unit;
310
311 /// Result of unescaping the zero char ('\0')
312 const ZERO_RESULT: Result<Self::Unit, EscapeError>;
313
314 /// Converts non-zero bytes to the unit type
315 fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit;
316
317 /// Converts chars to the unit type
318 fn char2unit(c: char) -> Result<Self::Unit, EscapeError>;
319
320 /// Converts the byte of a hex escape to the unit type
321 fn hex2unit(b: u8) -> Result<Self::Unit, EscapeError>;
322
323 /// Converts the result of a unicode escape to the unit type
324 fn unicode2unit(r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError>;
325
326 /// Unescape a single unit (single quote syntax)
327 fn unescape_single(chars: &mut Chars<'_>) -> Result<Self::Unit, EscapeError> {
328 let res = match chars.next().ok_or(EscapeError::ZeroChars)? {
329 '\\' => Self::unescape_1(chars),
330 '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
331 '\r' => Err(EscapeError::BareCarriageReturn),
332 c => Self::char2unit(c),
333 }?;
334 if chars.next().is_some() {
335 return Err(EscapeError::MoreThanOneChar);
336 }
337 Ok(res)
338 }
339
340 /// Unescape the first unit of a string (double quoted syntax)
341 fn unescape_1(chars: &mut Chars<'_>) -> Result<Self::Unit, EscapeError> {
342 // Previous character was '\\', unescape what follows.
343 let c = chars.next().ok_or(EscapeError::LoneSlash)?;
344 if c == '0' {
345 Self::ZERO_RESULT
346 } else {
347 simple_escape(c)
348 .map(|b| Self::nonzero_byte2unit(b))
349 .or_else(|c| match c {
350 'x' => Self::hex2unit(hex_escape(chars)?),
351 'u' => Self::unicode2unit({
352 let value = unicode_escape(chars)?;
353 if value > char::MAX as u32 {
354 Err(EscapeError::OutOfRangeUnicodeEscape)
355 } else {
356 char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
357 }
358 }),
359 _ => Err(EscapeError::InvalidEscape),
360 })
361 }
362 }
363
364 /// Unescape a string literal
365 ///
366 /// Takes the contents of a raw string literal (without quotes)
367 /// and produces a sequence of `Result<Self::Unit, EscapeError>`
368 /// which are returned via `callback`.
369 fn unescape(
370 src: &str,
371 mut callback: impl FnMut(Range<usize>, Result<Self::Unit, EscapeError>),
372 ) {
373 let mut chars = src.chars();
374 while let Some(c) = chars.next() {
375 let start = src.len() - chars.as_str().len() - c.len_utf8();
376 let res = match c {
377 '\\' => {
378 if let Some(b'\n') = chars.as_str().as_bytes().first() {
379 let _ = chars.next();
380 // skip whitespace for backslash newline, see [Rust language reference]
381 // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
382 let callback_err = |range, err| callback(range, Err(err));
383 skip_ascii_whitespace(&mut chars, start, callback_err);
384 continue;
385 } else {
386 Self::unescape_1(&mut chars)
387 }
388 }
389 '"' => Err(EscapeError::EscapeOnlyChar),
390 '\r' => Err(EscapeError::BareCarriageReturn),
391 c => Self::char2unit(c),
392 };
393 let end = src.len() - chars.as_str().len();
394 callback(start..end, res);
395 }
396 }
397}
398
399/// Interpret a non-nul ASCII escape
400///
401/// Parses the character of an ASCII escape (except nul) without the leading backslash.
402#[inline] // single use in Unescape::unescape_1
403fn simple_escape(c: char) -> Result<NonZero<u8>, char> {
404 // Previous character was '\\', unescape what follows.
405 Ok(NonZeroOption>::new(match c {
406 '"' => b'"',
407 'n' => b'\n',
408 'r' => b'\r',
409 't' => b'\t',
410 '\\' => b'\\',
411 '\'' => b'\'',
412 _ => Err(c)?,
413 })
414 .unwrap())
415}
416
417/// Interpret a hexadecimal escape
418///
419/// Parses the two hexadecimal characters of a hexadecimal escape without the leading r"\x".
420#[inline] // single use in Unescape::unescape_1
421fn hex_escape(chars: &mut impl Iterator<Item = char>) -> Result<u8, EscapeError> {
422 let hi: char = chars.next().ok_or(err:EscapeError::TooShortHexEscape)?;
423 let hi: u32 = hi.to_digit(16).ok_or(err:EscapeError::InvalidCharInHexEscape)?;
424
425 let lo: char = chars.next().ok_or(err:EscapeError::TooShortHexEscape)?;
426 let lo: u32 = lo.to_digit(16).ok_or(err:EscapeError::InvalidCharInHexEscape)?;
427
428 Ok((hi * 16 + lo) as u8)
429}
430
431/// Interpret a unicode escape
432///
433/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape.
434/// This r"{...}" normally comes after r"\u" and cannot start with an underscore.
435#[inline] // single use in Unescape::unescape_1
436fn unicode_escape(chars: &mut impl Iterator<Item = char>) -> Result<u32, EscapeError> {
437 if chars.next() != Some('{') {
438 return Err(EscapeError::NoBraceInUnicodeEscape);
439 }
440
441 // First character must be a hexadecimal digit.
442 let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
443 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
444 '}' => return Err(EscapeError::EmptyUnicodeEscape),
445 c => c
446 .to_digit(16)
447 .ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
448 };
449
450 // First character is valid, now parse the rest of the number
451 // and closing brace.
452 let mut n_digits = 1;
453 loop {
454 match chars.next() {
455 None => return Err(EscapeError::UnclosedUnicodeEscape),
456 Some('_') => continue,
457 Some('}') => {
458 // Incorrect syntax has higher priority for error reporting
459 // than unallowed value for a literal.
460 return if n_digits > 6 {
461 Err(EscapeError::OverlongUnicodeEscape)
462 } else {
463 Ok(value)
464 };
465 }
466 Some(c) => {
467 let digit: u32 = c
468 .to_digit(16)
469 .ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
470 n_digits += 1;
471 if n_digits > 6 {
472 // Stop updating value since we're sure that it's incorrect already.
473 continue;
474 }
475 value = value * 16 + digit;
476 }
477 };
478 }
479}
480
481/// Interpret a string continuation escape (https://doc.rust-lang.org/reference/expressions/literal-expr.html#string-continuation-escapes)
482///
483/// Skip ASCII whitespace, except for the formfeed character
484/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
485/// Warns on unescaped newline and following non-ASCII whitespace.
486#[inline] // single use in Unescape::unescape
487fn skip_ascii_whitespace(
488 chars: &mut Chars<'_>,
489 start: usize,
490 mut callback: impl FnMut(Range<usize>, EscapeError),
491) {
492 let rest: &str = chars.as_str();
493 let first_non_space: usize = rest
494 .bytes()
495 .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
496 .unwrap_or(default:rest.len());
497 let (space: &str, rest: &str) = rest.split_at(mid:first_non_space);
498 // backslash newline adds 2 bytes
499 let end: usize = start + 2 + first_non_space;
500 if space.contains('\n') {
501 callback(start..end, EscapeError::MultipleSkippedLinesWarning);
502 }
503 *chars = rest.chars();
504 if let Some(c: char) = chars.clone().next() {
505 if c.is_whitespace() {
506 // for error reporting, include the character that was not skipped in the span
507 callback(
508 start..end + c.len_utf8(),
509 EscapeError::UnskippedWhitespaceWarning,
510 );
511 }
512 }
513}
514
515impl Unescape for str {
516 type Unit = char;
517
518 const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok('\0');
519
520 #[inline]
521 fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
522 b.get().into()
523 }
524
525 #[inline]
526 fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
527 Ok(c)
528 }
529
530 #[inline]
531 fn hex2unit(b: u8) -> Result<Self::Unit, EscapeError> {
532 if b.is_ascii() {
533 Ok(b as char)
534 } else {
535 Err(EscapeError::OutOfRangeHexEscape)
536 }
537 }
538
539 #[inline]
540 fn unicode2unit(r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError> {
541 r
542 }
543}
544
545impl Unescape for [u8] {
546 type Unit = u8;
547
548 const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok(b'\0');
549
550 #[inline]
551 fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
552 b.get()
553 }
554
555 #[inline]
556 fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
557 char2byte(c)
558 }
559
560 #[inline]
561 fn hex2unit(b: u8) -> Result<Self::Unit, EscapeError> {
562 Ok(b)
563 }
564
565 #[inline]
566 fn unicode2unit(_r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError> {
567 Err(EscapeError::UnicodeEscapeInByte)
568 }
569}
570
571impl Unescape for CStr {
572 type Unit = MixedUnit;
573
574 const ZERO_RESULT: Result<Self::Unit, EscapeError> = Err(EscapeError::NulInCStr);
575
576 #[inline]
577 fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
578 b.into()
579 }
580
581 #[inline]
582 fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
583 c.try_into()
584 }
585
586 #[inline]
587 fn hex2unit(byte: u8) -> Result<Self::Unit, EscapeError> {
588 byte.try_into()
589 }
590
591 #[inline]
592 fn unicode2unit(r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError> {
593 Self::char2unit(r?)
594 }
595}
596
597/// Enum of the different kinds of literal
598#[derive(Debug, Clone, Copy, PartialEq)]
599pub enum Mode {
600 /// `'a'`
601 Char,
602
603 /// `b'a'`
604 Byte,
605
606 /// `"hello"`
607 Str,
608 /// `r"hello"`
609 RawStr,
610
611 /// `b"hello"`
612 ByteStr,
613 /// `br"hello"`
614 RawByteStr,
615
616 /// `c"hello"`
617 CStr,
618 /// `cr"hello"`
619 RawCStr,
620}
621
622impl Mode {
623 pub fn in_double_quotes(self) -> bool {
624 match self {
625 Mode::Str
626 | Mode::RawStr
627 | Mode::ByteStr
628 | Mode::RawByteStr
629 | Mode::CStr
630 | Mode::RawCStr => true,
631 Mode::Char | Mode::Byte => false,
632 }
633 }
634
635 pub fn prefix_noraw(self) -> &'static str {
636 match self {
637 Mode::Char | Mode::Str | Mode::RawStr => "",
638 Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b",
639 Mode::CStr | Mode::RawCStr => "c",
640 }
641 }
642}
643
644/// Check a literal only for errors
645///
646/// Takes the contents of a literal (without quotes)
647/// and produces a sequence of only errors,
648/// which are returned by invoking `error_callback`.
649///
650/// NB Does not produce any output other than errors
651pub fn check_for_errors(
652 src: &str,
653 mode: Mode,
654 mut error_callback: impl FnMut(Range<usize>, EscapeError),
655) {
656 match mode {
657 Mode::Char => {
658 let mut chars = src.chars();
659 if let Err(e) = str::unescape_single(&mut chars) {
660 error_callback(0..(src.len() - chars.as_str().len()), e);
661 }
662 }
663 Mode::Byte => {
664 let mut chars = src.chars();
665 if let Err(e) = <[u8]>::unescape_single(&mut chars) {
666 error_callback(0..(src.len() - chars.as_str().len()), e);
667 }
668 }
669 Mode::Str => unescape_str(src, |range, res| {
670 if let Err(e) = res {
671 error_callback(range, e);
672 }
673 }),
674 Mode::ByteStr => unescape_byte_str(src, |range, res| {
675 if let Err(e) = res {
676 error_callback(range, e);
677 }
678 }),
679 Mode::CStr => unescape_c_str(src, |range, res| {
680 if let Err(e) = res {
681 error_callback(range, e);
682 }
683 }),
684 Mode::RawStr => check_raw_str(src, |range, res| {
685 if let Err(e) = res {
686 error_callback(range, e);
687 }
688 }),
689 Mode::RawByteStr => check_raw_byte_str(src, |range, res| {
690 if let Err(e) = res {
691 error_callback(range, e);
692 }
693 }),
694 Mode::RawCStr => check_raw_c_str(src, |range, res| {
695 if let Err(e) = res {
696 error_callback(range, e);
697 }
698 }),
699 }
700}
701