1//! Utilities for validating string and char literals and turning them into
2//! values they represent.
3
4use std::ops::Range;
5use std::str::Chars;
6
7use Mode::*;
8
9#[cfg(test)]
10mod tests;
11
12/// Errors and warnings that can occur during string unescaping. They mostly
13/// relate to malformed escape sequences, but there are a few that are about
14/// other problems.
15#[derive(Debug, PartialEq, Eq)]
16pub enum EscapeError {
17 /// Expected 1 char, but 0 were found.
18 ZeroChars,
19 /// Expected 1 char, but more than 1 were found.
20 MoreThanOneChar,
21
22 /// Escaped '\' character without continuation.
23 LoneSlash,
24 /// Invalid escape character (e.g. '\z').
25 InvalidEscape,
26 /// Raw '\r' encountered.
27 BareCarriageReturn,
28 /// Raw '\r' encountered in raw string.
29 BareCarriageReturnInRawString,
30 /// Unescaped character that was expected to be escaped (e.g. raw '\t').
31 EscapeOnlyChar,
32
33 /// Numeric character escape is too short (e.g. '\x1').
34 TooShortHexEscape,
35 /// Invalid character in numeric escape (e.g. '\xz')
36 InvalidCharInHexEscape,
37 /// Character code in numeric escape is non-ascii (e.g. '\xFF').
38 OutOfRangeHexEscape,
39
40 /// '\u' not followed by '{'.
41 NoBraceInUnicodeEscape,
42 /// Non-hexadecimal value in '\u{..}'.
43 InvalidCharInUnicodeEscape,
44 /// '\u{}'
45 EmptyUnicodeEscape,
46 /// No closing brace in '\u{..}', e.g. '\u{12'.
47 UnclosedUnicodeEscape,
48 /// '\u{_12}'
49 LeadingUnderscoreUnicodeEscape,
50 /// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}'
51 OverlongUnicodeEscape,
52 /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'.
53 LoneSurrogateUnicodeEscape,
54 /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'.
55 OutOfRangeUnicodeEscape,
56
57 /// Unicode escape code in byte literal.
58 UnicodeEscapeInByte,
59 /// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
60 NonAsciiCharInByte,
61
62 // `\0` in a C string literal.
63 NulInCStr,
64
65 /// After a line ending with '\', the next line contains whitespace
66 /// characters that are not skipped.
67 UnskippedWhitespaceWarning,
68
69 /// After a line ending with '\', multiple lines are skipped.
70 MultipleSkippedLinesWarning,
71}
72
73impl EscapeError {
74 /// Returns true for actual errors, as opposed to warnings.
75 pub fn is_fatal(&self) -> bool {
76 !matches!(
77 self,
78 EscapeError::UnskippedWhitespaceWarning | EscapeError::MultipleSkippedLinesWarning
79 )
80 }
81}
82
83/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without
84/// quotes) and produces a sequence of escaped characters or errors.
85///
86/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
87/// the callback will be called exactly once.
88pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
89where
90 F: FnMut(Range<usize>, Result<char, EscapeError>),
91{
92 match mode {
93 Char | Byte => {
94 let mut chars: Chars<'_> = src.chars();
95 let res: Result = unescape_char_or_byte(&mut chars, mode);
96 callback(0..(src.len() - chars.as_str().len()), res);
97 }
98 Str | ByteStr => unescape_non_raw_common(src, mode, callback),
99 RawStr | RawByteStr => check_raw_common(src, mode, callback),
100 RawCStr => check_raw_common(src, mode, &mut |r: Range, mut result: Result| {
101 if let Ok('\0') = result {
102 result = Err(EscapeError::NulInCStr);
103 }
104 callback(r, result)
105 }),
106 CStr => unreachable!(),
107 }
108}
109
110/// Used for mixed utf8 string literals, i.e. those that allow both unicode
111/// chars and high bytes.
112pub enum MixedUnit {
113 /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
114 /// and Unicode chars (written directly or via `\u` escapes).
115 ///
116 /// For example, if '¥' appears in a string it is represented here as
117 /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
118 /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
119 Char(char),
120
121 /// Used for high bytes (`\x80`..`\xff`).
122 ///
123 /// For example, if `\xa5` appears in a string it is represented here as
124 /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
125 /// byte string as the single byte `0xa5`.
126 HighByte(u8),
127}
128
129impl From<char> for MixedUnit {
130 fn from(c: char) -> Self {
131 MixedUnit::Char(c)
132 }
133}
134
135impl From<u8> for MixedUnit {
136 fn from(n: u8) -> Self {
137 if n.is_ascii() {
138 MixedUnit::Char(n as char)
139 } else {
140 MixedUnit::HighByte(n)
141 }
142 }
143}
144
145/// Takes the contents of a mixed-utf8 literal (without quotes) and produces
146/// a sequence of escaped characters or errors.
147///
148/// Values are returned by invoking `callback`.
149pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
150where
151 F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
152{
153 match mode {
154 CStr => unescape_non_raw_common(src, mode, &mut |r: Range, mut result: Result| {
155 if let Ok(MixedUnit::Char('\0')) = result {
156 result = Err(EscapeError::NulInCStr);
157 }
158 callback(r, result)
159 }),
160 Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
161 }
162}
163
164/// Takes a contents of a char literal (without quotes), and returns an
165/// unescaped char or an error.
166pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
167 unescape_char_or_byte(&mut src.chars(), mode:Char)
168}
169
170/// Takes a contents of a byte literal (without quotes), and returns an
171/// unescaped byte or an error.
172pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
173 unescape_char_or_byte(&mut src.chars(), Byte).map(op:byte_from_char)
174}
175
176/// What kind of literal do we parse.
177#[derive(Debug, Clone, Copy, PartialEq)]
178pub enum Mode {
179 Char,
180
181 Byte,
182
183 Str,
184 RawStr,
185
186 ByteStr,
187 RawByteStr,
188
189 CStr,
190 RawCStr,
191}
192
193impl Mode {
194 pub fn in_double_quotes(self) -> bool {
195 match self {
196 Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true,
197 Char | Byte => false,
198 }
199 }
200
201 /// Are `\x80`..`\xff` allowed?
202 fn allow_high_bytes(self) -> bool {
203 match self {
204 Char | Str => false,
205 Byte | ByteStr | CStr => true,
206 RawStr | RawByteStr | RawCStr => unreachable!(),
207 }
208 }
209
210 /// Are unicode (non-ASCII) chars allowed?
211 #[inline]
212 fn allow_unicode_chars(self) -> bool {
213 match self {
214 Byte | ByteStr | RawByteStr => false,
215 Char | Str | RawStr | CStr | RawCStr => true,
216 }
217 }
218
219 /// Are unicode escapes (`\u`) allowed?
220 fn allow_unicode_escapes(self) -> bool {
221 match self {
222 Byte | ByteStr => false,
223 Char | Str | CStr => true,
224 RawByteStr | RawStr | RawCStr => unreachable!(),
225 }
226 }
227
228 pub fn prefix_noraw(self) -> &'static str {
229 match self {
230 Char | Str | RawStr => "",
231 Byte | ByteStr | RawByteStr => "b",
232 CStr | RawCStr => "c",
233 }
234 }
235}
236
237fn scan_escape<T: From<char> + From<u8>>(
238 chars: &mut Chars<'_>,
239 mode: Mode,
240) -> Result<T, EscapeError> {
241 // Previous character was '\\', unescape what follows.
242 let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
243 '"' => '"',
244 'n' => '\n',
245 'r' => '\r',
246 't' => '\t',
247 '\\' => '\\',
248 '\'' => '\'',
249 '0' => '\0',
250 'x' => {
251 // Parse hexadecimal character code.
252
253 let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
254 let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
255
256 let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
257 let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
258
259 let value = (hi * 16 + lo) as u8;
260
261 return if !mode.allow_high_bytes() && !value.is_ascii() {
262 Err(EscapeError::OutOfRangeHexEscape)
263 } else {
264 // This may be a high byte, but that will only happen if `T` is
265 // `MixedUnit`, because of the `allow_high_bytes` check above.
266 Ok(T::from(value))
267 };
268 }
269 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
270 _ => return Err(EscapeError::InvalidEscape),
271 };
272 Ok(T::from(res))
273}
274
275fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
276 // We've parsed '\u', now we have to parse '{..}'.
277
278 if chars.next() != Some('{') {
279 return Err(EscapeError::NoBraceInUnicodeEscape);
280 }
281
282 // First character must be a hexadecimal digit.
283 let mut n_digits = 1;
284 let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
285 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
286 '}' => return Err(EscapeError::EmptyUnicodeEscape),
287 c => c
288 .to_digit(16)
289 .ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
290 };
291
292 // First character is valid, now parse the rest of the number
293 // and closing brace.
294 loop {
295 match chars.next() {
296 None => return Err(EscapeError::UnclosedUnicodeEscape),
297 Some('_') => continue,
298 Some('}') => {
299 if n_digits > 6 {
300 return Err(EscapeError::OverlongUnicodeEscape);
301 }
302
303 // Incorrect syntax has higher priority for error reporting
304 // than unallowed value for a literal.
305 if !allow_unicode_escapes {
306 return Err(EscapeError::UnicodeEscapeInByte);
307 }
308
309 break std::char::from_u32(value).ok_or({
310 if value > 0x10FFFF {
311 EscapeError::OutOfRangeUnicodeEscape
312 } else {
313 EscapeError::LoneSurrogateUnicodeEscape
314 }
315 });
316 }
317 Some(c) => {
318 let digit: u32 = c
319 .to_digit(16)
320 .ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
321 n_digits += 1;
322 if n_digits > 6 {
323 // Stop updating value since we're sure that it's incorrect already.
324 continue;
325 }
326 value = value * 16 + digit;
327 }
328 };
329 }
330}
331
332#[inline]
333fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
334 if allow_unicode_chars || c.is_ascii() {
335 Ok(c)
336 } else {
337 Err(EscapeError::NonAsciiCharInByte)
338 }
339}
340
341fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
342 let c: char = chars.next().ok_or(err:EscapeError::ZeroChars)?;
343 let res: char = match c {
344 '\\' => scan_escape(chars, mode),
345 '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
346 '\r' => Err(EscapeError::BareCarriageReturn),
347 _ => ascii_check(c, mode.allow_unicode_chars()),
348 }?;
349 if chars.next().is_some() {
350 return Err(EscapeError::MoreThanOneChar);
351 }
352 Ok(res)
353}
354
355/// Takes a contents of a string literal (without quotes) and produces a
356/// sequence of escaped characters or errors.
357fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
358where
359 F: FnMut(Range<usize>, Result<T, EscapeError>),
360{
361 let mut chars = src.chars();
362 let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
363
364 // The `start` and `end` computation here is complicated because
365 // `skip_ascii_whitespace` makes us to skip over chars without counting
366 // them in the range computation.
367 while let Some(c) = chars.next() {
368 let start = src.len() - chars.as_str().len() - c.len_utf8();
369 let res = match c {
370 '\\' => {
371 match chars.clone().next() {
372 Some('\n') => {
373 // Rust language specification requires us to skip whitespaces
374 // if unescaped '\' character is followed by '\n'.
375 // For details see [Rust language reference]
376 // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
377 skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
378 callback(range, Err(err))
379 });
380 continue;
381 }
382 _ => scan_escape::<T>(&mut chars, mode),
383 }
384 }
385 '"' => Err(EscapeError::EscapeOnlyChar),
386 '\r' => Err(EscapeError::BareCarriageReturn),
387 _ => ascii_check(c, allow_unicode_chars).map(T::from),
388 };
389 let end = src.len() - chars.as_str().len();
390 callback(start..end, res);
391 }
392}
393
394fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
395where
396 F: FnMut(Range<usize>, EscapeError),
397{
398 let tail: &str = chars.as_str();
399 let first_non_space: usize = tail
400 .bytes()
401 .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
402 .unwrap_or(default:tail.len());
403 if tail[1..first_non_space].contains('\n') {
404 // The +1 accounts for the escaping slash.
405 let end: usize = start + first_non_space + 1;
406 callback(start..end, EscapeError::MultipleSkippedLinesWarning);
407 }
408 let tail: &str = &tail[first_non_space..];
409 if let Some(c: char) = tail.chars().next() {
410 if c.is_whitespace() {
411 // For error reporting, we would like the span to contain the character that was not
412 // skipped. The +1 is necessary to account for the leading \ that started the escape.
413 let end: usize = start + first_non_space + c.len_utf8() + 1;
414 callback(start..end, EscapeError::UnskippedWhitespaceWarning);
415 }
416 }
417 *chars = tail.chars();
418}
419
420/// Takes a contents of a string literal (without quotes) and produces a
421/// sequence of characters or errors.
422/// NOTE: Raw strings do not perform any explicit character escaping, here we
423/// only produce errors on bare CR.
424fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
425where
426 F: FnMut(Range<usize>, Result<char, EscapeError>),
427{
428 let mut chars: Chars<'_> = src.chars();
429 let allow_unicode_chars: bool = mode.allow_unicode_chars(); // get this outside the loop
430
431 // The `start` and `end` computation here matches the one in
432 // `unescape_non_raw_common` for consistency, even though this function
433 // doesn't have to worry about skipping any chars.
434 while let Some(c: char) = chars.next() {
435 let start: usize = src.len() - chars.as_str().len() - c.len_utf8();
436 let res: Result = match c {
437 '\r' => Err(EscapeError::BareCarriageReturnInRawString),
438 _ => ascii_check(c, allow_unicode_chars),
439 };
440 let end: usize = src.len() - chars.as_str().len();
441 callback(start..end, res);
442 }
443}
444
445#[inline]
446pub fn byte_from_char(c: char) -> u8 {
447 let res: u32 = c as u32;
448 debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
449 res as u8
450}
451