1 | //! Utilities for validating string and char literals and turning them into |
2 | //! values they represent. |
3 | |
4 | use std::ops::Range; |
5 | use std::str::Chars; |
6 | |
7 | use Mode::*; |
8 | |
9 | #[cfg (test)] |
10 | mod tests; |
11 | |
12 | /// Errors and warnings that can occur during string unescaping. They mostly |
13 | /// relate to malformed escape sequences, but there are a few that are about |
14 | /// other problems. |
15 | #[derive (Debug, PartialEq, Eq)] |
16 | pub enum EscapeError { |
17 | /// Expected 1 char, but 0 were found. |
18 | ZeroChars, |
19 | /// Expected 1 char, but more than 1 were found. |
20 | MoreThanOneChar, |
21 | |
22 | /// Escaped '\' character without continuation. |
23 | LoneSlash, |
24 | /// Invalid escape character (e.g. '\z'). |
25 | InvalidEscape, |
26 | /// Raw '\r' encountered. |
27 | BareCarriageReturn, |
28 | /// Raw '\r' encountered in raw string. |
29 | BareCarriageReturnInRawString, |
30 | /// Unescaped character that was expected to be escaped (e.g. raw '\t'). |
31 | EscapeOnlyChar, |
32 | |
33 | /// Numeric character escape is too short (e.g. '\x1'). |
34 | TooShortHexEscape, |
35 | /// Invalid character in numeric escape (e.g. '\xz') |
36 | InvalidCharInHexEscape, |
37 | /// Character code in numeric escape is non-ascii (e.g. '\xFF'). |
38 | OutOfRangeHexEscape, |
39 | |
40 | /// '\u' not followed by '{'. |
41 | NoBraceInUnicodeEscape, |
42 | /// Non-hexadecimal value in '\u{..}'. |
43 | InvalidCharInUnicodeEscape, |
44 | /// '\u{}' |
45 | EmptyUnicodeEscape, |
46 | /// No closing brace in '\u{..}', e.g. '\u{12'. |
47 | UnclosedUnicodeEscape, |
48 | /// '\u{_12}' |
49 | LeadingUnderscoreUnicodeEscape, |
50 | /// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}' |
51 | OverlongUnicodeEscape, |
52 | /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'. |
53 | LoneSurrogateUnicodeEscape, |
54 | /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'. |
55 | OutOfRangeUnicodeEscape, |
56 | |
57 | /// Unicode escape code in byte literal. |
58 | UnicodeEscapeInByte, |
59 | /// Non-ascii character in byte literal, byte string literal, or raw byte string literal. |
60 | NonAsciiCharInByte, |
61 | |
62 | // `\0` in a C string literal. |
63 | NulInCStr, |
64 | |
65 | /// After a line ending with '\', the next line contains whitespace |
66 | /// characters that are not skipped. |
67 | UnskippedWhitespaceWarning, |
68 | |
69 | /// After a line ending with '\', multiple lines are skipped. |
70 | MultipleSkippedLinesWarning, |
71 | } |
72 | |
73 | impl EscapeError { |
74 | /// Returns true for actual errors, as opposed to warnings. |
75 | pub fn is_fatal(&self) -> bool { |
76 | !matches!( |
77 | self, |
78 | EscapeError::UnskippedWhitespaceWarning | EscapeError::MultipleSkippedLinesWarning |
79 | ) |
80 | } |
81 | } |
82 | |
83 | /// Takes the contents of a unicode-only (non-mixed-utf8) literal (without |
84 | /// quotes) and produces a sequence of escaped characters or errors. |
85 | /// |
86 | /// Values are returned by invoking `callback`. For `Char` and `Byte` modes, |
87 | /// the callback will be called exactly once. |
88 | pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F) |
89 | where |
90 | F: FnMut(Range<usize>, Result<char, EscapeError>), |
91 | { |
92 | match mode { |
93 | Char | Byte => { |
94 | let mut chars: Chars<'_> = src.chars(); |
95 | let res: Result = unescape_char_or_byte(&mut chars, mode); |
96 | callback(0..(src.len() - chars.as_str().len()), res); |
97 | } |
98 | Str | ByteStr => unescape_non_raw_common(src, mode, callback), |
99 | RawStr | RawByteStr => check_raw_common(src, mode, callback), |
100 | RawCStr => check_raw_common(src, mode, &mut |r: Range, mut result: Result| { |
101 | if let Ok(' \0' ) = result { |
102 | result = Err(EscapeError::NulInCStr); |
103 | } |
104 | callback(r, result) |
105 | }), |
106 | CStr => unreachable!(), |
107 | } |
108 | } |
109 | |
110 | /// Used for mixed utf8 string literals, i.e. those that allow both unicode |
111 | /// chars and high bytes. |
112 | pub enum MixedUnit { |
113 | /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes) |
114 | /// and Unicode chars (written directly or via `\u` escapes). |
115 | /// |
116 | /// For example, if '¥' appears in a string it is represented here as |
117 | /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte |
118 | /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]` |
119 | Char(char), |
120 | |
121 | /// Used for high bytes (`\x80`..`\xff`). |
122 | /// |
123 | /// For example, if `\xa5` appears in a string it is represented here as |
124 | /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant |
125 | /// byte string as the single byte `0xa5`. |
126 | HighByte(u8), |
127 | } |
128 | |
129 | impl From<char> for MixedUnit { |
130 | fn from(c: char) -> Self { |
131 | MixedUnit::Char(c) |
132 | } |
133 | } |
134 | |
135 | impl From<u8> for MixedUnit { |
136 | fn from(n: u8) -> Self { |
137 | if n.is_ascii() { |
138 | MixedUnit::Char(n as char) |
139 | } else { |
140 | MixedUnit::HighByte(n) |
141 | } |
142 | } |
143 | } |
144 | |
145 | /// Takes the contents of a mixed-utf8 literal (without quotes) and produces |
146 | /// a sequence of escaped characters or errors. |
147 | /// |
148 | /// Values are returned by invoking `callback`. |
149 | pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F) |
150 | where |
151 | F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>), |
152 | { |
153 | match mode { |
154 | CStr => unescape_non_raw_common(src, mode, &mut |r: Range, mut result: Result| { |
155 | if let Ok(MixedUnit::Char(' \0' )) = result { |
156 | result = Err(EscapeError::NulInCStr); |
157 | } |
158 | callback(r, result) |
159 | }), |
160 | Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), |
161 | } |
162 | } |
163 | |
164 | /// Takes a contents of a char literal (without quotes), and returns an |
165 | /// unescaped char or an error. |
166 | pub fn unescape_char(src: &str) -> Result<char, EscapeError> { |
167 | unescape_char_or_byte(&mut src.chars(), mode:Char) |
168 | } |
169 | |
170 | /// Takes a contents of a byte literal (without quotes), and returns an |
171 | /// unescaped byte or an error. |
172 | pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> { |
173 | unescape_char_or_byte(&mut src.chars(), Byte).map(op:byte_from_char) |
174 | } |
175 | |
176 | /// What kind of literal do we parse. |
177 | #[derive (Debug, Clone, Copy, PartialEq)] |
178 | pub enum Mode { |
179 | Char, |
180 | |
181 | Byte, |
182 | |
183 | Str, |
184 | RawStr, |
185 | |
186 | ByteStr, |
187 | RawByteStr, |
188 | |
189 | CStr, |
190 | RawCStr, |
191 | } |
192 | |
193 | impl Mode { |
194 | pub fn in_double_quotes(self) -> bool { |
195 | match self { |
196 | Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, |
197 | Char | Byte => false, |
198 | } |
199 | } |
200 | |
201 | /// Are `\x80`..`\xff` allowed? |
202 | fn allow_high_bytes(self) -> bool { |
203 | match self { |
204 | Char | Str => false, |
205 | Byte | ByteStr | CStr => true, |
206 | RawStr | RawByteStr | RawCStr => unreachable!(), |
207 | } |
208 | } |
209 | |
210 | /// Are unicode (non-ASCII) chars allowed? |
211 | #[inline ] |
212 | fn allow_unicode_chars(self) -> bool { |
213 | match self { |
214 | Byte | ByteStr | RawByteStr => false, |
215 | Char | Str | RawStr | CStr | RawCStr => true, |
216 | } |
217 | } |
218 | |
219 | /// Are unicode escapes (`\u`) allowed? |
220 | fn allow_unicode_escapes(self) -> bool { |
221 | match self { |
222 | Byte | ByteStr => false, |
223 | Char | Str | CStr => true, |
224 | RawByteStr | RawStr | RawCStr => unreachable!(), |
225 | } |
226 | } |
227 | |
228 | pub fn prefix_noraw(self) -> &'static str { |
229 | match self { |
230 | Char | Str | RawStr => "" , |
231 | Byte | ByteStr | RawByteStr => "b" , |
232 | CStr | RawCStr => "c" , |
233 | } |
234 | } |
235 | } |
236 | |
237 | fn scan_escape<T: From<char> + From<u8>>( |
238 | chars: &mut Chars<'_>, |
239 | mode: Mode, |
240 | ) -> Result<T, EscapeError> { |
241 | // Previous character was '\\', unescape what follows. |
242 | let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { |
243 | '"' => '"' , |
244 | 'n' => ' \n' , |
245 | 'r' => ' \r' , |
246 | 't' => ' \t' , |
247 | ' \\' => ' \\' , |
248 | ' \'' => ' \'' , |
249 | '0' => ' \0' , |
250 | 'x' => { |
251 | // Parse hexadecimal character code. |
252 | |
253 | let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; |
254 | let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; |
255 | |
256 | let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; |
257 | let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; |
258 | |
259 | let value = (hi * 16 + lo) as u8; |
260 | |
261 | return if !mode.allow_high_bytes() && !value.is_ascii() { |
262 | Err(EscapeError::OutOfRangeHexEscape) |
263 | } else { |
264 | // This may be a high byte, but that will only happen if `T` is |
265 | // `MixedUnit`, because of the `allow_high_bytes` check above. |
266 | Ok(T::from(value)) |
267 | }; |
268 | } |
269 | 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), |
270 | _ => return Err(EscapeError::InvalidEscape), |
271 | }; |
272 | Ok(T::from(res)) |
273 | } |
274 | |
275 | fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> { |
276 | // We've parsed '\u', now we have to parse '{..}'. |
277 | |
278 | if chars.next() != Some('{' ) { |
279 | return Err(EscapeError::NoBraceInUnicodeEscape); |
280 | } |
281 | |
282 | // First character must be a hexadecimal digit. |
283 | let mut n_digits = 1; |
284 | let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { |
285 | '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), |
286 | '}' => return Err(EscapeError::EmptyUnicodeEscape), |
287 | c => c |
288 | .to_digit(16) |
289 | .ok_or(EscapeError::InvalidCharInUnicodeEscape)?, |
290 | }; |
291 | |
292 | // First character is valid, now parse the rest of the number |
293 | // and closing brace. |
294 | loop { |
295 | match chars.next() { |
296 | None => return Err(EscapeError::UnclosedUnicodeEscape), |
297 | Some('_' ) => continue, |
298 | Some('}' ) => { |
299 | if n_digits > 6 { |
300 | return Err(EscapeError::OverlongUnicodeEscape); |
301 | } |
302 | |
303 | // Incorrect syntax has higher priority for error reporting |
304 | // than unallowed value for a literal. |
305 | if !allow_unicode_escapes { |
306 | return Err(EscapeError::UnicodeEscapeInByte); |
307 | } |
308 | |
309 | break std::char::from_u32(value).ok_or({ |
310 | if value > 0x10FFFF { |
311 | EscapeError::OutOfRangeUnicodeEscape |
312 | } else { |
313 | EscapeError::LoneSurrogateUnicodeEscape |
314 | } |
315 | }); |
316 | } |
317 | Some(c) => { |
318 | let digit: u32 = c |
319 | .to_digit(16) |
320 | .ok_or(EscapeError::InvalidCharInUnicodeEscape)?; |
321 | n_digits += 1; |
322 | if n_digits > 6 { |
323 | // Stop updating value since we're sure that it's incorrect already. |
324 | continue; |
325 | } |
326 | value = value * 16 + digit; |
327 | } |
328 | }; |
329 | } |
330 | } |
331 | |
332 | #[inline ] |
333 | fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> { |
334 | if allow_unicode_chars || c.is_ascii() { |
335 | Ok(c) |
336 | } else { |
337 | Err(EscapeError::NonAsciiCharInByte) |
338 | } |
339 | } |
340 | |
341 | fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { |
342 | let c: char = chars.next().ok_or(err:EscapeError::ZeroChars)?; |
343 | let res: char = match c { |
344 | ' \\' => scan_escape(chars, mode), |
345 | ' \n' | ' \t' | ' \'' => Err(EscapeError::EscapeOnlyChar), |
346 | ' \r' => Err(EscapeError::BareCarriageReturn), |
347 | _ => ascii_check(c, mode.allow_unicode_chars()), |
348 | }?; |
349 | if chars.next().is_some() { |
350 | return Err(EscapeError::MoreThanOneChar); |
351 | } |
352 | Ok(res) |
353 | } |
354 | |
355 | /// Takes a contents of a string literal (without quotes) and produces a |
356 | /// sequence of escaped characters or errors. |
357 | fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F) |
358 | where |
359 | F: FnMut(Range<usize>, Result<T, EscapeError>), |
360 | { |
361 | let mut chars = src.chars(); |
362 | let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop |
363 | |
364 | // The `start` and `end` computation here is complicated because |
365 | // `skip_ascii_whitespace` makes us to skip over chars without counting |
366 | // them in the range computation. |
367 | while let Some(c) = chars.next() { |
368 | let start = src.len() - chars.as_str().len() - c.len_utf8(); |
369 | let res = match c { |
370 | ' \\' => { |
371 | match chars.clone().next() { |
372 | Some(' \n' ) => { |
373 | // Rust language specification requires us to skip whitespaces |
374 | // if unescaped '\' character is followed by '\n'. |
375 | // For details see [Rust language reference] |
376 | // (https://doc.rust-lang.org/reference/tokens.html#string-literals). |
377 | skip_ascii_whitespace(&mut chars, start, &mut |range, err| { |
378 | callback(range, Err(err)) |
379 | }); |
380 | continue; |
381 | } |
382 | _ => scan_escape::<T>(&mut chars, mode), |
383 | } |
384 | } |
385 | '"' => Err(EscapeError::EscapeOnlyChar), |
386 | ' \r' => Err(EscapeError::BareCarriageReturn), |
387 | _ => ascii_check(c, allow_unicode_chars).map(T::from), |
388 | }; |
389 | let end = src.len() - chars.as_str().len(); |
390 | callback(start..end, res); |
391 | } |
392 | } |
393 | |
394 | fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F) |
395 | where |
396 | F: FnMut(Range<usize>, EscapeError), |
397 | { |
398 | let tail: &str = chars.as_str(); |
399 | let first_non_space: usize = tail |
400 | .bytes() |
401 | .position(|b| b != b' ' && b != b' \t' && b != b' \n' && b != b' \r' ) |
402 | .unwrap_or(default:tail.len()); |
403 | if tail[1..first_non_space].contains(' \n' ) { |
404 | // The +1 accounts for the escaping slash. |
405 | let end: usize = start + first_non_space + 1; |
406 | callback(start..end, EscapeError::MultipleSkippedLinesWarning); |
407 | } |
408 | let tail: &str = &tail[first_non_space..]; |
409 | if let Some(c: char) = tail.chars().next() { |
410 | if c.is_whitespace() { |
411 | // For error reporting, we would like the span to contain the character that was not |
412 | // skipped. The +1 is necessary to account for the leading \ that started the escape. |
413 | let end: usize = start + first_non_space + c.len_utf8() + 1; |
414 | callback(start..end, EscapeError::UnskippedWhitespaceWarning); |
415 | } |
416 | } |
417 | *chars = tail.chars(); |
418 | } |
419 | |
420 | /// Takes a contents of a string literal (without quotes) and produces a |
421 | /// sequence of characters or errors. |
422 | /// NOTE: Raw strings do not perform any explicit character escaping, here we |
423 | /// only produce errors on bare CR. |
424 | fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F) |
425 | where |
426 | F: FnMut(Range<usize>, Result<char, EscapeError>), |
427 | { |
428 | let mut chars: Chars<'_> = src.chars(); |
429 | let allow_unicode_chars: bool = mode.allow_unicode_chars(); // get this outside the loop |
430 | |
431 | // The `start` and `end` computation here matches the one in |
432 | // `unescape_non_raw_common` for consistency, even though this function |
433 | // doesn't have to worry about skipping any chars. |
434 | while let Some(c: char) = chars.next() { |
435 | let start: usize = src.len() - chars.as_str().len() - c.len_utf8(); |
436 | let res: Result = match c { |
437 | ' \r' => Err(EscapeError::BareCarriageReturnInRawString), |
438 | _ => ascii_check(c, allow_unicode_chars), |
439 | }; |
440 | let end: usize = src.len() - chars.as_str().len(); |
441 | callback(start..end, res); |
442 | } |
443 | } |
444 | |
445 | #[inline ] |
446 | pub fn byte_from_char(c: char) -> u8 { |
447 | let res: u32 = c as u32; |
448 | debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr" ); |
449 | res as u8 |
450 | } |
451 | |