| 1 | //! Utilities for validating string and char literals and turning them into |
| 2 | //! values they represent. |
| 3 | |
| 4 | use std::ops::Range; |
| 5 | use std::str::Chars; |
| 6 | |
| 7 | use Mode::*; |
| 8 | |
| 9 | #[cfg (test)] |
| 10 | mod tests; |
| 11 | |
| 12 | /// Errors and warnings that can occur during string unescaping. They mostly |
| 13 | /// relate to malformed escape sequences, but there are a few that are about |
| 14 | /// other problems. |
| 15 | #[derive (Debug, PartialEq, Eq)] |
| 16 | pub enum EscapeError { |
| 17 | /// Expected 1 char, but 0 were found. |
| 18 | ZeroChars, |
| 19 | /// Expected 1 char, but more than 1 were found. |
| 20 | MoreThanOneChar, |
| 21 | |
| 22 | /// Escaped '\' character without continuation. |
| 23 | LoneSlash, |
| 24 | /// Invalid escape character (e.g. '\z'). |
| 25 | InvalidEscape, |
| 26 | /// Raw '\r' encountered. |
| 27 | BareCarriageReturn, |
| 28 | /// Raw '\r' encountered in raw string. |
| 29 | BareCarriageReturnInRawString, |
| 30 | /// Unescaped character that was expected to be escaped (e.g. raw '\t'). |
| 31 | EscapeOnlyChar, |
| 32 | |
| 33 | /// Numeric character escape is too short (e.g. '\x1'). |
| 34 | TooShortHexEscape, |
| 35 | /// Invalid character in numeric escape (e.g. '\xz') |
| 36 | InvalidCharInHexEscape, |
| 37 | /// Character code in numeric escape is non-ascii (e.g. '\xFF'). |
| 38 | OutOfRangeHexEscape, |
| 39 | |
| 40 | /// '\u' not followed by '{'. |
| 41 | NoBraceInUnicodeEscape, |
| 42 | /// Non-hexadecimal value in '\u{..}'. |
| 43 | InvalidCharInUnicodeEscape, |
| 44 | /// '\u{}' |
| 45 | EmptyUnicodeEscape, |
| 46 | /// No closing brace in '\u{..}', e.g. '\u{12'. |
| 47 | UnclosedUnicodeEscape, |
| 48 | /// '\u{_12}' |
| 49 | LeadingUnderscoreUnicodeEscape, |
| 50 | /// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}' |
| 51 | OverlongUnicodeEscape, |
| 52 | /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'. |
| 53 | LoneSurrogateUnicodeEscape, |
| 54 | /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'. |
| 55 | OutOfRangeUnicodeEscape, |
| 56 | |
| 57 | /// Unicode escape code in byte literal. |
| 58 | UnicodeEscapeInByte, |
| 59 | /// Non-ascii character in byte literal, byte string literal, or raw byte string literal. |
| 60 | NonAsciiCharInByte, |
| 61 | |
| 62 | // `\0` in a C string literal. |
| 63 | NulInCStr, |
| 64 | |
| 65 | /// After a line ending with '\', the next line contains whitespace |
| 66 | /// characters that are not skipped. |
| 67 | UnskippedWhitespaceWarning, |
| 68 | |
| 69 | /// After a line ending with '\', multiple lines are skipped. |
| 70 | MultipleSkippedLinesWarning, |
| 71 | } |
| 72 | |
| 73 | impl EscapeError { |
| 74 | /// Returns true for actual errors, as opposed to warnings. |
| 75 | pub fn is_fatal(&self) -> bool { |
| 76 | !matches!( |
| 77 | self, |
| 78 | EscapeError::UnskippedWhitespaceWarning | EscapeError::MultipleSkippedLinesWarning |
| 79 | ) |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | /// Takes the contents of a unicode-only (non-mixed-utf8) literal (without |
| 84 | /// quotes) and produces a sequence of escaped characters or errors. |
| 85 | /// |
| 86 | /// Values are returned by invoking `callback`. For `Char` and `Byte` modes, |
| 87 | /// the callback will be called exactly once. |
| 88 | pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F) |
| 89 | where |
| 90 | F: FnMut(Range<usize>, Result<char, EscapeError>), |
| 91 | { |
| 92 | match mode { |
| 93 | Char | Byte => { |
| 94 | let mut chars: Chars<'_> = src.chars(); |
| 95 | let res: Result = unescape_char_or_byte(&mut chars, mode); |
| 96 | callback(0..(src.len() - chars.as_str().len()), res); |
| 97 | } |
| 98 | Str | ByteStr => unescape_non_raw_common(src, mode, callback), |
| 99 | RawStr | RawByteStr => check_raw_common(src, mode, callback), |
| 100 | RawCStr => check_raw_common(src, mode, &mut |r: Range, mut result: Result| { |
| 101 | if let Ok(' \0' ) = result { |
| 102 | result = Err(EscapeError::NulInCStr); |
| 103 | } |
| 104 | callback(r, result) |
| 105 | }), |
| 106 | CStr => unreachable!(), |
| 107 | } |
| 108 | } |
| 109 | |
| 110 | /// Used for mixed utf8 string literals, i.e. those that allow both unicode |
| 111 | /// chars and high bytes. |
| 112 | pub enum MixedUnit { |
| 113 | /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes) |
| 114 | /// and Unicode chars (written directly or via `\u` escapes). |
| 115 | /// |
| 116 | /// For example, if '¥' appears in a string it is represented here as |
| 117 | /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte |
| 118 | /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]` |
| 119 | Char(char), |
| 120 | |
| 121 | /// Used for high bytes (`\x80`..`\xff`). |
| 122 | /// |
| 123 | /// For example, if `\xa5` appears in a string it is represented here as |
| 124 | /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant |
| 125 | /// byte string as the single byte `0xa5`. |
| 126 | HighByte(u8), |
| 127 | } |
| 128 | |
| 129 | impl From<char> for MixedUnit { |
| 130 | fn from(c: char) -> Self { |
| 131 | MixedUnit::Char(c) |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | impl From<u8> for MixedUnit { |
| 136 | fn from(n: u8) -> Self { |
| 137 | if n.is_ascii() { |
| 138 | MixedUnit::Char(n as char) |
| 139 | } else { |
| 140 | MixedUnit::HighByte(n) |
| 141 | } |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | /// Takes the contents of a mixed-utf8 literal (without quotes) and produces |
| 146 | /// a sequence of escaped characters or errors. |
| 147 | /// |
| 148 | /// Values are returned by invoking `callback`. |
| 149 | pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F) |
| 150 | where |
| 151 | F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>), |
| 152 | { |
| 153 | match mode { |
| 154 | CStr => unescape_non_raw_common(src, mode, &mut |r: Range, mut result: Result| { |
| 155 | if let Ok(MixedUnit::Char(' \0' )) = result { |
| 156 | result = Err(EscapeError::NulInCStr); |
| 157 | } |
| 158 | callback(r, result) |
| 159 | }), |
| 160 | Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), |
| 161 | } |
| 162 | } |
| 163 | |
| 164 | /// Takes a contents of a char literal (without quotes), and returns an |
| 165 | /// unescaped char or an error. |
| 166 | pub fn unescape_char(src: &str) -> Result<char, EscapeError> { |
| 167 | unescape_char_or_byte(&mut src.chars(), mode:Char) |
| 168 | } |
| 169 | |
| 170 | /// Takes a contents of a byte literal (without quotes), and returns an |
| 171 | /// unescaped byte or an error. |
| 172 | pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> { |
| 173 | unescape_char_or_byte(&mut src.chars(), Byte).map(op:byte_from_char) |
| 174 | } |
| 175 | |
| 176 | /// What kind of literal do we parse. |
| 177 | #[derive (Debug, Clone, Copy, PartialEq)] |
| 178 | pub enum Mode { |
| 179 | Char, |
| 180 | |
| 181 | Byte, |
| 182 | |
| 183 | Str, |
| 184 | RawStr, |
| 185 | |
| 186 | ByteStr, |
| 187 | RawByteStr, |
| 188 | |
| 189 | CStr, |
| 190 | RawCStr, |
| 191 | } |
| 192 | |
| 193 | impl Mode { |
| 194 | pub fn in_double_quotes(self) -> bool { |
| 195 | match self { |
| 196 | Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, |
| 197 | Char | Byte => false, |
| 198 | } |
| 199 | } |
| 200 | |
| 201 | /// Are `\x80`..`\xff` allowed? |
| 202 | fn allow_high_bytes(self) -> bool { |
| 203 | match self { |
| 204 | Char | Str => false, |
| 205 | Byte | ByteStr | CStr => true, |
| 206 | RawStr | RawByteStr | RawCStr => unreachable!(), |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | /// Are unicode (non-ASCII) chars allowed? |
| 211 | #[inline ] |
| 212 | fn allow_unicode_chars(self) -> bool { |
| 213 | match self { |
| 214 | Byte | ByteStr | RawByteStr => false, |
| 215 | Char | Str | RawStr | CStr | RawCStr => true, |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | /// Are unicode escapes (`\u`) allowed? |
| 220 | fn allow_unicode_escapes(self) -> bool { |
| 221 | match self { |
| 222 | Byte | ByteStr => false, |
| 223 | Char | Str | CStr => true, |
| 224 | RawByteStr | RawStr | RawCStr => unreachable!(), |
| 225 | } |
| 226 | } |
| 227 | |
| 228 | pub fn prefix_noraw(self) -> &'static str { |
| 229 | match self { |
| 230 | Char | Str | RawStr => "" , |
| 231 | Byte | ByteStr | RawByteStr => "b" , |
| 232 | CStr | RawCStr => "c" , |
| 233 | } |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | fn scan_escape<T: From<char> + From<u8>>( |
| 238 | chars: &mut Chars<'_>, |
| 239 | mode: Mode, |
| 240 | ) -> Result<T, EscapeError> { |
| 241 | // Previous character was '\\', unescape what follows. |
| 242 | let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { |
| 243 | '"' => '"' , |
| 244 | 'n' => ' \n' , |
| 245 | 'r' => ' \r' , |
| 246 | 't' => ' \t' , |
| 247 | ' \\' => ' \\' , |
| 248 | ' \'' => ' \'' , |
| 249 | '0' => ' \0' , |
| 250 | 'x' => { |
| 251 | // Parse hexadecimal character code. |
| 252 | |
| 253 | let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; |
| 254 | let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; |
| 255 | |
| 256 | let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; |
| 257 | let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; |
| 258 | |
| 259 | let value = (hi * 16 + lo) as u8; |
| 260 | |
| 261 | return if !mode.allow_high_bytes() && !value.is_ascii() { |
| 262 | Err(EscapeError::OutOfRangeHexEscape) |
| 263 | } else { |
| 264 | // This may be a high byte, but that will only happen if `T` is |
| 265 | // `MixedUnit`, because of the `allow_high_bytes` check above. |
| 266 | Ok(T::from(value)) |
| 267 | }; |
| 268 | } |
| 269 | 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), |
| 270 | _ => return Err(EscapeError::InvalidEscape), |
| 271 | }; |
| 272 | Ok(T::from(res)) |
| 273 | } |
| 274 | |
| 275 | fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> { |
| 276 | // We've parsed '\u', now we have to parse '{..}'. |
| 277 | |
| 278 | if chars.next() != Some('{' ) { |
| 279 | return Err(EscapeError::NoBraceInUnicodeEscape); |
| 280 | } |
| 281 | |
| 282 | // First character must be a hexadecimal digit. |
| 283 | let mut n_digits = 1; |
| 284 | let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { |
| 285 | '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), |
| 286 | '}' => return Err(EscapeError::EmptyUnicodeEscape), |
| 287 | c => c |
| 288 | .to_digit(16) |
| 289 | .ok_or(EscapeError::InvalidCharInUnicodeEscape)?, |
| 290 | }; |
| 291 | |
| 292 | // First character is valid, now parse the rest of the number |
| 293 | // and closing brace. |
| 294 | loop { |
| 295 | match chars.next() { |
| 296 | None => return Err(EscapeError::UnclosedUnicodeEscape), |
| 297 | Some('_' ) => continue, |
| 298 | Some('}' ) => { |
| 299 | if n_digits > 6 { |
| 300 | return Err(EscapeError::OverlongUnicodeEscape); |
| 301 | } |
| 302 | |
| 303 | // Incorrect syntax has higher priority for error reporting |
| 304 | // than unallowed value for a literal. |
| 305 | if !allow_unicode_escapes { |
| 306 | return Err(EscapeError::UnicodeEscapeInByte); |
| 307 | } |
| 308 | |
| 309 | break std::char::from_u32(value).ok_or({ |
| 310 | if value > 0x10FFFF { |
| 311 | EscapeError::OutOfRangeUnicodeEscape |
| 312 | } else { |
| 313 | EscapeError::LoneSurrogateUnicodeEscape |
| 314 | } |
| 315 | }); |
| 316 | } |
| 317 | Some(c) => { |
| 318 | let digit: u32 = c |
| 319 | .to_digit(16) |
| 320 | .ok_or(EscapeError::InvalidCharInUnicodeEscape)?; |
| 321 | n_digits += 1; |
| 322 | if n_digits > 6 { |
| 323 | // Stop updating value since we're sure that it's incorrect already. |
| 324 | continue; |
| 325 | } |
| 326 | value = value * 16 + digit; |
| 327 | } |
| 328 | }; |
| 329 | } |
| 330 | } |
| 331 | |
| 332 | #[inline ] |
| 333 | fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> { |
| 334 | if allow_unicode_chars || c.is_ascii() { |
| 335 | Ok(c) |
| 336 | } else { |
| 337 | Err(EscapeError::NonAsciiCharInByte) |
| 338 | } |
| 339 | } |
| 340 | |
| 341 | fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { |
| 342 | let c: char = chars.next().ok_or(err:EscapeError::ZeroChars)?; |
| 343 | let res: char = match c { |
| 344 | ' \\' => scan_escape(chars, mode), |
| 345 | ' \n' | ' \t' | ' \'' => Err(EscapeError::EscapeOnlyChar), |
| 346 | ' \r' => Err(EscapeError::BareCarriageReturn), |
| 347 | _ => ascii_check(c, mode.allow_unicode_chars()), |
| 348 | }?; |
| 349 | if chars.next().is_some() { |
| 350 | return Err(EscapeError::MoreThanOneChar); |
| 351 | } |
| 352 | Ok(res) |
| 353 | } |
| 354 | |
| 355 | /// Takes a contents of a string literal (without quotes) and produces a |
| 356 | /// sequence of escaped characters or errors. |
| 357 | fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F) |
| 358 | where |
| 359 | F: FnMut(Range<usize>, Result<T, EscapeError>), |
| 360 | { |
| 361 | let mut chars = src.chars(); |
| 362 | let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop |
| 363 | |
| 364 | // The `start` and `end` computation here is complicated because |
| 365 | // `skip_ascii_whitespace` makes us to skip over chars without counting |
| 366 | // them in the range computation. |
| 367 | while let Some(c) = chars.next() { |
| 368 | let start = src.len() - chars.as_str().len() - c.len_utf8(); |
| 369 | let res = match c { |
| 370 | ' \\' => { |
| 371 | match chars.clone().next() { |
| 372 | Some(' \n' ) => { |
| 373 | // Rust language specification requires us to skip whitespaces |
| 374 | // if unescaped '\' character is followed by '\n'. |
| 375 | // For details see [Rust language reference] |
| 376 | // (https://doc.rust-lang.org/reference/tokens.html#string-literals). |
| 377 | skip_ascii_whitespace(&mut chars, start, &mut |range, err| { |
| 378 | callback(range, Err(err)) |
| 379 | }); |
| 380 | continue; |
| 381 | } |
| 382 | _ => scan_escape::<T>(&mut chars, mode), |
| 383 | } |
| 384 | } |
| 385 | '"' => Err(EscapeError::EscapeOnlyChar), |
| 386 | ' \r' => Err(EscapeError::BareCarriageReturn), |
| 387 | _ => ascii_check(c, allow_unicode_chars).map(T::from), |
| 388 | }; |
| 389 | let end = src.len() - chars.as_str().len(); |
| 390 | callback(start..end, res); |
| 391 | } |
| 392 | } |
| 393 | |
| 394 | fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F) |
| 395 | where |
| 396 | F: FnMut(Range<usize>, EscapeError), |
| 397 | { |
| 398 | let tail: &str = chars.as_str(); |
| 399 | let first_non_space: usize = tail |
| 400 | .bytes() |
| 401 | .position(|b| b != b' ' && b != b' \t' && b != b' \n' && b != b' \r' ) |
| 402 | .unwrap_or(default:tail.len()); |
| 403 | if tail[1..first_non_space].contains(' \n' ) { |
| 404 | // The +1 accounts for the escaping slash. |
| 405 | let end: usize = start + first_non_space + 1; |
| 406 | callback(start..end, EscapeError::MultipleSkippedLinesWarning); |
| 407 | } |
| 408 | let tail: &str = &tail[first_non_space..]; |
| 409 | if let Some(c: char) = tail.chars().next() { |
| 410 | if c.is_whitespace() { |
| 411 | // For error reporting, we would like the span to contain the character that was not |
| 412 | // skipped. The +1 is necessary to account for the leading \ that started the escape. |
| 413 | let end: usize = start + first_non_space + c.len_utf8() + 1; |
| 414 | callback(start..end, EscapeError::UnskippedWhitespaceWarning); |
| 415 | } |
| 416 | } |
| 417 | *chars = tail.chars(); |
| 418 | } |
| 419 | |
| 420 | /// Takes a contents of a string literal (without quotes) and produces a |
| 421 | /// sequence of characters or errors. |
| 422 | /// NOTE: Raw strings do not perform any explicit character escaping, here we |
| 423 | /// only produce errors on bare CR. |
| 424 | fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F) |
| 425 | where |
| 426 | F: FnMut(Range<usize>, Result<char, EscapeError>), |
| 427 | { |
| 428 | let mut chars: Chars<'_> = src.chars(); |
| 429 | let allow_unicode_chars: bool = mode.allow_unicode_chars(); // get this outside the loop |
| 430 | |
| 431 | // The `start` and `end` computation here matches the one in |
| 432 | // `unescape_non_raw_common` for consistency, even though this function |
| 433 | // doesn't have to worry about skipping any chars. |
| 434 | while let Some(c: char) = chars.next() { |
| 435 | let start: usize = src.len() - chars.as_str().len() - c.len_utf8(); |
| 436 | let res: Result = match c { |
| 437 | ' \r' => Err(EscapeError::BareCarriageReturnInRawString), |
| 438 | _ => ascii_check(c, allow_unicode_chars), |
| 439 | }; |
| 440 | let end: usize = src.len() - chars.as_str().len(); |
| 441 | callback(start..end, res); |
| 442 | } |
| 443 | } |
| 444 | |
| 445 | #[inline ] |
| 446 | pub fn byte_from_char(c: char) -> u8 { |
| 447 | let res: u32 = c as u32; |
| 448 | debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr" ); |
| 449 | res as u8 |
| 450 | } |
| 451 | |