| 1 | //! Definition of a lexer for the WebAssembly text format. |
| 2 | //! |
| 3 | //! This module provides a [`Lexer`][] type which is an iterate over the raw |
| 4 | //! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single |
| 5 | //! byte in a WebAssembly text field, returning tokens even for comments and |
| 6 | //! whitespace. Typically you'll ignore comments and whitespace, however. |
| 7 | //! |
| 8 | //! If you'd like to iterate over the tokens in a file you can do so via: |
| 9 | //! |
| 10 | //! ``` |
| 11 | //! # fn foo() -> Result<(), wast::Error> { |
| 12 | //! use wast::lexer::Lexer; |
| 13 | //! |
| 14 | //! let wat = "(module (func $foo))" ; |
| 15 | //! for token in Lexer::new(wat).iter(0) { |
| 16 | //! println!("{:?}" , token?); |
| 17 | //! } |
| 18 | //! # Ok(()) |
| 19 | //! # } |
| 20 | //! ``` |
| 21 | //! |
| 22 | //! Note that you'll typically not use this module but will rather use |
| 23 | //! [`ParseBuffer`](crate::parser::ParseBuffer) instead. |
| 24 | //! |
| 25 | //! [`Lexer`]: crate::lexer::Lexer |
| 26 | |
| 27 | use crate::token::Span; |
| 28 | use crate::Error; |
| 29 | use std::borrow::Cow; |
| 30 | use std::char; |
| 31 | use std::fmt; |
| 32 | use std::slice; |
| 33 | use std::str; |
| 34 | use std::str::Utf8Error; |
| 35 | |
| 36 | /// A structure used to lex the s-expression syntax of WAT files. |
| 37 | /// |
| 38 | /// This structure is used to generate [`Token`] items, which should account for |
| 39 | /// every single byte of the input as we iterate over it. A [`LexError`] is |
| 40 | /// returned for any non-lexable text. |
| 41 | #[derive (Clone)] |
| 42 | pub struct Lexer<'a> { |
| 43 | input: &'a str, |
| 44 | allow_confusing_unicode: bool, |
| 45 | } |
| 46 | |
| 47 | /// A single token parsed from a `Lexer`. |
| 48 | #[derive (Copy, Clone, Debug, PartialEq)] |
| 49 | pub struct Token { |
| 50 | /// The kind of token this represents, such as whether it's whitespace, a |
| 51 | /// keyword, etc. |
| 52 | pub kind: TokenKind, |
| 53 | /// The byte offset within the original source for where this token came |
| 54 | /// from. |
| 55 | pub offset: usize, |
| 56 | /// The byte length of this token as it resides in the original source. |
| 57 | // |
| 58 | // NB: this is `u32` to enable packing `Token` into two pointers of size. |
| 59 | // This does limit a single token to being at most 4G large, but that seems |
| 60 | // probably ok. |
| 61 | pub len: u32, |
| 62 | } |
| 63 | |
| 64 | #[test ] |
| 65 | fn token_is_not_too_big() { |
| 66 | assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2); |
| 67 | } |
| 68 | |
| 69 | /// Classification of what was parsed from the input stream. |
| 70 | /// |
| 71 | /// This enumeration contains all kinds of fragments, including comments and |
| 72 | /// whitespace. |
| 73 | #[derive (Copy, Clone, Debug, PartialEq)] |
| 74 | pub enum TokenKind { |
| 75 | /// A line comment, preceded with `;;` |
| 76 | LineComment, |
| 77 | |
| 78 | /// A block comment, surrounded by `(;` and `;)`. Note that these can be |
| 79 | /// nested. |
| 80 | BlockComment, |
| 81 | |
| 82 | /// A fragment of source that represents whitespace. |
| 83 | Whitespace, |
| 84 | |
| 85 | /// A left-parenthesis, including the source text for where it comes from. |
| 86 | LParen, |
| 87 | /// A right-parenthesis, including the source text for where it comes from. |
| 88 | RParen, |
| 89 | |
| 90 | /// A string literal, which is actually a list of bytes. |
| 91 | String, |
| 92 | |
| 93 | /// An identifier (like `$foo`). |
| 94 | /// |
| 95 | /// All identifiers start with `$` and the payload here is the original |
| 96 | /// source text. |
| 97 | Id, |
| 98 | |
| 99 | /// A keyword, or something that starts with an alphabetic character. |
| 100 | /// |
| 101 | /// The payload here is the original source text. |
| 102 | Keyword, |
| 103 | |
| 104 | /// An annotation (like `@foo`). |
| 105 | /// |
| 106 | /// All annotations start with `@` and the payload will be the name of the |
| 107 | /// annotation. |
| 108 | Annotation, |
| 109 | |
| 110 | /// A reserved series of `idchar` symbols. Unknown what this is meant to be |
| 111 | /// used for, you'll probably generate an error about an unexpected token. |
| 112 | Reserved, |
| 113 | |
| 114 | /// An integer. |
| 115 | Integer(IntegerKind), |
| 116 | |
| 117 | /// A float. |
| 118 | Float(FloatKind), |
| 119 | } |
| 120 | |
| 121 | /// Description of the parsed integer from the source. |
| 122 | #[derive (Copy, Clone, Debug, PartialEq)] |
| 123 | pub struct IntegerKind { |
| 124 | sign: Option<SignToken>, |
| 125 | has_underscores: bool, |
| 126 | hex: bool, |
| 127 | } |
| 128 | |
| 129 | /// Description of a parsed float from the source. |
| 130 | #[allow (missing_docs)] |
| 131 | #[derive (Copy, Clone, Debug, PartialEq)] |
| 132 | pub enum FloatKind { |
| 133 | #[doc (hidden)] |
| 134 | Inf { negative: bool }, |
| 135 | #[doc (hidden)] |
| 136 | Nan { negative: bool }, |
| 137 | #[doc (hidden)] |
| 138 | NanVal { |
| 139 | negative: bool, |
| 140 | has_underscores: bool, |
| 141 | }, |
| 142 | #[doc (hidden)] |
| 143 | Normal { has_underscores: bool, hex: bool }, |
| 144 | } |
| 145 | |
| 146 | enum ReservedKind { |
| 147 | /// "..." |
| 148 | String, |
| 149 | /// anything that's just a sequence of `idchars!()` |
| 150 | Idchars, |
| 151 | /// $"..." |
| 152 | IdString, |
| 153 | /// @"..." |
| 154 | AnnotationString, |
| 155 | /// everything else (a conglomeration of strings, idchars, etc) |
| 156 | Reserved, |
| 157 | } |
| 158 | |
| 159 | /// Errors that can be generated while lexing. |
| 160 | /// |
| 161 | /// All lexing errors have line/colum/position information as well as a |
| 162 | /// `LexError` indicating what kind of error happened while lexing. |
| 163 | #[derive (Debug, Clone, PartialEq, Eq)] |
| 164 | #[non_exhaustive ] |
| 165 | pub enum LexError { |
| 166 | /// A dangling block comment was found with an unbalanced `(;` which was |
| 167 | /// never terminated in the file. |
| 168 | DanglingBlockComment, |
| 169 | |
| 170 | /// An unexpected character was encountered when generally parsing and |
| 171 | /// looking for something else. |
| 172 | Unexpected(char), |
| 173 | |
| 174 | /// An invalid `char` in a string literal was found. |
| 175 | InvalidStringElement(char), |
| 176 | |
| 177 | /// An invalid string escape letter was found (the thing after the `\` in |
| 178 | /// string literals) |
| 179 | InvalidStringEscape(char), |
| 180 | |
| 181 | /// An invalid hexadecimal digit was found. |
| 182 | InvalidHexDigit(char), |
| 183 | |
| 184 | /// An invalid base-10 digit was found. |
| 185 | InvalidDigit(char), |
| 186 | |
| 187 | /// Parsing expected `wanted` but ended up finding `found` instead where the |
| 188 | /// two characters aren't the same. |
| 189 | Expected { |
| 190 | /// The character that was expected to be found |
| 191 | wanted: char, |
| 192 | /// The character that was actually found |
| 193 | found: char, |
| 194 | }, |
| 195 | |
| 196 | /// We needed to parse more but EOF (or end of the string) was encountered. |
| 197 | UnexpectedEof, |
| 198 | |
| 199 | /// A number failed to parse because it was too big to fit within the target |
| 200 | /// type. |
| 201 | NumberTooBig, |
| 202 | |
| 203 | /// An invalid unicode value was found in a `\u{...}` escape in a string, |
| 204 | /// only valid unicode scalars can be escaped that way. |
| 205 | InvalidUnicodeValue(u32), |
| 206 | |
| 207 | /// A lone underscore was found when parsing a number, since underscores |
| 208 | /// should always be preceded and succeeded with a digit of some form. |
| 209 | LoneUnderscore, |
| 210 | |
| 211 | /// A "confusing" unicode character is present in a comment or a string |
| 212 | /// literal, such as a character that changes the direction text is |
| 213 | /// typically displayed in editors. This could cause the human-read |
| 214 | /// version to behave differently than the compiler-visible version, so |
| 215 | /// these are simply rejected for now. |
| 216 | ConfusingUnicode(char), |
| 217 | |
| 218 | /// An invalid utf-8 sequence was found in a quoted identifier, such as |
| 219 | /// `$"\ff"`. |
| 220 | InvalidUtf8Id(Utf8Error), |
| 221 | |
| 222 | /// An empty identifier was found, or a lone `$`. |
| 223 | EmptyId, |
| 224 | |
| 225 | /// An empty identifier was found, or a lone `@`. |
| 226 | EmptyAnnotation, |
| 227 | } |
| 228 | |
| 229 | /// A sign token for an integer. |
| 230 | #[derive (Clone, Copy, Debug, PartialEq, Eq)] |
| 231 | pub enum SignToken { |
| 232 | /// Plus sign: "+", |
| 233 | Plus, |
| 234 | /// Minus sign: "-", |
| 235 | Minus, |
| 236 | } |
| 237 | |
| 238 | /// A fully parsed integer from a source string with a payload ready to parse |
| 239 | /// into an integral type. |
| 240 | #[derive (Debug, PartialEq)] |
| 241 | pub struct Integer<'a> { |
| 242 | sign: Option<SignToken>, |
| 243 | val: Cow<'a, str>, |
| 244 | hex: bool, |
| 245 | } |
| 246 | |
| 247 | /// Possible parsed float values |
| 248 | #[derive (Debug, PartialEq, Eq)] |
| 249 | pub enum Float<'a> { |
| 250 | /// A float `NaN` representation |
| 251 | Nan { |
| 252 | /// The specific bits to encode for this float, optionally |
| 253 | val: Option<Cow<'a, str>>, |
| 254 | /// Whether or not this is a negative `NaN` or not. |
| 255 | negative: bool, |
| 256 | }, |
| 257 | /// An float infinite representation, |
| 258 | Inf { |
| 259 | #[allow (missing_docs)] |
| 260 | negative: bool, |
| 261 | }, |
| 262 | /// A parsed and separated floating point value |
| 263 | Val { |
| 264 | /// Whether or not the `integral` and `fractional` are specified in hex |
| 265 | hex: bool, |
| 266 | /// The float parts before the `.` |
| 267 | integral: Cow<'a, str>, |
| 268 | /// The float parts after the `.` |
| 269 | fractional: Option<Cow<'a, str>>, |
| 270 | /// The exponent to multiple this `integral.fractional` portion of the |
| 271 | /// float by. If `hex` is true this is `2^exponent` and otherwise it's |
| 272 | /// `10^exponent` |
| 273 | exponent: Option<Cow<'a, str>>, |
| 274 | }, |
| 275 | } |
| 276 | |
| 277 | // https://webassembly.github.io/spec/core/text/values.html#text-idchar |
| 278 | macro_rules! idchars { |
| 279 | () => { |
| 280 | b'0' ..=b'9' |
| 281 | | b'A' ..=b'Z' |
| 282 | | b'a' ..=b'z' |
| 283 | | b'!' |
| 284 | | b'#' |
| 285 | | b'$' |
| 286 | | b'%' |
| 287 | | b'&' |
| 288 | | b' \'' |
| 289 | | b'*' |
| 290 | | b'+' |
| 291 | | b'-' |
| 292 | | b'.' |
| 293 | | b'/' |
| 294 | | b':' |
| 295 | | b'<' |
| 296 | | b'=' |
| 297 | | b'>' |
| 298 | | b'?' |
| 299 | | b'@' |
| 300 | | b' \\' |
| 301 | | b'^' |
| 302 | | b'_' |
| 303 | | b'`' |
| 304 | | b'|' |
| 305 | | b'~' |
| 306 | } |
| 307 | } |
| 308 | |
| 309 | impl<'a> Lexer<'a> { |
| 310 | /// Creates a new lexer which will lex the `input` source string. |
| 311 | pub fn new(input: &str) -> Lexer<'_> { |
| 312 | Lexer { |
| 313 | input, |
| 314 | allow_confusing_unicode: false, |
| 315 | } |
| 316 | } |
| 317 | |
| 318 | /// Returns the original source input that we're lexing. |
| 319 | pub fn input(&self) -> &'a str { |
| 320 | self.input |
| 321 | } |
| 322 | |
| 323 | /// Configures whether "confusing" unicode characters are allowed while |
| 324 | /// lexing. |
| 325 | /// |
| 326 | /// If allowed then no error will happen if these characters are found, but |
| 327 | /// otherwise if disallowed a lex error will be produced when these |
| 328 | /// characters are found. Confusing characters are denied by default. |
| 329 | /// |
| 330 | /// For now "confusing characters" are primarily related to the "trojan |
| 331 | /// source" problem where it refers to characters which cause humans to read |
| 332 | /// text differently than this lexer, such as characters that alter the |
| 333 | /// left-to-right display of the source code. |
| 334 | pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self { |
| 335 | self.allow_confusing_unicode = allow; |
| 336 | self |
| 337 | } |
| 338 | |
| 339 | /// Lexes the next at the byte position `pos` in the input. |
| 340 | /// |
| 341 | /// Returns `Some` if a token is found or `None` if we're at EOF. |
| 342 | /// |
| 343 | /// The `pos` argument will be updated to point to the next token on a |
| 344 | /// successful parse. |
| 345 | /// |
| 346 | /// # Errors |
| 347 | /// |
| 348 | /// Returns an error if the input is malformed. |
| 349 | pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> { |
| 350 | let offset = *pos; |
| 351 | Ok(match self.parse_kind(pos)? { |
| 352 | Some(kind) => Some(Token { |
| 353 | kind, |
| 354 | offset, |
| 355 | len: (*pos - offset).try_into().unwrap(), |
| 356 | }), |
| 357 | None => None, |
| 358 | }) |
| 359 | } |
| 360 | |
| 361 | fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> { |
| 362 | let start = *pos; |
| 363 | // This `match` generally parses the grammar specified at |
| 364 | // |
| 365 | // https://webassembly.github.io/spec/core/text/lexical.html#text-token |
| 366 | let remaining = &self.input.as_bytes()[start..]; |
| 367 | let byte = match remaining.first() { |
| 368 | Some(b) => b, |
| 369 | None => return Ok(None), |
| 370 | }; |
| 371 | |
| 372 | match byte { |
| 373 | // Open-parens check the next character to see if this is the start |
| 374 | // of a block comment, otherwise it's just a bland left-paren |
| 375 | // token. |
| 376 | b'(' => match remaining.get(1) { |
| 377 | Some(b';' ) => { |
| 378 | let mut level = 1; |
| 379 | // Note that we're doing a byte-level search here for the |
| 380 | // close-delimiter of `;)`. The actual source text is utf-8 |
| 381 | // encode in `remaining` but due to how utf-8 works we |
| 382 | // can safely search for an ASCII byte since it'll never |
| 383 | // otherwise appear in the middle of a codepoint and if we |
| 384 | // find it then it's guaranteed to be the right byte. |
| 385 | // |
| 386 | // Mainly we're avoiding the overhead of decoding utf-8 |
| 387 | // characters into a Rust `char` since it's otherwise |
| 388 | // unnecessary work. |
| 389 | let mut iter = remaining[2..].iter(); |
| 390 | while let Some(ch) = iter.next() { |
| 391 | match ch { |
| 392 | b'(' => { |
| 393 | if let Some(b';' ) = iter.as_slice().first() { |
| 394 | level += 1; |
| 395 | iter.next(); |
| 396 | } |
| 397 | } |
| 398 | b';' => { |
| 399 | if let Some(b')' ) = iter.as_slice().first() { |
| 400 | level -= 1; |
| 401 | iter.next(); |
| 402 | if level == 0 { |
| 403 | let len = remaining.len() - iter.as_slice().len(); |
| 404 | let comment = &self.input[start..][..len]; |
| 405 | *pos += len; |
| 406 | self.check_confusing_comment(*pos, comment)?; |
| 407 | return Ok(Some(TokenKind::BlockComment)); |
| 408 | } |
| 409 | } |
| 410 | } |
| 411 | _ => {} |
| 412 | } |
| 413 | } |
| 414 | Err(self.error(start, LexError::DanglingBlockComment)) |
| 415 | } |
| 416 | _ => { |
| 417 | *pos += 1; |
| 418 | |
| 419 | Ok(Some(TokenKind::LParen)) |
| 420 | } |
| 421 | }, |
| 422 | |
| 423 | b')' => { |
| 424 | *pos += 1; |
| 425 | Ok(Some(TokenKind::RParen)) |
| 426 | } |
| 427 | |
| 428 | // https://webassembly.github.io/spec/core/text/lexical.html#white-space |
| 429 | b' ' | b' \n' | b' \r' | b' \t' => { |
| 430 | self.skip_ws(pos); |
| 431 | Ok(Some(TokenKind::Whitespace)) |
| 432 | } |
| 433 | |
| 434 | c @ (idchars!() | b'"' ) => { |
| 435 | let (kind, src) = self.parse_reserved(pos)?; |
| 436 | match kind { |
| 437 | // If the reserved token was simply a single string then |
| 438 | // that is converted to a standalone string token |
| 439 | ReservedKind::String => return Ok(Some(TokenKind::String)), |
| 440 | |
| 441 | // If only idchars were consumed then this could be a |
| 442 | // specific kind of standalone token we're interested in. |
| 443 | ReservedKind::Idchars => { |
| 444 | // https://webassembly.github.io/spec/core/text/values.html#integers |
| 445 | if let Some(ret) = self.classify_number(src) { |
| 446 | return Ok(Some(ret)); |
| 447 | // https://webassembly.github.io/spec/core/text/values.html#text-id |
| 448 | } else if *c == b'$' { |
| 449 | return Ok(Some(TokenKind::Id)); |
| 450 | // part of the WebAssembly/annotations proposal |
| 451 | // (no online url yet) |
| 452 | } else if *c == b'@' { |
| 453 | return Ok(Some(TokenKind::Annotation)); |
| 454 | // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword |
| 455 | } else if b'a' <= *c && *c <= b'z' { |
| 456 | return Ok(Some(TokenKind::Keyword)); |
| 457 | } |
| 458 | } |
| 459 | |
| 460 | ReservedKind::IdString => return Ok(Some(TokenKind::Id)), |
| 461 | ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)), |
| 462 | |
| 463 | // ... otherwise this was a conglomeration of idchars, |
| 464 | // strings, or just idchars that don't match a prior rule, |
| 465 | // meaning this falls through to the fallback `Reserved` |
| 466 | // token. |
| 467 | ReservedKind::Reserved => {} |
| 468 | } |
| 469 | |
| 470 | Ok(Some(TokenKind::Reserved)) |
| 471 | } |
| 472 | |
| 473 | // This could be a line comment, otherwise `;` is a reserved token. |
| 474 | // The second byte is checked to see if it's a `;;` line comment |
| 475 | // |
| 476 | // Note that this character being considered as part of a |
| 477 | // `reserved` token is part of the annotations proposal. |
| 478 | b';' => match remaining.get(1) { |
| 479 | Some(b';' ) => { |
| 480 | let remaining = &self.input[*pos..]; |
| 481 | let byte_pos = memchr::memchr2(b' \n' , b' \r' , remaining.as_bytes()) |
| 482 | .unwrap_or(remaining.len()); |
| 483 | *pos += byte_pos; |
| 484 | let comment = &remaining[..byte_pos]; |
| 485 | self.check_confusing_comment(*pos, comment)?; |
| 486 | Ok(Some(TokenKind::LineComment)) |
| 487 | } |
| 488 | _ => { |
| 489 | *pos += 1; |
| 490 | Ok(Some(TokenKind::Reserved)) |
| 491 | } |
| 492 | }, |
| 493 | |
| 494 | // Other known reserved tokens other than `;` |
| 495 | // |
| 496 | // Note that these characters being considered as part of a |
| 497 | // `reserved` token is part of the annotations proposal. |
| 498 | b',' | b'[' | b']' | b'{' | b'}' => { |
| 499 | *pos += 1; |
| 500 | Ok(Some(TokenKind::Reserved)) |
| 501 | } |
| 502 | |
| 503 | _ => { |
| 504 | let ch = self.input[start..].chars().next().unwrap(); |
| 505 | Err(self.error(*pos, LexError::Unexpected(ch))) |
| 506 | } |
| 507 | } |
| 508 | } |
| 509 | |
| 510 | fn skip_ws(&self, pos: &mut usize) { |
| 511 | // This table is a byte lookup table to determine whether a byte is a |
| 512 | // whitespace byte. There are only 4 whitespace bytes for the `*.wat` |
| 513 | // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes |
| 514 | // have a '1' in the table below. |
| 515 | // |
| 516 | // Due to how utf-8 works (our input is guaranteed to be utf-8) it is |
| 517 | // known that if these bytes are found they're guaranteed to be the |
| 518 | // whitespace byte, so they can be safely skipped and we don't have to |
| 519 | // do full utf-8 decoding. This means that the goal of this function is |
| 520 | // to find the first non-whitespace byte in `remaining`. |
| 521 | // |
| 522 | // For now this lookup table seems to be the fastest, but projects like |
| 523 | // https://github.com/lemire/despacer show other simd algorithms which |
| 524 | // can possibly accelerate this even more. Note that `*.wat` files often |
| 525 | // have a lot of whitespace so this function is typically quite hot when |
| 526 | // parsing inputs. |
| 527 | #[rustfmt::skip] |
| 528 | const WS: [u8; 256] = [ |
| 529 | // \t \n \r |
| 530 | /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, |
| 531 | /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 532 | // ' ' |
| 533 | /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 534 | /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 535 | /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 536 | /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 537 | /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 538 | /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 539 | /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 540 | /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 541 | /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 542 | /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 543 | /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 544 | /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 545 | /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 546 | /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 547 | ]; |
| 548 | let remaining = &self.input[*pos..]; |
| 549 | let non_ws_pos = remaining |
| 550 | .as_bytes() |
| 551 | .iter() |
| 552 | .position(|b| WS[*b as usize] != 1) |
| 553 | .unwrap_or(remaining.len()); |
| 554 | *pos += non_ws_pos; |
| 555 | } |
| 556 | |
| 557 | /// Splits off a "reserved" token which is then further processed later on |
| 558 | /// to figure out which kind of token it is `depending on `ReservedKind`. |
| 559 | /// |
| 560 | /// For more information on this method see the clarification at |
| 561 | /// <https://github.com/WebAssembly/spec/pull/1499> but the general gist is |
| 562 | /// that this is parsing the grammar: |
| 563 | /// |
| 564 | /// ```text |
| 565 | /// reserved := (idchar | string)+ |
| 566 | /// ``` |
| 567 | /// |
| 568 | /// which means that it is eating any number of adjacent string/idchar |
| 569 | /// tokens (e.g. `a"b"c`) and returning the classification of what was |
| 570 | /// eaten. The classification assists in determining what the actual token |
| 571 | /// here eaten looks like. |
| 572 | fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> { |
| 573 | let mut idchars = 0u32; |
| 574 | let mut strings = 0u32; |
| 575 | let start = *pos; |
| 576 | while let Some(byte) = self.input.as_bytes().get(*pos) { |
| 577 | match byte { |
| 578 | // Normal `idchars` production which appends to the reserved |
| 579 | // token that's being produced. |
| 580 | idchars!() => { |
| 581 | idchars += 1; |
| 582 | *pos += 1; |
| 583 | } |
| 584 | |
| 585 | // https://webassembly.github.io/spec/core/text/values.html#text-string |
| 586 | b'"' => { |
| 587 | strings += 1; |
| 588 | *pos += 1; |
| 589 | let mut it = self.input[*pos..].chars(); |
| 590 | let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode); |
| 591 | *pos = self.input.len() - it.as_str().len(); |
| 592 | match result { |
| 593 | Ok(_) => {} |
| 594 | Err(e) => { |
| 595 | let err_pos = match &e { |
| 596 | LexError::UnexpectedEof => self.input.len(), |
| 597 | _ => self.input[..*pos].char_indices().next_back().unwrap().0, |
| 598 | }; |
| 599 | return Err(self.error(err_pos, e)); |
| 600 | } |
| 601 | } |
| 602 | } |
| 603 | |
| 604 | // Nothing else is considered part of a reserved token |
| 605 | _ => break, |
| 606 | } |
| 607 | } |
| 608 | let ret = &self.input[start..*pos]; |
| 609 | Ok(match (idchars, strings) { |
| 610 | (0, 0) => unreachable!(), |
| 611 | (0, 1) => (ReservedKind::String, ret), |
| 612 | (_, 0) => (ReservedKind::Idchars, ret), |
| 613 | // Pattern match `@"..."` and `$"..."` for string-based |
| 614 | // identifiers and annotations. |
| 615 | (1, 1) if ret.starts_with("$" ) => (ReservedKind::IdString, ret), |
| 616 | (1, 1) if ret.starts_with("@" ) => (ReservedKind::AnnotationString, ret), |
| 617 | _ => (ReservedKind::Reserved, ret), |
| 618 | }) |
| 619 | } |
| 620 | |
| 621 | fn classify_number(&self, src: &str) -> Option<TokenKind> { |
| 622 | let (sign, num) = if let Some(stripped) = src.strip_prefix('+' ) { |
| 623 | (Some(SignToken::Plus), stripped) |
| 624 | } else if let Some(stripped) = src.strip_prefix('-' ) { |
| 625 | (Some(SignToken::Minus), stripped) |
| 626 | } else { |
| 627 | (None, src) |
| 628 | }; |
| 629 | |
| 630 | let negative = sign == Some(SignToken::Minus); |
| 631 | |
| 632 | // Handle `inf` and `nan` which are special numbers here |
| 633 | if num == "inf" { |
| 634 | return Some(TokenKind::Float(FloatKind::Inf { negative })); |
| 635 | } else if num == "nan" { |
| 636 | return Some(TokenKind::Float(FloatKind::Nan { negative })); |
| 637 | } else if let Some(stripped) = num.strip_prefix("nan:0x" ) { |
| 638 | let mut it = stripped.as_bytes().iter(); |
| 639 | let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?; |
| 640 | if it.next().is_some() { |
| 641 | return None; |
| 642 | } |
| 643 | return Some(TokenKind::Float(FloatKind::NanVal { |
| 644 | negative, |
| 645 | has_underscores, |
| 646 | })); |
| 647 | } |
| 648 | |
| 649 | // Figure out if we're a hex number or not |
| 650 | let test_valid: fn(u8) -> bool; |
| 651 | let (mut it, hex) = if let Some(stripped) = num.strip_prefix("0x" ) { |
| 652 | test_valid = |x: u8| char::from(x).is_ascii_hexdigit(); |
| 653 | (stripped.as_bytes().iter(), true) |
| 654 | } else { |
| 655 | test_valid = |x: u8| char::from(x).is_ascii_digit(); |
| 656 | (num.as_bytes().iter(), false) |
| 657 | }; |
| 658 | |
| 659 | // Evaluate the first part, moving out all underscores |
| 660 | let mut has_underscores = skip_underscores(&mut it, test_valid)?; |
| 661 | |
| 662 | match it.clone().next() { |
| 663 | // If we're followed by something this may be a float so keep going. |
| 664 | Some(_) => {} |
| 665 | |
| 666 | // Otherwise this is a valid integer literal! |
| 667 | None => { |
| 668 | return Some(TokenKind::Integer(IntegerKind { |
| 669 | has_underscores, |
| 670 | sign, |
| 671 | hex, |
| 672 | })) |
| 673 | } |
| 674 | } |
| 675 | |
| 676 | // A number can optionally be after the dot so only actually try to |
| 677 | // parse one if it's there. |
| 678 | if it.clone().next() == Some(&b'.' ) { |
| 679 | it.next(); |
| 680 | match it.clone().next() { |
| 681 | Some(c) if test_valid(*c) => { |
| 682 | if skip_underscores(&mut it, test_valid)? { |
| 683 | has_underscores = true; |
| 684 | } |
| 685 | } |
| 686 | Some(_) | None => {} |
| 687 | } |
| 688 | }; |
| 689 | |
| 690 | // Figure out if there's an exponential part here to make a float, and |
| 691 | // if so parse it but defer its actual calculation until later. |
| 692 | match (hex, it.next()) { |
| 693 | (true, Some(b'p' )) | (true, Some(b'P' )) | (false, Some(b'e' )) | (false, Some(b'E' )) => { |
| 694 | match it.clone().next() { |
| 695 | Some(b'-' ) => { |
| 696 | it.next(); |
| 697 | } |
| 698 | Some(b'+' ) => { |
| 699 | it.next(); |
| 700 | } |
| 701 | _ => {} |
| 702 | } |
| 703 | if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? { |
| 704 | has_underscores = true; |
| 705 | } |
| 706 | } |
| 707 | (_, None) => {} |
| 708 | _ => return None, |
| 709 | } |
| 710 | |
| 711 | // We should have eaten everything by now, if not then this is surely |
| 712 | // not a float or integer literal. |
| 713 | if it.next().is_some() { |
| 714 | return None; |
| 715 | } |
| 716 | |
| 717 | return Some(TokenKind::Float(FloatKind::Normal { |
| 718 | has_underscores, |
| 719 | hex, |
| 720 | })); |
| 721 | |
| 722 | fn skip_underscores<'a>( |
| 723 | it: &mut slice::Iter<'_, u8>, |
| 724 | good: fn(u8) -> bool, |
| 725 | ) -> Option<bool> { |
| 726 | let mut last_underscore = false; |
| 727 | let mut has_underscores = false; |
| 728 | let first = *it.next()?; |
| 729 | if !good(first) { |
| 730 | return None; |
| 731 | } |
| 732 | while let Some(c) = it.clone().next() { |
| 733 | if *c == b'_' && !last_underscore { |
| 734 | has_underscores = true; |
| 735 | it.next(); |
| 736 | last_underscore = true; |
| 737 | continue; |
| 738 | } |
| 739 | if !good(*c) { |
| 740 | break; |
| 741 | } |
| 742 | last_underscore = false; |
| 743 | it.next(); |
| 744 | } |
| 745 | if last_underscore { |
| 746 | return None; |
| 747 | } |
| 748 | Some(has_underscores) |
| 749 | } |
| 750 | } |
| 751 | |
| 752 | /// Verifies that `comment`, which is about to be returned, has a "confusing |
| 753 | /// unicode character" in it and should instead be transformed into an |
| 754 | /// error. |
| 755 | fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> { |
| 756 | if self.allow_confusing_unicode { |
| 757 | return Ok(()); |
| 758 | } |
| 759 | |
| 760 | // In an effort to avoid utf-8 decoding the entire `comment` the search |
| 761 | // here is a bit more optimized. This checks for the `0xe2` byte because |
| 762 | // in the utf-8 encoding that's the leading encoding byte for all |
| 763 | // "confusing characters". Each instance of 0xe2 is checked to see if it |
| 764 | // starts a confusing character, and if so that's returned. |
| 765 | // |
| 766 | // Also note that 0xe2 will never be found in the middle of a codepoint, |
| 767 | // it's always the start of a codepoint. This means that if our special |
| 768 | // characters show up they're guaranteed to start with 0xe2 bytes. |
| 769 | let bytes = comment.as_bytes(); |
| 770 | for pos in memchr::Memchr::new(0xe2, bytes) { |
| 771 | if let Some(c) = comment[pos..].chars().next() { |
| 772 | if is_confusing_unicode(c) { |
| 773 | // Note that `self.cur()` accounts for already having |
| 774 | // parsed `comment`, so we move backwards to where |
| 775 | // `comment` started and then add the index within |
| 776 | // `comment`. |
| 777 | let pos = end - comment.len() + pos; |
| 778 | return Err(self.error(pos, LexError::ConfusingUnicode(c))); |
| 779 | } |
| 780 | } |
| 781 | } |
| 782 | |
| 783 | Ok(()) |
| 784 | } |
| 785 | |
| 786 | fn parse_str( |
| 787 | it: &mut str::Chars<'a>, |
| 788 | allow_confusing_unicode: bool, |
| 789 | ) -> Result<Cow<'a, [u8]>, LexError> { |
| 790 | enum State { |
| 791 | Start, |
| 792 | String(Vec<u8>), |
| 793 | } |
| 794 | let orig = it.as_str(); |
| 795 | let mut state = State::Start; |
| 796 | loop { |
| 797 | match it.next().ok_or(LexError::UnexpectedEof)? { |
| 798 | '"' => break, |
| 799 | ' \\' => { |
| 800 | match state { |
| 801 | State::String(_) => {} |
| 802 | State::Start => { |
| 803 | let pos = orig.len() - it.as_str().len() - 1; |
| 804 | state = State::String(orig[..pos].as_bytes().to_vec()); |
| 805 | } |
| 806 | } |
| 807 | let buf = match &mut state { |
| 808 | State::String(b) => b, |
| 809 | State::Start => unreachable!(), |
| 810 | }; |
| 811 | match it.next().ok_or(LexError::UnexpectedEof)? { |
| 812 | '"' => buf.push(b'"' ), |
| 813 | ' \'' => buf.push(b' \'' ), |
| 814 | 't' => buf.push(b' \t' ), |
| 815 | 'n' => buf.push(b' \n' ), |
| 816 | 'r' => buf.push(b' \r' ), |
| 817 | ' \\' => buf.push(b' \\' ), |
| 818 | 'u' => { |
| 819 | Lexer::must_eat_char(it, '{' )?; |
| 820 | let n = Lexer::hexnum(it)?; |
| 821 | let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?; |
| 822 | buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); |
| 823 | Lexer::must_eat_char(it, '}' )?; |
| 824 | } |
| 825 | c1 if c1.is_ascii_hexdigit() => { |
| 826 | let c2 = Lexer::hexdigit(it)?; |
| 827 | buf.push(to_hex(c1) * 16 + c2); |
| 828 | } |
| 829 | c => return Err(LexError::InvalidStringEscape(c)), |
| 830 | } |
| 831 | } |
| 832 | c if (c as u32) < 0x20 || c as u32 == 0x7f => { |
| 833 | return Err(LexError::InvalidStringElement(c)) |
| 834 | } |
| 835 | c if !allow_confusing_unicode && is_confusing_unicode(c) => { |
| 836 | return Err(LexError::ConfusingUnicode(c)) |
| 837 | } |
| 838 | c => match &mut state { |
| 839 | State::Start => {} |
| 840 | State::String(v) => { |
| 841 | v.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); |
| 842 | } |
| 843 | }, |
| 844 | } |
| 845 | } |
| 846 | match state { |
| 847 | State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()), |
| 848 | State::String(s) => Ok(s.into()), |
| 849 | } |
| 850 | } |
| 851 | |
| 852 | /// Parses an id-or-string-based name from `it`. |
| 853 | /// |
| 854 | /// Note that `it` should already have been lexed and this is just |
| 855 | /// extracting the value. If the token lexed was `@a` then this should point |
| 856 | /// to `a`. |
| 857 | /// |
| 858 | /// This will automatically detect quoted syntax such as `@"..."` and the |
| 859 | /// byte string will be parsed and validated as utf-8. |
| 860 | /// |
| 861 | /// # Errors |
| 862 | /// |
| 863 | /// Returns an error if a quoted byte string is found and contains invalid |
| 864 | /// utf-8. |
| 865 | fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> { |
| 866 | if it.clone().next() == Some('"' ) { |
| 867 | it.next(); |
| 868 | match Lexer::parse_str(it, true)? { |
| 869 | Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) { |
| 870 | Ok(s) => Ok(Cow::Borrowed(s)), |
| 871 | Err(e) => Err(LexError::InvalidUtf8Id(e)), |
| 872 | }, |
| 873 | Cow::Owned(bytes) => match String::from_utf8(bytes) { |
| 874 | Ok(s) => Ok(Cow::Owned(s)), |
| 875 | Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())), |
| 876 | }, |
| 877 | } |
| 878 | } else { |
| 879 | Ok(Cow::Borrowed(it.as_str())) |
| 880 | } |
| 881 | } |
| 882 | |
| 883 | fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> { |
| 884 | let n = Lexer::hexdigit(it)?; |
| 885 | let mut last_underscore = false; |
| 886 | let mut n = n as u32; |
| 887 | while let Some(c) = it.clone().next() { |
| 888 | if c == '_' { |
| 889 | it.next(); |
| 890 | last_underscore = true; |
| 891 | continue; |
| 892 | } |
| 893 | if !c.is_ascii_hexdigit() { |
| 894 | break; |
| 895 | } |
| 896 | last_underscore = false; |
| 897 | it.next(); |
| 898 | n = n |
| 899 | .checked_mul(16) |
| 900 | .and_then(|n| n.checked_add(to_hex(c) as u32)) |
| 901 | .ok_or(LexError::NumberTooBig)?; |
| 902 | } |
| 903 | if last_underscore { |
| 904 | return Err(LexError::LoneUnderscore); |
| 905 | } |
| 906 | Ok(n) |
| 907 | } |
| 908 | |
| 909 | /// Reads a hexidecimal digit from the input stream, returning where it's |
| 910 | /// defined and the hex value. Returns an error on EOF or an invalid hex |
| 911 | /// digit. |
| 912 | fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> { |
| 913 | let ch = Lexer::must_char(it)?; |
| 914 | if ch.is_ascii_hexdigit() { |
| 915 | Ok(to_hex(ch)) |
| 916 | } else { |
| 917 | Err(LexError::InvalidHexDigit(ch)) |
| 918 | } |
| 919 | } |
| 920 | |
| 921 | /// Reads the next character from the input string and where it's located, |
| 922 | /// returning an error if the input stream is empty. |
| 923 | fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> { |
| 924 | it.next().ok_or(LexError::UnexpectedEof) |
| 925 | } |
| 926 | |
| 927 | /// Expects that a specific character must be read next |
| 928 | fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> { |
| 929 | let found = Lexer::must_char(it)?; |
| 930 | if wanted == found { |
| 931 | Ok(()) |
| 932 | } else { |
| 933 | Err(LexError::Expected { wanted, found }) |
| 934 | } |
| 935 | } |
| 936 | |
| 937 | /// Creates an error at `pos` with the specified `kind` |
| 938 | fn error(&self, pos: usize, kind: LexError) -> Error { |
| 939 | Error::lex(Span { offset: pos }, self.input, kind) |
| 940 | } |
| 941 | |
| 942 | /// Returns an iterator over all tokens in the original source string |
| 943 | /// starting at the `pos` specified. |
| 944 | pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ { |
| 945 | std::iter::from_fn(move || self.parse(&mut pos).transpose()) |
| 946 | } |
| 947 | |
| 948 | /// Returns whether an annotation is present at `pos`. If it is present then |
| 949 | /// `Ok(Some(token))` is returned corresponding to the token, otherwise |
| 950 | /// `Ok(None)` is returned. If the next token cannot be parsed then an error |
| 951 | /// is returned. |
| 952 | pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> { |
| 953 | let bytes = self.input.as_bytes(); |
| 954 | // Quickly reject anything that for sure isn't an annotation since this |
| 955 | // method is used every time an lparen is parsed. |
| 956 | if bytes.get(pos) != Some(&b'@' ) { |
| 957 | return Ok(None); |
| 958 | } |
| 959 | match self.parse(&mut pos)? { |
| 960 | Some(token) => match token.kind { |
| 961 | TokenKind::Annotation => Ok(Some(token)), |
| 962 | _ => Ok(None), |
| 963 | }, |
| 964 | None => Ok(None), |
| 965 | } |
| 966 | } |
| 967 | } |
| 968 | |
| 969 | impl Token { |
| 970 | /// Returns the original source text for this token. |
| 971 | pub fn src<'a>(&self, s: &'a str) -> &'a str { |
| 972 | &s[self.offset..][..self.len.try_into().unwrap()] |
| 973 | } |
| 974 | |
| 975 | /// Returns the identifier, without the leading `$` symbol, that this token |
| 976 | /// represents. |
| 977 | /// |
| 978 | /// Note that this method returns the contents of the identifier. With a |
| 979 | /// string-based identifier this means that escapes have been resolved to |
| 980 | /// their string-based equivalent. |
| 981 | /// |
| 982 | /// Should only be used with `TokenKind::Id`. |
| 983 | /// |
| 984 | /// # Errors |
| 985 | /// |
| 986 | /// Returns an error if this is a string-based identifier (e.g. `$"..."`) |
| 987 | /// which is invalid utf-8. |
| 988 | pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> { |
| 989 | let mut ch = self.src(s).chars(); |
| 990 | let dollar = ch.next(); |
| 991 | debug_assert_eq!(dollar, Some('$' )); |
| 992 | let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?; |
| 993 | if id.is_empty() { |
| 994 | return Err(self.error(s, LexError::EmptyId)); |
| 995 | } |
| 996 | Ok(id) |
| 997 | } |
| 998 | |
| 999 | /// Returns the annotation, without the leading `@` symbol, that this token |
| 1000 | /// represents. |
| 1001 | /// |
| 1002 | /// Note that this method returns the contents of the identifier. With a |
| 1003 | /// string-based identifier this means that escapes have been resolved to |
| 1004 | /// their string-based equivalent. |
| 1005 | /// |
| 1006 | /// Should only be used with `TokenKind::Annotation`. |
| 1007 | /// |
| 1008 | /// # Errors |
| 1009 | /// |
| 1010 | /// Returns an error if this is a string-based identifier (e.g. `$"..."`) |
| 1011 | /// which is invalid utf-8. |
| 1012 | pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> { |
| 1013 | let mut ch = self.src(s).chars(); |
| 1014 | let at = ch.next(); |
| 1015 | debug_assert_eq!(at, Some('@' )); |
| 1016 | let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?; |
| 1017 | if id.is_empty() { |
| 1018 | return Err(self.error(s, LexError::EmptyAnnotation)); |
| 1019 | } |
| 1020 | Ok(id) |
| 1021 | } |
| 1022 | |
| 1023 | /// Returns the keyword this token represents. |
| 1024 | /// |
| 1025 | /// Should only be used with [`TokenKind::Keyword`]. |
| 1026 | pub fn keyword<'a>(&self, s: &'a str) -> &'a str { |
| 1027 | self.src(s) |
| 1028 | } |
| 1029 | |
| 1030 | /// Returns the reserved string this token represents. |
| 1031 | /// |
| 1032 | /// Should only be used with [`TokenKind::Reserved`]. |
| 1033 | pub fn reserved<'a>(&self, s: &'a str) -> &'a str { |
| 1034 | self.src(s) |
| 1035 | } |
| 1036 | |
| 1037 | /// Returns the parsed string that this token represents. |
| 1038 | /// |
| 1039 | /// This returns either a raw byte slice into the source if that's possible |
| 1040 | /// or an owned representation to handle escaped characters and such. |
| 1041 | /// |
| 1042 | /// Should only be used with [`TokenKind::String`]. |
| 1043 | pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> { |
| 1044 | let mut ch = self.src(s).chars(); |
| 1045 | ch.next().unwrap(); |
| 1046 | Lexer::parse_str(&mut ch, true).unwrap() |
| 1047 | } |
| 1048 | |
| 1049 | /// Returns the decomposed float token that this represents. |
| 1050 | /// |
| 1051 | /// This will slice up the float token into its component parts and return a |
| 1052 | /// description of the float token in the source. |
| 1053 | /// |
| 1054 | /// Should only be used with [`TokenKind::Float`]. |
| 1055 | pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> { |
| 1056 | match kind { |
| 1057 | FloatKind::Inf { negative } => Float::Inf { negative }, |
| 1058 | FloatKind::Nan { negative } => Float::Nan { |
| 1059 | val: None, |
| 1060 | negative, |
| 1061 | }, |
| 1062 | FloatKind::NanVal { |
| 1063 | negative, |
| 1064 | has_underscores, |
| 1065 | } => { |
| 1066 | let src = self.src(s); |
| 1067 | let src = if src.starts_with("n" ) { src } else { &src[1..] }; |
| 1068 | let mut val = Cow::Borrowed(src.strip_prefix("nan:0x" ).unwrap()); |
| 1069 | if has_underscores { |
| 1070 | *val.to_mut() = val.replace("_" , "" ); |
| 1071 | } |
| 1072 | Float::Nan { |
| 1073 | val: Some(val), |
| 1074 | negative, |
| 1075 | } |
| 1076 | } |
| 1077 | FloatKind::Normal { |
| 1078 | has_underscores, |
| 1079 | hex, |
| 1080 | } => { |
| 1081 | let src = self.src(s); |
| 1082 | let (integral, fractional, exponent) = match src.find('.' ) { |
| 1083 | Some(i) => { |
| 1084 | let integral = &src[..i]; |
| 1085 | let rest = &src[i + 1..]; |
| 1086 | let exponent = if hex { |
| 1087 | rest.find('p' ).or_else(|| rest.find('P' )) |
| 1088 | } else { |
| 1089 | rest.find('e' ).or_else(|| rest.find('E' )) |
| 1090 | }; |
| 1091 | match exponent { |
| 1092 | Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])), |
| 1093 | None => (integral, Some(rest), None), |
| 1094 | } |
| 1095 | } |
| 1096 | None => { |
| 1097 | let exponent = if hex { |
| 1098 | src.find('p' ).or_else(|| src.find('P' )) |
| 1099 | } else { |
| 1100 | src.find('e' ).or_else(|| src.find('E' )) |
| 1101 | }; |
| 1102 | match exponent { |
| 1103 | Some(i) => (&src[..i], None, Some(&src[i + 1..])), |
| 1104 | None => (src, None, None), |
| 1105 | } |
| 1106 | } |
| 1107 | }; |
| 1108 | let mut integral = Cow::Borrowed(integral.strip_prefix('+' ).unwrap_or(integral)); |
| 1109 | let mut fractional = fractional.and_then(|s| { |
| 1110 | if s.is_empty() { |
| 1111 | None |
| 1112 | } else { |
| 1113 | Some(Cow::Borrowed(s)) |
| 1114 | } |
| 1115 | }); |
| 1116 | let mut exponent = |
| 1117 | exponent.map(|s| Cow::Borrowed(s.strip_prefix('+' ).unwrap_or(s))); |
| 1118 | if has_underscores { |
| 1119 | *integral.to_mut() = integral.replace("_" , "" ); |
| 1120 | if let Some(fractional) = &mut fractional { |
| 1121 | *fractional.to_mut() = fractional.replace("_" , "" ); |
| 1122 | } |
| 1123 | if let Some(exponent) = &mut exponent { |
| 1124 | *exponent.to_mut() = exponent.replace("_" , "" ); |
| 1125 | } |
| 1126 | } |
| 1127 | if hex { |
| 1128 | *integral.to_mut() = integral.replace("0x" , "" ); |
| 1129 | } |
| 1130 | Float::Val { |
| 1131 | hex, |
| 1132 | integral, |
| 1133 | fractional, |
| 1134 | exponent, |
| 1135 | } |
| 1136 | } |
| 1137 | } |
| 1138 | } |
| 1139 | |
| 1140 | /// Returns the decomposed integer token that this represents. |
| 1141 | /// |
| 1142 | /// This will slice up the integer token into its component parts and |
| 1143 | /// return a description of the integer token in the source. |
| 1144 | /// |
| 1145 | /// Should only be used with [`TokenKind::Integer`]. |
| 1146 | pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> { |
| 1147 | let src = self.src(s); |
| 1148 | let val = match kind.sign { |
| 1149 | Some(SignToken::Plus) => src.strip_prefix('+' ).unwrap(), |
| 1150 | Some(SignToken::Minus) => src, |
| 1151 | None => src, |
| 1152 | }; |
| 1153 | let mut val = Cow::Borrowed(val); |
| 1154 | if kind.has_underscores { |
| 1155 | *val.to_mut() = val.replace("_" , "" ); |
| 1156 | } |
| 1157 | if kind.hex { |
| 1158 | *val.to_mut() = val.replace("0x" , "" ); |
| 1159 | } |
| 1160 | Integer { |
| 1161 | sign: kind.sign, |
| 1162 | hex: kind.hex, |
| 1163 | val, |
| 1164 | } |
| 1165 | } |
| 1166 | |
| 1167 | fn error(&self, src: &str, err: LexError) -> Error { |
| 1168 | Error::lex( |
| 1169 | Span { |
| 1170 | offset: self.offset, |
| 1171 | }, |
| 1172 | src, |
| 1173 | err, |
| 1174 | ) |
| 1175 | } |
| 1176 | } |
| 1177 | |
| 1178 | impl<'a> Integer<'a> { |
| 1179 | /// Returns the sign token for this integer. |
| 1180 | pub fn sign(&self) -> Option<SignToken> { |
| 1181 | self.sign |
| 1182 | } |
| 1183 | |
| 1184 | /// Returns the value string that can be parsed for this integer, as well |
| 1185 | /// as the base that it should be parsed in |
| 1186 | pub fn val(&self) -> (&str, u32) { |
| 1187 | (&self.val, if self.hex { 16 } else { 10 }) |
| 1188 | } |
| 1189 | } |
| 1190 | |
| 1191 | fn to_hex(c: char) -> u8 { |
| 1192 | match c { |
| 1193 | 'a' ..='f' => c as u8 - b'a' + 10, |
| 1194 | 'A' ..='F' => c as u8 - b'A' + 10, |
| 1195 | _ => c as u8 - b'0' , |
| 1196 | } |
| 1197 | } |
| 1198 | |
| 1199 | impl fmt::Display for LexError { |
| 1200 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 1201 | use LexError::*; |
| 1202 | match self { |
| 1203 | DanglingBlockComment => f.write_str("unterminated block comment" )?, |
| 1204 | Unexpected(c) => write!(f, "unexpected character ' {}'" , escape_char(*c))?, |
| 1205 | InvalidStringElement(c) => { |
| 1206 | write!(f, "invalid character in string ' {}'" , escape_char(*c))? |
| 1207 | } |
| 1208 | InvalidStringEscape(c) => write!(f, "invalid string escape ' {}'" , escape_char(*c))?, |
| 1209 | InvalidHexDigit(c) => write!(f, "invalid hex digit ' {}'" , escape_char(*c))?, |
| 1210 | InvalidDigit(c) => write!(f, "invalid decimal digit ' {}'" , escape_char(*c))?, |
| 1211 | Expected { wanted, found } => write!( |
| 1212 | f, |
| 1213 | "expected ' {}' but found ' {}'" , |
| 1214 | escape_char(*wanted), |
| 1215 | escape_char(*found) |
| 1216 | )?, |
| 1217 | UnexpectedEof => write!(f, "unexpected end-of-file" )?, |
| 1218 | NumberTooBig => f.write_str("number is too big to parse" )?, |
| 1219 | InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x {:x}" , c)?, |
| 1220 | LoneUnderscore => write!(f, "bare underscore in numeric literal" )?, |
| 1221 | ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}" , c)?, |
| 1222 | InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id" )?, |
| 1223 | EmptyId => write!(f, "empty identifier" )?, |
| 1224 | EmptyAnnotation => write!(f, "empty annotation id" )?, |
| 1225 | } |
| 1226 | Ok(()) |
| 1227 | } |
| 1228 | } |
| 1229 | |
| 1230 | fn escape_char(c: char) -> String { |
| 1231 | match c { |
| 1232 | ' \t' => String::from(" \\t" ), |
| 1233 | ' \r' => String::from(" \\r" ), |
| 1234 | ' \n' => String::from(" \\n" ), |
| 1235 | ' \\' => String::from(" \\\\" ), |
| 1236 | ' \'' => String::from(" \\\'" ), |
| 1237 | ' \"' => String::from(" \"" ), |
| 1238 | ' \x20' ..=' \x7e' => String::from(c), |
| 1239 | _ => c.escape_unicode().to_string(), |
| 1240 | } |
| 1241 | } |
| 1242 | |
| 1243 | /// This is an attempt to protect agains the "trojan source" [1] problem where |
| 1244 | /// unicode characters can cause editors to render source code differently |
| 1245 | /// for humans than the compiler itself sees. |
| 1246 | /// |
| 1247 | /// To mitigate this issue, and because it's relatively rare in practice, |
| 1248 | /// this simply rejects characters of that form. |
| 1249 | /// |
| 1250 | /// [1]: https://www.trojansource.codes/ |
| 1251 | fn is_confusing_unicode(ch: char) -> bool { |
| 1252 | matches!( |
| 1253 | ch, |
| 1254 | ' \u{202a}' |
| 1255 | | ' \u{202b}' |
| 1256 | | ' \u{202d}' |
| 1257 | | ' \u{202e}' |
| 1258 | | ' \u{2066}' |
| 1259 | | ' \u{2067}' |
| 1260 | | ' \u{2068}' |
| 1261 | | ' \u{206c}' |
| 1262 | | ' \u{2069}' |
| 1263 | ) |
| 1264 | } |
| 1265 | |
| 1266 | #[cfg (test)] |
| 1267 | mod tests { |
| 1268 | use super::*; |
| 1269 | |
| 1270 | #[test ] |
| 1271 | fn ws_smoke() { |
| 1272 | fn get_whitespace(input: &str) -> &str { |
| 1273 | let token = get_token(input); |
| 1274 | match token.kind { |
| 1275 | TokenKind::Whitespace => token.src(input), |
| 1276 | other => panic!("unexpected {:?}" , other), |
| 1277 | } |
| 1278 | } |
| 1279 | assert_eq!(get_whitespace(" " ), " " ); |
| 1280 | assert_eq!(get_whitespace(" " ), " " ); |
| 1281 | assert_eq!(get_whitespace(" \n " ), " \n " ); |
| 1282 | assert_eq!(get_whitespace(" x" ), " " ); |
| 1283 | assert_eq!(get_whitespace(" ;" ), " " ); |
| 1284 | } |
| 1285 | |
| 1286 | #[test ] |
| 1287 | fn line_comment_smoke() { |
| 1288 | fn get_line_comment(input: &str) -> &str { |
| 1289 | let token = get_token(input); |
| 1290 | match token.kind { |
| 1291 | TokenKind::LineComment => token.src(input), |
| 1292 | other => panic!("unexpected {:?}" , other), |
| 1293 | } |
| 1294 | } |
| 1295 | assert_eq!(get_line_comment(";;" ), ";;" ); |
| 1296 | assert_eq!(get_line_comment(";; xyz" ), ";; xyz" ); |
| 1297 | assert_eq!(get_line_comment(";; xyz \nabc" ), ";; xyz" ); |
| 1298 | assert_eq!(get_line_comment(";; \nabc" ), ";;" ); |
| 1299 | assert_eq!(get_line_comment(";; \nabc" ), ";; " ); |
| 1300 | assert_eq!(get_line_comment(";; \rabc" ), ";; " ); |
| 1301 | assert_eq!(get_line_comment(";; \r\nabc" ), ";; " ); |
| 1302 | } |
| 1303 | |
| 1304 | #[test ] |
| 1305 | fn block_comment_smoke() { |
| 1306 | fn get_block_comment(input: &str) -> &str { |
| 1307 | let token = get_token(input); |
| 1308 | match token.kind { |
| 1309 | TokenKind::BlockComment => token.src(input), |
| 1310 | other => panic!("unexpected {:?}" , other), |
| 1311 | } |
| 1312 | } |
| 1313 | assert_eq!(get_block_comment("(;;)" ), "(;;)" ); |
| 1314 | assert_eq!(get_block_comment("(; ;)" ), "(; ;)" ); |
| 1315 | assert_eq!(get_block_comment("(; (;;) ;)" ), "(; (;;) ;)" ); |
| 1316 | } |
| 1317 | |
| 1318 | fn get_token(input: &str) -> Token { |
| 1319 | Lexer::new(input) |
| 1320 | .parse(&mut 0) |
| 1321 | .expect("no first token" ) |
| 1322 | .expect("no token" ) |
| 1323 | } |
| 1324 | |
| 1325 | #[test ] |
| 1326 | fn lparen() { |
| 1327 | assert_eq!(get_token("((" ).kind, TokenKind::LParen); |
| 1328 | } |
| 1329 | |
| 1330 | #[test ] |
| 1331 | fn rparen() { |
| 1332 | assert_eq!(get_token(")(" ).kind, TokenKind::RParen); |
| 1333 | } |
| 1334 | |
| 1335 | #[test ] |
| 1336 | fn strings() { |
| 1337 | fn get_string(input: &str) -> Vec<u8> { |
| 1338 | let token = get_token(input); |
| 1339 | match token.kind { |
| 1340 | TokenKind::String => token.string(input).to_vec(), |
| 1341 | other => panic!("not keyword {:?}" , other), |
| 1342 | } |
| 1343 | } |
| 1344 | assert_eq!(&*get_string(" \"\"" ), b"" ); |
| 1345 | assert_eq!(&*get_string(" \"a \"" ), b"a" ); |
| 1346 | assert_eq!(&*get_string(" \"a b c d \"" ), b"a b c d" ); |
| 1347 | assert_eq!(&*get_string(" \"\\\"\"" ), b" \"" ); |
| 1348 | assert_eq!(&*get_string(" \"\\' \"" ), b"'" ); |
| 1349 | assert_eq!(&*get_string(" \"\\n \"" ), b" \n" ); |
| 1350 | assert_eq!(&*get_string(" \"\\t \"" ), b" \t" ); |
| 1351 | assert_eq!(&*get_string(" \"\\r \"" ), b" \r" ); |
| 1352 | assert_eq!(&*get_string(" \"\\\\\"" ), b" \\" ); |
| 1353 | assert_eq!(&*get_string(" \"\\01 \"" ), &[1]); |
| 1354 | assert_eq!(&*get_string(" \"\\u{1} \"" ), &[1]); |
| 1355 | assert_eq!( |
| 1356 | &*get_string(" \"\\u{0f3} \"" ), |
| 1357 | ' \u{0f3}' .encode_utf8(&mut [0; 4]).as_bytes() |
| 1358 | ); |
| 1359 | assert_eq!( |
| 1360 | &*get_string(" \"\\u{0_f_3} \"" ), |
| 1361 | ' \u{0f3}' .encode_utf8(&mut [0; 4]).as_bytes() |
| 1362 | ); |
| 1363 | |
| 1364 | for i in 0..=255i32 { |
| 1365 | let s = format!(" \"\\{:02x} \"" , i); |
| 1366 | assert_eq!(&*get_string(&s), &[i as u8]); |
| 1367 | } |
| 1368 | } |
| 1369 | |
| 1370 | #[test ] |
| 1371 | fn id() { |
| 1372 | fn get_id(input: &str) -> String { |
| 1373 | let token = get_token(input); |
| 1374 | match token.kind { |
| 1375 | TokenKind::Id => token.id(input).unwrap().to_string(), |
| 1376 | other => panic!("not id {:?}" , other), |
| 1377 | } |
| 1378 | } |
| 1379 | assert_eq!(get_id("$x" ), "x" ); |
| 1380 | assert_eq!(get_id("$xyz" ), "xyz" ); |
| 1381 | assert_eq!(get_id("$x_z" ), "x_z" ); |
| 1382 | assert_eq!(get_id("$0^" ), "0^" ); |
| 1383 | assert_eq!(get_id("$0^;;" ), "0^" ); |
| 1384 | assert_eq!(get_id("$0^ ;;" ), "0^" ); |
| 1385 | assert_eq!(get_id("$ \"x \" ;;" ), "x" ); |
| 1386 | } |
| 1387 | |
| 1388 | #[test ] |
| 1389 | fn annotation() { |
| 1390 | fn get_annotation(input: &str) -> String { |
| 1391 | let token = get_token(input); |
| 1392 | match token.kind { |
| 1393 | TokenKind::Annotation => token.annotation(input).unwrap().to_string(), |
| 1394 | other => panic!("not annotation {:?}" , other), |
| 1395 | } |
| 1396 | } |
| 1397 | assert_eq!(get_annotation("@foo" ), "foo" ); |
| 1398 | assert_eq!(get_annotation("@foo " ), "foo" ); |
| 1399 | assert_eq!(get_annotation("@f " ), "f" ); |
| 1400 | assert_eq!(get_annotation("@ \"x \" " ), "x" ); |
| 1401 | assert_eq!(get_annotation("@0 " ), "0" ); |
| 1402 | } |
| 1403 | |
| 1404 | #[test ] |
| 1405 | fn keyword() { |
| 1406 | fn get_keyword(input: &str) -> &str { |
| 1407 | let token = get_token(input); |
| 1408 | match token.kind { |
| 1409 | TokenKind::Keyword => token.keyword(input), |
| 1410 | other => panic!("not keyword {:?}" , other), |
| 1411 | } |
| 1412 | } |
| 1413 | assert_eq!(get_keyword("x" ), "x" ); |
| 1414 | assert_eq!(get_keyword("xyz" ), "xyz" ); |
| 1415 | assert_eq!(get_keyword("x_z" ), "x_z" ); |
| 1416 | assert_eq!(get_keyword("x_z " ), "x_z" ); |
| 1417 | assert_eq!(get_keyword("x_z " ), "x_z" ); |
| 1418 | } |
| 1419 | |
| 1420 | #[test ] |
| 1421 | fn reserved() { |
| 1422 | fn get_reserved(input: &str) -> &str { |
| 1423 | let token = get_token(input); |
| 1424 | match token.kind { |
| 1425 | TokenKind::Reserved => token.reserved(input), |
| 1426 | other => panic!("not reserved {:?}" , other), |
| 1427 | } |
| 1428 | } |
| 1429 | assert_eq!(get_reserved("^_x " ), "^_x" ); |
| 1430 | } |
| 1431 | |
| 1432 | #[test ] |
| 1433 | fn integer() { |
| 1434 | fn get_integer(input: &str) -> String { |
| 1435 | let token = get_token(input); |
| 1436 | match token.kind { |
| 1437 | TokenKind::Integer(i) => token.integer(input, i).val.to_string(), |
| 1438 | other => panic!("not integer {:?}" , other), |
| 1439 | } |
| 1440 | } |
| 1441 | assert_eq!(get_integer("1" ), "1" ); |
| 1442 | assert_eq!(get_integer("0" ), "0" ); |
| 1443 | assert_eq!(get_integer("-1" ), "-1" ); |
| 1444 | assert_eq!(get_integer("+1" ), "1" ); |
| 1445 | assert_eq!(get_integer("+1_000" ), "1000" ); |
| 1446 | assert_eq!(get_integer("+1_0_0_0" ), "1000" ); |
| 1447 | assert_eq!(get_integer("+0x10" ), "10" ); |
| 1448 | assert_eq!(get_integer("-0x10" ), "-10" ); |
| 1449 | assert_eq!(get_integer("0x10" ), "10" ); |
| 1450 | } |
| 1451 | |
| 1452 | #[test ] |
| 1453 | fn float() { |
| 1454 | fn get_float(input: &str) -> Float<'_> { |
| 1455 | let token = get_token(input); |
| 1456 | match token.kind { |
| 1457 | TokenKind::Float(f) => token.float(input, f), |
| 1458 | other => panic!("not float {:?}" , other), |
| 1459 | } |
| 1460 | } |
| 1461 | assert_eq!( |
| 1462 | get_float("nan" ), |
| 1463 | Float::Nan { |
| 1464 | val: None, |
| 1465 | negative: false |
| 1466 | }, |
| 1467 | ); |
| 1468 | assert_eq!( |
| 1469 | get_float("-nan" ), |
| 1470 | Float::Nan { |
| 1471 | val: None, |
| 1472 | negative: true, |
| 1473 | }, |
| 1474 | ); |
| 1475 | assert_eq!( |
| 1476 | get_float("+nan" ), |
| 1477 | Float::Nan { |
| 1478 | val: None, |
| 1479 | negative: false, |
| 1480 | }, |
| 1481 | ); |
| 1482 | assert_eq!( |
| 1483 | get_float("+nan:0x1" ), |
| 1484 | Float::Nan { |
| 1485 | val: Some("1" .into()), |
| 1486 | negative: false, |
| 1487 | }, |
| 1488 | ); |
| 1489 | assert_eq!( |
| 1490 | get_float("nan:0x7f_ffff" ), |
| 1491 | Float::Nan { |
| 1492 | val: Some("7fffff" .into()), |
| 1493 | negative: false, |
| 1494 | }, |
| 1495 | ); |
| 1496 | assert_eq!(get_float("inf" ), Float::Inf { negative: false }); |
| 1497 | assert_eq!(get_float("-inf" ), Float::Inf { negative: true }); |
| 1498 | assert_eq!(get_float("+inf" ), Float::Inf { negative: false }); |
| 1499 | |
| 1500 | assert_eq!( |
| 1501 | get_float("1.2" ), |
| 1502 | Float::Val { |
| 1503 | integral: "1" .into(), |
| 1504 | fractional: Some("2" .into()), |
| 1505 | exponent: None, |
| 1506 | hex: false, |
| 1507 | }, |
| 1508 | ); |
| 1509 | assert_eq!( |
| 1510 | get_float("1.2e3" ), |
| 1511 | Float::Val { |
| 1512 | integral: "1" .into(), |
| 1513 | fractional: Some("2" .into()), |
| 1514 | exponent: Some("3" .into()), |
| 1515 | hex: false, |
| 1516 | }, |
| 1517 | ); |
| 1518 | assert_eq!( |
| 1519 | get_float("-1_2.1_1E+0_1" ), |
| 1520 | Float::Val { |
| 1521 | integral: "-12" .into(), |
| 1522 | fractional: Some("11" .into()), |
| 1523 | exponent: Some("01" .into()), |
| 1524 | hex: false, |
| 1525 | }, |
| 1526 | ); |
| 1527 | assert_eq!( |
| 1528 | get_float("+1_2.1_1E-0_1" ), |
| 1529 | Float::Val { |
| 1530 | integral: "12" .into(), |
| 1531 | fractional: Some("11" .into()), |
| 1532 | exponent: Some("-01" .into()), |
| 1533 | hex: false, |
| 1534 | }, |
| 1535 | ); |
| 1536 | assert_eq!( |
| 1537 | get_float("0x1_2.3_4p5_6" ), |
| 1538 | Float::Val { |
| 1539 | integral: "12" .into(), |
| 1540 | fractional: Some("34" .into()), |
| 1541 | exponent: Some("56" .into()), |
| 1542 | hex: true, |
| 1543 | }, |
| 1544 | ); |
| 1545 | assert_eq!( |
| 1546 | get_float("+0x1_2.3_4P-5_6" ), |
| 1547 | Float::Val { |
| 1548 | integral: "12" .into(), |
| 1549 | fractional: Some("34" .into()), |
| 1550 | exponent: Some("-56" .into()), |
| 1551 | hex: true, |
| 1552 | }, |
| 1553 | ); |
| 1554 | assert_eq!( |
| 1555 | get_float("1." ), |
| 1556 | Float::Val { |
| 1557 | integral: "1" .into(), |
| 1558 | fractional: None, |
| 1559 | exponent: None, |
| 1560 | hex: false, |
| 1561 | }, |
| 1562 | ); |
| 1563 | assert_eq!( |
| 1564 | get_float("0x1p-24" ), |
| 1565 | Float::Val { |
| 1566 | integral: "1" .into(), |
| 1567 | fractional: None, |
| 1568 | exponent: Some("-24" .into()), |
| 1569 | hex: true, |
| 1570 | }, |
| 1571 | ); |
| 1572 | } |
| 1573 | } |
| 1574 | |