| 1 | //! Definition of a lexer for the WebAssembly text format. | 
| 2 | //! | 
|---|
| 3 | //! This module provides a [`Lexer`][] type which is an iterate over the raw | 
|---|
| 4 | //! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single | 
|---|
| 5 | //! byte in a WebAssembly text field, returning tokens even for comments and | 
|---|
| 6 | //! whitespace. Typically you'll ignore comments and whitespace, however. | 
|---|
| 7 | //! | 
|---|
| 8 | //! If you'd like to iterate over the tokens in a file you can do so via: | 
|---|
| 9 | //! | 
|---|
| 10 | //! ``` | 
|---|
| 11 | //! # fn foo() -> Result<(), wast::Error> { | 
|---|
| 12 | //! use wast::lexer::Lexer; | 
|---|
| 13 | //! | 
|---|
| 14 | //! let wat = "(module (func $foo))"; | 
|---|
| 15 | //! for token in Lexer::new(wat).iter(0) { | 
|---|
| 16 | //!     println!( "{:?}", token?); | 
|---|
| 17 | //! } | 
|---|
| 18 | //! # Ok(()) | 
|---|
| 19 | //! # } | 
|---|
| 20 | //! ``` | 
|---|
| 21 | //! | 
|---|
| 22 | //! Note that you'll typically not use this module but will rather use | 
|---|
| 23 | //! [`ParseBuffer`](crate::parser::ParseBuffer) instead. | 
|---|
| 24 | //! | 
|---|
| 25 | //! [`Lexer`]: crate::lexer::Lexer | 
|---|
| 26 |  | 
|---|
| 27 | use crate::token::Span; | 
|---|
| 28 | use crate::Error; | 
|---|
| 29 | use std::borrow::Cow; | 
|---|
| 30 | use std::char; | 
|---|
| 31 | use std::fmt; | 
|---|
| 32 | use std::slice; | 
|---|
| 33 | use std::str; | 
|---|
| 34 | use std::str::Utf8Error; | 
|---|
| 35 |  | 
|---|
| 36 | /// A structure used to lex the s-expression syntax of WAT files. | 
|---|
| 37 | /// | 
|---|
| 38 | /// This structure is used to generate [`Token`] items, which should account for | 
|---|
| 39 | /// every single byte of the input as we iterate over it. A [`LexError`] is | 
|---|
| 40 | /// returned for any non-lexable text. | 
|---|
| 41 | #[ derive(Clone)] | 
|---|
| 42 | pub struct Lexer<'a> { | 
|---|
| 43 | input: &'a str, | 
|---|
| 44 | allow_confusing_unicode: bool, | 
|---|
| 45 | } | 
|---|
| 46 |  | 
|---|
| 47 | /// A single token parsed from a `Lexer`. | 
|---|
| 48 | #[ derive(Copy, Clone, Debug, PartialEq)] | 
|---|
| 49 | pub struct Token { | 
|---|
| 50 | /// The kind of token this represents, such as whether it's whitespace, a | 
|---|
| 51 | /// keyword, etc. | 
|---|
| 52 | pub kind: TokenKind, | 
|---|
| 53 | /// The byte offset within the original source for where this token came | 
|---|
| 54 | /// from. | 
|---|
| 55 | pub offset: usize, | 
|---|
| 56 | /// The byte length of this token as it resides in the original source. | 
|---|
| 57 | // | 
|---|
| 58 | // NB: this is `u32` to enable packing `Token` into two pointers of size. | 
|---|
| 59 | // This does limit a single token to being at most 4G large, but that seems | 
|---|
| 60 | // probably ok. | 
|---|
| 61 | pub len: u32, | 
|---|
| 62 | } | 
|---|
| 63 |  | 
|---|
| 64 | #[ test] | 
|---|
| 65 | fn token_is_not_too_big() { | 
|---|
| 66 | assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2); | 
|---|
| 67 | } | 
|---|
| 68 |  | 
|---|
| 69 | /// Classification of what was parsed from the input stream. | 
|---|
| 70 | /// | 
|---|
| 71 | /// This enumeration contains all kinds of fragments, including comments and | 
|---|
| 72 | /// whitespace. | 
|---|
| 73 | #[ derive(Copy, Clone, Debug, PartialEq)] | 
|---|
| 74 | pub enum TokenKind { | 
|---|
| 75 | /// A line comment, preceded with `;;` | 
|---|
| 76 | LineComment, | 
|---|
| 77 |  | 
|---|
| 78 | /// A block comment, surrounded by `(;` and `;)`. Note that these can be | 
|---|
| 79 | /// nested. | 
|---|
| 80 | BlockComment, | 
|---|
| 81 |  | 
|---|
| 82 | /// A fragment of source that represents whitespace. | 
|---|
| 83 | Whitespace, | 
|---|
| 84 |  | 
|---|
| 85 | /// A left-parenthesis, including the source text for where it comes from. | 
|---|
| 86 | LParen, | 
|---|
| 87 | /// A right-parenthesis, including the source text for where it comes from. | 
|---|
| 88 | RParen, | 
|---|
| 89 |  | 
|---|
| 90 | /// A string literal, which is actually a list of bytes. | 
|---|
| 91 | String, | 
|---|
| 92 |  | 
|---|
| 93 | /// An identifier (like `$foo`). | 
|---|
| 94 | /// | 
|---|
| 95 | /// All identifiers start with `$` and the payload here is the original | 
|---|
| 96 | /// source text. | 
|---|
| 97 | Id, | 
|---|
| 98 |  | 
|---|
| 99 | /// A keyword, or something that starts with an alphabetic character. | 
|---|
| 100 | /// | 
|---|
| 101 | /// The payload here is the original source text. | 
|---|
| 102 | Keyword, | 
|---|
| 103 |  | 
|---|
| 104 | /// An annotation (like `@foo`). | 
|---|
| 105 | /// | 
|---|
| 106 | /// All annotations start with `@` and the payload will be the name of the | 
|---|
| 107 | /// annotation. | 
|---|
| 108 | Annotation, | 
|---|
| 109 |  | 
|---|
| 110 | /// A reserved series of `idchar` symbols. Unknown what this is meant to be | 
|---|
| 111 | /// used for, you'll probably generate an error about an unexpected token. | 
|---|
| 112 | Reserved, | 
|---|
| 113 |  | 
|---|
| 114 | /// An integer. | 
|---|
| 115 | Integer(IntegerKind), | 
|---|
| 116 |  | 
|---|
| 117 | /// A float. | 
|---|
| 118 | Float(FloatKind), | 
|---|
| 119 | } | 
|---|
| 120 |  | 
|---|
| 121 | /// Description of the parsed integer from the source. | 
|---|
| 122 | #[ derive(Copy, Clone, Debug, PartialEq)] | 
|---|
| 123 | pub struct IntegerKind { | 
|---|
| 124 | sign: Option<SignToken>, | 
|---|
| 125 | has_underscores: bool, | 
|---|
| 126 | hex: bool, | 
|---|
| 127 | } | 
|---|
| 128 |  | 
|---|
| 129 | /// Description of a parsed float from the source. | 
|---|
| 130 | #[ allow(missing_docs)] | 
|---|
| 131 | #[ derive(Copy, Clone, Debug, PartialEq)] | 
|---|
| 132 | pub enum FloatKind { | 
|---|
| 133 | #[ doc(hidden)] | 
|---|
| 134 | Inf { negative: bool }, | 
|---|
| 135 | #[ doc(hidden)] | 
|---|
| 136 | Nan { negative: bool }, | 
|---|
| 137 | #[ doc(hidden)] | 
|---|
| 138 | NanVal { | 
|---|
| 139 | negative: bool, | 
|---|
| 140 | has_underscores: bool, | 
|---|
| 141 | }, | 
|---|
| 142 | #[ doc(hidden)] | 
|---|
| 143 | Normal { has_underscores: bool, hex: bool }, | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | enum ReservedKind { | 
|---|
| 147 | /// "..." | 
|---|
| 148 | String, | 
|---|
| 149 | /// anything that's just a sequence of `idchars!()` | 
|---|
| 150 | Idchars, | 
|---|
| 151 | /// $"..." | 
|---|
| 152 | IdString, | 
|---|
| 153 | /// @"..." | 
|---|
| 154 | AnnotationString, | 
|---|
| 155 | /// everything else (a conglomeration of strings, idchars, etc) | 
|---|
| 156 | Reserved, | 
|---|
| 157 | } | 
|---|
| 158 |  | 
|---|
| 159 | /// Errors that can be generated while lexing. | 
|---|
| 160 | /// | 
|---|
| 161 | /// All lexing errors have line/colum/position information as well as a | 
|---|
| 162 | /// `LexError` indicating what kind of error happened while lexing. | 
|---|
| 163 | #[ derive(Debug, Clone, PartialEq, Eq)] | 
|---|
| 164 | #[ non_exhaustive] | 
|---|
| 165 | pub enum LexError { | 
|---|
| 166 | /// A dangling block comment was found with an unbalanced `(;` which was | 
|---|
| 167 | /// never terminated in the file. | 
|---|
| 168 | DanglingBlockComment, | 
|---|
| 169 |  | 
|---|
| 170 | /// An unexpected character was encountered when generally parsing and | 
|---|
| 171 | /// looking for something else. | 
|---|
| 172 | Unexpected(char), | 
|---|
| 173 |  | 
|---|
| 174 | /// An invalid `char` in a string literal was found. | 
|---|
| 175 | InvalidStringElement(char), | 
|---|
| 176 |  | 
|---|
| 177 | /// An invalid string escape letter was found (the thing after the `\` in | 
|---|
| 178 | /// string literals) | 
|---|
| 179 | InvalidStringEscape(char), | 
|---|
| 180 |  | 
|---|
| 181 | /// An invalid hexadecimal digit was found. | 
|---|
| 182 | InvalidHexDigit(char), | 
|---|
| 183 |  | 
|---|
| 184 | /// An invalid base-10 digit was found. | 
|---|
| 185 | InvalidDigit(char), | 
|---|
| 186 |  | 
|---|
| 187 | /// Parsing expected `wanted` but ended up finding `found` instead where the | 
|---|
| 188 | /// two characters aren't the same. | 
|---|
| 189 | Expected { | 
|---|
| 190 | /// The character that was expected to be found | 
|---|
| 191 | wanted: char, | 
|---|
| 192 | /// The character that was actually found | 
|---|
| 193 | found: char, | 
|---|
| 194 | }, | 
|---|
| 195 |  | 
|---|
| 196 | /// We needed to parse more but EOF (or end of the string) was encountered. | 
|---|
| 197 | UnexpectedEof, | 
|---|
| 198 |  | 
|---|
| 199 | /// A number failed to parse because it was too big to fit within the target | 
|---|
| 200 | /// type. | 
|---|
| 201 | NumberTooBig, | 
|---|
| 202 |  | 
|---|
| 203 | /// An invalid unicode value was found in a `\u{...}` escape in a string, | 
|---|
| 204 | /// only valid unicode scalars can be escaped that way. | 
|---|
| 205 | InvalidUnicodeValue(u32), | 
|---|
| 206 |  | 
|---|
| 207 | /// A lone underscore was found when parsing a number, since underscores | 
|---|
| 208 | /// should always be preceded and succeeded with a digit of some form. | 
|---|
| 209 | LoneUnderscore, | 
|---|
| 210 |  | 
|---|
| 211 | /// A "confusing" unicode character is present in a comment or a string | 
|---|
| 212 | /// literal, such as a character that changes the direction text is | 
|---|
| 213 | /// typically displayed in editors. This could cause the human-read | 
|---|
| 214 | /// version to behave differently than the compiler-visible version, so | 
|---|
| 215 | /// these are simply rejected for now. | 
|---|
| 216 | ConfusingUnicode(char), | 
|---|
| 217 |  | 
|---|
| 218 | /// An invalid utf-8 sequence was found in a quoted identifier, such as | 
|---|
| 219 | /// `$"\ff"`. | 
|---|
| 220 | InvalidUtf8Id(Utf8Error), | 
|---|
| 221 |  | 
|---|
| 222 | /// An empty identifier was found, or a lone `$`. | 
|---|
| 223 | EmptyId, | 
|---|
| 224 |  | 
|---|
| 225 | /// An empty identifier was found, or a lone `@`. | 
|---|
| 226 | EmptyAnnotation, | 
|---|
| 227 | } | 
|---|
| 228 |  | 
|---|
| 229 | /// A sign token for an integer. | 
|---|
| 230 | #[ derive(Clone, Copy, Debug, PartialEq, Eq)] | 
|---|
| 231 | pub enum SignToken { | 
|---|
| 232 | /// Plus sign: "+", | 
|---|
| 233 | Plus, | 
|---|
| 234 | /// Minus sign: "-", | 
|---|
| 235 | Minus, | 
|---|
| 236 | } | 
|---|
| 237 |  | 
|---|
| 238 | /// A fully parsed integer from a source string with a payload ready to parse | 
|---|
| 239 | /// into an integral type. | 
|---|
| 240 | #[ derive(Debug, PartialEq)] | 
|---|
| 241 | pub struct Integer<'a> { | 
|---|
| 242 | sign: Option<SignToken>, | 
|---|
| 243 | val: Cow<'a, str>, | 
|---|
| 244 | hex: bool, | 
|---|
| 245 | } | 
|---|
| 246 |  | 
|---|
| 247 | /// Possible parsed float values | 
|---|
| 248 | #[ derive(Debug, PartialEq, Eq)] | 
|---|
| 249 | pub enum Float<'a> { | 
|---|
| 250 | /// A float `NaN` representation | 
|---|
| 251 | Nan { | 
|---|
| 252 | /// The specific bits to encode for this float, optionally | 
|---|
| 253 | val: Option<Cow<'a, str>>, | 
|---|
| 254 | /// Whether or not this is a negative `NaN` or not. | 
|---|
| 255 | negative: bool, | 
|---|
| 256 | }, | 
|---|
| 257 | /// An float infinite representation, | 
|---|
| 258 | Inf { | 
|---|
| 259 | #[ allow(missing_docs)] | 
|---|
| 260 | negative: bool, | 
|---|
| 261 | }, | 
|---|
| 262 | /// A parsed and separated floating point value | 
|---|
| 263 | Val { | 
|---|
| 264 | /// Whether or not the `integral` and `fractional` are specified in hex | 
|---|
| 265 | hex: bool, | 
|---|
| 266 | /// The float parts before the `.` | 
|---|
| 267 | integral: Cow<'a, str>, | 
|---|
| 268 | /// The float parts after the `.` | 
|---|
| 269 | fractional: Option<Cow<'a, str>>, | 
|---|
| 270 | /// The exponent to multiple this `integral.fractional` portion of the | 
|---|
| 271 | /// float by. If `hex` is true this is `2^exponent` and otherwise it's | 
|---|
| 272 | /// `10^exponent` | 
|---|
| 273 | exponent: Option<Cow<'a, str>>, | 
|---|
| 274 | }, | 
|---|
| 275 | } | 
|---|
| 276 |  | 
|---|
| 277 | // https://webassembly.github.io/spec/core/text/values.html#text-idchar | 
|---|
| 278 | macro_rules! idchars { | 
|---|
| 279 | () => { | 
|---|
| 280 | b'0'..= b'9' | 
|---|
| 281 | | b'A'..= b'Z' | 
|---|
| 282 | | b'a'..= b'z' | 
|---|
| 283 | | b'!' | 
|---|
| 284 | | b'#' | 
|---|
| 285 | | b'$' | 
|---|
| 286 | | b'%' | 
|---|
| 287 | | b'&' | 
|---|
| 288 | | b'\' ' | 
|---|
| 289 | | b'*' | 
|---|
| 290 | | b'+' | 
|---|
| 291 | | b'-' | 
|---|
| 292 | | b'.' | 
|---|
| 293 | | b'/' | 
|---|
| 294 | | b':' | 
|---|
| 295 | | b'<' | 
|---|
| 296 | | b'=' | 
|---|
| 297 | | b'>' | 
|---|
| 298 | | b'?' | 
|---|
| 299 | | b'@' | 
|---|
| 300 | | b'\\ ' | 
|---|
| 301 | | b'^' | 
|---|
| 302 | | b'_' | 
|---|
| 303 | | b'`' | 
|---|
| 304 | | b'|' | 
|---|
| 305 | | b'~' | 
|---|
| 306 | } | 
|---|
| 307 | } | 
|---|
| 308 |  | 
|---|
| 309 | impl<'a> Lexer<'a> { | 
|---|
| 310 | /// Creates a new lexer which will lex the `input` source string. | 
|---|
| 311 | pub fn new(input: &str) -> Lexer<'_> { | 
|---|
| 312 | Lexer { | 
|---|
| 313 | input, | 
|---|
| 314 | allow_confusing_unicode: false, | 
|---|
| 315 | } | 
|---|
| 316 | } | 
|---|
| 317 |  | 
|---|
| 318 | /// Returns the original source input that we're lexing. | 
|---|
| 319 | pub fn input(&self) -> &'a str { | 
|---|
| 320 | self.input | 
|---|
| 321 | } | 
|---|
| 322 |  | 
|---|
| 323 | /// Configures whether "confusing" unicode characters are allowed while | 
|---|
| 324 | /// lexing. | 
|---|
| 325 | /// | 
|---|
| 326 | /// If allowed then no error will happen if these characters are found, but | 
|---|
| 327 | /// otherwise if disallowed a lex error will be produced when these | 
|---|
| 328 | /// characters are found. Confusing characters are denied by default. | 
|---|
| 329 | /// | 
|---|
| 330 | /// For now "confusing characters" are primarily related to the "trojan | 
|---|
| 331 | /// source" problem where it refers to characters which cause humans to read | 
|---|
| 332 | /// text differently than this lexer, such as characters that alter the | 
|---|
| 333 | /// left-to-right display of the source code. | 
|---|
| 334 | pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self { | 
|---|
| 335 | self.allow_confusing_unicode = allow; | 
|---|
| 336 | self | 
|---|
| 337 | } | 
|---|
| 338 |  | 
|---|
| 339 | /// Lexes the next at the byte position `pos` in the input. | 
|---|
| 340 | /// | 
|---|
| 341 | /// Returns `Some` if a token is found or `None` if we're at EOF. | 
|---|
| 342 | /// | 
|---|
| 343 | /// The `pos` argument will be updated to point to the next token on a | 
|---|
| 344 | /// successful parse. | 
|---|
| 345 | /// | 
|---|
| 346 | /// # Errors | 
|---|
| 347 | /// | 
|---|
| 348 | /// Returns an error if the input is malformed. | 
|---|
| 349 | pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> { | 
|---|
| 350 | let offset = *pos; | 
|---|
| 351 | Ok(match self.parse_kind(pos)? { | 
|---|
| 352 | Some(kind) => Some(Token { | 
|---|
| 353 | kind, | 
|---|
| 354 | offset, | 
|---|
| 355 | len: (*pos - offset).try_into().unwrap(), | 
|---|
| 356 | }), | 
|---|
| 357 | None => None, | 
|---|
| 358 | }) | 
|---|
| 359 | } | 
|---|
| 360 |  | 
|---|
| 361 | fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> { | 
|---|
| 362 | let start = *pos; | 
|---|
| 363 | // This `match` generally parses the grammar specified at | 
|---|
| 364 | // | 
|---|
| 365 | // https://webassembly.github.io/spec/core/text/lexical.html#text-token | 
|---|
| 366 | let remaining = &self.input.as_bytes()[start..]; | 
|---|
| 367 | let byte = match remaining.first() { | 
|---|
| 368 | Some(b) => b, | 
|---|
| 369 | None => return Ok(None), | 
|---|
| 370 | }; | 
|---|
| 371 |  | 
|---|
| 372 | match byte { | 
|---|
| 373 | // Open-parens check the next character to see if this is the start | 
|---|
| 374 | // of a block comment, otherwise it's just a bland left-paren | 
|---|
| 375 | // token. | 
|---|
| 376 | b'('=> match remaining.get(1) { | 
|---|
| 377 | Some( b';') => { | 
|---|
| 378 | let mut level = 1; | 
|---|
| 379 | // Note that we're doing a byte-level search here for the | 
|---|
| 380 | // close-delimiter of `;)`. The actual source text is utf-8 | 
|---|
| 381 | // encode in `remaining` but due to how utf-8 works we | 
|---|
| 382 | // can safely search for an ASCII byte since it'll never | 
|---|
| 383 | // otherwise appear in the middle of a codepoint and if we | 
|---|
| 384 | // find it then it's guaranteed to be the right byte. | 
|---|
| 385 | // | 
|---|
| 386 | // Mainly we're avoiding the overhead of decoding utf-8 | 
|---|
| 387 | // characters into a Rust `char` since it's otherwise | 
|---|
| 388 | // unnecessary work. | 
|---|
| 389 | let mut iter = remaining[2..].iter(); | 
|---|
| 390 | while let Some(ch) = iter.next() { | 
|---|
| 391 | match ch { | 
|---|
| 392 | b'('=> { | 
|---|
| 393 | if let Some( b';') = iter.as_slice().first() { | 
|---|
| 394 | level += 1; | 
|---|
| 395 | iter.next(); | 
|---|
| 396 | } | 
|---|
| 397 | } | 
|---|
| 398 | b';'=> { | 
|---|
| 399 | if let Some( b')') = iter.as_slice().first() { | 
|---|
| 400 | level -= 1; | 
|---|
| 401 | iter.next(); | 
|---|
| 402 | if level == 0 { | 
|---|
| 403 | let len = remaining.len() - iter.as_slice().len(); | 
|---|
| 404 | let comment = &self.input[start..][..len]; | 
|---|
| 405 | *pos += len; | 
|---|
| 406 | self.check_confusing_comment(*pos, comment)?; | 
|---|
| 407 | return Ok(Some(TokenKind::BlockComment)); | 
|---|
| 408 | } | 
|---|
| 409 | } | 
|---|
| 410 | } | 
|---|
| 411 | _ => {} | 
|---|
| 412 | } | 
|---|
| 413 | } | 
|---|
| 414 | Err(self.error(start, LexError::DanglingBlockComment)) | 
|---|
| 415 | } | 
|---|
| 416 | _ => { | 
|---|
| 417 | *pos += 1; | 
|---|
| 418 |  | 
|---|
| 419 | Ok(Some(TokenKind::LParen)) | 
|---|
| 420 | } | 
|---|
| 421 | }, | 
|---|
| 422 |  | 
|---|
| 423 | b')'=> { | 
|---|
| 424 | *pos += 1; | 
|---|
| 425 | Ok(Some(TokenKind::RParen)) | 
|---|
| 426 | } | 
|---|
| 427 |  | 
|---|
| 428 | // https://webassembly.github.io/spec/core/text/lexical.html#white-space | 
|---|
| 429 | b' '| b'\n '| b'\r '| b'\t '=> { | 
|---|
| 430 | self.skip_ws(pos); | 
|---|
| 431 | Ok(Some(TokenKind::Whitespace)) | 
|---|
| 432 | } | 
|---|
| 433 |  | 
|---|
| 434 | c @ (idchars!() | b'"') => { | 
|---|
| 435 | let (kind, src) = self.parse_reserved(pos)?; | 
|---|
| 436 | match kind { | 
|---|
| 437 | // If the reserved token was simply a single string then | 
|---|
| 438 | // that is converted to a standalone string token | 
|---|
| 439 | ReservedKind::String => return Ok(Some(TokenKind::String)), | 
|---|
| 440 |  | 
|---|
| 441 | // If only idchars were consumed then this could be a | 
|---|
| 442 | // specific kind of standalone token we're interested in. | 
|---|
| 443 | ReservedKind::Idchars => { | 
|---|
| 444 | // https://webassembly.github.io/spec/core/text/values.html#integers | 
|---|
| 445 | if let Some(ret) = self.classify_number(src) { | 
|---|
| 446 | return Ok(Some(ret)); | 
|---|
| 447 | // https://webassembly.github.io/spec/core/text/values.html#text-id | 
|---|
| 448 | } else if *c == b'$'{ | 
|---|
| 449 | return Ok(Some(TokenKind::Id)); | 
|---|
| 450 | // part of the WebAssembly/annotations proposal | 
|---|
| 451 | // (no online url yet) | 
|---|
| 452 | } else if *c == b'@'{ | 
|---|
| 453 | return Ok(Some(TokenKind::Annotation)); | 
|---|
| 454 | // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword | 
|---|
| 455 | } else if b'a'<= *c && *c <= b'z'{ | 
|---|
| 456 | return Ok(Some(TokenKind::Keyword)); | 
|---|
| 457 | } | 
|---|
| 458 | } | 
|---|
| 459 |  | 
|---|
| 460 | ReservedKind::IdString => return Ok(Some(TokenKind::Id)), | 
|---|
| 461 | ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)), | 
|---|
| 462 |  | 
|---|
| 463 | // ... otherwise this was a conglomeration of idchars, | 
|---|
| 464 | // strings, or just idchars that don't match a prior rule, | 
|---|
| 465 | // meaning this falls through to the fallback `Reserved` | 
|---|
| 466 | // token. | 
|---|
| 467 | ReservedKind::Reserved => {} | 
|---|
| 468 | } | 
|---|
| 469 |  | 
|---|
| 470 | Ok(Some(TokenKind::Reserved)) | 
|---|
| 471 | } | 
|---|
| 472 |  | 
|---|
| 473 | // This could be a line comment, otherwise `;` is a reserved token. | 
|---|
| 474 | // The second byte is checked to see if it's a `;;` line comment | 
|---|
| 475 | // | 
|---|
| 476 | // Note that this character being considered as part of a | 
|---|
| 477 | // `reserved` token is part of the annotations proposal. | 
|---|
| 478 | b';'=> match remaining.get(1) { | 
|---|
| 479 | Some( b';') => { | 
|---|
| 480 | let remaining = &self.input[*pos..]; | 
|---|
| 481 | let byte_pos = memchr::memchr2( b'\n ', b'\r ', remaining.as_bytes()) | 
|---|
| 482 | .unwrap_or(remaining.len()); | 
|---|
| 483 | *pos += byte_pos; | 
|---|
| 484 | let comment = &remaining[..byte_pos]; | 
|---|
| 485 | self.check_confusing_comment(*pos, comment)?; | 
|---|
| 486 | Ok(Some(TokenKind::LineComment)) | 
|---|
| 487 | } | 
|---|
| 488 | _ => { | 
|---|
| 489 | *pos += 1; | 
|---|
| 490 | Ok(Some(TokenKind::Reserved)) | 
|---|
| 491 | } | 
|---|
| 492 | }, | 
|---|
| 493 |  | 
|---|
| 494 | // Other known reserved tokens other than `;` | 
|---|
| 495 | // | 
|---|
| 496 | // Note that these characters being considered as part of a | 
|---|
| 497 | // `reserved` token is part of the annotations proposal. | 
|---|
| 498 | b','| b'['| b']'| b'{'| b'}'=> { | 
|---|
| 499 | *pos += 1; | 
|---|
| 500 | Ok(Some(TokenKind::Reserved)) | 
|---|
| 501 | } | 
|---|
| 502 |  | 
|---|
| 503 | _ => { | 
|---|
| 504 | let ch = self.input[start..].chars().next().unwrap(); | 
|---|
| 505 | Err(self.error(*pos, LexError::Unexpected(ch))) | 
|---|
| 506 | } | 
|---|
| 507 | } | 
|---|
| 508 | } | 
|---|
| 509 |  | 
|---|
| 510 | fn skip_ws(&self, pos: &mut usize) { | 
|---|
| 511 | // This table is a byte lookup table to determine whether a byte is a | 
|---|
| 512 | // whitespace byte. There are only 4 whitespace bytes for the `*.wat` | 
|---|
| 513 | // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes | 
|---|
| 514 | // have a '1' in the table below. | 
|---|
| 515 | // | 
|---|
| 516 | // Due to how utf-8 works (our input is guaranteed to be utf-8) it is | 
|---|
| 517 | // known that if these bytes are found they're guaranteed to be the | 
|---|
| 518 | // whitespace byte, so they can be safely skipped and we don't have to | 
|---|
| 519 | // do full utf-8 decoding. This means that the goal of this function is | 
|---|
| 520 | // to find the first non-whitespace byte in `remaining`. | 
|---|
| 521 | // | 
|---|
| 522 | // For now this lookup table seems to be the fastest, but projects like | 
|---|
| 523 | // https://github.com/lemire/despacer show other simd algorithms which | 
|---|
| 524 | // can possibly accelerate this even more. Note that `*.wat` files often | 
|---|
| 525 | // have a lot of whitespace so this function is typically quite hot when | 
|---|
| 526 | // parsing inputs. | 
|---|
| 527 | #[rustfmt::skip] | 
|---|
| 528 | const WS: [u8; 256] = [ | 
|---|
| 529 | //                                   \t \n       \r | 
|---|
| 530 | /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, | 
|---|
| 531 | /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 532 | //        ' ' | 
|---|
| 533 | /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 534 | /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 535 | /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 536 | /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 537 | /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 538 | /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 539 | /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 540 | /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 541 | /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 542 | /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 543 | /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 544 | /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 545 | /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 546 | /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
|---|
| 547 | ]; | 
|---|
| 548 | let remaining = &self.input[*pos..]; | 
|---|
| 549 | let non_ws_pos = remaining | 
|---|
| 550 | .as_bytes() | 
|---|
| 551 | .iter() | 
|---|
| 552 | .position(|b| WS[*b as usize] != 1) | 
|---|
| 553 | .unwrap_or(remaining.len()); | 
|---|
| 554 | *pos += non_ws_pos; | 
|---|
| 555 | } | 
|---|
| 556 |  | 
|---|
| 557 | /// Splits off a "reserved" token which is then further processed later on | 
|---|
| 558 | /// to figure out which kind of token it is `depending on `ReservedKind`. | 
|---|
| 559 | /// | 
|---|
| 560 | /// For more information on this method see the clarification at | 
|---|
| 561 | /// <https://github.com/WebAssembly/spec/pull/1499> but the general gist is | 
|---|
| 562 | /// that this is parsing the grammar: | 
|---|
| 563 | /// | 
|---|
| 564 | /// ```text | 
|---|
| 565 | /// reserved := (idchar | string)+ | 
|---|
| 566 | /// ``` | 
|---|
| 567 | /// | 
|---|
| 568 | /// which means that it is eating any number of adjacent string/idchar | 
|---|
| 569 | /// tokens (e.g. `a"b"c`) and returning the classification of what was | 
|---|
| 570 | /// eaten. The classification assists in determining what the actual token | 
|---|
| 571 | /// here eaten looks like. | 
|---|
| 572 | fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> { | 
|---|
| 573 | let mut idchars = 0u32; | 
|---|
| 574 | let mut strings = 0u32; | 
|---|
| 575 | let start = *pos; | 
|---|
| 576 | while let Some(byte) = self.input.as_bytes().get(*pos) { | 
|---|
| 577 | match byte { | 
|---|
| 578 | // Normal `idchars` production which appends to the reserved | 
|---|
| 579 | // token that's being produced. | 
|---|
| 580 | idchars!() => { | 
|---|
| 581 | idchars += 1; | 
|---|
| 582 | *pos += 1; | 
|---|
| 583 | } | 
|---|
| 584 |  | 
|---|
| 585 | // https://webassembly.github.io/spec/core/text/values.html#text-string | 
|---|
| 586 | b'"'=> { | 
|---|
| 587 | strings += 1; | 
|---|
| 588 | *pos += 1; | 
|---|
| 589 | let mut it = self.input[*pos..].chars(); | 
|---|
| 590 | let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode); | 
|---|
| 591 | *pos = self.input.len() - it.as_str().len(); | 
|---|
| 592 | match result { | 
|---|
| 593 | Ok(_) => {} | 
|---|
| 594 | Err(e) => { | 
|---|
| 595 | let err_pos = match &e { | 
|---|
| 596 | LexError::UnexpectedEof => self.input.len(), | 
|---|
| 597 | _ => self.input[..*pos].char_indices().next_back().unwrap().0, | 
|---|
| 598 | }; | 
|---|
| 599 | return Err(self.error(err_pos, e)); | 
|---|
| 600 | } | 
|---|
| 601 | } | 
|---|
| 602 | } | 
|---|
| 603 |  | 
|---|
| 604 | // Nothing else is considered part of a reserved token | 
|---|
| 605 | _ => break, | 
|---|
| 606 | } | 
|---|
| 607 | } | 
|---|
| 608 | let ret = &self.input[start..*pos]; | 
|---|
| 609 | Ok(match (idchars, strings) { | 
|---|
| 610 | (0, 0) => unreachable!(), | 
|---|
| 611 | (0, 1) => (ReservedKind::String, ret), | 
|---|
| 612 | (_, 0) => (ReservedKind::Idchars, ret), | 
|---|
| 613 | // Pattern match `@"..."` and `$"..."` for string-based | 
|---|
| 614 | // identifiers and annotations. | 
|---|
| 615 | (1, 1) if ret.starts_with( "$") => (ReservedKind::IdString, ret), | 
|---|
| 616 | (1, 1) if ret.starts_with( "@") => (ReservedKind::AnnotationString, ret), | 
|---|
| 617 | _ => (ReservedKind::Reserved, ret), | 
|---|
| 618 | }) | 
|---|
| 619 | } | 
|---|
| 620 |  | 
|---|
| 621 | fn classify_number(&self, src: &str) -> Option<TokenKind> { | 
|---|
| 622 | let (sign, num) = if let Some(stripped) = src.strip_prefix( '+') { | 
|---|
| 623 | (Some(SignToken::Plus), stripped) | 
|---|
| 624 | } else if let Some(stripped) = src.strip_prefix( '-') { | 
|---|
| 625 | (Some(SignToken::Minus), stripped) | 
|---|
| 626 | } else { | 
|---|
| 627 | (None, src) | 
|---|
| 628 | }; | 
|---|
| 629 |  | 
|---|
| 630 | let negative = sign == Some(SignToken::Minus); | 
|---|
| 631 |  | 
|---|
| 632 | // Handle `inf` and `nan` which are special numbers here | 
|---|
| 633 | if num == "inf"{ | 
|---|
| 634 | return Some(TokenKind::Float(FloatKind::Inf { negative })); | 
|---|
| 635 | } else if num == "nan"{ | 
|---|
| 636 | return Some(TokenKind::Float(FloatKind::Nan { negative })); | 
|---|
| 637 | } else if let Some(stripped) = num.strip_prefix( "nan:0x") { | 
|---|
| 638 | let mut it = stripped.as_bytes().iter(); | 
|---|
| 639 | let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?; | 
|---|
| 640 | if it.next().is_some() { | 
|---|
| 641 | return None; | 
|---|
| 642 | } | 
|---|
| 643 | return Some(TokenKind::Float(FloatKind::NanVal { | 
|---|
| 644 | negative, | 
|---|
| 645 | has_underscores, | 
|---|
| 646 | })); | 
|---|
| 647 | } | 
|---|
| 648 |  | 
|---|
| 649 | // Figure out if we're a hex number or not | 
|---|
| 650 | let test_valid: fn(u8) -> bool; | 
|---|
| 651 | let (mut it, hex) = if let Some(stripped) = num.strip_prefix( "0x") { | 
|---|
| 652 | test_valid = |x: u8| char::from(x).is_ascii_hexdigit(); | 
|---|
| 653 | (stripped.as_bytes().iter(), true) | 
|---|
| 654 | } else { | 
|---|
| 655 | test_valid = |x: u8| char::from(x).is_ascii_digit(); | 
|---|
| 656 | (num.as_bytes().iter(), false) | 
|---|
| 657 | }; | 
|---|
| 658 |  | 
|---|
| 659 | // Evaluate the first part, moving out all underscores | 
|---|
| 660 | let mut has_underscores = skip_underscores(&mut it, test_valid)?; | 
|---|
| 661 |  | 
|---|
| 662 | match it.clone().next() { | 
|---|
| 663 | // If we're followed by something this may be a float so keep going. | 
|---|
| 664 | Some(_) => {} | 
|---|
| 665 |  | 
|---|
| 666 | // Otherwise this is a valid integer literal! | 
|---|
| 667 | None => { | 
|---|
| 668 | return Some(TokenKind::Integer(IntegerKind { | 
|---|
| 669 | has_underscores, | 
|---|
| 670 | sign, | 
|---|
| 671 | hex, | 
|---|
| 672 | })) | 
|---|
| 673 | } | 
|---|
| 674 | } | 
|---|
| 675 |  | 
|---|
| 676 | // A number can optionally be after the dot so only actually try to | 
|---|
| 677 | // parse one if it's there. | 
|---|
| 678 | if it.clone().next() == Some(& b'.') { | 
|---|
| 679 | it.next(); | 
|---|
| 680 | match it.clone().next() { | 
|---|
| 681 | Some(c) if test_valid(*c) => { | 
|---|
| 682 | if skip_underscores(&mut it, test_valid)? { | 
|---|
| 683 | has_underscores = true; | 
|---|
| 684 | } | 
|---|
| 685 | } | 
|---|
| 686 | Some(_) | None => {} | 
|---|
| 687 | } | 
|---|
| 688 | }; | 
|---|
| 689 |  | 
|---|
| 690 | // Figure out if there's an exponential part here to make a float, and | 
|---|
| 691 | // if so parse it but defer its actual calculation until later. | 
|---|
| 692 | match (hex, it.next()) { | 
|---|
| 693 | (true, Some( b'p')) | (true, Some( b'P')) | (false, Some( b'e')) | (false, Some( b'E')) => { | 
|---|
| 694 | match it.clone().next() { | 
|---|
| 695 | Some( b'-') => { | 
|---|
| 696 | it.next(); | 
|---|
| 697 | } | 
|---|
| 698 | Some( b'+') => { | 
|---|
| 699 | it.next(); | 
|---|
| 700 | } | 
|---|
| 701 | _ => {} | 
|---|
| 702 | } | 
|---|
| 703 | if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? { | 
|---|
| 704 | has_underscores = true; | 
|---|
| 705 | } | 
|---|
| 706 | } | 
|---|
| 707 | (_, None) => {} | 
|---|
| 708 | _ => return None, | 
|---|
| 709 | } | 
|---|
| 710 |  | 
|---|
| 711 | // We should have eaten everything by now, if not then this is surely | 
|---|
| 712 | // not a float or integer literal. | 
|---|
| 713 | if it.next().is_some() { | 
|---|
| 714 | return None; | 
|---|
| 715 | } | 
|---|
| 716 |  | 
|---|
| 717 | return Some(TokenKind::Float(FloatKind::Normal { | 
|---|
| 718 | has_underscores, | 
|---|
| 719 | hex, | 
|---|
| 720 | })); | 
|---|
| 721 |  | 
|---|
| 722 | fn skip_underscores<'a>( | 
|---|
| 723 | it: &mut slice::Iter<'_, u8>, | 
|---|
| 724 | good: fn(u8) -> bool, | 
|---|
| 725 | ) -> Option<bool> { | 
|---|
| 726 | let mut last_underscore = false; | 
|---|
| 727 | let mut has_underscores = false; | 
|---|
| 728 | let first = *it.next()?; | 
|---|
| 729 | if !good(first) { | 
|---|
| 730 | return None; | 
|---|
| 731 | } | 
|---|
| 732 | while let Some(c) = it.clone().next() { | 
|---|
| 733 | if *c == b'_'&& !last_underscore { | 
|---|
| 734 | has_underscores = true; | 
|---|
| 735 | it.next(); | 
|---|
| 736 | last_underscore = true; | 
|---|
| 737 | continue; | 
|---|
| 738 | } | 
|---|
| 739 | if !good(*c) { | 
|---|
| 740 | break; | 
|---|
| 741 | } | 
|---|
| 742 | last_underscore = false; | 
|---|
| 743 | it.next(); | 
|---|
| 744 | } | 
|---|
| 745 | if last_underscore { | 
|---|
| 746 | return None; | 
|---|
| 747 | } | 
|---|
| 748 | Some(has_underscores) | 
|---|
| 749 | } | 
|---|
| 750 | } | 
|---|
| 751 |  | 
|---|
| 752 | /// Verifies that `comment`, which is about to be returned, has a "confusing | 
|---|
| 753 | /// unicode character" in it and should instead be transformed into an | 
|---|
| 754 | /// error. | 
|---|
| 755 | fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> { | 
|---|
| 756 | if self.allow_confusing_unicode { | 
|---|
| 757 | return Ok(()); | 
|---|
| 758 | } | 
|---|
| 759 |  | 
|---|
| 760 | // In an effort to avoid utf-8 decoding the entire `comment` the search | 
|---|
| 761 | // here is a bit more optimized. This checks for the `0xe2` byte because | 
|---|
| 762 | // in the utf-8 encoding that's the leading encoding byte for all | 
|---|
| 763 | // "confusing characters". Each instance of 0xe2 is checked to see if it | 
|---|
| 764 | // starts a confusing character, and if so that's returned. | 
|---|
| 765 | // | 
|---|
| 766 | // Also note that 0xe2 will never be found in the middle of a codepoint, | 
|---|
| 767 | // it's always the start of a codepoint. This means that if our special | 
|---|
| 768 | // characters show up they're guaranteed to start with 0xe2 bytes. | 
|---|
| 769 | let bytes = comment.as_bytes(); | 
|---|
| 770 | for pos in memchr::Memchr::new(0xe2, bytes) { | 
|---|
| 771 | if let Some(c) = comment[pos..].chars().next() { | 
|---|
| 772 | if is_confusing_unicode(c) { | 
|---|
| 773 | // Note that `self.cur()` accounts for already having | 
|---|
| 774 | // parsed `comment`, so we move backwards to where | 
|---|
| 775 | // `comment` started and then add the index within | 
|---|
| 776 | // `comment`. | 
|---|
| 777 | let pos = end - comment.len() + pos; | 
|---|
| 778 | return Err(self.error(pos, LexError::ConfusingUnicode(c))); | 
|---|
| 779 | } | 
|---|
| 780 | } | 
|---|
| 781 | } | 
|---|
| 782 |  | 
|---|
| 783 | Ok(()) | 
|---|
| 784 | } | 
|---|
| 785 |  | 
|---|
| 786 | fn parse_str( | 
|---|
| 787 | it: &mut str::Chars<'a>, | 
|---|
| 788 | allow_confusing_unicode: bool, | 
|---|
| 789 | ) -> Result<Cow<'a, [u8]>, LexError> { | 
|---|
| 790 | enum State { | 
|---|
| 791 | Start, | 
|---|
| 792 | String(Vec<u8>), | 
|---|
| 793 | } | 
|---|
| 794 | let orig = it.as_str(); | 
|---|
| 795 | let mut state = State::Start; | 
|---|
| 796 | loop { | 
|---|
| 797 | match it.next().ok_or(LexError::UnexpectedEof)? { | 
|---|
| 798 | '"'=> break, | 
|---|
| 799 | '\\ '=> { | 
|---|
| 800 | match state { | 
|---|
| 801 | State::String(_) => {} | 
|---|
| 802 | State::Start => { | 
|---|
| 803 | let pos = orig.len() - it.as_str().len() - 1; | 
|---|
| 804 | state = State::String(orig[..pos].as_bytes().to_vec()); | 
|---|
| 805 | } | 
|---|
| 806 | } | 
|---|
| 807 | let buf = match &mut state { | 
|---|
| 808 | State::String(b) => b, | 
|---|
| 809 | State::Start => unreachable!(), | 
|---|
| 810 | }; | 
|---|
| 811 | match it.next().ok_or(LexError::UnexpectedEof)? { | 
|---|
| 812 | '"'=> buf.push( b'"'), | 
|---|
| 813 | '\' '=> buf.push( b'\' '), | 
|---|
| 814 | 't'=> buf.push( b'\t '), | 
|---|
| 815 | 'n'=> buf.push( b'\n '), | 
|---|
| 816 | 'r'=> buf.push( b'\r '), | 
|---|
| 817 | '\\ '=> buf.push( b'\\ '), | 
|---|
| 818 | 'u'=> { | 
|---|
| 819 | Lexer::must_eat_char(it, '{')?; | 
|---|
| 820 | let n = Lexer::hexnum(it)?; | 
|---|
| 821 | let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?; | 
|---|
| 822 | buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); | 
|---|
| 823 | Lexer::must_eat_char(it, '}')?; | 
|---|
| 824 | } | 
|---|
| 825 | c1 if c1.is_ascii_hexdigit() => { | 
|---|
| 826 | let c2 = Lexer::hexdigit(it)?; | 
|---|
| 827 | buf.push(to_hex(c1) * 16 + c2); | 
|---|
| 828 | } | 
|---|
| 829 | c => return Err(LexError::InvalidStringEscape(c)), | 
|---|
| 830 | } | 
|---|
| 831 | } | 
|---|
| 832 | c if (c as u32) < 0x20 || c as u32 == 0x7f => { | 
|---|
| 833 | return Err(LexError::InvalidStringElement(c)) | 
|---|
| 834 | } | 
|---|
| 835 | c if !allow_confusing_unicode && is_confusing_unicode(c) => { | 
|---|
| 836 | return Err(LexError::ConfusingUnicode(c)) | 
|---|
| 837 | } | 
|---|
| 838 | c => match &mut state { | 
|---|
| 839 | State::Start => {} | 
|---|
| 840 | State::String(v) => { | 
|---|
| 841 | v.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); | 
|---|
| 842 | } | 
|---|
| 843 | }, | 
|---|
| 844 | } | 
|---|
| 845 | } | 
|---|
| 846 | match state { | 
|---|
| 847 | State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()), | 
|---|
| 848 | State::String(s) => Ok(s.into()), | 
|---|
| 849 | } | 
|---|
| 850 | } | 
|---|
| 851 |  | 
|---|
| 852 | /// Parses an id-or-string-based name from `it`. | 
|---|
| 853 | /// | 
|---|
| 854 | /// Note that `it` should already have been lexed and this is just | 
|---|
| 855 | /// extracting the value. If the token lexed was `@a` then this should point | 
|---|
| 856 | /// to `a`. | 
|---|
| 857 | /// | 
|---|
| 858 | /// This will automatically detect quoted syntax such as `@"..."` and the | 
|---|
| 859 | /// byte string will be parsed and validated as utf-8. | 
|---|
| 860 | /// | 
|---|
| 861 | /// # Errors | 
|---|
| 862 | /// | 
|---|
| 863 | /// Returns an error if a quoted byte string is found and contains invalid | 
|---|
| 864 | /// utf-8. | 
|---|
| 865 | fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> { | 
|---|
| 866 | if it.clone().next() == Some( '"') { | 
|---|
| 867 | it.next(); | 
|---|
| 868 | match Lexer::parse_str(it, true)? { | 
|---|
| 869 | Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) { | 
|---|
| 870 | Ok(s) => Ok(Cow::Borrowed(s)), | 
|---|
| 871 | Err(e) => Err(LexError::InvalidUtf8Id(e)), | 
|---|
| 872 | }, | 
|---|
| 873 | Cow::Owned(bytes) => match String::from_utf8(bytes) { | 
|---|
| 874 | Ok(s) => Ok(Cow::Owned(s)), | 
|---|
| 875 | Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())), | 
|---|
| 876 | }, | 
|---|
| 877 | } | 
|---|
| 878 | } else { | 
|---|
| 879 | Ok(Cow::Borrowed(it.as_str())) | 
|---|
| 880 | } | 
|---|
| 881 | } | 
|---|
| 882 |  | 
|---|
| 883 | fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> { | 
|---|
| 884 | let n = Lexer::hexdigit(it)?; | 
|---|
| 885 | let mut last_underscore = false; | 
|---|
| 886 | let mut n = n as u32; | 
|---|
| 887 | while let Some(c) = it.clone().next() { | 
|---|
| 888 | if c == '_'{ | 
|---|
| 889 | it.next(); | 
|---|
| 890 | last_underscore = true; | 
|---|
| 891 | continue; | 
|---|
| 892 | } | 
|---|
| 893 | if !c.is_ascii_hexdigit() { | 
|---|
| 894 | break; | 
|---|
| 895 | } | 
|---|
| 896 | last_underscore = false; | 
|---|
| 897 | it.next(); | 
|---|
| 898 | n = n | 
|---|
| 899 | .checked_mul(16) | 
|---|
| 900 | .and_then(|n| n.checked_add(to_hex(c) as u32)) | 
|---|
| 901 | .ok_or(LexError::NumberTooBig)?; | 
|---|
| 902 | } | 
|---|
| 903 | if last_underscore { | 
|---|
| 904 | return Err(LexError::LoneUnderscore); | 
|---|
| 905 | } | 
|---|
| 906 | Ok(n) | 
|---|
| 907 | } | 
|---|
| 908 |  | 
|---|
| 909 | /// Reads a hexidecimal digit from the input stream, returning where it's | 
|---|
| 910 | /// defined and the hex value. Returns an error on EOF or an invalid hex | 
|---|
| 911 | /// digit. | 
|---|
| 912 | fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> { | 
|---|
| 913 | let ch = Lexer::must_char(it)?; | 
|---|
| 914 | if ch.is_ascii_hexdigit() { | 
|---|
| 915 | Ok(to_hex(ch)) | 
|---|
| 916 | } else { | 
|---|
| 917 | Err(LexError::InvalidHexDigit(ch)) | 
|---|
| 918 | } | 
|---|
| 919 | } | 
|---|
| 920 |  | 
|---|
| 921 | /// Reads the next character from the input string and where it's located, | 
|---|
| 922 | /// returning an error if the input stream is empty. | 
|---|
| 923 | fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> { | 
|---|
| 924 | it.next().ok_or(LexError::UnexpectedEof) | 
|---|
| 925 | } | 
|---|
| 926 |  | 
|---|
| 927 | /// Expects that a specific character must be read next | 
|---|
| 928 | fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> { | 
|---|
| 929 | let found = Lexer::must_char(it)?; | 
|---|
| 930 | if wanted == found { | 
|---|
| 931 | Ok(()) | 
|---|
| 932 | } else { | 
|---|
| 933 | Err(LexError::Expected { wanted, found }) | 
|---|
| 934 | } | 
|---|
| 935 | } | 
|---|
| 936 |  | 
|---|
| 937 | /// Creates an error at `pos` with the specified `kind` | 
|---|
| 938 | fn error(&self, pos: usize, kind: LexError) -> Error { | 
|---|
| 939 | Error::lex(Span { offset: pos }, self.input, kind) | 
|---|
| 940 | } | 
|---|
| 941 |  | 
|---|
| 942 | /// Returns an iterator over all tokens in the original source string | 
|---|
| 943 | /// starting at the `pos` specified. | 
|---|
| 944 | pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ { | 
|---|
| 945 | std::iter::from_fn(move || self.parse(&mut pos).transpose()) | 
|---|
| 946 | } | 
|---|
| 947 |  | 
|---|
| 948 | /// Returns whether an annotation is present at `pos`. If it is present then | 
|---|
| 949 | /// `Ok(Some(token))` is returned corresponding to the token, otherwise | 
|---|
| 950 | /// `Ok(None)` is returned. If the next token cannot be parsed then an error | 
|---|
| 951 | /// is returned. | 
|---|
| 952 | pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> { | 
|---|
| 953 | let bytes = self.input.as_bytes(); | 
|---|
| 954 | // Quickly reject anything that for sure isn't an annotation since this | 
|---|
| 955 | // method is used every time an lparen is parsed. | 
|---|
| 956 | if bytes.get(pos) != Some(& b'@') { | 
|---|
| 957 | return Ok(None); | 
|---|
| 958 | } | 
|---|
| 959 | match self.parse(&mut pos)? { | 
|---|
| 960 | Some(token) => match token.kind { | 
|---|
| 961 | TokenKind::Annotation => Ok(Some(token)), | 
|---|
| 962 | _ => Ok(None), | 
|---|
| 963 | }, | 
|---|
| 964 | None => Ok(None), | 
|---|
| 965 | } | 
|---|
| 966 | } | 
|---|
| 967 | } | 
|---|
| 968 |  | 
|---|
| 969 | impl Token { | 
|---|
| 970 | /// Returns the original source text for this token. | 
|---|
| 971 | pub fn src<'a>(&self, s: &'a str) -> &'a str { | 
|---|
| 972 | &s[self.offset..][..self.len.try_into().unwrap()] | 
|---|
| 973 | } | 
|---|
| 974 |  | 
|---|
| 975 | /// Returns the identifier, without the leading `$` symbol, that this token | 
|---|
| 976 | /// represents. | 
|---|
| 977 | /// | 
|---|
| 978 | /// Note that this method returns the contents of the identifier. With a | 
|---|
| 979 | /// string-based identifier this means that escapes have been resolved to | 
|---|
| 980 | /// their string-based equivalent. | 
|---|
| 981 | /// | 
|---|
| 982 | /// Should only be used with `TokenKind::Id`. | 
|---|
| 983 | /// | 
|---|
| 984 | /// # Errors | 
|---|
| 985 | /// | 
|---|
| 986 | /// Returns an error if this is a string-based identifier (e.g. `$"..."`) | 
|---|
| 987 | /// which is invalid utf-8. | 
|---|
| 988 | pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> { | 
|---|
| 989 | let mut ch = self.src(s).chars(); | 
|---|
| 990 | let dollar = ch.next(); | 
|---|
| 991 | debug_assert_eq!(dollar, Some( '$')); | 
|---|
| 992 | let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?; | 
|---|
| 993 | if id.is_empty() { | 
|---|
| 994 | return Err(self.error(s, LexError::EmptyId)); | 
|---|
| 995 | } | 
|---|
| 996 | Ok(id) | 
|---|
| 997 | } | 
|---|
| 998 |  | 
|---|
| 999 | /// Returns the annotation, without the leading `@` symbol, that this token | 
|---|
| 1000 | /// represents. | 
|---|
| 1001 | /// | 
|---|
| 1002 | /// Note that this method returns the contents of the identifier. With a | 
|---|
| 1003 | /// string-based identifier this means that escapes have been resolved to | 
|---|
| 1004 | /// their string-based equivalent. | 
|---|
| 1005 | /// | 
|---|
| 1006 | /// Should only be used with `TokenKind::Annotation`. | 
|---|
| 1007 | /// | 
|---|
| 1008 | /// # Errors | 
|---|
| 1009 | /// | 
|---|
| 1010 | /// Returns an error if this is a string-based identifier (e.g. `$"..."`) | 
|---|
| 1011 | /// which is invalid utf-8. | 
|---|
| 1012 | pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> { | 
|---|
| 1013 | let mut ch = self.src(s).chars(); | 
|---|
| 1014 | let at = ch.next(); | 
|---|
| 1015 | debug_assert_eq!(at, Some( '@')); | 
|---|
| 1016 | let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?; | 
|---|
| 1017 | if id.is_empty() { | 
|---|
| 1018 | return Err(self.error(s, LexError::EmptyAnnotation)); | 
|---|
| 1019 | } | 
|---|
| 1020 | Ok(id) | 
|---|
| 1021 | } | 
|---|
| 1022 |  | 
|---|
| 1023 | /// Returns the keyword this token represents. | 
|---|
| 1024 | /// | 
|---|
| 1025 | /// Should only be used with [`TokenKind::Keyword`]. | 
|---|
| 1026 | pub fn keyword<'a>(&self, s: &'a str) -> &'a str { | 
|---|
| 1027 | self.src(s) | 
|---|
| 1028 | } | 
|---|
| 1029 |  | 
|---|
| 1030 | /// Returns the reserved string this token represents. | 
|---|
| 1031 | /// | 
|---|
| 1032 | /// Should only be used with [`TokenKind::Reserved`]. | 
|---|
| 1033 | pub fn reserved<'a>(&self, s: &'a str) -> &'a str { | 
|---|
| 1034 | self.src(s) | 
|---|
| 1035 | } | 
|---|
| 1036 |  | 
|---|
| 1037 | /// Returns the parsed string that this token represents. | 
|---|
| 1038 | /// | 
|---|
| 1039 | /// This returns either a raw byte slice into the source if that's possible | 
|---|
| 1040 | /// or an owned representation to handle escaped characters and such. | 
|---|
| 1041 | /// | 
|---|
| 1042 | /// Should only be used with [`TokenKind::String`]. | 
|---|
| 1043 | pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> { | 
|---|
| 1044 | let mut ch = self.src(s).chars(); | 
|---|
| 1045 | ch.next().unwrap(); | 
|---|
| 1046 | Lexer::parse_str(&mut ch, true).unwrap() | 
|---|
| 1047 | } | 
|---|
| 1048 |  | 
|---|
| 1049 | /// Returns the decomposed float token that this represents. | 
|---|
| 1050 | /// | 
|---|
| 1051 | /// This will slice up the float token into its component parts and return a | 
|---|
| 1052 | /// description of the float token in the source. | 
|---|
| 1053 | /// | 
|---|
| 1054 | /// Should only be used with [`TokenKind::Float`]. | 
|---|
| 1055 | pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> { | 
|---|
| 1056 | match kind { | 
|---|
| 1057 | FloatKind::Inf { negative } => Float::Inf { negative }, | 
|---|
| 1058 | FloatKind::Nan { negative } => Float::Nan { | 
|---|
| 1059 | val: None, | 
|---|
| 1060 | negative, | 
|---|
| 1061 | }, | 
|---|
| 1062 | FloatKind::NanVal { | 
|---|
| 1063 | negative, | 
|---|
| 1064 | has_underscores, | 
|---|
| 1065 | } => { | 
|---|
| 1066 | let src = self.src(s); | 
|---|
| 1067 | let src = if src.starts_with( "n") { src } else { &src[1..] }; | 
|---|
| 1068 | let mut val = Cow::Borrowed(src.strip_prefix( "nan:0x").unwrap()); | 
|---|
| 1069 | if has_underscores { | 
|---|
| 1070 | *val.to_mut() = val.replace( "_", ""); | 
|---|
| 1071 | } | 
|---|
| 1072 | Float::Nan { | 
|---|
| 1073 | val: Some(val), | 
|---|
| 1074 | negative, | 
|---|
| 1075 | } | 
|---|
| 1076 | } | 
|---|
| 1077 | FloatKind::Normal { | 
|---|
| 1078 | has_underscores, | 
|---|
| 1079 | hex, | 
|---|
| 1080 | } => { | 
|---|
| 1081 | let src = self.src(s); | 
|---|
| 1082 | let (integral, fractional, exponent) = match src.find( '.') { | 
|---|
| 1083 | Some(i) => { | 
|---|
| 1084 | let integral = &src[..i]; | 
|---|
| 1085 | let rest = &src[i + 1..]; | 
|---|
| 1086 | let exponent = if hex { | 
|---|
| 1087 | rest.find( 'p').or_else(|| rest.find( 'P')) | 
|---|
| 1088 | } else { | 
|---|
| 1089 | rest.find( 'e').or_else(|| rest.find( 'E')) | 
|---|
| 1090 | }; | 
|---|
| 1091 | match exponent { | 
|---|
| 1092 | Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])), | 
|---|
| 1093 | None => (integral, Some(rest), None), | 
|---|
| 1094 | } | 
|---|
| 1095 | } | 
|---|
| 1096 | None => { | 
|---|
| 1097 | let exponent = if hex { | 
|---|
| 1098 | src.find( 'p').or_else(|| src.find( 'P')) | 
|---|
| 1099 | } else { | 
|---|
| 1100 | src.find( 'e').or_else(|| src.find( 'E')) | 
|---|
| 1101 | }; | 
|---|
| 1102 | match exponent { | 
|---|
| 1103 | Some(i) => (&src[..i], None, Some(&src[i + 1..])), | 
|---|
| 1104 | None => (src, None, None), | 
|---|
| 1105 | } | 
|---|
| 1106 | } | 
|---|
| 1107 | }; | 
|---|
| 1108 | let mut integral = Cow::Borrowed(integral.strip_prefix( '+').unwrap_or(integral)); | 
|---|
| 1109 | let mut fractional = fractional.and_then(|s| { | 
|---|
| 1110 | if s.is_empty() { | 
|---|
| 1111 | None | 
|---|
| 1112 | } else { | 
|---|
| 1113 | Some(Cow::Borrowed(s)) | 
|---|
| 1114 | } | 
|---|
| 1115 | }); | 
|---|
| 1116 | let mut exponent = | 
|---|
| 1117 | exponent.map(|s| Cow::Borrowed(s.strip_prefix( '+').unwrap_or(s))); | 
|---|
| 1118 | if has_underscores { | 
|---|
| 1119 | *integral.to_mut() = integral.replace( "_", ""); | 
|---|
| 1120 | if let Some(fractional) = &mut fractional { | 
|---|
| 1121 | *fractional.to_mut() = fractional.replace( "_", ""); | 
|---|
| 1122 | } | 
|---|
| 1123 | if let Some(exponent) = &mut exponent { | 
|---|
| 1124 | *exponent.to_mut() = exponent.replace( "_", ""); | 
|---|
| 1125 | } | 
|---|
| 1126 | } | 
|---|
| 1127 | if hex { | 
|---|
| 1128 | *integral.to_mut() = integral.replace( "0x", ""); | 
|---|
| 1129 | } | 
|---|
| 1130 | Float::Val { | 
|---|
| 1131 | hex, | 
|---|
| 1132 | integral, | 
|---|
| 1133 | fractional, | 
|---|
| 1134 | exponent, | 
|---|
| 1135 | } | 
|---|
| 1136 | } | 
|---|
| 1137 | } | 
|---|
| 1138 | } | 
|---|
| 1139 |  | 
|---|
| 1140 | /// Returns the decomposed integer token that this represents. | 
|---|
| 1141 | /// | 
|---|
| 1142 | /// This will slice up the integer token into its component parts and | 
|---|
| 1143 | /// return a description of the integer token in the source. | 
|---|
| 1144 | /// | 
|---|
| 1145 | /// Should only be used with [`TokenKind::Integer`]. | 
|---|
| 1146 | pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> { | 
|---|
| 1147 | let src = self.src(s); | 
|---|
| 1148 | let val = match kind.sign { | 
|---|
| 1149 | Some(SignToken::Plus) => src.strip_prefix( '+').unwrap(), | 
|---|
| 1150 | Some(SignToken::Minus) => src, | 
|---|
| 1151 | None => src, | 
|---|
| 1152 | }; | 
|---|
| 1153 | let mut val = Cow::Borrowed(val); | 
|---|
| 1154 | if kind.has_underscores { | 
|---|
| 1155 | *val.to_mut() = val.replace( "_", ""); | 
|---|
| 1156 | } | 
|---|
| 1157 | if kind.hex { | 
|---|
| 1158 | *val.to_mut() = val.replace( "0x", ""); | 
|---|
| 1159 | } | 
|---|
| 1160 | Integer { | 
|---|
| 1161 | sign: kind.sign, | 
|---|
| 1162 | hex: kind.hex, | 
|---|
| 1163 | val, | 
|---|
| 1164 | } | 
|---|
| 1165 | } | 
|---|
| 1166 |  | 
|---|
| 1167 | fn error(&self, src: &str, err: LexError) -> Error { | 
|---|
| 1168 | Error::lex( | 
|---|
| 1169 | Span { | 
|---|
| 1170 | offset: self.offset, | 
|---|
| 1171 | }, | 
|---|
| 1172 | src, | 
|---|
| 1173 | err, | 
|---|
| 1174 | ) | 
|---|
| 1175 | } | 
|---|
| 1176 | } | 
|---|
| 1177 |  | 
|---|
| 1178 | impl<'a> Integer<'a> { | 
|---|
| 1179 | /// Returns the sign token for this integer. | 
|---|
| 1180 | pub fn sign(&self) -> Option<SignToken> { | 
|---|
| 1181 | self.sign | 
|---|
| 1182 | } | 
|---|
| 1183 |  | 
|---|
| 1184 | /// Returns the value string that can be parsed for this integer, as well | 
|---|
| 1185 | /// as the base that it should be parsed in | 
|---|
| 1186 | pub fn val(&self) -> (&str, u32) { | 
|---|
| 1187 | (&self.val, if self.hex { 16 } else { 10 }) | 
|---|
| 1188 | } | 
|---|
| 1189 | } | 
|---|
| 1190 |  | 
|---|
| 1191 | fn to_hex(c: char) -> u8 { | 
|---|
| 1192 | match c { | 
|---|
| 1193 | 'a'..= 'f'=> c as u8 - b'a'+ 10, | 
|---|
| 1194 | 'A'..= 'F'=> c as u8 - b'A'+ 10, | 
|---|
| 1195 | _ => c as u8 - b'0', | 
|---|
| 1196 | } | 
|---|
| 1197 | } | 
|---|
| 1198 |  | 
|---|
| 1199 | impl fmt::Display for LexError { | 
|---|
| 1200 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | 
|---|
| 1201 | use LexError::*; | 
|---|
| 1202 | match self { | 
|---|
| 1203 | DanglingBlockComment => f.write_str( "unterminated block comment")?, | 
|---|
| 1204 | Unexpected(c) => write!(f, "unexpected character '{} '", escape_char(*c))?, | 
|---|
| 1205 | InvalidStringElement(c) => { | 
|---|
| 1206 | write!(f, "invalid character in string '{} '", escape_char(*c))? | 
|---|
| 1207 | } | 
|---|
| 1208 | InvalidStringEscape(c) => write!(f, "invalid string escape '{} '", escape_char(*c))?, | 
|---|
| 1209 | InvalidHexDigit(c) => write!(f, "invalid hex digit '{} '", escape_char(*c))?, | 
|---|
| 1210 | InvalidDigit(c) => write!(f, "invalid decimal digit '{} '", escape_char(*c))?, | 
|---|
| 1211 | Expected { wanted, found } => write!( | 
|---|
| 1212 | f, | 
|---|
| 1213 | "expected '{} ' but found '{} '", | 
|---|
| 1214 | escape_char(*wanted), | 
|---|
| 1215 | escape_char(*found) | 
|---|
| 1216 | )?, | 
|---|
| 1217 | UnexpectedEof => write!(f, "unexpected end-of-file")?, | 
|---|
| 1218 | NumberTooBig => f.write_str( "number is too big to parse")?, | 
|---|
| 1219 | InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x} ", c)?, | 
|---|
| 1220 | LoneUnderscore => write!(f, "bare underscore in numeric literal")?, | 
|---|
| 1221 | ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?} ", c)?, | 
|---|
| 1222 | InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?, | 
|---|
| 1223 | EmptyId => write!(f, "empty identifier")?, | 
|---|
| 1224 | EmptyAnnotation => write!(f, "empty annotation id")?, | 
|---|
| 1225 | } | 
|---|
| 1226 | Ok(()) | 
|---|
| 1227 | } | 
|---|
| 1228 | } | 
|---|
| 1229 |  | 
|---|
| 1230 | fn escape_char(c: char) -> String { | 
|---|
| 1231 | match c { | 
|---|
| 1232 | '\t '=> String::from( "\\ t"), | 
|---|
| 1233 | '\r '=> String::from( "\\ r"), | 
|---|
| 1234 | '\n '=> String::from( "\\ n"), | 
|---|
| 1235 | '\\ '=> String::from( "\\\\ "), | 
|---|
| 1236 | '\' '=> String::from( "\\\' "), | 
|---|
| 1237 | '\" '=> String::from( "\" "), | 
|---|
| 1238 | '\x20 '..= '\x7e '=> String::from(c), | 
|---|
| 1239 | _ => c.escape_unicode().to_string(), | 
|---|
| 1240 | } | 
|---|
| 1241 | } | 
|---|
| 1242 |  | 
|---|
| 1243 | /// This is an attempt to protect agains the "trojan source" [1] problem where | 
|---|
| 1244 | /// unicode characters can cause editors to render source code differently | 
|---|
| 1245 | /// for humans than the compiler itself sees. | 
|---|
| 1246 | /// | 
|---|
| 1247 | /// To mitigate this issue, and because it's relatively rare in practice, | 
|---|
| 1248 | /// this simply rejects characters of that form. | 
|---|
| 1249 | /// | 
|---|
| 1250 | /// [1]: https://www.trojansource.codes/ | 
|---|
| 1251 | fn is_confusing_unicode(ch: char) -> bool { | 
|---|
| 1252 | matches!( | 
|---|
| 1253 | ch, | 
|---|
| 1254 | '\u{202a} ' | 
|---|
| 1255 | | '\u{202b} ' | 
|---|
| 1256 | | '\u{202d} ' | 
|---|
| 1257 | | '\u{202e} ' | 
|---|
| 1258 | | '\u{2066} ' | 
|---|
| 1259 | | '\u{2067} ' | 
|---|
| 1260 | | '\u{2068} ' | 
|---|
| 1261 | | '\u{206c} ' | 
|---|
| 1262 | | '\u{2069} ' | 
|---|
| 1263 | ) | 
|---|
| 1264 | } | 
|---|
| 1265 |  | 
|---|
| 1266 | #[ cfg(test)] | 
|---|
| 1267 | mod tests { | 
|---|
| 1268 | use super::*; | 
|---|
| 1269 |  | 
|---|
| 1270 | #[ test] | 
|---|
| 1271 | fn ws_smoke() { | 
|---|
| 1272 | fn get_whitespace(input: &str) -> &str { | 
|---|
| 1273 | let token = get_token(input); | 
|---|
| 1274 | match token.kind { | 
|---|
| 1275 | TokenKind::Whitespace => token.src(input), | 
|---|
| 1276 | other => panic!( "unexpected {:?}", other), | 
|---|
| 1277 | } | 
|---|
| 1278 | } | 
|---|
| 1279 | assert_eq!(get_whitespace( " "), " "); | 
|---|
| 1280 | assert_eq!(get_whitespace( "  "), "  "); | 
|---|
| 1281 | assert_eq!(get_whitespace( "  \n  "), "  \n  "); | 
|---|
| 1282 | assert_eq!(get_whitespace( "  x"), "  "); | 
|---|
| 1283 | assert_eq!(get_whitespace( "  ;"), "  "); | 
|---|
| 1284 | } | 
|---|
| 1285 |  | 
|---|
| 1286 | #[ test] | 
|---|
| 1287 | fn line_comment_smoke() { | 
|---|
| 1288 | fn get_line_comment(input: &str) -> &str { | 
|---|
| 1289 | let token = get_token(input); | 
|---|
| 1290 | match token.kind { | 
|---|
| 1291 | TokenKind::LineComment => token.src(input), | 
|---|
| 1292 | other => panic!( "unexpected {:?}", other), | 
|---|
| 1293 | } | 
|---|
| 1294 | } | 
|---|
| 1295 | assert_eq!(get_line_comment( ";;"), ";;"); | 
|---|
| 1296 | assert_eq!(get_line_comment( ";; xyz"), ";; xyz"); | 
|---|
| 1297 | assert_eq!(get_line_comment( ";; xyz\n abc"), ";; xyz"); | 
|---|
| 1298 | assert_eq!(get_line_comment( ";;\n abc"), ";;"); | 
|---|
| 1299 | assert_eq!(get_line_comment( ";;   \n abc"), ";;   "); | 
|---|
| 1300 | assert_eq!(get_line_comment( ";;   \r abc"), ";;   "); | 
|---|
| 1301 | assert_eq!(get_line_comment( ";;   \r\n abc"), ";;   "); | 
|---|
| 1302 | } | 
|---|
| 1303 |  | 
|---|
| 1304 | #[ test] | 
|---|
| 1305 | fn block_comment_smoke() { | 
|---|
| 1306 | fn get_block_comment(input: &str) -> &str { | 
|---|
| 1307 | let token = get_token(input); | 
|---|
| 1308 | match token.kind { | 
|---|
| 1309 | TokenKind::BlockComment => token.src(input), | 
|---|
| 1310 | other => panic!( "unexpected {:?}", other), | 
|---|
| 1311 | } | 
|---|
| 1312 | } | 
|---|
| 1313 | assert_eq!(get_block_comment( "(;;)"), "(;;)"); | 
|---|
| 1314 | assert_eq!(get_block_comment( "(; ;)"), "(; ;)"); | 
|---|
| 1315 | assert_eq!(get_block_comment( "(; (;;) ;)"), "(; (;;) ;)"); | 
|---|
| 1316 | } | 
|---|
| 1317 |  | 
|---|
| 1318 | fn get_token(input: &str) -> Token { | 
|---|
| 1319 | Lexer::new(input) | 
|---|
| 1320 | .parse(&mut 0) | 
|---|
| 1321 | .expect( "no first token") | 
|---|
| 1322 | .expect( "no token") | 
|---|
| 1323 | } | 
|---|
| 1324 |  | 
|---|
| 1325 | #[ test] | 
|---|
| 1326 | fn lparen() { | 
|---|
| 1327 | assert_eq!(get_token( "((").kind, TokenKind::LParen); | 
|---|
| 1328 | } | 
|---|
| 1329 |  | 
|---|
| 1330 | #[ test] | 
|---|
| 1331 | fn rparen() { | 
|---|
| 1332 | assert_eq!(get_token( ")(").kind, TokenKind::RParen); | 
|---|
| 1333 | } | 
|---|
| 1334 |  | 
|---|
| 1335 | #[ test] | 
|---|
| 1336 | fn strings() { | 
|---|
| 1337 | fn get_string(input: &str) -> Vec<u8> { | 
|---|
| 1338 | let token = get_token(input); | 
|---|
| 1339 | match token.kind { | 
|---|
| 1340 | TokenKind::String => token.string(input).to_vec(), | 
|---|
| 1341 | other => panic!( "not keyword {:?}", other), | 
|---|
| 1342 | } | 
|---|
| 1343 | } | 
|---|
| 1344 | assert_eq!(&*get_string( "\"\" "), b""); | 
|---|
| 1345 | assert_eq!(&*get_string( "\" a\" "), b"a"); | 
|---|
| 1346 | assert_eq!(&*get_string( "\" a b c d\" "), b"a b c d"); | 
|---|
| 1347 | assert_eq!(&*get_string( "\"\\\"\" "), b"\" "); | 
|---|
| 1348 | assert_eq!(&*get_string( "\"\\ '\" "), b"'"); | 
|---|
| 1349 | assert_eq!(&*get_string( "\"\\ n\" "), b"\n "); | 
|---|
| 1350 | assert_eq!(&*get_string( "\"\\ t\" "), b"\t "); | 
|---|
| 1351 | assert_eq!(&*get_string( "\"\\ r\" "), b"\r "); | 
|---|
| 1352 | assert_eq!(&*get_string( "\"\\\\\" "), b"\\ "); | 
|---|
| 1353 | assert_eq!(&*get_string( "\"\\ 01\" "), &[1]); | 
|---|
| 1354 | assert_eq!(&*get_string( "\"\\ u{1}\" "), &[1]); | 
|---|
| 1355 | assert_eq!( | 
|---|
| 1356 | &*get_string( "\"\\ u{0f3}\" "), | 
|---|
| 1357 | '\u{0f3} '.encode_utf8(&mut [0; 4]).as_bytes() | 
|---|
| 1358 | ); | 
|---|
| 1359 | assert_eq!( | 
|---|
| 1360 | &*get_string( "\"\\ u{0_f_3}\" "), | 
|---|
| 1361 | '\u{0f3} '.encode_utf8(&mut [0; 4]).as_bytes() | 
|---|
| 1362 | ); | 
|---|
| 1363 |  | 
|---|
| 1364 | for i in 0..=255i32 { | 
|---|
| 1365 | let s = format!( "\"\\ {:02x}\" ", i); | 
|---|
| 1366 | assert_eq!(&*get_string(&s), &[i as u8]); | 
|---|
| 1367 | } | 
|---|
| 1368 | } | 
|---|
| 1369 |  | 
|---|
| 1370 | #[ test] | 
|---|
| 1371 | fn id() { | 
|---|
| 1372 | fn get_id(input: &str) -> String { | 
|---|
| 1373 | let token = get_token(input); | 
|---|
| 1374 | match token.kind { | 
|---|
| 1375 | TokenKind::Id => token.id(input).unwrap().to_string(), | 
|---|
| 1376 | other => panic!( "not id {:?}", other), | 
|---|
| 1377 | } | 
|---|
| 1378 | } | 
|---|
| 1379 | assert_eq!(get_id( "$x"), "x"); | 
|---|
| 1380 | assert_eq!(get_id( "$xyz"), "xyz"); | 
|---|
| 1381 | assert_eq!(get_id( "$x_z"), "x_z"); | 
|---|
| 1382 | assert_eq!(get_id( "$0^"), "0^"); | 
|---|
| 1383 | assert_eq!(get_id( "$0^;;"), "0^"); | 
|---|
| 1384 | assert_eq!(get_id( "$0^ ;;"), "0^"); | 
|---|
| 1385 | assert_eq!(get_id( "$\" x\"  ;;"), "x"); | 
|---|
| 1386 | } | 
|---|
| 1387 |  | 
|---|
| 1388 | #[ test] | 
|---|
| 1389 | fn annotation() { | 
|---|
| 1390 | fn get_annotation(input: &str) -> String { | 
|---|
| 1391 | let token = get_token(input); | 
|---|
| 1392 | match token.kind { | 
|---|
| 1393 | TokenKind::Annotation => token.annotation(input).unwrap().to_string(), | 
|---|
| 1394 | other => panic!( "not annotation {:?}", other), | 
|---|
| 1395 | } | 
|---|
| 1396 | } | 
|---|
| 1397 | assert_eq!(get_annotation( "@foo"), "foo"); | 
|---|
| 1398 | assert_eq!(get_annotation( "@foo "), "foo"); | 
|---|
| 1399 | assert_eq!(get_annotation( "@f "), "f"); | 
|---|
| 1400 | assert_eq!(get_annotation( "@\" x\"  "), "x"); | 
|---|
| 1401 | assert_eq!(get_annotation( "@0 "), "0"); | 
|---|
| 1402 | } | 
|---|
| 1403 |  | 
|---|
| 1404 | #[ test] | 
|---|
| 1405 | fn keyword() { | 
|---|
| 1406 | fn get_keyword(input: &str) -> &str { | 
|---|
| 1407 | let token = get_token(input); | 
|---|
| 1408 | match token.kind { | 
|---|
| 1409 | TokenKind::Keyword => token.keyword(input), | 
|---|
| 1410 | other => panic!( "not keyword {:?}", other), | 
|---|
| 1411 | } | 
|---|
| 1412 | } | 
|---|
| 1413 | assert_eq!(get_keyword( "x"), "x"); | 
|---|
| 1414 | assert_eq!(get_keyword( "xyz"), "xyz"); | 
|---|
| 1415 | assert_eq!(get_keyword( "x_z"), "x_z"); | 
|---|
| 1416 | assert_eq!(get_keyword( "x_z "), "x_z"); | 
|---|
| 1417 | assert_eq!(get_keyword( "x_z "), "x_z"); | 
|---|
| 1418 | } | 
|---|
| 1419 |  | 
|---|
| 1420 | #[ test] | 
|---|
| 1421 | fn reserved() { | 
|---|
| 1422 | fn get_reserved(input: &str) -> &str { | 
|---|
| 1423 | let token = get_token(input); | 
|---|
| 1424 | match token.kind { | 
|---|
| 1425 | TokenKind::Reserved => token.reserved(input), | 
|---|
| 1426 | other => panic!( "not reserved {:?}", other), | 
|---|
| 1427 | } | 
|---|
| 1428 | } | 
|---|
| 1429 | assert_eq!(get_reserved( "^_x "), "^_x"); | 
|---|
| 1430 | } | 
|---|
| 1431 |  | 
|---|
| 1432 | #[ test] | 
|---|
| 1433 | fn integer() { | 
|---|
| 1434 | fn get_integer(input: &str) -> String { | 
|---|
| 1435 | let token = get_token(input); | 
|---|
| 1436 | match token.kind { | 
|---|
| 1437 | TokenKind::Integer(i) => token.integer(input, i).val.to_string(), | 
|---|
| 1438 | other => panic!( "not integer {:?}", other), | 
|---|
| 1439 | } | 
|---|
| 1440 | } | 
|---|
| 1441 | assert_eq!(get_integer( "1"), "1"); | 
|---|
| 1442 | assert_eq!(get_integer( "0"), "0"); | 
|---|
| 1443 | assert_eq!(get_integer( "-1"), "-1"); | 
|---|
| 1444 | assert_eq!(get_integer( "+1"), "1"); | 
|---|
| 1445 | assert_eq!(get_integer( "+1_000"), "1000"); | 
|---|
| 1446 | assert_eq!(get_integer( "+1_0_0_0"), "1000"); | 
|---|
| 1447 | assert_eq!(get_integer( "+0x10"), "10"); | 
|---|
| 1448 | assert_eq!(get_integer( "-0x10"), "-10"); | 
|---|
| 1449 | assert_eq!(get_integer( "0x10"), "10"); | 
|---|
| 1450 | } | 
|---|
| 1451 |  | 
|---|
| 1452 | #[ test] | 
|---|
| 1453 | fn float() { | 
|---|
| 1454 | fn get_float(input: &str) -> Float<'_> { | 
|---|
| 1455 | let token = get_token(input); | 
|---|
| 1456 | match token.kind { | 
|---|
| 1457 | TokenKind::Float(f) => token.float(input, f), | 
|---|
| 1458 | other => panic!( "not float {:?}", other), | 
|---|
| 1459 | } | 
|---|
| 1460 | } | 
|---|
| 1461 | assert_eq!( | 
|---|
| 1462 | get_float( "nan"), | 
|---|
| 1463 | Float::Nan { | 
|---|
| 1464 | val: None, | 
|---|
| 1465 | negative: false | 
|---|
| 1466 | }, | 
|---|
| 1467 | ); | 
|---|
| 1468 | assert_eq!( | 
|---|
| 1469 | get_float( "-nan"), | 
|---|
| 1470 | Float::Nan { | 
|---|
| 1471 | val: None, | 
|---|
| 1472 | negative: true, | 
|---|
| 1473 | }, | 
|---|
| 1474 | ); | 
|---|
| 1475 | assert_eq!( | 
|---|
| 1476 | get_float( "+nan"), | 
|---|
| 1477 | Float::Nan { | 
|---|
| 1478 | val: None, | 
|---|
| 1479 | negative: false, | 
|---|
| 1480 | }, | 
|---|
| 1481 | ); | 
|---|
| 1482 | assert_eq!( | 
|---|
| 1483 | get_float( "+nan:0x1"), | 
|---|
| 1484 | Float::Nan { | 
|---|
| 1485 | val: Some( "1".into()), | 
|---|
| 1486 | negative: false, | 
|---|
| 1487 | }, | 
|---|
| 1488 | ); | 
|---|
| 1489 | assert_eq!( | 
|---|
| 1490 | get_float( "nan:0x7f_ffff"), | 
|---|
| 1491 | Float::Nan { | 
|---|
| 1492 | val: Some( "7fffff".into()), | 
|---|
| 1493 | negative: false, | 
|---|
| 1494 | }, | 
|---|
| 1495 | ); | 
|---|
| 1496 | assert_eq!(get_float( "inf"), Float::Inf { negative: false }); | 
|---|
| 1497 | assert_eq!(get_float( "-inf"), Float::Inf { negative: true }); | 
|---|
| 1498 | assert_eq!(get_float( "+inf"), Float::Inf { negative: false }); | 
|---|
| 1499 |  | 
|---|
| 1500 | assert_eq!( | 
|---|
| 1501 | get_float( "1.2"), | 
|---|
| 1502 | Float::Val { | 
|---|
| 1503 | integral: "1".into(), | 
|---|
| 1504 | fractional: Some( "2".into()), | 
|---|
| 1505 | exponent: None, | 
|---|
| 1506 | hex: false, | 
|---|
| 1507 | }, | 
|---|
| 1508 | ); | 
|---|
| 1509 | assert_eq!( | 
|---|
| 1510 | get_float( "1.2e3"), | 
|---|
| 1511 | Float::Val { | 
|---|
| 1512 | integral: "1".into(), | 
|---|
| 1513 | fractional: Some( "2".into()), | 
|---|
| 1514 | exponent: Some( "3".into()), | 
|---|
| 1515 | hex: false, | 
|---|
| 1516 | }, | 
|---|
| 1517 | ); | 
|---|
| 1518 | assert_eq!( | 
|---|
| 1519 | get_float( "-1_2.1_1E+0_1"), | 
|---|
| 1520 | Float::Val { | 
|---|
| 1521 | integral: "-12".into(), | 
|---|
| 1522 | fractional: Some( "11".into()), | 
|---|
| 1523 | exponent: Some( "01".into()), | 
|---|
| 1524 | hex: false, | 
|---|
| 1525 | }, | 
|---|
| 1526 | ); | 
|---|
| 1527 | assert_eq!( | 
|---|
| 1528 | get_float( "+1_2.1_1E-0_1"), | 
|---|
| 1529 | Float::Val { | 
|---|
| 1530 | integral: "12".into(), | 
|---|
| 1531 | fractional: Some( "11".into()), | 
|---|
| 1532 | exponent: Some( "-01".into()), | 
|---|
| 1533 | hex: false, | 
|---|
| 1534 | }, | 
|---|
| 1535 | ); | 
|---|
| 1536 | assert_eq!( | 
|---|
| 1537 | get_float( "0x1_2.3_4p5_6"), | 
|---|
| 1538 | Float::Val { | 
|---|
| 1539 | integral: "12".into(), | 
|---|
| 1540 | fractional: Some( "34".into()), | 
|---|
| 1541 | exponent: Some( "56".into()), | 
|---|
| 1542 | hex: true, | 
|---|
| 1543 | }, | 
|---|
| 1544 | ); | 
|---|
| 1545 | assert_eq!( | 
|---|
| 1546 | get_float( "+0x1_2.3_4P-5_6"), | 
|---|
| 1547 | Float::Val { | 
|---|
| 1548 | integral: "12".into(), | 
|---|
| 1549 | fractional: Some( "34".into()), | 
|---|
| 1550 | exponent: Some( "-56".into()), | 
|---|
| 1551 | hex: true, | 
|---|
| 1552 | }, | 
|---|
| 1553 | ); | 
|---|
| 1554 | assert_eq!( | 
|---|
| 1555 | get_float( "1."), | 
|---|
| 1556 | Float::Val { | 
|---|
| 1557 | integral: "1".into(), | 
|---|
| 1558 | fractional: None, | 
|---|
| 1559 | exponent: None, | 
|---|
| 1560 | hex: false, | 
|---|
| 1561 | }, | 
|---|
| 1562 | ); | 
|---|
| 1563 | assert_eq!( | 
|---|
| 1564 | get_float( "0x1p-24"), | 
|---|
| 1565 | Float::Val { | 
|---|
| 1566 | integral: "1".into(), | 
|---|
| 1567 | fractional: None, | 
|---|
| 1568 | exponent: Some( "-24".into()), | 
|---|
| 1569 | hex: true, | 
|---|
| 1570 | }, | 
|---|
| 1571 | ); | 
|---|
| 1572 | } | 
|---|
| 1573 | } | 
|---|
| 1574 |  | 
|---|