1//! Definition of a lexer for the WebAssembly text format.
2//!
3//! This module provides a [`Lexer`][] type which is an iterate over the raw
4//! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single
5//! byte in a WebAssembly text field, returning tokens even for comments and
6//! whitespace. Typically you'll ignore comments and whitespace, however.
7//!
8//! If you'd like to iterate over the tokens in a file you can do so via:
9//!
10//! ```
11//! # fn foo() -> Result<(), wast::Error> {
12//! use wast::lexer::Lexer;
13//!
14//! let wat = "(module (func $foo))";
15//! for token in Lexer::new(wat).iter(0) {
16//! println!("{:?}", token?);
17//! }
18//! # Ok(())
19//! # }
20//! ```
21//!
22//! Note that you'll typically not use this module but will rather use
23//! [`ParseBuffer`](crate::parser::ParseBuffer) instead.
24//!
25//! [`Lexer`]: crate::lexer::Lexer
26
27use crate::token::Span;
28use crate::Error;
29use std::borrow::Cow;
30use std::char;
31use std::fmt;
32use std::slice;
33use std::str;
34use std::str::Utf8Error;
35
36/// A structure used to lex the s-expression syntax of WAT files.
37///
38/// This structure is used to generate [`Token`] items, which should account for
39/// every single byte of the input as we iterate over it. A [`LexError`] is
40/// returned for any non-lexable text.
41#[derive(Clone)]
42pub struct Lexer<'a> {
43 input: &'a str,
44 allow_confusing_unicode: bool,
45}
46
47/// A single token parsed from a `Lexer`.
48#[derive(Copy, Clone, Debug, PartialEq)]
49pub struct Token {
50 /// The kind of token this represents, such as whether it's whitespace, a
51 /// keyword, etc.
52 pub kind: TokenKind,
53 /// The byte offset within the original source for where this token came
54 /// from.
55 pub offset: usize,
56 /// The byte length of this token as it resides in the original source.
57 //
58 // NB: this is `u32` to enable packing `Token` into two pointers of size.
59 // This does limit a single token to being at most 4G large, but that seems
60 // probably ok.
61 pub len: u32,
62}
63
64#[test]
65fn token_is_not_too_big() {
66 assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2);
67}
68
69/// Classification of what was parsed from the input stream.
70///
71/// This enumeration contains all kinds of fragments, including comments and
72/// whitespace.
73#[derive(Copy, Clone, Debug, PartialEq)]
74pub enum TokenKind {
75 /// A line comment, preceded with `;;`
76 LineComment,
77
78 /// A block comment, surrounded by `(;` and `;)`. Note that these can be
79 /// nested.
80 BlockComment,
81
82 /// A fragment of source that represents whitespace.
83 Whitespace,
84
85 /// A left-parenthesis, including the source text for where it comes from.
86 LParen,
87 /// A right-parenthesis, including the source text for where it comes from.
88 RParen,
89
90 /// A string literal, which is actually a list of bytes.
91 String,
92
93 /// An identifier (like `$foo`).
94 ///
95 /// All identifiers start with `$` and the payload here is the original
96 /// source text.
97 Id,
98
99 /// A keyword, or something that starts with an alphabetic character.
100 ///
101 /// The payload here is the original source text.
102 Keyword,
103
104 /// An annotation (like `@foo`).
105 ///
106 /// All annotations start with `@` and the payload will be the name of the
107 /// annotation.
108 Annotation,
109
110 /// A reserved series of `idchar` symbols. Unknown what this is meant to be
111 /// used for, you'll probably generate an error about an unexpected token.
112 Reserved,
113
114 /// An integer.
115 Integer(IntegerKind),
116
117 /// A float.
118 Float(FloatKind),
119}
120
121/// Description of the parsed integer from the source.
122#[derive(Copy, Clone, Debug, PartialEq)]
123pub struct IntegerKind {
124 sign: Option<SignToken>,
125 has_underscores: bool,
126 hex: bool,
127}
128
129/// Description of a parsed float from the source.
130#[allow(missing_docs)]
131#[derive(Copy, Clone, Debug, PartialEq)]
132pub enum FloatKind {
133 #[doc(hidden)]
134 Inf { negative: bool },
135 #[doc(hidden)]
136 Nan { negative: bool },
137 #[doc(hidden)]
138 NanVal {
139 negative: bool,
140 has_underscores: bool,
141 },
142 #[doc(hidden)]
143 Normal { has_underscores: bool, hex: bool },
144}
145
146enum ReservedKind {
147 /// "..."
148 String,
149 /// anything that's just a sequence of `idchars!()`
150 Idchars,
151 /// $"..."
152 IdString,
153 /// @"..."
154 AnnotationString,
155 /// everything else (a conglomeration of strings, idchars, etc)
156 Reserved,
157}
158
159/// Errors that can be generated while lexing.
160///
161/// All lexing errors have line/colum/position information as well as a
162/// `LexError` indicating what kind of error happened while lexing.
163#[derive(Debug, Clone, PartialEq, Eq)]
164#[non_exhaustive]
165pub enum LexError {
166 /// A dangling block comment was found with an unbalanced `(;` which was
167 /// never terminated in the file.
168 DanglingBlockComment,
169
170 /// An unexpected character was encountered when generally parsing and
171 /// looking for something else.
172 Unexpected(char),
173
174 /// An invalid `char` in a string literal was found.
175 InvalidStringElement(char),
176
177 /// An invalid string escape letter was found (the thing after the `\` in
178 /// string literals)
179 InvalidStringEscape(char),
180
181 /// An invalid hexadecimal digit was found.
182 InvalidHexDigit(char),
183
184 /// An invalid base-10 digit was found.
185 InvalidDigit(char),
186
187 /// Parsing expected `wanted` but ended up finding `found` instead where the
188 /// two characters aren't the same.
189 Expected {
190 /// The character that was expected to be found
191 wanted: char,
192 /// The character that was actually found
193 found: char,
194 },
195
196 /// We needed to parse more but EOF (or end of the string) was encountered.
197 UnexpectedEof,
198
199 /// A number failed to parse because it was too big to fit within the target
200 /// type.
201 NumberTooBig,
202
203 /// An invalid unicode value was found in a `\u{...}` escape in a string,
204 /// only valid unicode scalars can be escaped that way.
205 InvalidUnicodeValue(u32),
206
207 /// A lone underscore was found when parsing a number, since underscores
208 /// should always be preceded and succeeded with a digit of some form.
209 LoneUnderscore,
210
211 /// A "confusing" unicode character is present in a comment or a string
212 /// literal, such as a character that changes the direction text is
213 /// typically displayed in editors. This could cause the human-read
214 /// version to behave differently than the compiler-visible version, so
215 /// these are simply rejected for now.
216 ConfusingUnicode(char),
217
218 /// An invalid utf-8 sequence was found in a quoted identifier, such as
219 /// `$"\ff"`.
220 InvalidUtf8Id(Utf8Error),
221
222 /// An empty identifier was found, or a lone `$`.
223 EmptyId,
224
225 /// An empty identifier was found, or a lone `@`.
226 EmptyAnnotation,
227}
228
229/// A sign token for an integer.
230#[derive(Clone, Copy, Debug, PartialEq, Eq)]
231pub enum SignToken {
232 /// Plus sign: "+",
233 Plus,
234 /// Minus sign: "-",
235 Minus,
236}
237
238/// A fully parsed integer from a source string with a payload ready to parse
239/// into an integral type.
240#[derive(Debug, PartialEq)]
241pub struct Integer<'a> {
242 sign: Option<SignToken>,
243 val: Cow<'a, str>,
244 hex: bool,
245}
246
247/// Possible parsed float values
248#[derive(Debug, PartialEq, Eq)]
249pub enum Float<'a> {
250 /// A float `NaN` representation
251 Nan {
252 /// The specific bits to encode for this float, optionally
253 val: Option<Cow<'a, str>>,
254 /// Whether or not this is a negative `NaN` or not.
255 negative: bool,
256 },
257 /// An float infinite representation,
258 Inf {
259 #[allow(missing_docs)]
260 negative: bool,
261 },
262 /// A parsed and separated floating point value
263 Val {
264 /// Whether or not the `integral` and `fractional` are specified in hex
265 hex: bool,
266 /// The float parts before the `.`
267 integral: Cow<'a, str>,
268 /// The float parts after the `.`
269 fractional: Option<Cow<'a, str>>,
270 /// The exponent to multiple this `integral.fractional` portion of the
271 /// float by. If `hex` is true this is `2^exponent` and otherwise it's
272 /// `10^exponent`
273 exponent: Option<Cow<'a, str>>,
274 },
275}
276
277// https://webassembly.github.io/spec/core/text/values.html#text-idchar
278macro_rules! idchars {
279 () => {
280 b'0'..=b'9'
281 | b'A'..=b'Z'
282 | b'a'..=b'z'
283 | b'!'
284 | b'#'
285 | b'$'
286 | b'%'
287 | b'&'
288 | b'\''
289 | b'*'
290 | b'+'
291 | b'-'
292 | b'.'
293 | b'/'
294 | b':'
295 | b'<'
296 | b'='
297 | b'>'
298 | b'?'
299 | b'@'
300 | b'\\'
301 | b'^'
302 | b'_'
303 | b'`'
304 | b'|'
305 | b'~'
306 }
307}
308
309impl<'a> Lexer<'a> {
310 /// Creates a new lexer which will lex the `input` source string.
311 pub fn new(input: &str) -> Lexer<'_> {
312 Lexer {
313 input,
314 allow_confusing_unicode: false,
315 }
316 }
317
318 /// Returns the original source input that we're lexing.
319 pub fn input(&self) -> &'a str {
320 self.input
321 }
322
323 /// Configures whether "confusing" unicode characters are allowed while
324 /// lexing.
325 ///
326 /// If allowed then no error will happen if these characters are found, but
327 /// otherwise if disallowed a lex error will be produced when these
328 /// characters are found. Confusing characters are denied by default.
329 ///
330 /// For now "confusing characters" are primarily related to the "trojan
331 /// source" problem where it refers to characters which cause humans to read
332 /// text differently than this lexer, such as characters that alter the
333 /// left-to-right display of the source code.
334 pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self {
335 self.allow_confusing_unicode = allow;
336 self
337 }
338
339 /// Lexes the next at the byte position `pos` in the input.
340 ///
341 /// Returns `Some` if a token is found or `None` if we're at EOF.
342 ///
343 /// The `pos` argument will be updated to point to the next token on a
344 /// successful parse.
345 ///
346 /// # Errors
347 ///
348 /// Returns an error if the input is malformed.
349 pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> {
350 let offset = *pos;
351 Ok(match self.parse_kind(pos)? {
352 Some(kind) => Some(Token {
353 kind,
354 offset,
355 len: (*pos - offset).try_into().unwrap(),
356 }),
357 None => None,
358 })
359 }
360
361 fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> {
362 let start = *pos;
363 // This `match` generally parses the grammar specified at
364 //
365 // https://webassembly.github.io/spec/core/text/lexical.html#text-token
366 let remaining = &self.input.as_bytes()[start..];
367 let byte = match remaining.first() {
368 Some(b) => b,
369 None => return Ok(None),
370 };
371
372 match byte {
373 // Open-parens check the next character to see if this is the start
374 // of a block comment, otherwise it's just a bland left-paren
375 // token.
376 b'(' => match remaining.get(1) {
377 Some(b';') => {
378 let mut level = 1;
379 // Note that we're doing a byte-level search here for the
380 // close-delimiter of `;)`. The actual source text is utf-8
381 // encode in `remaining` but due to how utf-8 works we
382 // can safely search for an ASCII byte since it'll never
383 // otherwise appear in the middle of a codepoint and if we
384 // find it then it's guaranteed to be the right byte.
385 //
386 // Mainly we're avoiding the overhead of decoding utf-8
387 // characters into a Rust `char` since it's otherwise
388 // unnecessary work.
389 let mut iter = remaining[2..].iter();
390 while let Some(ch) = iter.next() {
391 match ch {
392 b'(' => {
393 if let Some(b';') = iter.as_slice().first() {
394 level += 1;
395 iter.next();
396 }
397 }
398 b';' => {
399 if let Some(b')') = iter.as_slice().first() {
400 level -= 1;
401 iter.next();
402 if level == 0 {
403 let len = remaining.len() - iter.as_slice().len();
404 let comment = &self.input[start..][..len];
405 *pos += len;
406 self.check_confusing_comment(*pos, comment)?;
407 return Ok(Some(TokenKind::BlockComment));
408 }
409 }
410 }
411 _ => {}
412 }
413 }
414 Err(self.error(start, LexError::DanglingBlockComment))
415 }
416 _ => {
417 *pos += 1;
418
419 Ok(Some(TokenKind::LParen))
420 }
421 },
422
423 b')' => {
424 *pos += 1;
425 Ok(Some(TokenKind::RParen))
426 }
427
428 // https://webassembly.github.io/spec/core/text/lexical.html#white-space
429 b' ' | b'\n' | b'\r' | b'\t' => {
430 self.skip_ws(pos);
431 Ok(Some(TokenKind::Whitespace))
432 }
433
434 c @ (idchars!() | b'"') => {
435 let (kind, src) = self.parse_reserved(pos)?;
436 match kind {
437 // If the reserved token was simply a single string then
438 // that is converted to a standalone string token
439 ReservedKind::String => return Ok(Some(TokenKind::String)),
440
441 // If only idchars were consumed then this could be a
442 // specific kind of standalone token we're interested in.
443 ReservedKind::Idchars => {
444 // https://webassembly.github.io/spec/core/text/values.html#integers
445 if let Some(ret) = self.classify_number(src) {
446 return Ok(Some(ret));
447 // https://webassembly.github.io/spec/core/text/values.html#text-id
448 } else if *c == b'$' {
449 return Ok(Some(TokenKind::Id));
450 // part of the WebAssembly/annotations proposal
451 // (no online url yet)
452 } else if *c == b'@' {
453 return Ok(Some(TokenKind::Annotation));
454 // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
455 } else if b'a' <= *c && *c <= b'z' {
456 return Ok(Some(TokenKind::Keyword));
457 }
458 }
459
460 ReservedKind::IdString => return Ok(Some(TokenKind::Id)),
461 ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)),
462
463 // ... otherwise this was a conglomeration of idchars,
464 // strings, or just idchars that don't match a prior rule,
465 // meaning this falls through to the fallback `Reserved`
466 // token.
467 ReservedKind::Reserved => {}
468 }
469
470 Ok(Some(TokenKind::Reserved))
471 }
472
473 // This could be a line comment, otherwise `;` is a reserved token.
474 // The second byte is checked to see if it's a `;;` line comment
475 //
476 // Note that this character being considered as part of a
477 // `reserved` token is part of the annotations proposal.
478 b';' => match remaining.get(1) {
479 Some(b';') => {
480 let remaining = &self.input[*pos..];
481 let byte_pos = memchr::memchr2(b'\n', b'\r', remaining.as_bytes())
482 .unwrap_or(remaining.len());
483 *pos += byte_pos;
484 let comment = &remaining[..byte_pos];
485 self.check_confusing_comment(*pos, comment)?;
486 Ok(Some(TokenKind::LineComment))
487 }
488 _ => {
489 *pos += 1;
490 Ok(Some(TokenKind::Reserved))
491 }
492 },
493
494 // Other known reserved tokens other than `;`
495 //
496 // Note that these characters being considered as part of a
497 // `reserved` token is part of the annotations proposal.
498 b',' | b'[' | b']' | b'{' | b'}' => {
499 *pos += 1;
500 Ok(Some(TokenKind::Reserved))
501 }
502
503 _ => {
504 let ch = self.input[start..].chars().next().unwrap();
505 Err(self.error(*pos, LexError::Unexpected(ch)))
506 }
507 }
508 }
509
510 fn skip_ws(&self, pos: &mut usize) {
511 // This table is a byte lookup table to determine whether a byte is a
512 // whitespace byte. There are only 4 whitespace bytes for the `*.wat`
513 // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes
514 // have a '1' in the table below.
515 //
516 // Due to how utf-8 works (our input is guaranteed to be utf-8) it is
517 // known that if these bytes are found they're guaranteed to be the
518 // whitespace byte, so they can be safely skipped and we don't have to
519 // do full utf-8 decoding. This means that the goal of this function is
520 // to find the first non-whitespace byte in `remaining`.
521 //
522 // For now this lookup table seems to be the fastest, but projects like
523 // https://github.com/lemire/despacer show other simd algorithms which
524 // can possibly accelerate this even more. Note that `*.wat` files often
525 // have a lot of whitespace so this function is typically quite hot when
526 // parsing inputs.
527 #[rustfmt::skip]
528 const WS: [u8; 256] = [
529 // \t \n \r
530 /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
531 /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
532 // ' '
533 /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
534 /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
535 /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
536 /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
537 /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
538 /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
539 /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
540 /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
541 /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542 /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543 /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544 /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
545 /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546 /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 ];
548 let remaining = &self.input[*pos..];
549 let non_ws_pos = remaining
550 .as_bytes()
551 .iter()
552 .position(|b| WS[*b as usize] != 1)
553 .unwrap_or(remaining.len());
554 *pos += non_ws_pos;
555 }
556
557 /// Splits off a "reserved" token which is then further processed later on
558 /// to figure out which kind of token it is `depending on `ReservedKind`.
559 ///
560 /// For more information on this method see the clarification at
561 /// <https://github.com/WebAssembly/spec/pull/1499> but the general gist is
562 /// that this is parsing the grammar:
563 ///
564 /// ```text
565 /// reserved := (idchar | string)+
566 /// ```
567 ///
568 /// which means that it is eating any number of adjacent string/idchar
569 /// tokens (e.g. `a"b"c`) and returning the classification of what was
570 /// eaten. The classification assists in determining what the actual token
571 /// here eaten looks like.
572 fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> {
573 let mut idchars = 0u32;
574 let mut strings = 0u32;
575 let start = *pos;
576 while let Some(byte) = self.input.as_bytes().get(*pos) {
577 match byte {
578 // Normal `idchars` production which appends to the reserved
579 // token that's being produced.
580 idchars!() => {
581 idchars += 1;
582 *pos += 1;
583 }
584
585 // https://webassembly.github.io/spec/core/text/values.html#text-string
586 b'"' => {
587 strings += 1;
588 *pos += 1;
589 let mut it = self.input[*pos..].chars();
590 let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode);
591 *pos = self.input.len() - it.as_str().len();
592 match result {
593 Ok(_) => {}
594 Err(e) => {
595 let err_pos = match &e {
596 LexError::UnexpectedEof => self.input.len(),
597 _ => self.input[..*pos].char_indices().next_back().unwrap().0,
598 };
599 return Err(self.error(err_pos, e));
600 }
601 }
602 }
603
604 // Nothing else is considered part of a reserved token
605 _ => break,
606 }
607 }
608 let ret = &self.input[start..*pos];
609 Ok(match (idchars, strings) {
610 (0, 0) => unreachable!(),
611 (0, 1) => (ReservedKind::String, ret),
612 (_, 0) => (ReservedKind::Idchars, ret),
613 // Pattern match `@"..."` and `$"..."` for string-based
614 // identifiers and annotations.
615 (1, 1) if ret.starts_with("$") => (ReservedKind::IdString, ret),
616 (1, 1) if ret.starts_with("@") => (ReservedKind::AnnotationString, ret),
617 _ => (ReservedKind::Reserved, ret),
618 })
619 }
620
621 fn classify_number(&self, src: &str) -> Option<TokenKind> {
622 let (sign, num) = if let Some(stripped) = src.strip_prefix('+') {
623 (Some(SignToken::Plus), stripped)
624 } else if let Some(stripped) = src.strip_prefix('-') {
625 (Some(SignToken::Minus), stripped)
626 } else {
627 (None, src)
628 };
629
630 let negative = sign == Some(SignToken::Minus);
631
632 // Handle `inf` and `nan` which are special numbers here
633 if num == "inf" {
634 return Some(TokenKind::Float(FloatKind::Inf { negative }));
635 } else if num == "nan" {
636 return Some(TokenKind::Float(FloatKind::Nan { negative }));
637 } else if let Some(stripped) = num.strip_prefix("nan:0x") {
638 let mut it = stripped.as_bytes().iter();
639 let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?;
640 if it.next().is_some() {
641 return None;
642 }
643 return Some(TokenKind::Float(FloatKind::NanVal {
644 negative,
645 has_underscores,
646 }));
647 }
648
649 // Figure out if we're a hex number or not
650 let test_valid: fn(u8) -> bool;
651 let (mut it, hex) = if let Some(stripped) = num.strip_prefix("0x") {
652 test_valid = |x: u8| char::from(x).is_ascii_hexdigit();
653 (stripped.as_bytes().iter(), true)
654 } else {
655 test_valid = |x: u8| char::from(x).is_ascii_digit();
656 (num.as_bytes().iter(), false)
657 };
658
659 // Evaluate the first part, moving out all underscores
660 let mut has_underscores = skip_underscores(&mut it, test_valid)?;
661
662 match it.clone().next() {
663 // If we're followed by something this may be a float so keep going.
664 Some(_) => {}
665
666 // Otherwise this is a valid integer literal!
667 None => {
668 return Some(TokenKind::Integer(IntegerKind {
669 has_underscores,
670 sign,
671 hex,
672 }))
673 }
674 }
675
676 // A number can optionally be after the dot so only actually try to
677 // parse one if it's there.
678 if it.clone().next() == Some(&b'.') {
679 it.next();
680 match it.clone().next() {
681 Some(c) if test_valid(*c) => {
682 if skip_underscores(&mut it, test_valid)? {
683 has_underscores = true;
684 }
685 }
686 Some(_) | None => {}
687 }
688 };
689
690 // Figure out if there's an exponential part here to make a float, and
691 // if so parse it but defer its actual calculation until later.
692 match (hex, it.next()) {
693 (true, Some(b'p')) | (true, Some(b'P')) | (false, Some(b'e')) | (false, Some(b'E')) => {
694 match it.clone().next() {
695 Some(b'-') => {
696 it.next();
697 }
698 Some(b'+') => {
699 it.next();
700 }
701 _ => {}
702 }
703 if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? {
704 has_underscores = true;
705 }
706 }
707 (_, None) => {}
708 _ => return None,
709 }
710
711 // We should have eaten everything by now, if not then this is surely
712 // not a float or integer literal.
713 if it.next().is_some() {
714 return None;
715 }
716
717 return Some(TokenKind::Float(FloatKind::Normal {
718 has_underscores,
719 hex,
720 }));
721
722 fn skip_underscores<'a>(
723 it: &mut slice::Iter<'_, u8>,
724 good: fn(u8) -> bool,
725 ) -> Option<bool> {
726 let mut last_underscore = false;
727 let mut has_underscores = false;
728 let first = *it.next()?;
729 if !good(first) {
730 return None;
731 }
732 while let Some(c) = it.clone().next() {
733 if *c == b'_' && !last_underscore {
734 has_underscores = true;
735 it.next();
736 last_underscore = true;
737 continue;
738 }
739 if !good(*c) {
740 break;
741 }
742 last_underscore = false;
743 it.next();
744 }
745 if last_underscore {
746 return None;
747 }
748 Some(has_underscores)
749 }
750 }
751
752 /// Verifies that `comment`, which is about to be returned, has a "confusing
753 /// unicode character" in it and should instead be transformed into an
754 /// error.
755 fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> {
756 if self.allow_confusing_unicode {
757 return Ok(());
758 }
759
760 // In an effort to avoid utf-8 decoding the entire `comment` the search
761 // here is a bit more optimized. This checks for the `0xe2` byte because
762 // in the utf-8 encoding that's the leading encoding byte for all
763 // "confusing characters". Each instance of 0xe2 is checked to see if it
764 // starts a confusing character, and if so that's returned.
765 //
766 // Also note that 0xe2 will never be found in the middle of a codepoint,
767 // it's always the start of a codepoint. This means that if our special
768 // characters show up they're guaranteed to start with 0xe2 bytes.
769 let bytes = comment.as_bytes();
770 for pos in memchr::Memchr::new(0xe2, bytes) {
771 if let Some(c) = comment[pos..].chars().next() {
772 if is_confusing_unicode(c) {
773 // Note that `self.cur()` accounts for already having
774 // parsed `comment`, so we move backwards to where
775 // `comment` started and then add the index within
776 // `comment`.
777 let pos = end - comment.len() + pos;
778 return Err(self.error(pos, LexError::ConfusingUnicode(c)));
779 }
780 }
781 }
782
783 Ok(())
784 }
785
786 fn parse_str(
787 it: &mut str::Chars<'a>,
788 allow_confusing_unicode: bool,
789 ) -> Result<Cow<'a, [u8]>, LexError> {
790 enum State {
791 Start,
792 String(Vec<u8>),
793 }
794 let orig = it.as_str();
795 let mut state = State::Start;
796 loop {
797 match it.next().ok_or(LexError::UnexpectedEof)? {
798 '"' => break,
799 '\\' => {
800 match state {
801 State::String(_) => {}
802 State::Start => {
803 let pos = orig.len() - it.as_str().len() - 1;
804 state = State::String(orig[..pos].as_bytes().to_vec());
805 }
806 }
807 let buf = match &mut state {
808 State::String(b) => b,
809 State::Start => unreachable!(),
810 };
811 match it.next().ok_or(LexError::UnexpectedEof)? {
812 '"' => buf.push(b'"'),
813 '\'' => buf.push(b'\''),
814 't' => buf.push(b'\t'),
815 'n' => buf.push(b'\n'),
816 'r' => buf.push(b'\r'),
817 '\\' => buf.push(b'\\'),
818 'u' => {
819 Lexer::must_eat_char(it, '{')?;
820 let n = Lexer::hexnum(it)?;
821 let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?;
822 buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
823 Lexer::must_eat_char(it, '}')?;
824 }
825 c1 if c1.is_ascii_hexdigit() => {
826 let c2 = Lexer::hexdigit(it)?;
827 buf.push(to_hex(c1) * 16 + c2);
828 }
829 c => return Err(LexError::InvalidStringEscape(c)),
830 }
831 }
832 c if (c as u32) < 0x20 || c as u32 == 0x7f => {
833 return Err(LexError::InvalidStringElement(c))
834 }
835 c if !allow_confusing_unicode && is_confusing_unicode(c) => {
836 return Err(LexError::ConfusingUnicode(c))
837 }
838 c => match &mut state {
839 State::Start => {}
840 State::String(v) => {
841 v.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
842 }
843 },
844 }
845 }
846 match state {
847 State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()),
848 State::String(s) => Ok(s.into()),
849 }
850 }
851
852 /// Parses an id-or-string-based name from `it`.
853 ///
854 /// Note that `it` should already have been lexed and this is just
855 /// extracting the value. If the token lexed was `@a` then this should point
856 /// to `a`.
857 ///
858 /// This will automatically detect quoted syntax such as `@"..."` and the
859 /// byte string will be parsed and validated as utf-8.
860 ///
861 /// # Errors
862 ///
863 /// Returns an error if a quoted byte string is found and contains invalid
864 /// utf-8.
865 fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> {
866 if it.clone().next() == Some('"') {
867 it.next();
868 match Lexer::parse_str(it, true)? {
869 Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) {
870 Ok(s) => Ok(Cow::Borrowed(s)),
871 Err(e) => Err(LexError::InvalidUtf8Id(e)),
872 },
873 Cow::Owned(bytes) => match String::from_utf8(bytes) {
874 Ok(s) => Ok(Cow::Owned(s)),
875 Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())),
876 },
877 }
878 } else {
879 Ok(Cow::Borrowed(it.as_str()))
880 }
881 }
882
883 fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {
884 let n = Lexer::hexdigit(it)?;
885 let mut last_underscore = false;
886 let mut n = n as u32;
887 while let Some(c) = it.clone().next() {
888 if c == '_' {
889 it.next();
890 last_underscore = true;
891 continue;
892 }
893 if !c.is_ascii_hexdigit() {
894 break;
895 }
896 last_underscore = false;
897 it.next();
898 n = n
899 .checked_mul(16)
900 .and_then(|n| n.checked_add(to_hex(c) as u32))
901 .ok_or(LexError::NumberTooBig)?;
902 }
903 if last_underscore {
904 return Err(LexError::LoneUnderscore);
905 }
906 Ok(n)
907 }
908
909 /// Reads a hexidecimal digit from the input stream, returning where it's
910 /// defined and the hex value. Returns an error on EOF or an invalid hex
911 /// digit.
912 fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> {
913 let ch = Lexer::must_char(it)?;
914 if ch.is_ascii_hexdigit() {
915 Ok(to_hex(ch))
916 } else {
917 Err(LexError::InvalidHexDigit(ch))
918 }
919 }
920
921 /// Reads the next character from the input string and where it's located,
922 /// returning an error if the input stream is empty.
923 fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> {
924 it.next().ok_or(LexError::UnexpectedEof)
925 }
926
927 /// Expects that a specific character must be read next
928 fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> {
929 let found = Lexer::must_char(it)?;
930 if wanted == found {
931 Ok(())
932 } else {
933 Err(LexError::Expected { wanted, found })
934 }
935 }
936
937 /// Creates an error at `pos` with the specified `kind`
938 fn error(&self, pos: usize, kind: LexError) -> Error {
939 Error::lex(Span { offset: pos }, self.input, kind)
940 }
941
942 /// Returns an iterator over all tokens in the original source string
943 /// starting at the `pos` specified.
944 pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ {
945 std::iter::from_fn(move || self.parse(&mut pos).transpose())
946 }
947
948 /// Returns whether an annotation is present at `pos`. If it is present then
949 /// `Ok(Some(token))` is returned corresponding to the token, otherwise
950 /// `Ok(None)` is returned. If the next token cannot be parsed then an error
951 /// is returned.
952 pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> {
953 let bytes = self.input.as_bytes();
954 // Quickly reject anything that for sure isn't an annotation since this
955 // method is used every time an lparen is parsed.
956 if bytes.get(pos) != Some(&b'@') {
957 return Ok(None);
958 }
959 match self.parse(&mut pos)? {
960 Some(token) => match token.kind {
961 TokenKind::Annotation => Ok(Some(token)),
962 _ => Ok(None),
963 },
964 None => Ok(None),
965 }
966 }
967}
968
969impl Token {
970 /// Returns the original source text for this token.
971 pub fn src<'a>(&self, s: &'a str) -> &'a str {
972 &s[self.offset..][..self.len.try_into().unwrap()]
973 }
974
975 /// Returns the identifier, without the leading `$` symbol, that this token
976 /// represents.
977 ///
978 /// Note that this method returns the contents of the identifier. With a
979 /// string-based identifier this means that escapes have been resolved to
980 /// their string-based equivalent.
981 ///
982 /// Should only be used with `TokenKind::Id`.
983 ///
984 /// # Errors
985 ///
986 /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
987 /// which is invalid utf-8.
988 pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
989 let mut ch = self.src(s).chars();
990 let dollar = ch.next();
991 debug_assert_eq!(dollar, Some('$'));
992 let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
993 if id.is_empty() {
994 return Err(self.error(s, LexError::EmptyId));
995 }
996 Ok(id)
997 }
998
999 /// Returns the annotation, without the leading `@` symbol, that this token
1000 /// represents.
1001 ///
1002 /// Note that this method returns the contents of the identifier. With a
1003 /// string-based identifier this means that escapes have been resolved to
1004 /// their string-based equivalent.
1005 ///
1006 /// Should only be used with `TokenKind::Annotation`.
1007 ///
1008 /// # Errors
1009 ///
1010 /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
1011 /// which is invalid utf-8.
1012 pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
1013 let mut ch = self.src(s).chars();
1014 let at = ch.next();
1015 debug_assert_eq!(at, Some('@'));
1016 let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
1017 if id.is_empty() {
1018 return Err(self.error(s, LexError::EmptyAnnotation));
1019 }
1020 Ok(id)
1021 }
1022
1023 /// Returns the keyword this token represents.
1024 ///
1025 /// Should only be used with [`TokenKind::Keyword`].
1026 pub fn keyword<'a>(&self, s: &'a str) -> &'a str {
1027 self.src(s)
1028 }
1029
1030 /// Returns the reserved string this token represents.
1031 ///
1032 /// Should only be used with [`TokenKind::Reserved`].
1033 pub fn reserved<'a>(&self, s: &'a str) -> &'a str {
1034 self.src(s)
1035 }
1036
1037 /// Returns the parsed string that this token represents.
1038 ///
1039 /// This returns either a raw byte slice into the source if that's possible
1040 /// or an owned representation to handle escaped characters and such.
1041 ///
1042 /// Should only be used with [`TokenKind::String`].
1043 pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> {
1044 let mut ch = self.src(s).chars();
1045 ch.next().unwrap();
1046 Lexer::parse_str(&mut ch, true).unwrap()
1047 }
1048
1049 /// Returns the decomposed float token that this represents.
1050 ///
1051 /// This will slice up the float token into its component parts and return a
1052 /// description of the float token in the source.
1053 ///
1054 /// Should only be used with [`TokenKind::Float`].
1055 pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> {
1056 match kind {
1057 FloatKind::Inf { negative } => Float::Inf { negative },
1058 FloatKind::Nan { negative } => Float::Nan {
1059 val: None,
1060 negative,
1061 },
1062 FloatKind::NanVal {
1063 negative,
1064 has_underscores,
1065 } => {
1066 let src = self.src(s);
1067 let src = if src.starts_with("n") { src } else { &src[1..] };
1068 let mut val = Cow::Borrowed(src.strip_prefix("nan:0x").unwrap());
1069 if has_underscores {
1070 *val.to_mut() = val.replace("_", "");
1071 }
1072 Float::Nan {
1073 val: Some(val),
1074 negative,
1075 }
1076 }
1077 FloatKind::Normal {
1078 has_underscores,
1079 hex,
1080 } => {
1081 let src = self.src(s);
1082 let (integral, fractional, exponent) = match src.find('.') {
1083 Some(i) => {
1084 let integral = &src[..i];
1085 let rest = &src[i + 1..];
1086 let exponent = if hex {
1087 rest.find('p').or_else(|| rest.find('P'))
1088 } else {
1089 rest.find('e').or_else(|| rest.find('E'))
1090 };
1091 match exponent {
1092 Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])),
1093 None => (integral, Some(rest), None),
1094 }
1095 }
1096 None => {
1097 let exponent = if hex {
1098 src.find('p').or_else(|| src.find('P'))
1099 } else {
1100 src.find('e').or_else(|| src.find('E'))
1101 };
1102 match exponent {
1103 Some(i) => (&src[..i], None, Some(&src[i + 1..])),
1104 None => (src, None, None),
1105 }
1106 }
1107 };
1108 let mut integral = Cow::Borrowed(integral.strip_prefix('+').unwrap_or(integral));
1109 let mut fractional = fractional.and_then(|s| {
1110 if s.is_empty() {
1111 None
1112 } else {
1113 Some(Cow::Borrowed(s))
1114 }
1115 });
1116 let mut exponent =
1117 exponent.map(|s| Cow::Borrowed(s.strip_prefix('+').unwrap_or(s)));
1118 if has_underscores {
1119 *integral.to_mut() = integral.replace("_", "");
1120 if let Some(fractional) = &mut fractional {
1121 *fractional.to_mut() = fractional.replace("_", "");
1122 }
1123 if let Some(exponent) = &mut exponent {
1124 *exponent.to_mut() = exponent.replace("_", "");
1125 }
1126 }
1127 if hex {
1128 *integral.to_mut() = integral.replace("0x", "");
1129 }
1130 Float::Val {
1131 hex,
1132 integral,
1133 fractional,
1134 exponent,
1135 }
1136 }
1137 }
1138 }
1139
1140 /// Returns the decomposed integer token that this represents.
1141 ///
1142 /// This will slice up the integer token into its component parts and
1143 /// return a description of the integer token in the source.
1144 ///
1145 /// Should only be used with [`TokenKind::Integer`].
1146 pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> {
1147 let src = self.src(s);
1148 let val = match kind.sign {
1149 Some(SignToken::Plus) => src.strip_prefix('+').unwrap(),
1150 Some(SignToken::Minus) => src,
1151 None => src,
1152 };
1153 let mut val = Cow::Borrowed(val);
1154 if kind.has_underscores {
1155 *val.to_mut() = val.replace("_", "");
1156 }
1157 if kind.hex {
1158 *val.to_mut() = val.replace("0x", "");
1159 }
1160 Integer {
1161 sign: kind.sign,
1162 hex: kind.hex,
1163 val,
1164 }
1165 }
1166
1167 fn error(&self, src: &str, err: LexError) -> Error {
1168 Error::lex(
1169 Span {
1170 offset: self.offset,
1171 },
1172 src,
1173 err,
1174 )
1175 }
1176}
1177
1178impl<'a> Integer<'a> {
1179 /// Returns the sign token for this integer.
1180 pub fn sign(&self) -> Option<SignToken> {
1181 self.sign
1182 }
1183
1184 /// Returns the value string that can be parsed for this integer, as well
1185 /// as the base that it should be parsed in
1186 pub fn val(&self) -> (&str, u32) {
1187 (&self.val, if self.hex { 16 } else { 10 })
1188 }
1189}
1190
1191fn to_hex(c: char) -> u8 {
1192 match c {
1193 'a'..='f' => c as u8 - b'a' + 10,
1194 'A'..='F' => c as u8 - b'A' + 10,
1195 _ => c as u8 - b'0',
1196 }
1197}
1198
1199impl fmt::Display for LexError {
1200 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1201 use LexError::*;
1202 match self {
1203 DanglingBlockComment => f.write_str("unterminated block comment")?,
1204 Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?,
1205 InvalidStringElement(c) => {
1206 write!(f, "invalid character in string '{}'", escape_char(*c))?
1207 }
1208 InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?,
1209 InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?,
1210 InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?,
1211 Expected { wanted, found } => write!(
1212 f,
1213 "expected '{}' but found '{}'",
1214 escape_char(*wanted),
1215 escape_char(*found)
1216 )?,
1217 UnexpectedEof => write!(f, "unexpected end-of-file")?,
1218 NumberTooBig => f.write_str("number is too big to parse")?,
1219 InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,
1220 LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
1221 ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?,
1222 InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?,
1223 EmptyId => write!(f, "empty identifier")?,
1224 EmptyAnnotation => write!(f, "empty annotation id")?,
1225 }
1226 Ok(())
1227 }
1228}
1229
1230fn escape_char(c: char) -> String {
1231 match c {
1232 '\t' => String::from("\\t"),
1233 '\r' => String::from("\\r"),
1234 '\n' => String::from("\\n"),
1235 '\\' => String::from("\\\\"),
1236 '\'' => String::from("\\\'"),
1237 '\"' => String::from("\""),
1238 '\x20'..='\x7e' => String::from(c),
1239 _ => c.escape_unicode().to_string(),
1240 }
1241}
1242
1243/// This is an attempt to protect agains the "trojan source" [1] problem where
1244/// unicode characters can cause editors to render source code differently
1245/// for humans than the compiler itself sees.
1246///
1247/// To mitigate this issue, and because it's relatively rare in practice,
1248/// this simply rejects characters of that form.
1249///
1250/// [1]: https://www.trojansource.codes/
1251fn is_confusing_unicode(ch: char) -> bool {
1252 matches!(
1253 ch,
1254 '\u{202a}'
1255 | '\u{202b}'
1256 | '\u{202d}'
1257 | '\u{202e}'
1258 | '\u{2066}'
1259 | '\u{2067}'
1260 | '\u{2068}'
1261 | '\u{206c}'
1262 | '\u{2069}'
1263 )
1264}
1265
1266#[cfg(test)]
1267mod tests {
1268 use super::*;
1269
1270 #[test]
1271 fn ws_smoke() {
1272 fn get_whitespace(input: &str) -> &str {
1273 let token = get_token(input);
1274 match token.kind {
1275 TokenKind::Whitespace => token.src(input),
1276 other => panic!("unexpected {:?}", other),
1277 }
1278 }
1279 assert_eq!(get_whitespace(" "), " ");
1280 assert_eq!(get_whitespace(" "), " ");
1281 assert_eq!(get_whitespace(" \n "), " \n ");
1282 assert_eq!(get_whitespace(" x"), " ");
1283 assert_eq!(get_whitespace(" ;"), " ");
1284 }
1285
1286 #[test]
1287 fn line_comment_smoke() {
1288 fn get_line_comment(input: &str) -> &str {
1289 let token = get_token(input);
1290 match token.kind {
1291 TokenKind::LineComment => token.src(input),
1292 other => panic!("unexpected {:?}", other),
1293 }
1294 }
1295 assert_eq!(get_line_comment(";;"), ";;");
1296 assert_eq!(get_line_comment(";; xyz"), ";; xyz");
1297 assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz");
1298 assert_eq!(get_line_comment(";;\nabc"), ";;");
1299 assert_eq!(get_line_comment(";; \nabc"), ";; ");
1300 assert_eq!(get_line_comment(";; \rabc"), ";; ");
1301 assert_eq!(get_line_comment(";; \r\nabc"), ";; ");
1302 }
1303
1304 #[test]
1305 fn block_comment_smoke() {
1306 fn get_block_comment(input: &str) -> &str {
1307 let token = get_token(input);
1308 match token.kind {
1309 TokenKind::BlockComment => token.src(input),
1310 other => panic!("unexpected {:?}", other),
1311 }
1312 }
1313 assert_eq!(get_block_comment("(;;)"), "(;;)");
1314 assert_eq!(get_block_comment("(; ;)"), "(; ;)");
1315 assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");
1316 }
1317
1318 fn get_token(input: &str) -> Token {
1319 Lexer::new(input)
1320 .parse(&mut 0)
1321 .expect("no first token")
1322 .expect("no token")
1323 }
1324
1325 #[test]
1326 fn lparen() {
1327 assert_eq!(get_token("((").kind, TokenKind::LParen);
1328 }
1329
1330 #[test]
1331 fn rparen() {
1332 assert_eq!(get_token(")(").kind, TokenKind::RParen);
1333 }
1334
1335 #[test]
1336 fn strings() {
1337 fn get_string(input: &str) -> Vec<u8> {
1338 let token = get_token(input);
1339 match token.kind {
1340 TokenKind::String => token.string(input).to_vec(),
1341 other => panic!("not keyword {:?}", other),
1342 }
1343 }
1344 assert_eq!(&*get_string("\"\""), b"");
1345 assert_eq!(&*get_string("\"a\""), b"a");
1346 assert_eq!(&*get_string("\"a b c d\""), b"a b c d");
1347 assert_eq!(&*get_string("\"\\\"\""), b"\"");
1348 assert_eq!(&*get_string("\"\\'\""), b"'");
1349 assert_eq!(&*get_string("\"\\n\""), b"\n");
1350 assert_eq!(&*get_string("\"\\t\""), b"\t");
1351 assert_eq!(&*get_string("\"\\r\""), b"\r");
1352 assert_eq!(&*get_string("\"\\\\\""), b"\\");
1353 assert_eq!(&*get_string("\"\\01\""), &[1]);
1354 assert_eq!(&*get_string("\"\\u{1}\""), &[1]);
1355 assert_eq!(
1356 &*get_string("\"\\u{0f3}\""),
1357 '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1358 );
1359 assert_eq!(
1360 &*get_string("\"\\u{0_f_3}\""),
1361 '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1362 );
1363
1364 for i in 0..=255i32 {
1365 let s = format!("\"\\{:02x}\"", i);
1366 assert_eq!(&*get_string(&s), &[i as u8]);
1367 }
1368 }
1369
1370 #[test]
1371 fn id() {
1372 fn get_id(input: &str) -> String {
1373 let token = get_token(input);
1374 match token.kind {
1375 TokenKind::Id => token.id(input).unwrap().to_string(),
1376 other => panic!("not id {:?}", other),
1377 }
1378 }
1379 assert_eq!(get_id("$x"), "x");
1380 assert_eq!(get_id("$xyz"), "xyz");
1381 assert_eq!(get_id("$x_z"), "x_z");
1382 assert_eq!(get_id("$0^"), "0^");
1383 assert_eq!(get_id("$0^;;"), "0^");
1384 assert_eq!(get_id("$0^ ;;"), "0^");
1385 assert_eq!(get_id("$\"x\" ;;"), "x");
1386 }
1387
1388 #[test]
1389 fn annotation() {
1390 fn get_annotation(input: &str) -> String {
1391 let token = get_token(input);
1392 match token.kind {
1393 TokenKind::Annotation => token.annotation(input).unwrap().to_string(),
1394 other => panic!("not annotation {:?}", other),
1395 }
1396 }
1397 assert_eq!(get_annotation("@foo"), "foo");
1398 assert_eq!(get_annotation("@foo "), "foo");
1399 assert_eq!(get_annotation("@f "), "f");
1400 assert_eq!(get_annotation("@\"x\" "), "x");
1401 assert_eq!(get_annotation("@0 "), "0");
1402 }
1403
1404 #[test]
1405 fn keyword() {
1406 fn get_keyword(input: &str) -> &str {
1407 let token = get_token(input);
1408 match token.kind {
1409 TokenKind::Keyword => token.keyword(input),
1410 other => panic!("not keyword {:?}", other),
1411 }
1412 }
1413 assert_eq!(get_keyword("x"), "x");
1414 assert_eq!(get_keyword("xyz"), "xyz");
1415 assert_eq!(get_keyword("x_z"), "x_z");
1416 assert_eq!(get_keyword("x_z "), "x_z");
1417 assert_eq!(get_keyword("x_z "), "x_z");
1418 }
1419
1420 #[test]
1421 fn reserved() {
1422 fn get_reserved(input: &str) -> &str {
1423 let token = get_token(input);
1424 match token.kind {
1425 TokenKind::Reserved => token.reserved(input),
1426 other => panic!("not reserved {:?}", other),
1427 }
1428 }
1429 assert_eq!(get_reserved("^_x "), "^_x");
1430 }
1431
1432 #[test]
1433 fn integer() {
1434 fn get_integer(input: &str) -> String {
1435 let token = get_token(input);
1436 match token.kind {
1437 TokenKind::Integer(i) => token.integer(input, i).val.to_string(),
1438 other => panic!("not integer {:?}", other),
1439 }
1440 }
1441 assert_eq!(get_integer("1"), "1");
1442 assert_eq!(get_integer("0"), "0");
1443 assert_eq!(get_integer("-1"), "-1");
1444 assert_eq!(get_integer("+1"), "1");
1445 assert_eq!(get_integer("+1_000"), "1000");
1446 assert_eq!(get_integer("+1_0_0_0"), "1000");
1447 assert_eq!(get_integer("+0x10"), "10");
1448 assert_eq!(get_integer("-0x10"), "-10");
1449 assert_eq!(get_integer("0x10"), "10");
1450 }
1451
1452 #[test]
1453 fn float() {
1454 fn get_float(input: &str) -> Float<'_> {
1455 let token = get_token(input);
1456 match token.kind {
1457 TokenKind::Float(f) => token.float(input, f),
1458 other => panic!("not float {:?}", other),
1459 }
1460 }
1461 assert_eq!(
1462 get_float("nan"),
1463 Float::Nan {
1464 val: None,
1465 negative: false
1466 },
1467 );
1468 assert_eq!(
1469 get_float("-nan"),
1470 Float::Nan {
1471 val: None,
1472 negative: true,
1473 },
1474 );
1475 assert_eq!(
1476 get_float("+nan"),
1477 Float::Nan {
1478 val: None,
1479 negative: false,
1480 },
1481 );
1482 assert_eq!(
1483 get_float("+nan:0x1"),
1484 Float::Nan {
1485 val: Some("1".into()),
1486 negative: false,
1487 },
1488 );
1489 assert_eq!(
1490 get_float("nan:0x7f_ffff"),
1491 Float::Nan {
1492 val: Some("7fffff".into()),
1493 negative: false,
1494 },
1495 );
1496 assert_eq!(get_float("inf"), Float::Inf { negative: false });
1497 assert_eq!(get_float("-inf"), Float::Inf { negative: true });
1498 assert_eq!(get_float("+inf"), Float::Inf { negative: false });
1499
1500 assert_eq!(
1501 get_float("1.2"),
1502 Float::Val {
1503 integral: "1".into(),
1504 fractional: Some("2".into()),
1505 exponent: None,
1506 hex: false,
1507 },
1508 );
1509 assert_eq!(
1510 get_float("1.2e3"),
1511 Float::Val {
1512 integral: "1".into(),
1513 fractional: Some("2".into()),
1514 exponent: Some("3".into()),
1515 hex: false,
1516 },
1517 );
1518 assert_eq!(
1519 get_float("-1_2.1_1E+0_1"),
1520 Float::Val {
1521 integral: "-12".into(),
1522 fractional: Some("11".into()),
1523 exponent: Some("01".into()),
1524 hex: false,
1525 },
1526 );
1527 assert_eq!(
1528 get_float("+1_2.1_1E-0_1"),
1529 Float::Val {
1530 integral: "12".into(),
1531 fractional: Some("11".into()),
1532 exponent: Some("-01".into()),
1533 hex: false,
1534 },
1535 );
1536 assert_eq!(
1537 get_float("0x1_2.3_4p5_6"),
1538 Float::Val {
1539 integral: "12".into(),
1540 fractional: Some("34".into()),
1541 exponent: Some("56".into()),
1542 hex: true,
1543 },
1544 );
1545 assert_eq!(
1546 get_float("+0x1_2.3_4P-5_6"),
1547 Float::Val {
1548 integral: "12".into(),
1549 fractional: Some("34".into()),
1550 exponent: Some("-56".into()),
1551 hex: true,
1552 },
1553 );
1554 assert_eq!(
1555 get_float("1."),
1556 Float::Val {
1557 integral: "1".into(),
1558 fractional: None,
1559 exponent: None,
1560 hex: false,
1561 },
1562 );
1563 assert_eq!(
1564 get_float("0x1p-24"),
1565 Float::Val {
1566 integral: "1".into(),
1567 fractional: None,
1568 exponent: Some("-24".into()),
1569 hex: true,
1570 },
1571 );
1572 }
1573}
1574