1 | //! Definition of a lexer for the WebAssembly text format. |
2 | //! |
3 | //! This module provides a [`Lexer`][] type which is an iterate over the raw |
4 | //! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single |
5 | //! byte in a WebAssembly text field, returning tokens even for comments and |
6 | //! whitespace. Typically you'll ignore comments and whitespace, however. |
7 | //! |
8 | //! If you'd like to iterate over the tokens in a file you can do so via: |
9 | //! |
10 | //! ``` |
11 | //! # fn foo() -> Result<(), wast::Error> { |
12 | //! use wast::lexer::Lexer; |
13 | //! |
14 | //! let wat = "(module (func $foo))" ; |
15 | //! for token in Lexer::new(wat).iter(0) { |
16 | //! println!("{:?}" , token?); |
17 | //! } |
18 | //! # Ok(()) |
19 | //! # } |
20 | //! ``` |
21 | //! |
22 | //! Note that you'll typically not use this module but will rather use |
23 | //! [`ParseBuffer`](crate::parser::ParseBuffer) instead. |
24 | //! |
25 | //! [`Lexer`]: crate::lexer::Lexer |
26 | |
27 | use crate::token::Span; |
28 | use crate::Error; |
29 | use std::borrow::Cow; |
30 | use std::char; |
31 | use std::fmt; |
32 | use std::slice; |
33 | use std::str; |
34 | use std::str::Utf8Error; |
35 | |
36 | /// A structure used to lex the s-expression syntax of WAT files. |
37 | /// |
38 | /// This structure is used to generate [`Token`] items, which should account for |
39 | /// every single byte of the input as we iterate over it. A [`LexError`] is |
40 | /// returned for any non-lexable text. |
41 | #[derive (Clone)] |
42 | pub struct Lexer<'a> { |
43 | input: &'a str, |
44 | allow_confusing_unicode: bool, |
45 | } |
46 | |
47 | /// A single token parsed from a `Lexer`. |
48 | #[derive (Copy, Clone, Debug, PartialEq)] |
49 | pub struct Token { |
50 | /// The kind of token this represents, such as whether it's whitespace, a |
51 | /// keyword, etc. |
52 | pub kind: TokenKind, |
53 | /// The byte offset within the original source for where this token came |
54 | /// from. |
55 | pub offset: usize, |
56 | /// The byte length of this token as it resides in the original source. |
57 | // |
58 | // NB: this is `u32` to enable packing `Token` into two pointers of size. |
59 | // This does limit a single token to being at most 4G large, but that seems |
60 | // probably ok. |
61 | pub len: u32, |
62 | } |
63 | |
64 | #[test ] |
65 | fn token_is_not_too_big() { |
66 | assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2); |
67 | } |
68 | |
69 | /// Classification of what was parsed from the input stream. |
70 | /// |
71 | /// This enumeration contains all kinds of fragments, including comments and |
72 | /// whitespace. |
73 | #[derive (Copy, Clone, Debug, PartialEq)] |
74 | pub enum TokenKind { |
75 | /// A line comment, preceded with `;;` |
76 | LineComment, |
77 | |
78 | /// A block comment, surrounded by `(;` and `;)`. Note that these can be |
79 | /// nested. |
80 | BlockComment, |
81 | |
82 | /// A fragment of source that represents whitespace. |
83 | Whitespace, |
84 | |
85 | /// A left-parenthesis, including the source text for where it comes from. |
86 | LParen, |
87 | /// A right-parenthesis, including the source text for where it comes from. |
88 | RParen, |
89 | |
90 | /// A string literal, which is actually a list of bytes. |
91 | String, |
92 | |
93 | /// An identifier (like `$foo`). |
94 | /// |
95 | /// All identifiers start with `$` and the payload here is the original |
96 | /// source text. |
97 | Id, |
98 | |
99 | /// A keyword, or something that starts with an alphabetic character. |
100 | /// |
101 | /// The payload here is the original source text. |
102 | Keyword, |
103 | |
104 | /// An annotation (like `@foo`). |
105 | /// |
106 | /// All annotations start with `@` and the payload will be the name of the |
107 | /// annotation. |
108 | Annotation, |
109 | |
110 | /// A reserved series of `idchar` symbols. Unknown what this is meant to be |
111 | /// used for, you'll probably generate an error about an unexpected token. |
112 | Reserved, |
113 | |
114 | /// An integer. |
115 | Integer(IntegerKind), |
116 | |
117 | /// A float. |
118 | Float(FloatKind), |
119 | } |
120 | |
121 | /// Description of the parsed integer from the source. |
122 | #[derive (Copy, Clone, Debug, PartialEq)] |
123 | pub struct IntegerKind { |
124 | sign: Option<SignToken>, |
125 | has_underscores: bool, |
126 | hex: bool, |
127 | } |
128 | |
129 | /// Description of a parsed float from the source. |
130 | #[allow (missing_docs)] |
131 | #[derive (Copy, Clone, Debug, PartialEq)] |
132 | pub enum FloatKind { |
133 | #[doc (hidden)] |
134 | Inf { negative: bool }, |
135 | #[doc (hidden)] |
136 | Nan { negative: bool }, |
137 | #[doc (hidden)] |
138 | NanVal { |
139 | negative: bool, |
140 | has_underscores: bool, |
141 | }, |
142 | #[doc (hidden)] |
143 | Normal { has_underscores: bool, hex: bool }, |
144 | } |
145 | |
146 | enum ReservedKind { |
147 | /// "..." |
148 | String, |
149 | /// anything that's just a sequence of `idchars!()` |
150 | Idchars, |
151 | /// $"..." |
152 | IdString, |
153 | /// @"..." |
154 | AnnotationString, |
155 | /// everything else (a conglomeration of strings, idchars, etc) |
156 | Reserved, |
157 | } |
158 | |
159 | /// Errors that can be generated while lexing. |
160 | /// |
161 | /// All lexing errors have line/colum/position information as well as a |
162 | /// `LexError` indicating what kind of error happened while lexing. |
163 | #[derive (Debug, Clone, PartialEq, Eq)] |
164 | #[non_exhaustive ] |
165 | pub enum LexError { |
166 | /// A dangling block comment was found with an unbalanced `(;` which was |
167 | /// never terminated in the file. |
168 | DanglingBlockComment, |
169 | |
170 | /// An unexpected character was encountered when generally parsing and |
171 | /// looking for something else. |
172 | Unexpected(char), |
173 | |
174 | /// An invalid `char` in a string literal was found. |
175 | InvalidStringElement(char), |
176 | |
177 | /// An invalid string escape letter was found (the thing after the `\` in |
178 | /// string literals) |
179 | InvalidStringEscape(char), |
180 | |
181 | /// An invalid hexadecimal digit was found. |
182 | InvalidHexDigit(char), |
183 | |
184 | /// An invalid base-10 digit was found. |
185 | InvalidDigit(char), |
186 | |
187 | /// Parsing expected `wanted` but ended up finding `found` instead where the |
188 | /// two characters aren't the same. |
189 | Expected { |
190 | /// The character that was expected to be found |
191 | wanted: char, |
192 | /// The character that was actually found |
193 | found: char, |
194 | }, |
195 | |
196 | /// We needed to parse more but EOF (or end of the string) was encountered. |
197 | UnexpectedEof, |
198 | |
199 | /// A number failed to parse because it was too big to fit within the target |
200 | /// type. |
201 | NumberTooBig, |
202 | |
203 | /// An invalid unicode value was found in a `\u{...}` escape in a string, |
204 | /// only valid unicode scalars can be escaped that way. |
205 | InvalidUnicodeValue(u32), |
206 | |
207 | /// A lone underscore was found when parsing a number, since underscores |
208 | /// should always be preceded and succeeded with a digit of some form. |
209 | LoneUnderscore, |
210 | |
211 | /// A "confusing" unicode character is present in a comment or a string |
212 | /// literal, such as a character that changes the direction text is |
213 | /// typically displayed in editors. This could cause the human-read |
214 | /// version to behave differently than the compiler-visible version, so |
215 | /// these are simply rejected for now. |
216 | ConfusingUnicode(char), |
217 | |
218 | /// An invalid utf-8 sequence was found in a quoted identifier, such as |
219 | /// `$"\ff"`. |
220 | InvalidUtf8Id(Utf8Error), |
221 | |
222 | /// An empty identifier was found, or a lone `$`. |
223 | EmptyId, |
224 | |
225 | /// An empty identifier was found, or a lone `@`. |
226 | EmptyAnnotation, |
227 | } |
228 | |
229 | /// A sign token for an integer. |
230 | #[derive (Clone, Copy, Debug, PartialEq, Eq)] |
231 | pub enum SignToken { |
232 | /// Plus sign: "+", |
233 | Plus, |
234 | /// Minus sign: "-", |
235 | Minus, |
236 | } |
237 | |
238 | /// A fully parsed integer from a source string with a payload ready to parse |
239 | /// into an integral type. |
240 | #[derive (Debug, PartialEq)] |
241 | pub struct Integer<'a> { |
242 | sign: Option<SignToken>, |
243 | val: Cow<'a, str>, |
244 | hex: bool, |
245 | } |
246 | |
247 | /// Possible parsed float values |
248 | #[derive (Debug, PartialEq, Eq)] |
249 | pub enum Float<'a> { |
250 | /// A float `NaN` representation |
251 | Nan { |
252 | /// The specific bits to encode for this float, optionally |
253 | val: Option<Cow<'a, str>>, |
254 | /// Whether or not this is a negative `NaN` or not. |
255 | negative: bool, |
256 | }, |
257 | /// An float infinite representation, |
258 | Inf { |
259 | #[allow (missing_docs)] |
260 | negative: bool, |
261 | }, |
262 | /// A parsed and separated floating point value |
263 | Val { |
264 | /// Whether or not the `integral` and `fractional` are specified in hex |
265 | hex: bool, |
266 | /// The float parts before the `.` |
267 | integral: Cow<'a, str>, |
268 | /// The float parts after the `.` |
269 | fractional: Option<Cow<'a, str>>, |
270 | /// The exponent to multiple this `integral.fractional` portion of the |
271 | /// float by. If `hex` is true this is `2^exponent` and otherwise it's |
272 | /// `10^exponent` |
273 | exponent: Option<Cow<'a, str>>, |
274 | }, |
275 | } |
276 | |
277 | // https://webassembly.github.io/spec/core/text/values.html#text-idchar |
278 | macro_rules! idchars { |
279 | () => { |
280 | b'0' ..=b'9' |
281 | | b'A' ..=b'Z' |
282 | | b'a' ..=b'z' |
283 | | b'!' |
284 | | b'#' |
285 | | b'$' |
286 | | b'%' |
287 | | b'&' |
288 | | b' \'' |
289 | | b'*' |
290 | | b'+' |
291 | | b'-' |
292 | | b'.' |
293 | | b'/' |
294 | | b':' |
295 | | b'<' |
296 | | b'=' |
297 | | b'>' |
298 | | b'?' |
299 | | b'@' |
300 | | b' \\' |
301 | | b'^' |
302 | | b'_' |
303 | | b'`' |
304 | | b'|' |
305 | | b'~' |
306 | } |
307 | } |
308 | |
309 | impl<'a> Lexer<'a> { |
310 | /// Creates a new lexer which will lex the `input` source string. |
311 | pub fn new(input: &str) -> Lexer<'_> { |
312 | Lexer { |
313 | input, |
314 | allow_confusing_unicode: false, |
315 | } |
316 | } |
317 | |
318 | /// Returns the original source input that we're lexing. |
319 | pub fn input(&self) -> &'a str { |
320 | self.input |
321 | } |
322 | |
323 | /// Configures whether "confusing" unicode characters are allowed while |
324 | /// lexing. |
325 | /// |
326 | /// If allowed then no error will happen if these characters are found, but |
327 | /// otherwise if disallowed a lex error will be produced when these |
328 | /// characters are found. Confusing characters are denied by default. |
329 | /// |
330 | /// For now "confusing characters" are primarily related to the "trojan |
331 | /// source" problem where it refers to characters which cause humans to read |
332 | /// text differently than this lexer, such as characters that alter the |
333 | /// left-to-right display of the source code. |
334 | pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self { |
335 | self.allow_confusing_unicode = allow; |
336 | self |
337 | } |
338 | |
339 | /// Lexes the next at the byte position `pos` in the input. |
340 | /// |
341 | /// Returns `Some` if a token is found or `None` if we're at EOF. |
342 | /// |
343 | /// The `pos` argument will be updated to point to the next token on a |
344 | /// successful parse. |
345 | /// |
346 | /// # Errors |
347 | /// |
348 | /// Returns an error if the input is malformed. |
349 | pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> { |
350 | let offset = *pos; |
351 | Ok(match self.parse_kind(pos)? { |
352 | Some(kind) => Some(Token { |
353 | kind, |
354 | offset, |
355 | len: (*pos - offset).try_into().unwrap(), |
356 | }), |
357 | None => None, |
358 | }) |
359 | } |
360 | |
361 | fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> { |
362 | let start = *pos; |
363 | // This `match` generally parses the grammar specified at |
364 | // |
365 | // https://webassembly.github.io/spec/core/text/lexical.html#text-token |
366 | let remaining = &self.input.as_bytes()[start..]; |
367 | let byte = match remaining.first() { |
368 | Some(b) => b, |
369 | None => return Ok(None), |
370 | }; |
371 | |
372 | match byte { |
373 | // Open-parens check the next character to see if this is the start |
374 | // of a block comment, otherwise it's just a bland left-paren |
375 | // token. |
376 | b'(' => match remaining.get(1) { |
377 | Some(b';' ) => { |
378 | let mut level = 1; |
379 | // Note that we're doing a byte-level search here for the |
380 | // close-delimiter of `;)`. The actual source text is utf-8 |
381 | // encode in `remaining` but due to how utf-8 works we |
382 | // can safely search for an ASCII byte since it'll never |
383 | // otherwise appear in the middle of a codepoint and if we |
384 | // find it then it's guaranteed to be the right byte. |
385 | // |
386 | // Mainly we're avoiding the overhead of decoding utf-8 |
387 | // characters into a Rust `char` since it's otherwise |
388 | // unnecessary work. |
389 | let mut iter = remaining[2..].iter(); |
390 | while let Some(ch) = iter.next() { |
391 | match ch { |
392 | b'(' => { |
393 | if let Some(b';' ) = iter.as_slice().first() { |
394 | level += 1; |
395 | iter.next(); |
396 | } |
397 | } |
398 | b';' => { |
399 | if let Some(b')' ) = iter.as_slice().first() { |
400 | level -= 1; |
401 | iter.next(); |
402 | if level == 0 { |
403 | let len = remaining.len() - iter.as_slice().len(); |
404 | let comment = &self.input[start..][..len]; |
405 | *pos += len; |
406 | self.check_confusing_comment(*pos, comment)?; |
407 | return Ok(Some(TokenKind::BlockComment)); |
408 | } |
409 | } |
410 | } |
411 | _ => {} |
412 | } |
413 | } |
414 | Err(self.error(start, LexError::DanglingBlockComment)) |
415 | } |
416 | _ => { |
417 | *pos += 1; |
418 | |
419 | Ok(Some(TokenKind::LParen)) |
420 | } |
421 | }, |
422 | |
423 | b')' => { |
424 | *pos += 1; |
425 | Ok(Some(TokenKind::RParen)) |
426 | } |
427 | |
428 | // https://webassembly.github.io/spec/core/text/lexical.html#white-space |
429 | b' ' | b' \n' | b' \r' | b' \t' => { |
430 | self.skip_ws(pos); |
431 | Ok(Some(TokenKind::Whitespace)) |
432 | } |
433 | |
434 | c @ (idchars!() | b'"' ) => { |
435 | let (kind, src) = self.parse_reserved(pos)?; |
436 | match kind { |
437 | // If the reserved token was simply a single string then |
438 | // that is converted to a standalone string token |
439 | ReservedKind::String => return Ok(Some(TokenKind::String)), |
440 | |
441 | // If only idchars were consumed then this could be a |
442 | // specific kind of standalone token we're interested in. |
443 | ReservedKind::Idchars => { |
444 | // https://webassembly.github.io/spec/core/text/values.html#integers |
445 | if let Some(ret) = self.classify_number(src) { |
446 | return Ok(Some(ret)); |
447 | // https://webassembly.github.io/spec/core/text/values.html#text-id |
448 | } else if *c == b'$' { |
449 | return Ok(Some(TokenKind::Id)); |
450 | // part of the WebAssembly/annotations proposal |
451 | // (no online url yet) |
452 | } else if *c == b'@' { |
453 | return Ok(Some(TokenKind::Annotation)); |
454 | // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword |
455 | } else if b'a' <= *c && *c <= b'z' { |
456 | return Ok(Some(TokenKind::Keyword)); |
457 | } |
458 | } |
459 | |
460 | ReservedKind::IdString => return Ok(Some(TokenKind::Id)), |
461 | ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)), |
462 | |
463 | // ... otherwise this was a conglomeration of idchars, |
464 | // strings, or just idchars that don't match a prior rule, |
465 | // meaning this falls through to the fallback `Reserved` |
466 | // token. |
467 | ReservedKind::Reserved => {} |
468 | } |
469 | |
470 | Ok(Some(TokenKind::Reserved)) |
471 | } |
472 | |
473 | // This could be a line comment, otherwise `;` is a reserved token. |
474 | // The second byte is checked to see if it's a `;;` line comment |
475 | // |
476 | // Note that this character being considered as part of a |
477 | // `reserved` token is part of the annotations proposal. |
478 | b';' => match remaining.get(1) { |
479 | Some(b';' ) => { |
480 | let remaining = &self.input[*pos..]; |
481 | let byte_pos = memchr::memchr2(b' \n' , b' \r' , remaining.as_bytes()) |
482 | .unwrap_or(remaining.len()); |
483 | *pos += byte_pos; |
484 | let comment = &remaining[..byte_pos]; |
485 | self.check_confusing_comment(*pos, comment)?; |
486 | Ok(Some(TokenKind::LineComment)) |
487 | } |
488 | _ => { |
489 | *pos += 1; |
490 | Ok(Some(TokenKind::Reserved)) |
491 | } |
492 | }, |
493 | |
494 | // Other known reserved tokens other than `;` |
495 | // |
496 | // Note that these characters being considered as part of a |
497 | // `reserved` token is part of the annotations proposal. |
498 | b',' | b'[' | b']' | b'{' | b'}' => { |
499 | *pos += 1; |
500 | Ok(Some(TokenKind::Reserved)) |
501 | } |
502 | |
503 | _ => { |
504 | let ch = self.input[start..].chars().next().unwrap(); |
505 | Err(self.error(*pos, LexError::Unexpected(ch))) |
506 | } |
507 | } |
508 | } |
509 | |
510 | fn skip_ws(&self, pos: &mut usize) { |
511 | // This table is a byte lookup table to determine whether a byte is a |
512 | // whitespace byte. There are only 4 whitespace bytes for the `*.wat` |
513 | // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes |
514 | // have a '1' in the table below. |
515 | // |
516 | // Due to how utf-8 works (our input is guaranteed to be utf-8) it is |
517 | // known that if these bytes are found they're guaranteed to be the |
518 | // whitespace byte, so they can be safely skipped and we don't have to |
519 | // do full utf-8 decoding. This means that the goal of this function is |
520 | // to find the first non-whitespace byte in `remaining`. |
521 | // |
522 | // For now this lookup table seems to be the fastest, but projects like |
523 | // https://github.com/lemire/despacer show other simd algorithms which |
524 | // can possibly accelerate this even more. Note that `*.wat` files often |
525 | // have a lot of whitespace so this function is typically quite hot when |
526 | // parsing inputs. |
527 | #[rustfmt::skip] |
528 | const WS: [u8; 256] = [ |
529 | // \t \n \r |
530 | /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, |
531 | /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
532 | // ' ' |
533 | /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
534 | /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
535 | /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
536 | /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
537 | /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
538 | /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
539 | /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
540 | /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
541 | /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
542 | /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
543 | /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
544 | /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
545 | /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
546 | /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
547 | ]; |
548 | let remaining = &self.input[*pos..]; |
549 | let non_ws_pos = remaining |
550 | .as_bytes() |
551 | .iter() |
552 | .position(|b| WS[*b as usize] != 1) |
553 | .unwrap_or(remaining.len()); |
554 | *pos += non_ws_pos; |
555 | } |
556 | |
557 | /// Splits off a "reserved" token which is then further processed later on |
558 | /// to figure out which kind of token it is `depending on `ReservedKind`. |
559 | /// |
560 | /// For more information on this method see the clarification at |
561 | /// <https://github.com/WebAssembly/spec/pull/1499> but the general gist is |
562 | /// that this is parsing the grammar: |
563 | /// |
564 | /// ```text |
565 | /// reserved := (idchar | string)+ |
566 | /// ``` |
567 | /// |
568 | /// which means that it is eating any number of adjacent string/idchar |
569 | /// tokens (e.g. `a"b"c`) and returning the classification of what was |
570 | /// eaten. The classification assists in determining what the actual token |
571 | /// here eaten looks like. |
572 | fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> { |
573 | let mut idchars = 0u32; |
574 | let mut strings = 0u32; |
575 | let start = *pos; |
576 | while let Some(byte) = self.input.as_bytes().get(*pos) { |
577 | match byte { |
578 | // Normal `idchars` production which appends to the reserved |
579 | // token that's being produced. |
580 | idchars!() => { |
581 | idchars += 1; |
582 | *pos += 1; |
583 | } |
584 | |
585 | // https://webassembly.github.io/spec/core/text/values.html#text-string |
586 | b'"' => { |
587 | strings += 1; |
588 | *pos += 1; |
589 | let mut it = self.input[*pos..].chars(); |
590 | let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode); |
591 | *pos = self.input.len() - it.as_str().len(); |
592 | match result { |
593 | Ok(_) => {} |
594 | Err(e) => { |
595 | let err_pos = match &e { |
596 | LexError::UnexpectedEof => self.input.len(), |
597 | _ => self.input[..*pos].char_indices().next_back().unwrap().0, |
598 | }; |
599 | return Err(self.error(err_pos, e)); |
600 | } |
601 | } |
602 | } |
603 | |
604 | // Nothing else is considered part of a reserved token |
605 | _ => break, |
606 | } |
607 | } |
608 | let ret = &self.input[start..*pos]; |
609 | Ok(match (idchars, strings) { |
610 | (0, 0) => unreachable!(), |
611 | (0, 1) => (ReservedKind::String, ret), |
612 | (_, 0) => (ReservedKind::Idchars, ret), |
613 | // Pattern match `@"..."` and `$"..."` for string-based |
614 | // identifiers and annotations. |
615 | (1, 1) if ret.starts_with("$" ) => (ReservedKind::IdString, ret), |
616 | (1, 1) if ret.starts_with("@" ) => (ReservedKind::AnnotationString, ret), |
617 | _ => (ReservedKind::Reserved, ret), |
618 | }) |
619 | } |
620 | |
621 | fn classify_number(&self, src: &str) -> Option<TokenKind> { |
622 | let (sign, num) = if let Some(stripped) = src.strip_prefix('+' ) { |
623 | (Some(SignToken::Plus), stripped) |
624 | } else if let Some(stripped) = src.strip_prefix('-' ) { |
625 | (Some(SignToken::Minus), stripped) |
626 | } else { |
627 | (None, src) |
628 | }; |
629 | |
630 | let negative = sign == Some(SignToken::Minus); |
631 | |
632 | // Handle `inf` and `nan` which are special numbers here |
633 | if num == "inf" { |
634 | return Some(TokenKind::Float(FloatKind::Inf { negative })); |
635 | } else if num == "nan" { |
636 | return Some(TokenKind::Float(FloatKind::Nan { negative })); |
637 | } else if let Some(stripped) = num.strip_prefix("nan:0x" ) { |
638 | let mut it = stripped.as_bytes().iter(); |
639 | let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?; |
640 | if it.next().is_some() { |
641 | return None; |
642 | } |
643 | return Some(TokenKind::Float(FloatKind::NanVal { |
644 | negative, |
645 | has_underscores, |
646 | })); |
647 | } |
648 | |
649 | // Figure out if we're a hex number or not |
650 | let test_valid: fn(u8) -> bool; |
651 | let (mut it, hex) = if let Some(stripped) = num.strip_prefix("0x" ) { |
652 | test_valid = |x: u8| char::from(x).is_ascii_hexdigit(); |
653 | (stripped.as_bytes().iter(), true) |
654 | } else { |
655 | test_valid = |x: u8| char::from(x).is_ascii_digit(); |
656 | (num.as_bytes().iter(), false) |
657 | }; |
658 | |
659 | // Evaluate the first part, moving out all underscores |
660 | let mut has_underscores = skip_underscores(&mut it, test_valid)?; |
661 | |
662 | match it.clone().next() { |
663 | // If we're followed by something this may be a float so keep going. |
664 | Some(_) => {} |
665 | |
666 | // Otherwise this is a valid integer literal! |
667 | None => { |
668 | return Some(TokenKind::Integer(IntegerKind { |
669 | has_underscores, |
670 | sign, |
671 | hex, |
672 | })) |
673 | } |
674 | } |
675 | |
676 | // A number can optionally be after the dot so only actually try to |
677 | // parse one if it's there. |
678 | if it.clone().next() == Some(&b'.' ) { |
679 | it.next(); |
680 | match it.clone().next() { |
681 | Some(c) if test_valid(*c) => { |
682 | if skip_underscores(&mut it, test_valid)? { |
683 | has_underscores = true; |
684 | } |
685 | } |
686 | Some(_) | None => {} |
687 | } |
688 | }; |
689 | |
690 | // Figure out if there's an exponential part here to make a float, and |
691 | // if so parse it but defer its actual calculation until later. |
692 | match (hex, it.next()) { |
693 | (true, Some(b'p' )) | (true, Some(b'P' )) | (false, Some(b'e' )) | (false, Some(b'E' )) => { |
694 | match it.clone().next() { |
695 | Some(b'-' ) => { |
696 | it.next(); |
697 | } |
698 | Some(b'+' ) => { |
699 | it.next(); |
700 | } |
701 | _ => {} |
702 | } |
703 | if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? { |
704 | has_underscores = true; |
705 | } |
706 | } |
707 | (_, None) => {} |
708 | _ => return None, |
709 | } |
710 | |
711 | // We should have eaten everything by now, if not then this is surely |
712 | // not a float or integer literal. |
713 | if it.next().is_some() { |
714 | return None; |
715 | } |
716 | |
717 | return Some(TokenKind::Float(FloatKind::Normal { |
718 | has_underscores, |
719 | hex, |
720 | })); |
721 | |
722 | fn skip_underscores<'a>( |
723 | it: &mut slice::Iter<'_, u8>, |
724 | good: fn(u8) -> bool, |
725 | ) -> Option<bool> { |
726 | let mut last_underscore = false; |
727 | let mut has_underscores = false; |
728 | let first = *it.next()?; |
729 | if !good(first) { |
730 | return None; |
731 | } |
732 | while let Some(c) = it.clone().next() { |
733 | if *c == b'_' && !last_underscore { |
734 | has_underscores = true; |
735 | it.next(); |
736 | last_underscore = true; |
737 | continue; |
738 | } |
739 | if !good(*c) { |
740 | break; |
741 | } |
742 | last_underscore = false; |
743 | it.next(); |
744 | } |
745 | if last_underscore { |
746 | return None; |
747 | } |
748 | Some(has_underscores) |
749 | } |
750 | } |
751 | |
752 | /// Verifies that `comment`, which is about to be returned, has a "confusing |
753 | /// unicode character" in it and should instead be transformed into an |
754 | /// error. |
755 | fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> { |
756 | if self.allow_confusing_unicode { |
757 | return Ok(()); |
758 | } |
759 | |
760 | // In an effort to avoid utf-8 decoding the entire `comment` the search |
761 | // here is a bit more optimized. This checks for the `0xe2` byte because |
762 | // in the utf-8 encoding that's the leading encoding byte for all |
763 | // "confusing characters". Each instance of 0xe2 is checked to see if it |
764 | // starts a confusing character, and if so that's returned. |
765 | // |
766 | // Also note that 0xe2 will never be found in the middle of a codepoint, |
767 | // it's always the start of a codepoint. This means that if our special |
768 | // characters show up they're guaranteed to start with 0xe2 bytes. |
769 | let bytes = comment.as_bytes(); |
770 | for pos in memchr::Memchr::new(0xe2, bytes) { |
771 | if let Some(c) = comment[pos..].chars().next() { |
772 | if is_confusing_unicode(c) { |
773 | // Note that `self.cur()` accounts for already having |
774 | // parsed `comment`, so we move backwards to where |
775 | // `comment` started and then add the index within |
776 | // `comment`. |
777 | let pos = end - comment.len() + pos; |
778 | return Err(self.error(pos, LexError::ConfusingUnicode(c))); |
779 | } |
780 | } |
781 | } |
782 | |
783 | Ok(()) |
784 | } |
785 | |
786 | fn parse_str( |
787 | it: &mut str::Chars<'a>, |
788 | allow_confusing_unicode: bool, |
789 | ) -> Result<Cow<'a, [u8]>, LexError> { |
790 | enum State { |
791 | Start, |
792 | String(Vec<u8>), |
793 | } |
794 | let orig = it.as_str(); |
795 | let mut state = State::Start; |
796 | loop { |
797 | match it.next().ok_or(LexError::UnexpectedEof)? { |
798 | '"' => break, |
799 | ' \\' => { |
800 | match state { |
801 | State::String(_) => {} |
802 | State::Start => { |
803 | let pos = orig.len() - it.as_str().len() - 1; |
804 | state = State::String(orig[..pos].as_bytes().to_vec()); |
805 | } |
806 | } |
807 | let buf = match &mut state { |
808 | State::String(b) => b, |
809 | State::Start => unreachable!(), |
810 | }; |
811 | match it.next().ok_or(LexError::UnexpectedEof)? { |
812 | '"' => buf.push(b'"' ), |
813 | ' \'' => buf.push(b' \'' ), |
814 | 't' => buf.push(b' \t' ), |
815 | 'n' => buf.push(b' \n' ), |
816 | 'r' => buf.push(b' \r' ), |
817 | ' \\' => buf.push(b' \\' ), |
818 | 'u' => { |
819 | Lexer::must_eat_char(it, '{' )?; |
820 | let n = Lexer::hexnum(it)?; |
821 | let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?; |
822 | buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); |
823 | Lexer::must_eat_char(it, '}' )?; |
824 | } |
825 | c1 if c1.is_ascii_hexdigit() => { |
826 | let c2 = Lexer::hexdigit(it)?; |
827 | buf.push(to_hex(c1) * 16 + c2); |
828 | } |
829 | c => return Err(LexError::InvalidStringEscape(c)), |
830 | } |
831 | } |
832 | c if (c as u32) < 0x20 || c as u32 == 0x7f => { |
833 | return Err(LexError::InvalidStringElement(c)) |
834 | } |
835 | c if !allow_confusing_unicode && is_confusing_unicode(c) => { |
836 | return Err(LexError::ConfusingUnicode(c)) |
837 | } |
838 | c => match &mut state { |
839 | State::Start => {} |
840 | State::String(v) => { |
841 | v.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); |
842 | } |
843 | }, |
844 | } |
845 | } |
846 | match state { |
847 | State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()), |
848 | State::String(s) => Ok(s.into()), |
849 | } |
850 | } |
851 | |
852 | /// Parses an id-or-string-based name from `it`. |
853 | /// |
854 | /// Note that `it` should already have been lexed and this is just |
855 | /// extracting the value. If the token lexed was `@a` then this should point |
856 | /// to `a`. |
857 | /// |
858 | /// This will automatically detect quoted syntax such as `@"..."` and the |
859 | /// byte string will be parsed and validated as utf-8. |
860 | /// |
861 | /// # Errors |
862 | /// |
863 | /// Returns an error if a quoted byte string is found and contains invalid |
864 | /// utf-8. |
865 | fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> { |
866 | if it.clone().next() == Some('"' ) { |
867 | it.next(); |
868 | match Lexer::parse_str(it, true)? { |
869 | Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) { |
870 | Ok(s) => Ok(Cow::Borrowed(s)), |
871 | Err(e) => Err(LexError::InvalidUtf8Id(e)), |
872 | }, |
873 | Cow::Owned(bytes) => match String::from_utf8(bytes) { |
874 | Ok(s) => Ok(Cow::Owned(s)), |
875 | Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())), |
876 | }, |
877 | } |
878 | } else { |
879 | Ok(Cow::Borrowed(it.as_str())) |
880 | } |
881 | } |
882 | |
883 | fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> { |
884 | let n = Lexer::hexdigit(it)?; |
885 | let mut last_underscore = false; |
886 | let mut n = n as u32; |
887 | while let Some(c) = it.clone().next() { |
888 | if c == '_' { |
889 | it.next(); |
890 | last_underscore = true; |
891 | continue; |
892 | } |
893 | if !c.is_ascii_hexdigit() { |
894 | break; |
895 | } |
896 | last_underscore = false; |
897 | it.next(); |
898 | n = n |
899 | .checked_mul(16) |
900 | .and_then(|n| n.checked_add(to_hex(c) as u32)) |
901 | .ok_or(LexError::NumberTooBig)?; |
902 | } |
903 | if last_underscore { |
904 | return Err(LexError::LoneUnderscore); |
905 | } |
906 | Ok(n) |
907 | } |
908 | |
909 | /// Reads a hexidecimal digit from the input stream, returning where it's |
910 | /// defined and the hex value. Returns an error on EOF or an invalid hex |
911 | /// digit. |
912 | fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> { |
913 | let ch = Lexer::must_char(it)?; |
914 | if ch.is_ascii_hexdigit() { |
915 | Ok(to_hex(ch)) |
916 | } else { |
917 | Err(LexError::InvalidHexDigit(ch)) |
918 | } |
919 | } |
920 | |
921 | /// Reads the next character from the input string and where it's located, |
922 | /// returning an error if the input stream is empty. |
923 | fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> { |
924 | it.next().ok_or(LexError::UnexpectedEof) |
925 | } |
926 | |
927 | /// Expects that a specific character must be read next |
928 | fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> { |
929 | let found = Lexer::must_char(it)?; |
930 | if wanted == found { |
931 | Ok(()) |
932 | } else { |
933 | Err(LexError::Expected { wanted, found }) |
934 | } |
935 | } |
936 | |
937 | /// Creates an error at `pos` with the specified `kind` |
938 | fn error(&self, pos: usize, kind: LexError) -> Error { |
939 | Error::lex(Span { offset: pos }, self.input, kind) |
940 | } |
941 | |
942 | /// Returns an iterator over all tokens in the original source string |
943 | /// starting at the `pos` specified. |
944 | pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ { |
945 | std::iter::from_fn(move || self.parse(&mut pos).transpose()) |
946 | } |
947 | |
948 | /// Returns whether an annotation is present at `pos`. If it is present then |
949 | /// `Ok(Some(token))` is returned corresponding to the token, otherwise |
950 | /// `Ok(None)` is returned. If the next token cannot be parsed then an error |
951 | /// is returned. |
952 | pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> { |
953 | let bytes = self.input.as_bytes(); |
954 | // Quickly reject anything that for sure isn't an annotation since this |
955 | // method is used every time an lparen is parsed. |
956 | if bytes.get(pos) != Some(&b'@' ) { |
957 | return Ok(None); |
958 | } |
959 | match self.parse(&mut pos)? { |
960 | Some(token) => match token.kind { |
961 | TokenKind::Annotation => Ok(Some(token)), |
962 | _ => Ok(None), |
963 | }, |
964 | None => Ok(None), |
965 | } |
966 | } |
967 | } |
968 | |
969 | impl Token { |
970 | /// Returns the original source text for this token. |
971 | pub fn src<'a>(&self, s: &'a str) -> &'a str { |
972 | &s[self.offset..][..self.len.try_into().unwrap()] |
973 | } |
974 | |
975 | /// Returns the identifier, without the leading `$` symbol, that this token |
976 | /// represents. |
977 | /// |
978 | /// Note that this method returns the contents of the identifier. With a |
979 | /// string-based identifier this means that escapes have been resolved to |
980 | /// their string-based equivalent. |
981 | /// |
982 | /// Should only be used with `TokenKind::Id`. |
983 | /// |
984 | /// # Errors |
985 | /// |
986 | /// Returns an error if this is a string-based identifier (e.g. `$"..."`) |
987 | /// which is invalid utf-8. |
988 | pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> { |
989 | let mut ch = self.src(s).chars(); |
990 | let dollar = ch.next(); |
991 | debug_assert_eq!(dollar, Some('$' )); |
992 | let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?; |
993 | if id.is_empty() { |
994 | return Err(self.error(s, LexError::EmptyId)); |
995 | } |
996 | Ok(id) |
997 | } |
998 | |
999 | /// Returns the annotation, without the leading `@` symbol, that this token |
1000 | /// represents. |
1001 | /// |
1002 | /// Note that this method returns the contents of the identifier. With a |
1003 | /// string-based identifier this means that escapes have been resolved to |
1004 | /// their string-based equivalent. |
1005 | /// |
1006 | /// Should only be used with `TokenKind::Annotation`. |
1007 | /// |
1008 | /// # Errors |
1009 | /// |
1010 | /// Returns an error if this is a string-based identifier (e.g. `$"..."`) |
1011 | /// which is invalid utf-8. |
1012 | pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> { |
1013 | let mut ch = self.src(s).chars(); |
1014 | let at = ch.next(); |
1015 | debug_assert_eq!(at, Some('@' )); |
1016 | let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?; |
1017 | if id.is_empty() { |
1018 | return Err(self.error(s, LexError::EmptyAnnotation)); |
1019 | } |
1020 | Ok(id) |
1021 | } |
1022 | |
1023 | /// Returns the keyword this token represents. |
1024 | /// |
1025 | /// Should only be used with [`TokenKind::Keyword`]. |
1026 | pub fn keyword<'a>(&self, s: &'a str) -> &'a str { |
1027 | self.src(s) |
1028 | } |
1029 | |
1030 | /// Returns the reserved string this token represents. |
1031 | /// |
1032 | /// Should only be used with [`TokenKind::Reserved`]. |
1033 | pub fn reserved<'a>(&self, s: &'a str) -> &'a str { |
1034 | self.src(s) |
1035 | } |
1036 | |
1037 | /// Returns the parsed string that this token represents. |
1038 | /// |
1039 | /// This returns either a raw byte slice into the source if that's possible |
1040 | /// or an owned representation to handle escaped characters and such. |
1041 | /// |
1042 | /// Should only be used with [`TokenKind::String`]. |
1043 | pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> { |
1044 | let mut ch = self.src(s).chars(); |
1045 | ch.next().unwrap(); |
1046 | Lexer::parse_str(&mut ch, true).unwrap() |
1047 | } |
1048 | |
1049 | /// Returns the decomposed float token that this represents. |
1050 | /// |
1051 | /// This will slice up the float token into its component parts and return a |
1052 | /// description of the float token in the source. |
1053 | /// |
1054 | /// Should only be used with [`TokenKind::Float`]. |
1055 | pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> { |
1056 | match kind { |
1057 | FloatKind::Inf { negative } => Float::Inf { negative }, |
1058 | FloatKind::Nan { negative } => Float::Nan { |
1059 | val: None, |
1060 | negative, |
1061 | }, |
1062 | FloatKind::NanVal { |
1063 | negative, |
1064 | has_underscores, |
1065 | } => { |
1066 | let src = self.src(s); |
1067 | let src = if src.starts_with("n" ) { src } else { &src[1..] }; |
1068 | let mut val = Cow::Borrowed(src.strip_prefix("nan:0x" ).unwrap()); |
1069 | if has_underscores { |
1070 | *val.to_mut() = val.replace("_" , "" ); |
1071 | } |
1072 | Float::Nan { |
1073 | val: Some(val), |
1074 | negative, |
1075 | } |
1076 | } |
1077 | FloatKind::Normal { |
1078 | has_underscores, |
1079 | hex, |
1080 | } => { |
1081 | let src = self.src(s); |
1082 | let (integral, fractional, exponent) = match src.find('.' ) { |
1083 | Some(i) => { |
1084 | let integral = &src[..i]; |
1085 | let rest = &src[i + 1..]; |
1086 | let exponent = if hex { |
1087 | rest.find('p' ).or_else(|| rest.find('P' )) |
1088 | } else { |
1089 | rest.find('e' ).or_else(|| rest.find('E' )) |
1090 | }; |
1091 | match exponent { |
1092 | Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])), |
1093 | None => (integral, Some(rest), None), |
1094 | } |
1095 | } |
1096 | None => { |
1097 | let exponent = if hex { |
1098 | src.find('p' ).or_else(|| src.find('P' )) |
1099 | } else { |
1100 | src.find('e' ).or_else(|| src.find('E' )) |
1101 | }; |
1102 | match exponent { |
1103 | Some(i) => (&src[..i], None, Some(&src[i + 1..])), |
1104 | None => (src, None, None), |
1105 | } |
1106 | } |
1107 | }; |
1108 | let mut integral = Cow::Borrowed(integral.strip_prefix('+' ).unwrap_or(integral)); |
1109 | let mut fractional = fractional.and_then(|s| { |
1110 | if s.is_empty() { |
1111 | None |
1112 | } else { |
1113 | Some(Cow::Borrowed(s)) |
1114 | } |
1115 | }); |
1116 | let mut exponent = |
1117 | exponent.map(|s| Cow::Borrowed(s.strip_prefix('+' ).unwrap_or(s))); |
1118 | if has_underscores { |
1119 | *integral.to_mut() = integral.replace("_" , "" ); |
1120 | if let Some(fractional) = &mut fractional { |
1121 | *fractional.to_mut() = fractional.replace("_" , "" ); |
1122 | } |
1123 | if let Some(exponent) = &mut exponent { |
1124 | *exponent.to_mut() = exponent.replace("_" , "" ); |
1125 | } |
1126 | } |
1127 | if hex { |
1128 | *integral.to_mut() = integral.replace("0x" , "" ); |
1129 | } |
1130 | Float::Val { |
1131 | hex, |
1132 | integral, |
1133 | fractional, |
1134 | exponent, |
1135 | } |
1136 | } |
1137 | } |
1138 | } |
1139 | |
1140 | /// Returns the decomposed integer token that this represents. |
1141 | /// |
1142 | /// This will slice up the integer token into its component parts and |
1143 | /// return a description of the integer token in the source. |
1144 | /// |
1145 | /// Should only be used with [`TokenKind::Integer`]. |
1146 | pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> { |
1147 | let src = self.src(s); |
1148 | let val = match kind.sign { |
1149 | Some(SignToken::Plus) => src.strip_prefix('+' ).unwrap(), |
1150 | Some(SignToken::Minus) => src, |
1151 | None => src, |
1152 | }; |
1153 | let mut val = Cow::Borrowed(val); |
1154 | if kind.has_underscores { |
1155 | *val.to_mut() = val.replace("_" , "" ); |
1156 | } |
1157 | if kind.hex { |
1158 | *val.to_mut() = val.replace("0x" , "" ); |
1159 | } |
1160 | Integer { |
1161 | sign: kind.sign, |
1162 | hex: kind.hex, |
1163 | val, |
1164 | } |
1165 | } |
1166 | |
1167 | fn error(&self, src: &str, err: LexError) -> Error { |
1168 | Error::lex( |
1169 | Span { |
1170 | offset: self.offset, |
1171 | }, |
1172 | src, |
1173 | err, |
1174 | ) |
1175 | } |
1176 | } |
1177 | |
1178 | impl<'a> Integer<'a> { |
1179 | /// Returns the sign token for this integer. |
1180 | pub fn sign(&self) -> Option<SignToken> { |
1181 | self.sign |
1182 | } |
1183 | |
1184 | /// Returns the value string that can be parsed for this integer, as well |
1185 | /// as the base that it should be parsed in |
1186 | pub fn val(&self) -> (&str, u32) { |
1187 | (&self.val, if self.hex { 16 } else { 10 }) |
1188 | } |
1189 | } |
1190 | |
1191 | fn to_hex(c: char) -> u8 { |
1192 | match c { |
1193 | 'a' ..='f' => c as u8 - b'a' + 10, |
1194 | 'A' ..='F' => c as u8 - b'A' + 10, |
1195 | _ => c as u8 - b'0' , |
1196 | } |
1197 | } |
1198 | |
1199 | impl fmt::Display for LexError { |
1200 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1201 | use LexError::*; |
1202 | match self { |
1203 | DanglingBlockComment => f.write_str("unterminated block comment" )?, |
1204 | Unexpected(c) => write!(f, "unexpected character ' {}'" , escape_char(*c))?, |
1205 | InvalidStringElement(c) => { |
1206 | write!(f, "invalid character in string ' {}'" , escape_char(*c))? |
1207 | } |
1208 | InvalidStringEscape(c) => write!(f, "invalid string escape ' {}'" , escape_char(*c))?, |
1209 | InvalidHexDigit(c) => write!(f, "invalid hex digit ' {}'" , escape_char(*c))?, |
1210 | InvalidDigit(c) => write!(f, "invalid decimal digit ' {}'" , escape_char(*c))?, |
1211 | Expected { wanted, found } => write!( |
1212 | f, |
1213 | "expected ' {}' but found ' {}'" , |
1214 | escape_char(*wanted), |
1215 | escape_char(*found) |
1216 | )?, |
1217 | UnexpectedEof => write!(f, "unexpected end-of-file" )?, |
1218 | NumberTooBig => f.write_str("number is too big to parse" )?, |
1219 | InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x {:x}" , c)?, |
1220 | LoneUnderscore => write!(f, "bare underscore in numeric literal" )?, |
1221 | ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}" , c)?, |
1222 | InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id" )?, |
1223 | EmptyId => write!(f, "empty identifier" )?, |
1224 | EmptyAnnotation => write!(f, "empty annotation id" )?, |
1225 | } |
1226 | Ok(()) |
1227 | } |
1228 | } |
1229 | |
1230 | fn escape_char(c: char) -> String { |
1231 | match c { |
1232 | ' \t' => String::from(" \\t" ), |
1233 | ' \r' => String::from(" \\r" ), |
1234 | ' \n' => String::from(" \\n" ), |
1235 | ' \\' => String::from(" \\\\" ), |
1236 | ' \'' => String::from(" \\\'" ), |
1237 | ' \"' => String::from(" \"" ), |
1238 | ' \x20' ..=' \x7e' => String::from(c), |
1239 | _ => c.escape_unicode().to_string(), |
1240 | } |
1241 | } |
1242 | |
1243 | /// This is an attempt to protect agains the "trojan source" [1] problem where |
1244 | /// unicode characters can cause editors to render source code differently |
1245 | /// for humans than the compiler itself sees. |
1246 | /// |
1247 | /// To mitigate this issue, and because it's relatively rare in practice, |
1248 | /// this simply rejects characters of that form. |
1249 | /// |
1250 | /// [1]: https://www.trojansource.codes/ |
1251 | fn is_confusing_unicode(ch: char) -> bool { |
1252 | matches!( |
1253 | ch, |
1254 | ' \u{202a}' |
1255 | | ' \u{202b}' |
1256 | | ' \u{202d}' |
1257 | | ' \u{202e}' |
1258 | | ' \u{2066}' |
1259 | | ' \u{2067}' |
1260 | | ' \u{2068}' |
1261 | | ' \u{206c}' |
1262 | | ' \u{2069}' |
1263 | ) |
1264 | } |
1265 | |
1266 | #[cfg (test)] |
1267 | mod tests { |
1268 | use super::*; |
1269 | |
1270 | #[test ] |
1271 | fn ws_smoke() { |
1272 | fn get_whitespace(input: &str) -> &str { |
1273 | let token = get_token(input); |
1274 | match token.kind { |
1275 | TokenKind::Whitespace => token.src(input), |
1276 | other => panic!("unexpected {:?}" , other), |
1277 | } |
1278 | } |
1279 | assert_eq!(get_whitespace(" " ), " " ); |
1280 | assert_eq!(get_whitespace(" " ), " " ); |
1281 | assert_eq!(get_whitespace(" \n " ), " \n " ); |
1282 | assert_eq!(get_whitespace(" x" ), " " ); |
1283 | assert_eq!(get_whitespace(" ;" ), " " ); |
1284 | } |
1285 | |
1286 | #[test ] |
1287 | fn line_comment_smoke() { |
1288 | fn get_line_comment(input: &str) -> &str { |
1289 | let token = get_token(input); |
1290 | match token.kind { |
1291 | TokenKind::LineComment => token.src(input), |
1292 | other => panic!("unexpected {:?}" , other), |
1293 | } |
1294 | } |
1295 | assert_eq!(get_line_comment(";;" ), ";;" ); |
1296 | assert_eq!(get_line_comment(";; xyz" ), ";; xyz" ); |
1297 | assert_eq!(get_line_comment(";; xyz \nabc" ), ";; xyz" ); |
1298 | assert_eq!(get_line_comment(";; \nabc" ), ";;" ); |
1299 | assert_eq!(get_line_comment(";; \nabc" ), ";; " ); |
1300 | assert_eq!(get_line_comment(";; \rabc" ), ";; " ); |
1301 | assert_eq!(get_line_comment(";; \r\nabc" ), ";; " ); |
1302 | } |
1303 | |
1304 | #[test ] |
1305 | fn block_comment_smoke() { |
1306 | fn get_block_comment(input: &str) -> &str { |
1307 | let token = get_token(input); |
1308 | match token.kind { |
1309 | TokenKind::BlockComment => token.src(input), |
1310 | other => panic!("unexpected {:?}" , other), |
1311 | } |
1312 | } |
1313 | assert_eq!(get_block_comment("(;;)" ), "(;;)" ); |
1314 | assert_eq!(get_block_comment("(; ;)" ), "(; ;)" ); |
1315 | assert_eq!(get_block_comment("(; (;;) ;)" ), "(; (;;) ;)" ); |
1316 | } |
1317 | |
1318 | fn get_token(input: &str) -> Token { |
1319 | Lexer::new(input) |
1320 | .parse(&mut 0) |
1321 | .expect("no first token" ) |
1322 | .expect("no token" ) |
1323 | } |
1324 | |
1325 | #[test ] |
1326 | fn lparen() { |
1327 | assert_eq!(get_token("((" ).kind, TokenKind::LParen); |
1328 | } |
1329 | |
1330 | #[test ] |
1331 | fn rparen() { |
1332 | assert_eq!(get_token(")(" ).kind, TokenKind::RParen); |
1333 | } |
1334 | |
1335 | #[test ] |
1336 | fn strings() { |
1337 | fn get_string(input: &str) -> Vec<u8> { |
1338 | let token = get_token(input); |
1339 | match token.kind { |
1340 | TokenKind::String => token.string(input).to_vec(), |
1341 | other => panic!("not keyword {:?}" , other), |
1342 | } |
1343 | } |
1344 | assert_eq!(&*get_string(" \"\"" ), b"" ); |
1345 | assert_eq!(&*get_string(" \"a \"" ), b"a" ); |
1346 | assert_eq!(&*get_string(" \"a b c d \"" ), b"a b c d" ); |
1347 | assert_eq!(&*get_string(" \"\\\"\"" ), b" \"" ); |
1348 | assert_eq!(&*get_string(" \"\\' \"" ), b"'" ); |
1349 | assert_eq!(&*get_string(" \"\\n \"" ), b" \n" ); |
1350 | assert_eq!(&*get_string(" \"\\t \"" ), b" \t" ); |
1351 | assert_eq!(&*get_string(" \"\\r \"" ), b" \r" ); |
1352 | assert_eq!(&*get_string(" \"\\\\\"" ), b" \\" ); |
1353 | assert_eq!(&*get_string(" \"\\01 \"" ), &[1]); |
1354 | assert_eq!(&*get_string(" \"\\u{1} \"" ), &[1]); |
1355 | assert_eq!( |
1356 | &*get_string(" \"\\u{0f3} \"" ), |
1357 | ' \u{0f3}' .encode_utf8(&mut [0; 4]).as_bytes() |
1358 | ); |
1359 | assert_eq!( |
1360 | &*get_string(" \"\\u{0_f_3} \"" ), |
1361 | ' \u{0f3}' .encode_utf8(&mut [0; 4]).as_bytes() |
1362 | ); |
1363 | |
1364 | for i in 0..=255i32 { |
1365 | let s = format!(" \"\\{:02x} \"" , i); |
1366 | assert_eq!(&*get_string(&s), &[i as u8]); |
1367 | } |
1368 | } |
1369 | |
1370 | #[test ] |
1371 | fn id() { |
1372 | fn get_id(input: &str) -> String { |
1373 | let token = get_token(input); |
1374 | match token.kind { |
1375 | TokenKind::Id => token.id(input).unwrap().to_string(), |
1376 | other => panic!("not id {:?}" , other), |
1377 | } |
1378 | } |
1379 | assert_eq!(get_id("$x" ), "x" ); |
1380 | assert_eq!(get_id("$xyz" ), "xyz" ); |
1381 | assert_eq!(get_id("$x_z" ), "x_z" ); |
1382 | assert_eq!(get_id("$0^" ), "0^" ); |
1383 | assert_eq!(get_id("$0^;;" ), "0^" ); |
1384 | assert_eq!(get_id("$0^ ;;" ), "0^" ); |
1385 | assert_eq!(get_id("$ \"x \" ;;" ), "x" ); |
1386 | } |
1387 | |
1388 | #[test ] |
1389 | fn annotation() { |
1390 | fn get_annotation(input: &str) -> String { |
1391 | let token = get_token(input); |
1392 | match token.kind { |
1393 | TokenKind::Annotation => token.annotation(input).unwrap().to_string(), |
1394 | other => panic!("not annotation {:?}" , other), |
1395 | } |
1396 | } |
1397 | assert_eq!(get_annotation("@foo" ), "foo" ); |
1398 | assert_eq!(get_annotation("@foo " ), "foo" ); |
1399 | assert_eq!(get_annotation("@f " ), "f" ); |
1400 | assert_eq!(get_annotation("@ \"x \" " ), "x" ); |
1401 | assert_eq!(get_annotation("@0 " ), "0" ); |
1402 | } |
1403 | |
1404 | #[test ] |
1405 | fn keyword() { |
1406 | fn get_keyword(input: &str) -> &str { |
1407 | let token = get_token(input); |
1408 | match token.kind { |
1409 | TokenKind::Keyword => token.keyword(input), |
1410 | other => panic!("not keyword {:?}" , other), |
1411 | } |
1412 | } |
1413 | assert_eq!(get_keyword("x" ), "x" ); |
1414 | assert_eq!(get_keyword("xyz" ), "xyz" ); |
1415 | assert_eq!(get_keyword("x_z" ), "x_z" ); |
1416 | assert_eq!(get_keyword("x_z " ), "x_z" ); |
1417 | assert_eq!(get_keyword("x_z " ), "x_z" ); |
1418 | } |
1419 | |
1420 | #[test ] |
1421 | fn reserved() { |
1422 | fn get_reserved(input: &str) -> &str { |
1423 | let token = get_token(input); |
1424 | match token.kind { |
1425 | TokenKind::Reserved => token.reserved(input), |
1426 | other => panic!("not reserved {:?}" , other), |
1427 | } |
1428 | } |
1429 | assert_eq!(get_reserved("^_x " ), "^_x" ); |
1430 | } |
1431 | |
1432 | #[test ] |
1433 | fn integer() { |
1434 | fn get_integer(input: &str) -> String { |
1435 | let token = get_token(input); |
1436 | match token.kind { |
1437 | TokenKind::Integer(i) => token.integer(input, i).val.to_string(), |
1438 | other => panic!("not integer {:?}" , other), |
1439 | } |
1440 | } |
1441 | assert_eq!(get_integer("1" ), "1" ); |
1442 | assert_eq!(get_integer("0" ), "0" ); |
1443 | assert_eq!(get_integer("-1" ), "-1" ); |
1444 | assert_eq!(get_integer("+1" ), "1" ); |
1445 | assert_eq!(get_integer("+1_000" ), "1000" ); |
1446 | assert_eq!(get_integer("+1_0_0_0" ), "1000" ); |
1447 | assert_eq!(get_integer("+0x10" ), "10" ); |
1448 | assert_eq!(get_integer("-0x10" ), "-10" ); |
1449 | assert_eq!(get_integer("0x10" ), "10" ); |
1450 | } |
1451 | |
1452 | #[test ] |
1453 | fn float() { |
1454 | fn get_float(input: &str) -> Float<'_> { |
1455 | let token = get_token(input); |
1456 | match token.kind { |
1457 | TokenKind::Float(f) => token.float(input, f), |
1458 | other => panic!("not float {:?}" , other), |
1459 | } |
1460 | } |
1461 | assert_eq!( |
1462 | get_float("nan" ), |
1463 | Float::Nan { |
1464 | val: None, |
1465 | negative: false |
1466 | }, |
1467 | ); |
1468 | assert_eq!( |
1469 | get_float("-nan" ), |
1470 | Float::Nan { |
1471 | val: None, |
1472 | negative: true, |
1473 | }, |
1474 | ); |
1475 | assert_eq!( |
1476 | get_float("+nan" ), |
1477 | Float::Nan { |
1478 | val: None, |
1479 | negative: false, |
1480 | }, |
1481 | ); |
1482 | assert_eq!( |
1483 | get_float("+nan:0x1" ), |
1484 | Float::Nan { |
1485 | val: Some("1" .into()), |
1486 | negative: false, |
1487 | }, |
1488 | ); |
1489 | assert_eq!( |
1490 | get_float("nan:0x7f_ffff" ), |
1491 | Float::Nan { |
1492 | val: Some("7fffff" .into()), |
1493 | negative: false, |
1494 | }, |
1495 | ); |
1496 | assert_eq!(get_float("inf" ), Float::Inf { negative: false }); |
1497 | assert_eq!(get_float("-inf" ), Float::Inf { negative: true }); |
1498 | assert_eq!(get_float("+inf" ), Float::Inf { negative: false }); |
1499 | |
1500 | assert_eq!( |
1501 | get_float("1.2" ), |
1502 | Float::Val { |
1503 | integral: "1" .into(), |
1504 | fractional: Some("2" .into()), |
1505 | exponent: None, |
1506 | hex: false, |
1507 | }, |
1508 | ); |
1509 | assert_eq!( |
1510 | get_float("1.2e3" ), |
1511 | Float::Val { |
1512 | integral: "1" .into(), |
1513 | fractional: Some("2" .into()), |
1514 | exponent: Some("3" .into()), |
1515 | hex: false, |
1516 | }, |
1517 | ); |
1518 | assert_eq!( |
1519 | get_float("-1_2.1_1E+0_1" ), |
1520 | Float::Val { |
1521 | integral: "-12" .into(), |
1522 | fractional: Some("11" .into()), |
1523 | exponent: Some("01" .into()), |
1524 | hex: false, |
1525 | }, |
1526 | ); |
1527 | assert_eq!( |
1528 | get_float("+1_2.1_1E-0_1" ), |
1529 | Float::Val { |
1530 | integral: "12" .into(), |
1531 | fractional: Some("11" .into()), |
1532 | exponent: Some("-01" .into()), |
1533 | hex: false, |
1534 | }, |
1535 | ); |
1536 | assert_eq!( |
1537 | get_float("0x1_2.3_4p5_6" ), |
1538 | Float::Val { |
1539 | integral: "12" .into(), |
1540 | fractional: Some("34" .into()), |
1541 | exponent: Some("56" .into()), |
1542 | hex: true, |
1543 | }, |
1544 | ); |
1545 | assert_eq!( |
1546 | get_float("+0x1_2.3_4P-5_6" ), |
1547 | Float::Val { |
1548 | integral: "12" .into(), |
1549 | fractional: Some("34" .into()), |
1550 | exponent: Some("-56" .into()), |
1551 | hex: true, |
1552 | }, |
1553 | ); |
1554 | assert_eq!( |
1555 | get_float("1." ), |
1556 | Float::Val { |
1557 | integral: "1" .into(), |
1558 | fractional: None, |
1559 | exponent: None, |
1560 | hex: false, |
1561 | }, |
1562 | ); |
1563 | assert_eq!( |
1564 | get_float("0x1p-24" ), |
1565 | Float::Val { |
1566 | integral: "1" .into(), |
1567 | fractional: None, |
1568 | exponent: Some("-24" .into()), |
1569 | hex: true, |
1570 | }, |
1571 | ); |
1572 | } |
1573 | } |
1574 | |