lexer.rs source code [crates/wast/src/lexer.rs]

1	//! Definition of a lexer for the WebAssembly text format.
2	//!
3	//! This module provides a [`Lexer`][] type which is an iterate over the raw
4	//! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single
5	//! byte in a WebAssembly text field, returning tokens even for comments and
6	//! whitespace. Typically you'll ignore comments and whitespace, however.
7	//!
8	//! If you'd like to iterate over the tokens in a file you can do so via:
9	//!
10	//! ```
11	//! # fn foo() -> Result<(), wast::Error> {
12	//! use wast::lexer::Lexer;
13	//!
14	//! let wat = "(module (func $foo))";
15	//! for token in Lexer::new(wat).iter(`0`) {
16	//! println!("{:?}", token?);
17	//! }
18	//! # Ok(())
19	//! # }
20	//! ```
21	//!
22	//! Note that you'll typically not use this module but will rather use
23	//! [`ParseBuffer`](crate::parser::ParseBuffer) instead.
24	//!
25	//! [`Lexer`]: crate::lexer::Lexer
26
27	use crate::token::Span;
28	use crate::Error;
29	use std::borrow::Cow;
30	use std::char;
31	use std::fmt;
32	use std::slice;
33	use std::str;
34	use std::str::Utf8Error;
35
36	/// A structure used to lex the s-expression syntax of WAT files.
37	///
38	/// This structure is used to generate [`Token`] items, which should account for
39	/// every single byte of the input as we iterate over it. A [`LexError`] is
40	/// returned for any non-lexable text.
41	#[derive(Clone)]
42	pub struct Lexer<'a> {
43	input: &'a str,
44	allow_confusing_unicode: bool,
45	}
46
47	/// A single token parsed from a `Lexer`.
48	#[derive(Copy, Clone, Debug, PartialEq)]
49	pub struct Token {
50	/// The kind of token this represents, such as whether it's whitespace, a
51	/// keyword, etc.
52	pub kind: TokenKind,
53	/// The byte offset within the original source for where this token came
54	/// from.
55	pub offset: usize,
56	/// The byte length of this token as it resides in the original source.
57	//
58	// NB: this is `u32` to enable packing `Token` into two pointers of size.
59	// This does limit a single token to being at most 4G large, but that seems
60	// probably ok.
61	pub len: u32,
62	}
63
64	#[test]
65	fn token_is_not_too_big() {
66	assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * `2`);
67	}
68
69	/// Classification of what was parsed from the input stream.
70	///
71	/// This enumeration contains all kinds of fragments, including comments and
72	/// whitespace.
73	#[derive(Copy, Clone, Debug, PartialEq)]
74	pub enum TokenKind {
75	/// A line comment, preceded with `;;`
76	LineComment,
77
78	/// A block comment, surrounded by `(;` and `;)`. Note that these can be
79	/// nested.
80	BlockComment,
81
82	/// A fragment of source that represents whitespace.
83	Whitespace,
84
85	/// A left-parenthesis, including the source text for where it comes from.
86	LParen,
87	/// A right-parenthesis, including the source text for where it comes from.
88	RParen,
89
90	/// A string literal, which is actually a list of bytes.
91	String,
92
93	/// An identifier (like `$foo`).
94	///
95	/// All identifiers start with `$` and the payload here is the original
96	/// source text.
97	Id,
98
99	/// A keyword, or something that starts with an alphabetic character.
100	///
101	/// The payload here is the original source text.
102	Keyword,
103
104	/// An annotation (like `@foo`).
105	///
106	/// All annotations start with `@` and the payload will be the name of the
107	/// annotation.
108	Annotation,
109
110	/// A reserved series of `idchar` symbols. Unknown what this is meant to be
111	/// used for, you'll probably generate an error about an unexpected token.
112	Reserved,
113
114	/// An integer.
115	Integer(IntegerKind),
116
117	/// A float.
118	Float(FloatKind),
119	}
120
121	/// Description of the parsed integer from the source.
122	#[derive(Copy, Clone, Debug, PartialEq)]
123	pub struct IntegerKind {
124	sign: Option<SignToken>,
125	has_underscores: bool,
126	hex: bool,
127	}
128
129	/// Description of a parsed float from the source.
130	#[allow(missing_docs)]
131	#[derive(Copy, Clone, Debug, PartialEq)]
132	pub enum FloatKind {
133	#[doc(hidden)]
134	Inf { negative: bool },
135	#[doc(hidden)]
136	Nan { negative: bool },
137	#[doc(hidden)]
138	NanVal {
139	negative: bool,
140	has_underscores: bool,
141	},
142	#[doc(hidden)]
143	Normal { has_underscores: bool, hex: bool },
144	}
145
146	enum ReservedKind {
147	/// "..."
148	String,
149	/// anything that's just a sequence of `idchars!()`
150	Idchars,
151	/// $"..."
152	IdString,
153	/// @"..."
154	AnnotationString,
155	/// everything else (a conglomeration of strings, idchars, etc)
156	Reserved,
157	}
158
159	/// Errors that can be generated while lexing.
160	///
161	/// All lexing errors have line/colum/position information as well as a
162	/// `LexError` indicating what kind of error happened while lexing.
163	#[derive(Debug, Clone, PartialEq, Eq)]
164	#[non_exhaustive]
165	pub enum LexError {
166	/// A dangling block comment was found with an unbalanced `(;` which was
167	/// never terminated in the file.
168	DanglingBlockComment,
169
170	/// An unexpected character was encountered when generally parsing and
171	/// looking for something else.
172	Unexpected(char),
173
174	/// An invalid `char` in a string literal was found.
175	InvalidStringElement(char),
176
177	/// An invalid string escape letter was found (the thing after the `\` in
178	/// string literals)
179	InvalidStringEscape(char),
180
181	/// An invalid hexadecimal digit was found.
182	InvalidHexDigit(char),
183
184	/// An invalid base-10 digit was found.
185	InvalidDigit(char),
186
187	/// Parsing expected `wanted` but ended up finding `found` instead where the
188	/// two characters aren't the same.
189	Expected {
190	/// The character that was expected to be found
191	wanted: char,
192	/// The character that was actually found
193	found: char,
194	},
195
196	/// We needed to parse more but EOF (or end of the string) was encountered.
197	UnexpectedEof,
198
199	/// A number failed to parse because it was too big to fit within the target
200	/// type.
201	NumberTooBig,
202
203	/// An invalid unicode value was found in a `\u{...}` escape in a string,
204	/// only valid unicode scalars can be escaped that way.
205	InvalidUnicodeValue(u32),
206
207	/// A lone underscore was found when parsing a number, since underscores
208	/// should always be preceded and succeeded with a digit of some form.
209	LoneUnderscore,
210
211	/// A "confusing" unicode character is present in a comment or a string
212	/// literal, such as a character that changes the direction text is
213	/// typically displayed in editors. This could cause the human-read
214	/// version to behave differently than the compiler-visible version, so
215	/// these are simply rejected for now.
216	ConfusingUnicode(char),
217
218	/// An invalid utf-8 sequence was found in a quoted identifier, such as
219	/// `$"\ff"`.
220	InvalidUtf8Id(Utf8Error),
221
222	/// An empty identifier was found, or a lone `$`.
223	EmptyId,
224
225	/// An empty identifier was found, or a lone `@`.
226	EmptyAnnotation,
227	}
228
229	/// A sign token for an integer.
230	#[derive(Clone, Copy, Debug, PartialEq, Eq)]
231	pub enum SignToken {
232	/// Plus sign: "+",
233	Plus,
234	/// Minus sign: "-",
235	Minus,
236	}
237
238	/// A fully parsed integer from a source string with a payload ready to parse
239	/// into an integral type.
240	#[derive(Debug, PartialEq)]
241	pub struct Integer<'a> {
242	sign: Option<SignToken>,
243	val: Cow<'a, str>,
244	hex: bool,
245	}
246
247	/// Possible parsed float values
248	#[derive(Debug, PartialEq, Eq)]
249	pub enum Float<'a> {
250	/// A float `NaN` representation
251	Nan {
252	/// The specific bits to encode for this float, optionally
253	val: Option<Cow<'a, str>>,
254	/// Whether or not this is a negative `NaN` or not.
255	negative: bool,
256	},
257	/// An float infinite representation,
258	Inf {
259	#[allow(missing_docs)]
260	negative: bool,
261	},
262	/// A parsed and separated floating point value
263	Val {
264	/// Whether or not the `integral` and `fractional` are specified in hex
265	hex: bool,
266	/// The float parts before the `.`
267	integral: Cow<'a, str>,
268	/// The float parts after the `.`
269	fractional: Option<Cow<'a, str>>,
270	/// The exponent to multiple this `integral.fractional` portion of the
271	/// float by. If `hex` is true this is `2^exponent` and otherwise it's
272	/// `10^exponent`
273	exponent: Option<Cow<'a, str>>,
274	},
275	}
276
277	// https://webassembly.github.io/spec/core/text/values.html#text-idchar
278	macro_rules! idchars {
279	() => {
280	b'0'..=b'9'
281	\| b'A'..=b'Z'
282	\| b'a'..=b'z'
283	\| b'!'
284	\| b'#'
285	\| b'$'
286	\| b'%'
287	\| b'&'
288	\| b'`\'`'
289	\| b'*'
290	\| b'+'
291	\| b'-'
292	\| b'.'
293	\| b'/'
294	\| b':'
295	\| b'<'
296	\| b'='
297	\| b'>'
298	\| b'?'
299	\| b'@'
300	\| b'`\\`'
301	\| b'^'
302	\| b'_'
303	\| b'`'
304	\| b'\|'
305	\| b'~'
306	}
307	}
308
309	impl<'a> Lexer<'a> {
310	/// Creates a new lexer which will lex the `input` source string.
311	pub fn new(input: &str) -> Lexer<'_> {
312	Lexer {
313	input,
314	allow_confusing_unicode: `false`,
315	}
316	}
317
318	/// Returns the original source input that we're lexing.
319	pub fn input(&self) -> &'a str {
320	self.input
321	}
322
323	/// Configures whether "confusing" unicode characters are allowed while
324	/// lexing.
325	///
326	/// If allowed then no error will happen if these characters are found, but
327	/// otherwise if disallowed a lex error will be produced when these
328	/// characters are found. Confusing characters are denied by default.
329	///
330	/// For now "confusing characters" are primarily related to the "trojan
331	/// source" problem where it refers to characters which cause humans to read
332	/// text differently than this lexer, such as characters that alter the
333	/// left-to-right display of the source code.
334	pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self {
335	self.allow_confusing_unicode = allow;
336	self
337	}
338
339	/// Lexes the next at the byte position `pos` in the input.
340	///
341	/// Returns `Some` if a token is found or `None` if we're at EOF.
342	///
343	/// The `pos` argument will be updated to point to the next token on a
344	/// successful parse.
345	///
346	/// # Errors
347	///
348	/// Returns an error if the input is malformed.
349	pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> {
350	let offset = *pos;
351	Ok(match self.parse_kind(pos)? {
352	Some(kind) => Some(Token {
353	kind,
354	offset,
355	len: (*pos - offset).try_into().unwrap(),
356	}),
357	None => None,
358	})
359	}
360
361	fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> {
362	let start = *pos;
363	// This `match` generally parses the grammar specified at
364	//
365	// https://webassembly.github.io/spec/core/text/lexical.html#text-token
366	let remaining = &self.input.as_bytes()[start..];
367	let byte = match remaining.first() {
368	Some(b) => b,
369	None => return Ok(None),
370	};
371
372	match byte {
373	// Open-parens check the next character to see if this is the start
374	// of a block comment, otherwise it's just a bland left-paren
375	// token.
376	b'(' => match remaining.get(`1`) {
377	Some(b';') => {
378	let mut level = `1`;
379	// Note that we're doing a byte-level search here for the
380	// close-delimiter of `;)`. The actual source text is utf-8
381	// encode in `remaining` but due to how utf-8 works we
382	// can safely search for an ASCII byte since it'll never
383	// otherwise appear in the middle of a codepoint and if we
384	// find it then it's guaranteed to be the right byte.
385	//
386	// Mainly we're avoiding the overhead of decoding utf-8
387	// characters into a Rust `char` since it's otherwise
388	// unnecessary work.
389	let mut iter = remaining[`2`..].iter();
390	while let Some(ch) = iter.next() {
391	match ch {
392	b'(' => {
393	if let Some(b';') = iter.as_slice().first() {
394	level += `1`;
395	iter.next();
396	}
397	}
398	b';' => {
399	if let Some(b')') = iter.as_slice().first() {
400	level -= `1`;
401	iter.next();
402	if level == `0` {
403	let len = remaining.len() - iter.as_slice().len();
404	let comment = &self.input[start..][..len];
405	*pos += len;
406	self.check_confusing_comment(*pos, comment)?;
407	return Ok(Some(TokenKind::BlockComment));
408	}
409	}
410	}
411	_ => {}
412	}
413	}
414	Err(self.error(start, LexError::DanglingBlockComment))
415	}
416	_ => {
417	*pos += `1`;
418
419	Ok(Some(TokenKind::LParen))
420	}
421	},
422
423	b')' => {
424	*pos += `1`;
425	Ok(Some(TokenKind::RParen))
426	}
427
428	// https://webassembly.github.io/spec/core/text/lexical.html#white-space
429	b' ' \| b'`\n`' \| b'`\r`' \| b'`\t`' => {
430	self.skip_ws(pos);
431	Ok(Some(TokenKind::Whitespace))
432	}
433
434	c @ (idchars!() \| b'"') => {
435	let (kind, src) = self.parse_reserved(pos)?;
436	match kind {
437	// If the reserved token was simply a single string then
438	// that is converted to a standalone string token
439	ReservedKind::String => return Ok(Some(TokenKind::String)),
440
441	// If only idchars were consumed then this could be a
442	// specific kind of standalone token we're interested in.
443	ReservedKind::Idchars => {
444	// https://webassembly.github.io/spec/core/text/values.html#integers
445	if let Some(ret) = self.classify_number(src) {
446	return Ok(Some(ret));
447	// https://webassembly.github.io/spec/core/text/values.html#text-id
448	} else if *c == b'$' {
449	return Ok(Some(TokenKind::Id));
450	// part of the WebAssembly/annotations proposal
451	// (no online url yet)
452	} else if *c == b'@' {
453	return Ok(Some(TokenKind::Annotation));
454	// https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
455	} else if b'a' <= c && c <= b'z' {
456	return Ok(Some(TokenKind::Keyword));
457	}
458	}
459
460	ReservedKind::IdString => return Ok(Some(TokenKind::Id)),
461	ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)),
462
463	// ... otherwise this was a conglomeration of idchars,
464	// strings, or just idchars that don't match a prior rule,
465	// meaning this falls through to the fallback `Reserved`
466	// token.
467	ReservedKind::Reserved => {}
468	}
469
470	Ok(Some(TokenKind::Reserved))
471	}
472
473	// This could be a line comment, otherwise `;` is a reserved token.
474	// The second byte is checked to see if it's a `;;` line comment
475	//
476	// Note that this character being considered as part of a
477	// `reserved` token is part of the annotations proposal.
478	b';' => match remaining.get(`1`) {
479	Some(b';') => {
480	let remaining = &self.input[*pos..];
481	let byte_pos = memchr::memchr2(b'`\n`', b'`\r`', remaining.as_bytes())
482	.unwrap_or(remaining.len());
483	*pos += byte_pos;
484	let comment = &remaining[..byte_pos];
485	self.check_confusing_comment(*pos, comment)?;
486	Ok(Some(TokenKind::LineComment))
487	}
488	_ => {
489	*pos += `1`;
490	Ok(Some(TokenKind::Reserved))
491	}
492	},
493
494	// Other known reserved tokens other than `;`
495	//
496	// Note that these characters being considered as part of a
497	// `reserved` token is part of the annotations proposal.
498	b',' \| b'[' \| b']' \| b'{' \| b'}' => {
499	*pos += `1`;
500	Ok(Some(TokenKind::Reserved))
501	}
502
503	_ => {
504	let ch = self.input[start..].chars().next().unwrap();
505	Err(self.error(*pos, LexError::Unexpected(ch)))
506	}
507	}
508	}
509
510	fn skip_ws(&self, pos: &mut usize) {
511	// This table is a byte lookup table to determine whether a byte is a
512	// whitespace byte. There are only 4 whitespace bytes for the `.wat`*
513	// format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes
514	// have a '1' in the table below.
515	//
516	// Due to how utf-8 works (our input is guaranteed to be utf-8) it is
517	// known that if these bytes are found they're guaranteed to be the
518	// whitespace byte, so they can be safely skipped and we don't have to
519	// do full utf-8 decoding. This means that the goal of this function is
520	// to find the first non-whitespace byte in `remaining`.
521	//
522	// For now this lookup table seems to be the fastest, but projects like
523	// https://github.com/lemire/despacer show other simd algorithms which
524	// can possibly accelerate this even more. Note that `.wat` files often*
525	// have a lot of whitespace so this function is typically quite hot when
526	// parsing inputs.
527	#[rustfmt::skip]
528	const WS: [u8; `256`] = [
529	// \t \n \r
530	/ 0x00 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `0`, `0`, `1`, `0`, `0`,
531	/ 0x10 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
532	// ' '
533	/ 0x20 / `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
534	/ 0x30 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
535	/ 0x40 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
536	/ 0x50 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
537	/ 0x60 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
538	/ 0x70 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
539	/ 0x80 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
540	/ 0x90 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
541	/ 0xa0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
542	/ 0xb0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
543	/ 0xc0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
544	/ 0xd0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
545	/ 0xe0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
546	/ 0xf0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
547	];
548	let remaining = &self.input[*pos..];
549	let non_ws_pos = remaining
550	.as_bytes()
551	.iter()
552	.position(\|b\| WS[b as usize*] != `1`)
553	.unwrap_or(remaining.len());
554	*pos += non_ws_pos;
555	}
556
557	/// Splits off a "reserved" token which is then further processed later on
558	/// to figure out which kind of token it is `depending on `ReservedKind`.
559	///
560	/// For more information on this method see the clarification at
561	/// <https://github.com/WebAssembly/spec/pull/1499> but the general gist is
562	/// that this is parsing the grammar:
563	///
564	/// ```text
565	/// reserved := (idchar \| string)+
566	/// ```
567	///
568	/// which means that it is eating any number of adjacent string/idchar
569	/// tokens (e.g. `a"b"c`) and returning the classification of what was
570	/// eaten. The classification assists in determining what the actual token
571	/// here eaten looks like.
572	fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> {
573	let mut idchars = `0u32`;
574	let mut strings = `0u32`;
575	let start = *pos;
576	while let Some(byte) = self.input.as_bytes().get(*pos) {
577	match byte {
578	// Normal `idchars` production which appends to the reserved
579	// token that's being produced.
580	idchars!() => {
581	idchars += `1`;
582	*pos += `1`;
583	}
584
585	// https://webassembly.github.io/spec/core/text/values.html#text-string
586	b'"' => {
587	strings += `1`;
588	*pos += `1`;
589	let mut it = self.input[*pos..].chars();
590	let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode);
591	*pos = self.input.len() - it.as_str().len();
592	match result {
593	Ok(_) => {}
594	Err(e) => {
595	let err_pos = match &e {
596	LexError::UnexpectedEof => self.input.len(),
597	_ => self.input[..*pos].char_indices().next_back().unwrap().0,
598	};
599	return Err(self.error(err_pos, e));
600	}
601	}
602	}
603
604	// Nothing else is considered part of a reserved token
605	_ => break,
606	}
607	}
608	let ret = &self.input[start..*pos];
609	Ok(match (idchars, strings) {
610	(`0`, `0`) => unreachable!(),
611	(`0`, `1`) => (ReservedKind::String, ret),
612	(_, `0`) => (ReservedKind::Idchars, ret),
613	// Pattern match `@"..."` and `$"..."` for string-based
614	// identifiers and annotations.
615	(`1`, `1`) if ret.starts_with("$") => (ReservedKind::IdString, ret),
616	(`1`, `1`) if ret.starts_with("@") => (ReservedKind::AnnotationString, ret),
617	_ => (ReservedKind::Reserved, ret),
618	})
619	}
620
621	fn classify_number(&self, src: &str) -> Option<TokenKind> {
622	let (sign, num) = if let Some(stripped) = src.strip_prefix('+') {
623	(Some(SignToken::Plus), stripped)
624	} else if let Some(stripped) = src.strip_prefix('-') {
625	(Some(SignToken::Minus), stripped)
626	} else {
627	(None, src)
628	};
629
630	let negative = sign == Some(SignToken::Minus);
631
632	// Handle `inf` and `nan` which are special numbers here
633	if num == "inf" {
634	return Some(TokenKind::Float(FloatKind::Inf { negative }));
635	} else if num == "nan" {
636	return Some(TokenKind::Float(FloatKind::Nan { negative }));
637	} else if let Some(stripped) = num.strip_prefix("nan:0x") {
638	let mut it = stripped.as_bytes().iter();
639	let has_underscores = skip_underscores(&mut it, \|x\| char::from(x).is_ascii_hexdigit())?;
640	if it.next().is_some() {
641	return None;
642	}
643	return Some(TokenKind::Float(FloatKind::NanVal {
644	negative,
645	has_underscores,
646	}));
647	}
648
649	// Figure out if we're a hex number or not
650	let test_valid: fn(u8) -> bool;
651	let (mut it, hex) = if let Some(stripped) = num.strip_prefix("0x") {
652	test_valid = \|x: u8\| char::from(x).is_ascii_hexdigit();
653	(stripped.as_bytes().iter(), `true`)
654	} else {
655	test_valid = \|x: u8\| char::from(x).is_ascii_digit();
656	(num.as_bytes().iter(), `false`)
657	};
658
659	// Evaluate the first part, moving out all underscores
660	let mut has_underscores = skip_underscores(&mut it, test_valid)?;
661
662	match it.clone().next() {
663	// If we're followed by something this may be a float so keep going.
664	Some(_) => {}
665
666	// Otherwise this is a valid integer literal!
667	None => {
668	return Some(TokenKind::Integer(IntegerKind {
669	has_underscores,
670	sign,
671	hex,
672	}))
673	}
674	}
675
676	// A number can optionally be after the dot so only actually try to
677	// parse one if it's there.
678	if it.clone().next() == Some(&b'.') {
679	it.next();
680	match it.clone().next() {
681	Some(c) if test_valid(*c) => {
682	if skip_underscores(&mut it, test_valid)? {
683	has_underscores = `true`;
684	}
685	}
686	Some(_) \| None => {}
687	}
688	};
689
690	// Figure out if there's an exponential part here to make a float, and
691	// if so parse it but defer its actual calculation until later.
692	match (hex, it.next()) {
693	(`true`, Some(b'p')) \| (`true`, Some(b'P')) \| (`false`, Some(b'e')) \| (`false`, Some(b'E')) => {
694	match it.clone().next() {
695	Some(b'-') => {
696	it.next();
697	}
698	Some(b'+') => {
699	it.next();
700	}
701	_ => {}
702	}
703	if skip_underscores(&mut it, \|x\| char::from(x).is_ascii_digit())? {
704	has_underscores = `true`;
705	}
706	}
707	(_, None) => {}
708	_ => return None,
709	}
710
711	// We should have eaten everything by now, if not then this is surely
712	// not a float or integer literal.
713	if it.next().is_some() {
714	return None;
715	}
716
717	return Some(TokenKind::Float(FloatKind::Normal {
718	has_underscores,
719	hex,
720	}));
721
722	fn skip_underscores<'a>(
723	it: &mut slice::Iter<'_, u8>,
724	good: fn(u8) -> bool,
725	) -> Option<bool> {
726	let mut last_underscore = `false`;
727	let mut has_underscores = `false`;
728	let first = *it.next()?;
729	if !good(first) {
730	return None;
731	}
732	while let Some(c) = it.clone().next() {
733	if *c == b'_' && !last_underscore {
734	has_underscores = `true`;
735	it.next();
736	last_underscore = `true`;
737	continue;
738	}
739	if !good(*c) {
740	break;
741	}
742	last_underscore = `false`;
743	it.next();
744	}
745	if last_underscore {
746	return None;
747	}
748	Some(has_underscores)
749	}
750	}
751
752	/// Verifies that `comment`, which is about to be returned, has a "confusing
753	/// unicode character" in it and should instead be transformed into an
754	/// error.
755	fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> {
756	if self.allow_confusing_unicode {
757	return Ok(());
758	}
759
760	// In an effort to avoid utf-8 decoding the entire `comment` the search
761	// here is a bit more optimized. This checks for the `0xe2` byte because
762	// in the utf-8 encoding that's the leading encoding byte for all
763	// "confusing characters". Each instance of 0xe2 is checked to see if it
764	// starts a confusing character, and if so that's returned.
765	//
766	// Also note that 0xe2 will never be found in the middle of a codepoint,
767	// it's always the start of a codepoint. This means that if our special
768	// characters show up they're guaranteed to start with 0xe2 bytes.
769	let bytes = comment.as_bytes();
770	for pos in memchr::Memchr::new(`0xe2`, bytes) {
771	if let Some(c) = comment[pos..].chars().next() {
772	if is_confusing_unicode(c) {
773	// Note that `self.cur()` accounts for already having
774	// parsed `comment`, so we move backwards to where
775	// `comment` started and then add the index within
776	// `comment`.
777	let pos = end - comment.len() + pos;
778	return Err(self.error(pos, LexError::ConfusingUnicode(c)));
779	}
780	}
781	}
782
783	Ok(())
784	}
785
786	fn parse_str(
787	it: &mut str::Chars<'a>,
788	allow_confusing_unicode: bool,
789	) -> Result<Cow<'a, [u8]>, LexError> {
790	enum State {
791	Start,
792	String(Vec<u8>),
793	}
794	let orig = it.as_str();
795	let mut state = State::Start;
796	loop {
797	match it.next().ok_or(LexError::UnexpectedEof)? {
798	'"' => break,
799	'`\\`' => {
800	match state {
801	State::String(_) => {}
802	State::Start => {
803	let pos = orig.len() - it.as_str().len() - `1`;
804	state = State::String(orig[..pos].as_bytes().to_vec());
805	}
806	}
807	let buf = match &mut state {
808	State::String(b) => b,
809	State::Start => unreachable!(),
810	};
811	match it.next().ok_or(LexError::UnexpectedEof)? {
812	'"' => buf.push(b'"'),
813	'`\'`' => buf.push(b'`\'`'),
814	't' => buf.push(b'`\t`'),
815	'n' => buf.push(b'`\n`'),
816	'r' => buf.push(b'`\r`'),
817	'`\\`' => buf.push(b'`\\`'),
818	'u' => {
819	Lexer::must_eat_char(it, '{')?;
820	let n = Lexer::hexnum(it)?;
821	let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?;
822	buf.extend(c.encode_utf8(&mut [`0`; `4`]).as_bytes());
823	Lexer::must_eat_char(it, '}')?;
824	}
825	c1 if c1.is_ascii_hexdigit() => {
826	let c2 = Lexer::hexdigit(it)?;
827	buf.push(to_hex(c1) * `16` + c2);
828	}
829	c => return Err(LexError::InvalidStringEscape(c)),
830	}
831	}
832	c if (c as u32) < `0x20` \|\| c as u32 == `0x7f` => {
833	return Err(LexError::InvalidStringElement(c))
834	}
835	c if !allow_confusing_unicode && is_confusing_unicode(c) => {
836	return Err(LexError::ConfusingUnicode(c))
837	}
838	c => match &mut state {
839	State::Start => {}
840	State::String(v) => {
841	v.extend(c.encode_utf8(&mut [`0`; `4`]).as_bytes());
842	}
843	},
844	}
845	}
846	match state {
847	State::Start => Ok(orig[..orig.len() - it.as_str().len() - `1`].as_bytes().into()),
848	State::String(s) => Ok(s.into()),
849	}
850	}
851
852	/// Parses an id-or-string-based name from `it`.
853	///
854	/// Note that `it` should already have been lexed and this is just
855	/// extracting the value. If the token lexed was `@a` then this should point
856	/// to `a`.
857	///
858	/// This will automatically detect quoted syntax such as `@"..."` and the
859	/// byte string will be parsed and validated as utf-8.
860	///
861	/// # Errors
862	///
863	/// Returns an error if a quoted byte string is found and contains invalid
864	/// utf-8.
865	fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> {
866	if it.clone().next() == Some('"') {
867	it.next();
868	match Lexer::parse_str(it, `true`)? {
869	Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) {
870	Ok(s) => Ok(Cow::Borrowed(s)),
871	Err(e) => Err(LexError::InvalidUtf8Id(e)),
872	},
873	Cow::Owned(bytes) => match String::from_utf8(bytes) {
874	Ok(s) => Ok(Cow::Owned(s)),
875	Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())),
876	},
877	}
878	} else {
879	Ok(Cow::Borrowed(it.as_str()))
880	}
881	}
882
883	fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {
884	let n = Lexer::hexdigit(it)?;
885	let mut last_underscore = `false`;
886	let mut n = n as u32;
887	while let Some(c) = it.clone().next() {
888	if c == '_' {
889	it.next();
890	last_underscore = `true`;
891	continue;
892	}
893	if !c.is_ascii_hexdigit() {
894	break;
895	}
896	last_underscore = `false`;
897	it.next();
898	n = n
899	.checked_mul(`16`)
900	.and_then(\|n\| n.checked_add(to_hex(c) as u32))
901	.ok_or(LexError::NumberTooBig)?;
902	}
903	if last_underscore {
904	return Err(LexError::LoneUnderscore);
905	}
906	Ok(n)
907	}
908
909	/// Reads a hexidecimal digit from the input stream, returning where it's
910	/// defined and the hex value. Returns an error on EOF or an invalid hex
911	/// digit.
912	fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> {
913	let ch = Lexer::must_char(it)?;
914	if ch.is_ascii_hexdigit() {
915	Ok(to_hex(ch))
916	} else {
917	Err(LexError::InvalidHexDigit(ch))
918	}
919	}
920
921	/// Reads the next character from the input string and where it's located,
922	/// returning an error if the input stream is empty.
923	fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> {
924	it.next().ok_or(LexError::UnexpectedEof)
925	}
926
927	/// Expects that a specific character must be read next
928	fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> {
929	let found = Lexer::must_char(it)?;
930	if wanted == found {
931	Ok(())
932	} else {
933	Err(LexError::Expected { wanted, found })
934	}
935	}
936
937	/// Creates an error at `pos` with the specified `kind`
938	fn error(&self, pos: usize, kind: LexError) -> Error {
939	Error::lex(Span { offset: pos }, self.input, kind)
940	}
941
942	/// Returns an iterator over all tokens in the original source string
943	/// starting at the `pos` specified.
944	pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ {
945	std::iter::from_fn(move \|\| self.parse(&mut pos).transpose())
946	}
947
948	/// Returns whether an annotation is present at `pos`. If it is present then
949	/// `Ok(Some(token))` is returned corresponding to the token, otherwise
950	/// `Ok(None)` is returned. If the next token cannot be parsed then an error
951	/// is returned.
952	pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> {
953	let bytes = self.input.as_bytes();
954	// Quickly reject anything that for sure isn't an annotation since this
955	// method is used every time an lparen is parsed.
956	if bytes.get(pos) != Some(&b'@') {
957	return Ok(None);
958	}
959	match self.parse(&mut pos)? {
960	Some(token) => match token.kind {
961	TokenKind::Annotation => Ok(Some(token)),
962	_ => Ok(None),
963	},
964	None => Ok(None),
965	}
966	}
967	}
968
969	impl Token {
970	/// Returns the original source text for this token.
971	pub fn src<'a>(&self, s: &'a str) -> &'a str {
972	&s[self.offset..][..self.len.try_into().unwrap()]
973	}
974
975	/// Returns the identifier, without the leading `$` symbol, that this token
976	/// represents.
977	///
978	/// Note that this method returns the contents of the identifier. With a
979	/// string-based identifier this means that escapes have been resolved to
980	/// their string-based equivalent.
981	///
982	/// Should only be used with `TokenKind::Id`.
983	///
984	/// # Errors
985	///
986	/// Returns an error if this is a string-based identifier (e.g. `$"..."`)
987	/// which is invalid utf-8.
988	pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
989	let mut ch = self.src(s).chars();
990	let dollar = ch.next();
991	debug_assert_eq!(dollar, Some('$'));
992	let id = Lexer::parse_name(&mut ch).map_err(\|e\| self.error(s, e))?;
993	if id.is_empty() {
994	return Err(self.error(s, LexError::EmptyId));
995	}
996	Ok(id)
997	}
998
999	/// Returns the annotation, without the leading `@` symbol, that this token
1000	/// represents.
1001	///
1002	/// Note that this method returns the contents of the identifier. With a
1003	/// string-based identifier this means that escapes have been resolved to
1004	/// their string-based equivalent.
1005	///
1006	/// Should only be used with `TokenKind::Annotation`.
1007	///
1008	/// # Errors
1009	///
1010	/// Returns an error if this is a string-based identifier (e.g. `$"..."`)
1011	/// which is invalid utf-8.
1012	pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
1013	let mut ch = self.src(s).chars();
1014	let at = ch.next();
1015	debug_assert_eq!(at, Some('@'));
1016	let id = Lexer::parse_name(&mut ch).map_err(\|e\| self.error(s, e))?;
1017	if id.is_empty() {
1018	return Err(self.error(s, LexError::EmptyAnnotation));
1019	}
1020	Ok(id)
1021	}
1022
1023	/// Returns the keyword this token represents.
1024	///
1025	/// Should only be used with [`TokenKind::Keyword`].
1026	pub fn keyword<'a>(&self, s: &'a str) -> &'a str {
1027	self.src(s)
1028	}
1029
1030	/// Returns the reserved string this token represents.
1031	///
1032	/// Should only be used with [`TokenKind::Reserved`].
1033	pub fn reserved<'a>(&self, s: &'a str) -> &'a str {
1034	self.src(s)
1035	}
1036
1037	/// Returns the parsed string that this token represents.
1038	///
1039	/// This returns either a raw byte slice into the source if that's possible
1040	/// or an owned representation to handle escaped characters and such.
1041	///
1042	/// Should only be used with [`TokenKind::String`].
1043	pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> {
1044	let mut ch = self.src(s).chars();
1045	ch.next().unwrap();
1046	Lexer::parse_str(&mut ch, `true`).unwrap()
1047	}
1048
1049	/// Returns the decomposed float token that this represents.
1050	///
1051	/// This will slice up the float token into its component parts and return a
1052	/// description of the float token in the source.
1053	///
1054	/// Should only be used with [`TokenKind::Float`].
1055	pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> {
1056	match kind {
1057	FloatKind::Inf { negative } => Float::Inf { negative },
1058	FloatKind::Nan { negative } => Float::Nan {
1059	val: None,
1060	negative,
1061	},
1062	FloatKind::NanVal {
1063	negative,
1064	has_underscores,
1065	} => {
1066	let src = self.src(s);
1067	let src = if src.starts_with("n") { src } else { &src[`1`..] };
1068	let mut val = Cow::Borrowed(src.strip_prefix("nan:0x").unwrap());
1069	if has_underscores {
1070	*val.to_mut() = val.replace("_", "");
1071	}
1072	Float::Nan {
1073	val: Some(val),
1074	negative,
1075	}
1076	}
1077	FloatKind::Normal {
1078	has_underscores,
1079	hex,
1080	} => {
1081	let src = self.src(s);
1082	let (integral, fractional, exponent) = match src.find('.') {
1083	Some(i) => {
1084	let integral = &src[..i];
1085	let rest = &src[i + `1`..];
1086	let exponent = if hex {
1087	rest.find('p').or_else(\|\| rest.find('P'))
1088	} else {
1089	rest.find('e').or_else(\|\| rest.find('E'))
1090	};
1091	match exponent {
1092	Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + `1`..])),
1093	None => (integral, Some(rest), None),
1094	}
1095	}
1096	None => {
1097	let exponent = if hex {
1098	src.find('p').or_else(\|\| src.find('P'))
1099	} else {
1100	src.find('e').or_else(\|\| src.find('E'))
1101	};
1102	match exponent {
1103	Some(i) => (&src[..i], None, Some(&src[i + `1`..])),
1104	None => (src, None, None),
1105	}
1106	}
1107	};
1108	let mut integral = Cow::Borrowed(integral.strip_prefix('+').unwrap_or(integral));
1109	let mut fractional = fractional.and_then(\|s\| {
1110	if s.is_empty() {
1111	None
1112	} else {
1113	Some(Cow::Borrowed(s))
1114	}
1115	});
1116	let mut exponent =
1117	exponent.map(\|s\| Cow::Borrowed(s.strip_prefix('+').unwrap_or(s)));
1118	if has_underscores {
1119	*integral.to_mut() = integral.replace("_", "");
1120	if let Some(fractional) = &mut fractional {
1121	*fractional.to_mut() = fractional.replace("_", "");
1122	}
1123	if let Some(exponent) = &mut exponent {
1124	*exponent.to_mut() = exponent.replace("_", "");
1125	}
1126	}
1127	if hex {
1128	*integral.to_mut() = integral.replace("0x", "");
1129	}
1130	Float::Val {
1131	hex,
1132	integral,
1133	fractional,
1134	exponent,
1135	}
1136	}
1137	}
1138	}
1139
1140	/// Returns the decomposed integer token that this represents.
1141	///
1142	/// This will slice up the integer token into its component parts and
1143	/// return a description of the integer token in the source.
1144	///
1145	/// Should only be used with [`TokenKind::Integer`].
1146	pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> {
1147	let src = self.src(s);
1148	let val = match kind.sign {
1149	Some(SignToken::Plus) => src.strip_prefix('+').unwrap(),
1150	Some(SignToken::Minus) => src,
1151	None => src,
1152	};
1153	let mut val = Cow::Borrowed(val);
1154	if kind.has_underscores {
1155	*val.to_mut() = val.replace("_", "");
1156	}
1157	if kind.hex {
1158	*val.to_mut() = val.replace("0x", "");
1159	}
1160	Integer {
1161	sign: kind.sign,
1162	hex: kind.hex,
1163	val,
1164	}
1165	}
1166
1167	fn error(&self, src: &str, err: LexError) -> Error {
1168	Error::lex(
1169	Span {
1170	offset: self.offset,
1171	},
1172	src,
1173	err,
1174	)
1175	}
1176	}
1177
1178	impl<'a> Integer<'a> {
1179	/// Returns the sign token for this integer.
1180	pub fn sign(&self) -> Option<SignToken> {
1181	self.sign
1182	}
1183
1184	/// Returns the value string that can be parsed for this integer, as well
1185	/// as the base that it should be parsed in
1186	pub fn val(&self) -> (&str, u32) {
1187	(&self.val, if self.hex { `16` } else { `10` })
1188	}
1189	}
1190
1191	fn to_hex(c: char) -> u8 {
1192	match c {
1193	'a'..='f' => c as u8 - b'a' + `10`,
1194	'A'..='F' => c as u8 - b'A' + `10`,
1195	_ => c as u8 - b'0',
1196	}
1197	}
1198
1199	impl fmt::Display for LexError {
1200	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1201	use LexError::*;
1202	match self {
1203	DanglingBlockComment => f.write_str("unterminated block comment")?,
1204	Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?,
1205	InvalidStringElement(c) => {
1206	write!(f, "invalid character in string '{}'", escape_char(*c))?
1207	}
1208	InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?,
1209	InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?,
1210	InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?,
1211	Expected { wanted, found } => write!(
1212	f,
1213	"expected '{}' but found '{}'",
1214	escape_char(*wanted),
1215	escape_char(*found)
1216	)?,
1217	UnexpectedEof => write!(f, "unexpected end-of-file")?,
1218	NumberTooBig => f.write_str("number is too big to parse")?,
1219	InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,
1220	LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
1221	ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?,
1222	InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?,
1223	EmptyId => write!(f, "empty identifier")?,
1224	EmptyAnnotation => write!(f, "empty annotation id")?,
1225	}
1226	Ok(())
1227	}
1228	}
1229
1230	fn escape_char(c: char) -> String {
1231	match c {
1232	'`\t`' => String::from("`\\`t"),
1233	'`\r`' => String::from("`\\`r"),
1234	'`\n`' => String::from("`\\`n"),
1235	'`\\`' => String::from("`\\\\`"),
1236	'`\'`' => String::from("`\\\'`"),
1237	'`\"`' => String::from("`\"`"),
1238	'`\x20`'..='`\x7e`' => String::from(c),
1239	_ => c.escape_unicode().to_string(),
1240	}
1241	}
1242
1243	/// This is an attempt to protect agains the "trojan source" [1] problem where
1244	/// unicode characters can cause editors to render source code differently
1245	/// for humans than the compiler itself sees.
1246	///
1247	/// To mitigate this issue, and because it's relatively rare in practice,
1248	/// this simply rejects characters of that form.
1249	///
1250	/// [1]: https://www.trojansource.codes/
1251	fn is_confusing_unicode(ch: char) -> bool {
1252	matches!(
1253	ch,
1254	'`\u{202a}`'
1255	\| '`\u{202b}`'
1256	\| '`\u{202d}`'
1257	\| '`\u{202e}`'
1258	\| '`\u{2066}`'
1259	\| '`\u{2067}`'
1260	\| '`\u{2068}`'
1261	\| '`\u{206c}`'
1262	\| '`\u{2069}`'
1263	)
1264	}
1265
1266	#[cfg(test)]
1267	mod tests {
1268	use super::*;
1269
1270	#[test]
1271	fn ws_smoke() {
1272	fn get_whitespace(input: &str) -> &str {
1273	let token = get_token(input);
1274	match token.kind {
1275	TokenKind::Whitespace => token.src(input),
1276	other => panic!("unexpected {:?}", other),
1277	}
1278	}
1279	assert_eq!(get_whitespace(" "), " ");
1280	assert_eq!(get_whitespace(" "), " ");
1281	assert_eq!(get_whitespace(" `\n` "), " `\n` ");
1282	assert_eq!(get_whitespace(" x"), " ");
1283	assert_eq!(get_whitespace(" ;"), " ");
1284	}
1285
1286	#[test]
1287	fn line_comment_smoke() {
1288	fn get_line_comment(input: &str) -> &str {
1289	let token = get_token(input);
1290	match token.kind {
1291	TokenKind::LineComment => token.src(input),
1292	other => panic!("unexpected {:?}", other),
1293	}
1294	}
1295	assert_eq!(get_line_comment(";;"), ";;");
1296	assert_eq!(get_line_comment(";; xyz"), ";; xyz");
1297	assert_eq!(get_line_comment(";; xyz`\n`abc"), ";; xyz");
1298	assert_eq!(get_line_comment(";;`\n`abc"), ";;");
1299	assert_eq!(get_line_comment(";; `\n`abc"), ";; ");
1300	assert_eq!(get_line_comment(";; `\r`abc"), ";; ");
1301	assert_eq!(get_line_comment(";; `\r\n`abc"), ";; ");
1302	}
1303
1304	#[test]
1305	fn block_comment_smoke() {
1306	fn get_block_comment(input: &str) -> &str {
1307	let token = get_token(input);
1308	match token.kind {
1309	TokenKind::BlockComment => token.src(input),
1310	other => panic!("unexpected {:?}", other),
1311	}
1312	}
1313	assert_eq!(get_block_comment("(;;)"), "(;;)");
1314	assert_eq!(get_block_comment("(; ;)"), "(; ;)");
1315	assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");
1316	}
1317
1318	fn get_token(input: &str) -> Token {
1319	Lexer::new(input)
1320	.parse(&mut `0`)
1321	.expect("no first token")
1322	.expect("no token")
1323	}
1324
1325	#[test]
1326	fn lparen() {
1327	assert_eq!(get_token("((").kind, TokenKind::LParen);
1328	}
1329
1330	#[test]
1331	fn rparen() {
1332	assert_eq!(get_token(")(").kind, TokenKind::RParen);
1333	}
1334
1335	#[test]
1336	fn strings() {
1337	fn get_string(input: &str) -> Vec<u8> {
1338	let token = get_token(input);
1339	match token.kind {
1340	TokenKind::String => token.string(input).to_vec(),
1341	other => panic!("not keyword {:?}", other),
1342	}
1343	}
1344	assert_eq!(&*get_string("`\"\"`"), b"");
1345	assert_eq!(&*get_string("`\"`a`\"`"), b"a");
1346	assert_eq!(&*get_string("`\"`a b c d`\"`"), b"a b c d");
1347	assert_eq!(&*get_string("`\"\\\"\"`"), b"`\"`");
1348	assert_eq!(&*get_string("`\"\\`'`\"`"), b"'");
1349	assert_eq!(&*get_string("`\"\\`n`\"`"), b"`\n`");
1350	assert_eq!(&*get_string("`\"\\`t`\"`"), b"`\t`");
1351	assert_eq!(&*get_string("`\"\\`r`\"`"), b"`\r`");
1352	assert_eq!(&*get_string("`\"\\\\\"`"), b"`\\`");
1353	assert_eq!(&*get_string("`\"\\`01`\"`"), &[`1`]);
1354	assert_eq!(&*get_string("`\"\\`u{1}`\"`"), &[`1`]);
1355	assert_eq!(
1356	&*get_string("`\"\\`u{0f3}`\"`"),
1357	'`\u{0f3}`'.encode_utf8(&mut [`0`; `4`]).as_bytes()
1358	);
1359	assert_eq!(
1360	&*get_string("`\"\\`u{0_f_3}`\"`"),
1361	'`\u{0f3}`'.encode_utf8(&mut [`0`; `4`]).as_bytes()
1362	);
1363
1364	for i in `0`..=`255i32` {
1365	let s = format!("`\"\\`{:02x}`\"`", i);
1366	assert_eq!(&*get_string(&s), &[i as u8]);
1367	}
1368	}
1369
1370	#[test]
1371	fn id() {
1372	fn get_id(input: &str) -> String {
1373	let token = get_token(input);
1374	match token.kind {
1375	TokenKind::Id => token.id(input).unwrap().to_string(),
1376	other => panic!("not id {:?}", other),
1377	}
1378	}
1379	assert_eq!(get_id("$x"), "x");
1380	assert_eq!(get_id("$xyz"), "xyz");
1381	assert_eq!(get_id("$x_z"), "x_z");
1382	assert_eq!(get_id("$0^"), "0^");
1383	assert_eq!(get_id("$0^;;"), "0^");
1384	assert_eq!(get_id("$0^ ;;"), "0^");
1385	assert_eq!(get_id("$`\"`x`\"` ;;"), "x");
1386	}
1387
1388	#[test]
1389	fn annotation() {
1390	fn get_annotation(input: &str) -> String {
1391	let token = get_token(input);
1392	match token.kind {
1393	TokenKind::Annotation => token.annotation(input).unwrap().to_string(),
1394	other => panic!("not annotation {:?}", other),
1395	}
1396	}
1397	assert_eq!(get_annotation("@foo"), "foo");
1398	assert_eq!(get_annotation("@foo "), "foo");
1399	assert_eq!(get_annotation("@f "), "f");
1400	assert_eq!(get_annotation("@`\"`x`\"` "), "x");
1401	assert_eq!(get_annotation("@0 "), "0");
1402	}
1403
1404	#[test]
1405	fn keyword() {
1406	fn get_keyword(input: &str) -> &str {
1407	let token = get_token(input);
1408	match token.kind {
1409	TokenKind::Keyword => token.keyword(input),
1410	other => panic!("not keyword {:?}", other),
1411	}
1412	}
1413	assert_eq!(get_keyword("x"), "x");
1414	assert_eq!(get_keyword("xyz"), "xyz");
1415	assert_eq!(get_keyword("x_z"), "x_z");
1416	assert_eq!(get_keyword("x_z "), "x_z");
1417	assert_eq!(get_keyword("x_z "), "x_z");
1418	}
1419
1420	#[test]
1421	fn reserved() {
1422	fn get_reserved(input: &str) -> &str {
1423	let token = get_token(input);
1424	match token.kind {
1425	TokenKind::Reserved => token.reserved(input),
1426	other => panic!("not reserved {:?}", other),
1427	}
1428	}
1429	assert_eq!(get_reserved("^_x "), "^_x");
1430	}
1431
1432	#[test]
1433	fn integer() {
1434	fn get_integer(input: &str) -> String {
1435	let token = get_token(input);
1436	match token.kind {
1437	TokenKind::Integer(i) => token.integer(input, i).val.to_string(),
1438	other => panic!("not integer {:?}", other),
1439	}
1440	}
1441	assert_eq!(get_integer("1"), "1");
1442	assert_eq!(get_integer("0"), "0");
1443	assert_eq!(get_integer("-1"), "-1");
1444	assert_eq!(get_integer("+1"), "1");
1445	assert_eq!(get_integer("+1_000"), "1000");
1446	assert_eq!(get_integer("+1_0_0_0"), "1000");
1447	assert_eq!(get_integer("+0x10"), "10");
1448	assert_eq!(get_integer("-0x10"), "-10");
1449	assert_eq!(get_integer("0x10"), "10");
1450	}
1451
1452	#[test]
1453	fn float() {
1454	fn get_float(input: &str) -> Float<'_> {
1455	let token = get_token(input);
1456	match token.kind {
1457	TokenKind::Float(f) => token.float(input, f),
1458	other => panic!("not float {:?}", other),
1459	}
1460	}
1461	assert_eq!(
1462	get_float("nan"),
1463	Float::Nan {
1464	val: None,
1465	negative: `false`
1466	},
1467	);
1468	assert_eq!(
1469	get_float("-nan"),
1470	Float::Nan {
1471	val: None,
1472	negative: `true`,
1473	},
1474	);
1475	assert_eq!(
1476	get_float("+nan"),
1477	Float::Nan {
1478	val: None,
1479	negative: `false`,
1480	},
1481	);
1482	assert_eq!(
1483	get_float("+nan:0x1"),
1484	Float::Nan {
1485	val: Some("1".into()),
1486	negative: `false`,
1487	},
1488	);
1489	assert_eq!(
1490	get_float("nan:0x7f_ffff"),
1491	Float::Nan {
1492	val: Some("7fffff".into()),
1493	negative: `false`,
1494	},
1495	);
1496	assert_eq!(get_float("inf"), Float::Inf { negative: `false` });
1497	assert_eq!(get_float("-inf"), Float::Inf { negative: `true` });
1498	assert_eq!(get_float("+inf"), Float::Inf { negative: `false` });
1499
1500	assert_eq!(
1501	get_float("1.2"),
1502	Float::Val {
1503	integral: "1".into(),
1504	fractional: Some("2".into()),
1505	exponent: None,
1506	hex: `false`,
1507	},
1508	);
1509	assert_eq!(
1510	get_float("1.2e3"),
1511	Float::Val {
1512	integral: "1".into(),
1513	fractional: Some("2".into()),
1514	exponent: Some("3".into()),
1515	hex: `false`,
1516	},
1517	);
1518	assert_eq!(
1519	get_float("-1_2.1_1E+0_1"),
1520	Float::Val {
1521	integral: "-12".into(),
1522	fractional: Some("11".into()),
1523	exponent: Some("01".into()),
1524	hex: `false`,
1525	},
1526	);
1527	assert_eq!(
1528	get_float("+1_2.1_1E-0_1"),
1529	Float::Val {
1530	integral: "12".into(),
1531	fractional: Some("11".into()),
1532	exponent: Some("-01".into()),
1533	hex: `false`,
1534	},
1535	);
1536	assert_eq!(
1537	get_float("0x1_2.3_4p5_6"),
1538	Float::Val {
1539	integral: "12".into(),
1540	fractional: Some("34".into()),
1541	exponent: Some("56".into()),
1542	hex: `true`,
1543	},
1544	);
1545	assert_eq!(
1546	get_float("+0x1_2.3_4P-5_6"),
1547	Float::Val {
1548	integral: "12".into(),
1549	fractional: Some("34".into()),
1550	exponent: Some("-56".into()),
1551	hex: `true`,
1552	},
1553	);
1554	assert_eq!(
1555	get_float("1."),
1556	Float::Val {
1557	integral: "1".into(),
1558	fractional: None,
1559	exponent: None,
1560	hex: `false`,
1561	},
1562	);
1563	assert_eq!(
1564	get_float("0x1p-24"),
1565	Float::Val {
1566	integral: "1".into(),
1567	fractional: None,
1568	exponent: Some("-24".into()),
1569	hex: `true`,
1570	},
1571	);
1572	}
1573	}
1574