lexer.rs source code [crates/time/src/format_description/parse/lexer.rs]

1	//! Lexer for parsing format descriptions.
2
3	use core::iter;
4
5	use super::{attach_location, unused, Error, Location, Spanned, SpannedValue};
6
7	/// An iterator over the lexed tokens.
8	pub(super) struct Lexed<I: Iterator> {
9	/// The internal iterator.
10	iter: iter::Peekable<I>,
11	}
12
13	impl<I: Iterator> Iterator for Lexed<I> {
14	type Item = I::Item;
15
16	fn next(&mut self) -> Option<Self::Item> {
17	self.iter.next()
18	}
19	}
20
21	impl<'iter, 'token: 'iter, I: Iterator<Item = Result<Token<'token>, Error>> + 'iter> Lexed<I> {
22	/// Peek at the next item in the iterator.
23	pub(super) fn peek(&mut self) -> Option<&I::Item> {
24	self.iter.peek()
25	}
26
27	/// Consume the next token if it is whitespace.
28	pub(super) fn next_if_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
29	if let Some(&Ok(Token::ComponentPart {
30	kind: ComponentKind::Whitespace,
31	value,
32	})) = self.peek()
33	{
34	self.next(); // consume
35	Some(value)
36	} else {
37	None
38	}
39	}
40
41	/// Consume the next token if it is a component item that is not whitespace.
42	pub(super) fn next_if_not_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
43	if let Some(&Ok(Token::ComponentPart {
44	kind: ComponentKind::NotWhitespace,
45	value,
46	})) = self.peek()
47	{
48	self.next(); // consume
49	Some(value)
50	} else {
51	None
52	}
53	}
54
55	/// Consume the next token if it is an opening bracket.
56	pub(super) fn next_if_opening_bracket(&mut self) -> Option<Location> {
57	if let Some(&Ok(Token::Bracket {
58	kind: BracketKind::Opening,
59	location,
60	})) = self.peek()
61	{
62	self.next(); // consume
63	Some(location)
64	} else {
65	None
66	}
67	}
68
69	/// Peek at the next token if it is a closing bracket.
70	pub(super) fn peek_closing_bracket(&'iter mut self) -> Option<&'iter Location> {
71	if let Some(Ok(Token::Bracket {
72	kind: BracketKind::Closing,
73	location,
74	})) = self.peek()
75	{
76	Some(location)
77	} else {
78	None
79	}
80	}
81
82	/// Consume the next token if it is a closing bracket.
83	pub(super) fn next_if_closing_bracket(&mut self) -> Option<Location> {
84	if let Some(&Ok(Token::Bracket {
85	kind: BracketKind::Closing,
86	location,
87	})) = self.peek()
88	{
89	self.next(); // consume
90	Some(location)
91	} else {
92	None
93	}
94	}
95	}
96
97	/// A token emitted by the lexer. There is no semantic meaning at this stage.
98	pub(super) enum Token<'a> {
99	/// A literal string, formatted and parsed as-is.
100	Literal(Spanned<&'a [u8]>),
101	/// An opening or closing bracket. May or may not be the start or end of a component.
102	Bracket {
103	/// Whether the bracket is opening or closing.
104	kind: BracketKind,
105	/// Where the bracket was in the format string.
106	location: Location,
107	},
108	/// One part of a component. This could be its name, a modifier, or whitespace.
109	ComponentPart {
110	/// Whether the part is whitespace or not.
111	kind: ComponentKind,
112	/// The part itself.
113	value: Spanned<&'a [u8]>,
114	},
115	}
116
117	/// What type of bracket is present.
118	pub(super) enum BracketKind {
119	/// An opening bracket: `[`
120	Opening,
121	/// A closing bracket: `]`
122	Closing,
123	}
124
125	/// Indicates whether the component is whitespace or not.
126	pub(super) enum ComponentKind {
127	#[allow(clippy::missing_docs_in_private_items)]
128	Whitespace,
129	#[allow(clippy::missing_docs_in_private_items)]
130	NotWhitespace,
131	}
132
133	/// Parse the string into a series of [`Token`]s.
134	///
135	/// `VERSION` controls the version of the format description that is being parsed. Currently, this
136	/// must be 1 or 2.
137	///
138	/// - When `VERSION` is 1, `[[` is the only escape sequence, resulting in a literal `[`.
139	/// - When `VERSION` is 2, all escape sequences begin with `\`. The only characters that may
140	/// currently follow are `\`, `[`, and `]`, all of which result in the literal character. All
141	/// other characters result in a lex error.
142	pub(super) fn lex<const VERSION: usize>(
143	mut input: &[u8],
144	) -> Lexed<impl Iterator<Item = Result<Token<'_>, Error>>> {
145	validate_version!(VERSION);
146
147	let mut depth: u8 = `0`;
148	let mut iter = attach_location(input.iter()).peekable();
149	let mut second_bracket_location = None;
150
151	let iter = iter::from_fn(move \|\| {
152	// The flag is only set when version is zero.
153	if version!(..=`1`) {
154	// There is a flag set to emit the second half of an escaped bracket pair.
155	if let Some(location) = second_bracket_location.take() {
156	return Some(Ok(Token::Bracket {
157	kind: BracketKind::Opening,
158	location,
159	}));
160	}
161	}
162
163	Some(Ok(match iter.next()? {
164	// possible escape sequence
165	(b'`\\`', backslash_loc) if version!(`2`..) => {
166	match iter.next() {
167	Some((b'`\\`' \| b'[' \| b']', char_loc)) => {
168	// The escaped character is emitted as-is.
169	let char = &input[`1`..`2`];
170	input = &input[`2`..];
171	if depth == `0` {
172	Token::Literal(char.spanned(backslash_loc.to(char_loc)))
173	} else {
174	Token::ComponentPart {
175	kind: ComponentKind::NotWhitespace,
176	value: char.spanned(backslash_loc.to(char_loc)),
177	}
178	}
179	}
180	Some((_, loc)) => {
181	return Some(Err(Error {
182	_inner: unused(loc.error("invalid escape sequence")),
183	public: crate::error::InvalidFormatDescription::Expected {
184	what: "valid escape sequence",
185	index: loc.byte as _,
186	},
187	}));
188	}
189	None => {
190	return Some(Err(Error {
191	_inner: unused(backslash_loc.error("unexpected end of input")),
192	public: crate::error::InvalidFormatDescription::Expected {
193	what: "valid escape sequence",
194	index: backslash_loc.byte as _,
195	},
196	}));
197	}
198	}
199	}
200	// potentially escaped opening bracket
201	(b'[', location) if version!(..=`1`) => {
202	if let Some((_, second_location)) = iter.next_if(\|&(&byte, _)\| byte == b'[') {
203	// Escaped bracket. Store the location of the second so we can emit it later.
204	second_bracket_location = Some(second_location);
205	input = &input[`2`..];
206	} else {
207	// opening bracket
208	depth += `1`;
209	input = &input[`1`..];
210	}
211
212	Token::Bracket {
213	kind: BracketKind::Opening,
214	location,
215	}
216	}
217	// opening bracket
218	(b'[', location) => {
219	depth += `1`;
220	input = &input[`1`..];
221
222	Token::Bracket {
223	kind: BracketKind::Opening,
224	location,
225	}
226	}
227	// closing bracket
228	(b']', location) if depth > `0` => {
229	depth -= `1`;
230	input = &input[`1`..];
231
232	Token::Bracket {
233	kind: BracketKind::Closing,
234	location,
235	}
236	}
237	// literal
238	(_, start_location) if depth == `0` => {
239	let mut bytes = `1`;
240	let mut end_location = start_location;
241
242	while let Some((_, location)) =
243	iter.next_if(\|&(&byte, _)\| !((version!(`2`..) && byte == b'`\\`') \|\| byte == b'['))
244	{
245	end_location = location;
246	bytes += `1`;
247	}
248
249	let value = &input[..bytes];
250	input = &input[bytes..];
251
252	Token::Literal(value.spanned(start_location.to(end_location)))
253	}
254	// component part
255	(byte, start_location) => {
256	let mut bytes = `1`;
257	let mut end_location = start_location;
258	let is_whitespace = byte.is_ascii_whitespace();
259
260	while let Some((_, location)) = iter.next_if(\|&(byte, _)\| {
261	!matches!(byte, b'`\\`' \| b'[' \| b']')
262	&& is_whitespace == byte.is_ascii_whitespace()
263	}) {
264	end_location = location;
265	bytes += `1`;
266	}
267
268	let value = &input[..bytes];
269	input = &input[bytes..];
270
271	Token::ComponentPart {
272	kind: if is_whitespace {
273	ComponentKind::Whitespace
274	} else {
275	ComponentKind::NotWhitespace
276	},
277	value: value.spanned(start_location.to(end_location)),
278	}
279	}
280	}))
281	});
282
283	Lexed {
284	iter: iter.peekable(),
285	}
286	}
287