1//! Lexer for parsing format descriptions.
2
3use core::iter;
4
5use super::{unused, Error, Location, Spanned, SpannedValue};
6
7/// An iterator over the lexed tokens.
8pub(super) struct Lexed<I: Iterator> {
9 /// The internal iterator.
10 iter: core::iter::Peekable<I>,
11}
12
13impl<I: Iterator> Iterator for Lexed<I> {
14 type Item = I::Item;
15
16 fn next(&mut self) -> Option<Self::Item> {
17 self.iter.next()
18 }
19}
20
21impl<'iter, 'token: 'iter, I: Iterator<Item = Result<Token<'token>, Error>> + 'iter> Lexed<I> {
22 /// Peek at the next item in the iterator.
23 pub(super) fn peek(&mut self) -> Option<&I::Item> {
24 self.iter.peek()
25 }
26
27 /// Consume the next token if it is whitespace.
28 pub(super) fn next_if_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
29 if let Some(&Ok(Token::ComponentPart {
30 kind: ComponentKind::Whitespace,
31 value,
32 })) = self.peek()
33 {
34 self.next(); // consume
35 Some(value)
36 } else {
37 None
38 }
39 }
40
41 /// Consume the next token if it is a component item that is not whitespace.
42 pub(super) fn next_if_not_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
43 if let Some(&Ok(Token::ComponentPart {
44 kind: ComponentKind::NotWhitespace,
45 value,
46 })) = self.peek()
47 {
48 self.next(); // consume
49 Some(value)
50 } else {
51 None
52 }
53 }
54
55 /// Consume the next token if it is an opening bracket.
56 pub(super) fn next_if_opening_bracket(&mut self) -> Option<Location> {
57 if let Some(&Ok(Token::Bracket {
58 kind: BracketKind::Opening,
59 location,
60 })) = self.peek()
61 {
62 self.next(); // consume
63 Some(location)
64 } else {
65 None
66 }
67 }
68
69 /// Peek at the next token if it is a closing bracket.
70 pub(super) fn peek_closing_bracket(&'iter mut self) -> Option<&'iter Location> {
71 if let Some(Ok(Token::Bracket {
72 kind: BracketKind::Closing,
73 location,
74 })) = self.peek()
75 {
76 Some(location)
77 } else {
78 None
79 }
80 }
81
82 /// Consume the next token if it is a closing bracket.
83 pub(super) fn next_if_closing_bracket(&mut self) -> Option<Location> {
84 if let Some(&Ok(Token::Bracket {
85 kind: BracketKind::Closing,
86 location,
87 })) = self.peek()
88 {
89 self.next(); // consume
90 Some(location)
91 } else {
92 None
93 }
94 }
95}
96
97/// A token emitted by the lexer. There is no semantic meaning at this stage.
98pub(super) enum Token<'a> {
99 /// A literal string, formatted and parsed as-is.
100 Literal(Spanned<&'a [u8]>),
101 /// An opening or closing bracket. May or may not be the start or end of a component.
102 Bracket {
103 /// Whether the bracket is opening or closing.
104 kind: BracketKind,
105 /// Where the bracket was in the format string.
106 location: Location,
107 },
108 /// One part of a component. This could be its name, a modifier, or whitespace.
109 ComponentPart {
110 /// Whether the part is whitespace or not.
111 kind: ComponentKind,
112 /// The part itself.
113 value: Spanned<&'a [u8]>,
114 },
115}
116
117/// What type of bracket is present.
118pub(super) enum BracketKind {
119 /// An opening bracket: `[`
120 Opening,
121 /// A closing bracket: `]`
122 Closing,
123}
124
125/// Indicates whether the component is whitespace or not.
126pub(super) enum ComponentKind {
127 #[allow(clippy::missing_docs_in_private_items)]
128 Whitespace,
129 #[allow(clippy::missing_docs_in_private_items)]
130 NotWhitespace,
131}
132
133/// Attach [`Location`] information to each byte in the iterator.
134fn attach_location<'item>(
135 iter: impl Iterator<Item = &'item u8>,
136) -> impl Iterator<Item = (&'item u8, Location)> {
137 let mut byte_pos: u32 = 0;
138
139 iter.map(move |byte: &u8| {
140 let location: Location = Location { byte: byte_pos };
141 byte_pos += 1;
142 (byte, location)
143 })
144}
145
146/// Parse the string into a series of [`Token`]s.
147///
148/// `VERSION` controls the version of the format description that is being parsed. Currently, this
149/// must be 1 or 2.
150///
151/// - When `VERSION` is 1, `[[` is the only escape sequence, resulting in a literal `[`.
152/// - When `VERSION` is 2, all escape sequences begin with `\`. The only characters that may
153/// currently follow are `\`, `[`, and `]`, all of which result in the literal character. All
154/// other characters result in a lex error.
155pub(super) fn lex<const VERSION: usize>(
156 mut input: &[u8],
157) -> Lexed<impl Iterator<Item = Result<Token<'_>, Error>>> {
158 validate_version!(VERSION);
159
160 let mut depth: u8 = 0;
161 let mut iter = attach_location(input.iter()).peekable();
162 let mut second_bracket_location = None;
163
164 let iter = iter::from_fn(move || {
165 // The flag is only set when version is zero.
166 if version!(..=1) {
167 // There is a flag set to emit the second half of an escaped bracket pair.
168 if let Some(location) = second_bracket_location.take() {
169 return Some(Ok(Token::Bracket {
170 kind: BracketKind::Opening,
171 location,
172 }));
173 }
174 }
175
176 Some(Ok(match iter.next()? {
177 // possible escape sequence
178 (b'\\', backslash_loc) if version!(2..) => {
179 match iter.next() {
180 Some((b'\\' | b'[' | b']', char_loc)) => {
181 // The escaped character is emitted as-is.
182 let char = &input[1..2];
183 input = &input[2..];
184 if depth == 0 {
185 Token::Literal(char.spanned(backslash_loc.to(char_loc)))
186 } else {
187 Token::ComponentPart {
188 kind: ComponentKind::NotWhitespace,
189 value: char.spanned(backslash_loc.to(char_loc)),
190 }
191 }
192 }
193 Some((_, loc)) => {
194 return Some(Err(Error {
195 _inner: unused(loc.error("invalid escape sequence")),
196 public: crate::error::InvalidFormatDescription::Expected {
197 what: "valid escape sequence",
198 index: loc.byte as _,
199 },
200 }));
201 }
202 None => {
203 return Some(Err(Error {
204 _inner: unused(backslash_loc.error("unexpected end of input")),
205 public: crate::error::InvalidFormatDescription::Expected {
206 what: "valid escape sequence",
207 index: backslash_loc.byte as _,
208 },
209 }));
210 }
211 }
212 }
213 // potentially escaped opening bracket
214 (b'[', location) if version!(..=1) => {
215 if let Some((_, second_location)) = iter.next_if(|&(&byte, _)| byte == b'[') {
216 // Escaped bracket. Store the location of the second so we can emit it later.
217 second_bracket_location = Some(second_location);
218 input = &input[2..];
219 } else {
220 // opening bracket
221 depth += 1;
222 input = &input[1..];
223 }
224
225 Token::Bracket {
226 kind: BracketKind::Opening,
227 location,
228 }
229 }
230 // opening bracket
231 (b'[', location) => {
232 depth += 1;
233 input = &input[1..];
234
235 Token::Bracket {
236 kind: BracketKind::Opening,
237 location,
238 }
239 }
240 // closing bracket
241 (b']', location) if depth > 0 => {
242 depth -= 1;
243 input = &input[1..];
244
245 Token::Bracket {
246 kind: BracketKind::Closing,
247 location,
248 }
249 }
250 // literal
251 (_, start_location) if depth == 0 => {
252 let mut bytes = 1;
253 let mut end_location = start_location;
254
255 while let Some((_, location)) =
256 iter.next_if(|&(&byte, _)| !((version!(2..) && byte == b'\\') || byte == b'['))
257 {
258 end_location = location;
259 bytes += 1;
260 }
261
262 let value = &input[..bytes];
263 input = &input[bytes..];
264
265 Token::Literal(value.spanned(start_location.to(end_location)))
266 }
267 // component part
268 (byte, start_location) => {
269 let mut bytes = 1;
270 let mut end_location = start_location;
271 let is_whitespace = byte.is_ascii_whitespace();
272
273 while let Some((_, location)) = iter.next_if(|&(byte, _)| {
274 !matches!(byte, b'\\' | b'[' | b']')
275 && is_whitespace == byte.is_ascii_whitespace()
276 }) {
277 end_location = location;
278 bytes += 1;
279 }
280
281 let value = &input[..bytes];
282 input = &input[bytes..];
283
284 Token::ComponentPart {
285 kind: if is_whitespace {
286 ComponentKind::Whitespace
287 } else {
288 ComponentKind::NotWhitespace
289 },
290 value: value.spanned(start_location.to(end_location)),
291 }
292 }
293 }))
294 });
295
296 Lexed {
297 iter: iter.peekable(),
298 }
299}
300