1 | //! Lexer for parsing format descriptions. |
2 | |
3 | use core::iter; |
4 | |
5 | use super::{unused, Error, Location, Spanned, SpannedValue}; |
6 | |
7 | /// An iterator over the lexed tokens. |
8 | pub(super) struct Lexed<I: Iterator> { |
9 | /// The internal iterator. |
10 | iter: core::iter::Peekable<I>, |
11 | } |
12 | |
13 | impl<I: Iterator> Iterator for Lexed<I> { |
14 | type Item = I::Item; |
15 | |
16 | fn next(&mut self) -> Option<Self::Item> { |
17 | self.iter.next() |
18 | } |
19 | } |
20 | |
21 | impl<'iter, 'token: 'iter, I: Iterator<Item = Result<Token<'token>, Error>> + 'iter> Lexed<I> { |
22 | /// Peek at the next item in the iterator. |
23 | pub(super) fn peek(&mut self) -> Option<&I::Item> { |
24 | self.iter.peek() |
25 | } |
26 | |
27 | /// Consume the next token if it is whitespace. |
28 | pub(super) fn next_if_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> { |
29 | if let Some(&Ok(Token::ComponentPart { |
30 | kind: ComponentKind::Whitespace, |
31 | value, |
32 | })) = self.peek() |
33 | { |
34 | self.next(); // consume |
35 | Some(value) |
36 | } else { |
37 | None |
38 | } |
39 | } |
40 | |
41 | /// Consume the next token if it is a component item that is not whitespace. |
42 | pub(super) fn next_if_not_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> { |
43 | if let Some(&Ok(Token::ComponentPart { |
44 | kind: ComponentKind::NotWhitespace, |
45 | value, |
46 | })) = self.peek() |
47 | { |
48 | self.next(); // consume |
49 | Some(value) |
50 | } else { |
51 | None |
52 | } |
53 | } |
54 | |
55 | /// Consume the next token if it is an opening bracket. |
56 | pub(super) fn next_if_opening_bracket(&mut self) -> Option<Location> { |
57 | if let Some(&Ok(Token::Bracket { |
58 | kind: BracketKind::Opening, |
59 | location, |
60 | })) = self.peek() |
61 | { |
62 | self.next(); // consume |
63 | Some(location) |
64 | } else { |
65 | None |
66 | } |
67 | } |
68 | |
69 | /// Peek at the next token if it is a closing bracket. |
70 | pub(super) fn peek_closing_bracket(&'iter mut self) -> Option<&'iter Location> { |
71 | if let Some(Ok(Token::Bracket { |
72 | kind: BracketKind::Closing, |
73 | location, |
74 | })) = self.peek() |
75 | { |
76 | Some(location) |
77 | } else { |
78 | None |
79 | } |
80 | } |
81 | |
82 | /// Consume the next token if it is a closing bracket. |
83 | pub(super) fn next_if_closing_bracket(&mut self) -> Option<Location> { |
84 | if let Some(&Ok(Token::Bracket { |
85 | kind: BracketKind::Closing, |
86 | location, |
87 | })) = self.peek() |
88 | { |
89 | self.next(); // consume |
90 | Some(location) |
91 | } else { |
92 | None |
93 | } |
94 | } |
95 | } |
96 | |
97 | /// A token emitted by the lexer. There is no semantic meaning at this stage. |
98 | pub(super) enum Token<'a> { |
99 | /// A literal string, formatted and parsed as-is. |
100 | Literal(Spanned<&'a [u8]>), |
101 | /// An opening or closing bracket. May or may not be the start or end of a component. |
102 | Bracket { |
103 | /// Whether the bracket is opening or closing. |
104 | kind: BracketKind, |
105 | /// Where the bracket was in the format string. |
106 | location: Location, |
107 | }, |
108 | /// One part of a component. This could be its name, a modifier, or whitespace. |
109 | ComponentPart { |
110 | /// Whether the part is whitespace or not. |
111 | kind: ComponentKind, |
112 | /// The part itself. |
113 | value: Spanned<&'a [u8]>, |
114 | }, |
115 | } |
116 | |
117 | /// What type of bracket is present. |
118 | pub(super) enum BracketKind { |
119 | /// An opening bracket: `[` |
120 | Opening, |
121 | /// A closing bracket: `]` |
122 | Closing, |
123 | } |
124 | |
125 | /// Indicates whether the component is whitespace or not. |
126 | pub(super) enum ComponentKind { |
127 | #[allow (clippy::missing_docs_in_private_items)] |
128 | Whitespace, |
129 | #[allow (clippy::missing_docs_in_private_items)] |
130 | NotWhitespace, |
131 | } |
132 | |
133 | /// Attach [`Location`] information to each byte in the iterator. |
134 | fn attach_location<'item>( |
135 | iter: impl Iterator<Item = &'item u8>, |
136 | ) -> impl Iterator<Item = (&'item u8, Location)> { |
137 | let mut byte_pos: u32 = 0; |
138 | |
139 | iter.map(move |byte: &u8| { |
140 | let location: Location = Location { byte: byte_pos }; |
141 | byte_pos += 1; |
142 | (byte, location) |
143 | }) |
144 | } |
145 | |
146 | /// Parse the string into a series of [`Token`]s. |
147 | /// |
148 | /// `VERSION` controls the version of the format description that is being parsed. Currently, this |
149 | /// must be 1 or 2. |
150 | /// |
151 | /// - When `VERSION` is 1, `[[` is the only escape sequence, resulting in a literal `[`. |
152 | /// - When `VERSION` is 2, all escape sequences begin with `\`. The only characters that may |
153 | /// currently follow are `\`, `[`, and `]`, all of which result in the literal character. All |
154 | /// other characters result in a lex error. |
155 | pub(super) fn lex<const VERSION: usize>( |
156 | mut input: &[u8], |
157 | ) -> Lexed<impl Iterator<Item = Result<Token<'_>, Error>>> { |
158 | validate_version!(VERSION); |
159 | |
160 | let mut depth: u8 = 0; |
161 | let mut iter = attach_location(input.iter()).peekable(); |
162 | let mut second_bracket_location = None; |
163 | |
164 | let iter = iter::from_fn(move || { |
165 | // The flag is only set when version is zero. |
166 | if version!(..=1) { |
167 | // There is a flag set to emit the second half of an escaped bracket pair. |
168 | if let Some(location) = second_bracket_location.take() { |
169 | return Some(Ok(Token::Bracket { |
170 | kind: BracketKind::Opening, |
171 | location, |
172 | })); |
173 | } |
174 | } |
175 | |
176 | Some(Ok(match iter.next()? { |
177 | // possible escape sequence |
178 | (b' \\' , backslash_loc) if version!(2..) => { |
179 | match iter.next() { |
180 | Some((b' \\' | b'[' | b']' , char_loc)) => { |
181 | // The escaped character is emitted as-is. |
182 | let char = &input[1..2]; |
183 | input = &input[2..]; |
184 | if depth == 0 { |
185 | Token::Literal(char.spanned(backslash_loc.to(char_loc))) |
186 | } else { |
187 | Token::ComponentPart { |
188 | kind: ComponentKind::NotWhitespace, |
189 | value: char.spanned(backslash_loc.to(char_loc)), |
190 | } |
191 | } |
192 | } |
193 | Some((_, loc)) => { |
194 | return Some(Err(Error { |
195 | _inner: unused(loc.error("invalid escape sequence" )), |
196 | public: crate::error::InvalidFormatDescription::Expected { |
197 | what: "valid escape sequence" , |
198 | index: loc.byte as _, |
199 | }, |
200 | })); |
201 | } |
202 | None => { |
203 | return Some(Err(Error { |
204 | _inner: unused(backslash_loc.error("unexpected end of input" )), |
205 | public: crate::error::InvalidFormatDescription::Expected { |
206 | what: "valid escape sequence" , |
207 | index: backslash_loc.byte as _, |
208 | }, |
209 | })); |
210 | } |
211 | } |
212 | } |
213 | // potentially escaped opening bracket |
214 | (b'[' , location) if version!(..=1) => { |
215 | if let Some((_, second_location)) = iter.next_if(|&(&byte, _)| byte == b'[' ) { |
216 | // Escaped bracket. Store the location of the second so we can emit it later. |
217 | second_bracket_location = Some(second_location); |
218 | input = &input[2..]; |
219 | } else { |
220 | // opening bracket |
221 | depth += 1; |
222 | input = &input[1..]; |
223 | } |
224 | |
225 | Token::Bracket { |
226 | kind: BracketKind::Opening, |
227 | location, |
228 | } |
229 | } |
230 | // opening bracket |
231 | (b'[' , location) => { |
232 | depth += 1; |
233 | input = &input[1..]; |
234 | |
235 | Token::Bracket { |
236 | kind: BracketKind::Opening, |
237 | location, |
238 | } |
239 | } |
240 | // closing bracket |
241 | (b']' , location) if depth > 0 => { |
242 | depth -= 1; |
243 | input = &input[1..]; |
244 | |
245 | Token::Bracket { |
246 | kind: BracketKind::Closing, |
247 | location, |
248 | } |
249 | } |
250 | // literal |
251 | (_, start_location) if depth == 0 => { |
252 | let mut bytes = 1; |
253 | let mut end_location = start_location; |
254 | |
255 | while let Some((_, location)) = |
256 | iter.next_if(|&(&byte, _)| !((version!(2..) && byte == b' \\' ) || byte == b'[' )) |
257 | { |
258 | end_location = location; |
259 | bytes += 1; |
260 | } |
261 | |
262 | let value = &input[..bytes]; |
263 | input = &input[bytes..]; |
264 | |
265 | Token::Literal(value.spanned(start_location.to(end_location))) |
266 | } |
267 | // component part |
268 | (byte, start_location) => { |
269 | let mut bytes = 1; |
270 | let mut end_location = start_location; |
271 | let is_whitespace = byte.is_ascii_whitespace(); |
272 | |
273 | while let Some((_, location)) = iter.next_if(|&(byte, _)| { |
274 | !matches!(byte, b' \\' | b'[' | b']' ) |
275 | && is_whitespace == byte.is_ascii_whitespace() |
276 | }) { |
277 | end_location = location; |
278 | bytes += 1; |
279 | } |
280 | |
281 | let value = &input[..bytes]; |
282 | input = &input[bytes..]; |
283 | |
284 | Token::ComponentPart { |
285 | kind: if is_whitespace { |
286 | ComponentKind::Whitespace |
287 | } else { |
288 | ComponentKind::NotWhitespace |
289 | }, |
290 | value: value.spanned(start_location.to(end_location)), |
291 | } |
292 | } |
293 | })) |
294 | }); |
295 | |
296 | Lexed { |
297 | iter: iter.peekable(), |
298 | } |
299 | } |
300 | |