| 1 | //! Lexer for parsing format descriptions. |
| 2 | |
| 3 | use core::iter; |
| 4 | |
| 5 | use super::{attach_location, unused, Error, Location, Spanned, SpannedValue}; |
| 6 | |
| 7 | /// An iterator over the lexed tokens. |
| 8 | pub(super) struct Lexed<I: Iterator> { |
| 9 | /// The internal iterator. |
| 10 | iter: iter::Peekable<I>, |
| 11 | } |
| 12 | |
| 13 | impl<I: Iterator> Iterator for Lexed<I> { |
| 14 | type Item = I::Item; |
| 15 | |
| 16 | fn next(&mut self) -> Option<Self::Item> { |
| 17 | self.iter.next() |
| 18 | } |
| 19 | } |
| 20 | |
| 21 | impl<'iter, 'token: 'iter, I: Iterator<Item = Result<Token<'token>, Error>> + 'iter> Lexed<I> { |
| 22 | /// Peek at the next item in the iterator. |
| 23 | pub(super) fn peek(&mut self) -> Option<&I::Item> { |
| 24 | self.iter.peek() |
| 25 | } |
| 26 | |
| 27 | /// Consume the next token if it is whitespace. |
| 28 | pub(super) fn next_if_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> { |
| 29 | if let Some(&Ok(Token::ComponentPart { |
| 30 | kind: ComponentKind::Whitespace, |
| 31 | value, |
| 32 | })) = self.peek() |
| 33 | { |
| 34 | self.next(); // consume |
| 35 | Some(value) |
| 36 | } else { |
| 37 | None |
| 38 | } |
| 39 | } |
| 40 | |
| 41 | /// Consume the next token if it is a component item that is not whitespace. |
| 42 | pub(super) fn next_if_not_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> { |
| 43 | if let Some(&Ok(Token::ComponentPart { |
| 44 | kind: ComponentKind::NotWhitespace, |
| 45 | value, |
| 46 | })) = self.peek() |
| 47 | { |
| 48 | self.next(); // consume |
| 49 | Some(value) |
| 50 | } else { |
| 51 | None |
| 52 | } |
| 53 | } |
| 54 | |
| 55 | /// Consume the next token if it is an opening bracket. |
| 56 | pub(super) fn next_if_opening_bracket(&mut self) -> Option<Location> { |
| 57 | if let Some(&Ok(Token::Bracket { |
| 58 | kind: BracketKind::Opening, |
| 59 | location, |
| 60 | })) = self.peek() |
| 61 | { |
| 62 | self.next(); // consume |
| 63 | Some(location) |
| 64 | } else { |
| 65 | None |
| 66 | } |
| 67 | } |
| 68 | |
| 69 | /// Peek at the next token if it is a closing bracket. |
| 70 | pub(super) fn peek_closing_bracket(&'iter mut self) -> Option<&'iter Location> { |
| 71 | if let Some(Ok(Token::Bracket { |
| 72 | kind: BracketKind::Closing, |
| 73 | location, |
| 74 | })) = self.peek() |
| 75 | { |
| 76 | Some(location) |
| 77 | } else { |
| 78 | None |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | /// Consume the next token if it is a closing bracket. |
| 83 | pub(super) fn next_if_closing_bracket(&mut self) -> Option<Location> { |
| 84 | if let Some(&Ok(Token::Bracket { |
| 85 | kind: BracketKind::Closing, |
| 86 | location, |
| 87 | })) = self.peek() |
| 88 | { |
| 89 | self.next(); // consume |
| 90 | Some(location) |
| 91 | } else { |
| 92 | None |
| 93 | } |
| 94 | } |
| 95 | } |
| 96 | |
| 97 | /// A token emitted by the lexer. There is no semantic meaning at this stage. |
| 98 | pub(super) enum Token<'a> { |
| 99 | /// A literal string, formatted and parsed as-is. |
| 100 | Literal(Spanned<&'a [u8]>), |
| 101 | /// An opening or closing bracket. May or may not be the start or end of a component. |
| 102 | Bracket { |
| 103 | /// Whether the bracket is opening or closing. |
| 104 | kind: BracketKind, |
| 105 | /// Where the bracket was in the format string. |
| 106 | location: Location, |
| 107 | }, |
| 108 | /// One part of a component. This could be its name, a modifier, or whitespace. |
| 109 | ComponentPart { |
| 110 | /// Whether the part is whitespace or not. |
| 111 | kind: ComponentKind, |
| 112 | /// The part itself. |
| 113 | value: Spanned<&'a [u8]>, |
| 114 | }, |
| 115 | } |
| 116 | |
| 117 | /// What type of bracket is present. |
| 118 | pub(super) enum BracketKind { |
| 119 | /// An opening bracket: `[` |
| 120 | Opening, |
| 121 | /// A closing bracket: `]` |
| 122 | Closing, |
| 123 | } |
| 124 | |
| 125 | /// Indicates whether the component is whitespace or not. |
| 126 | pub(super) enum ComponentKind { |
| 127 | #[allow (clippy::missing_docs_in_private_items)] |
| 128 | Whitespace, |
| 129 | #[allow (clippy::missing_docs_in_private_items)] |
| 130 | NotWhitespace, |
| 131 | } |
| 132 | |
| 133 | /// Parse the string into a series of [`Token`]s. |
| 134 | /// |
| 135 | /// `VERSION` controls the version of the format description that is being parsed. Currently, this |
| 136 | /// must be 1 or 2. |
| 137 | /// |
| 138 | /// - When `VERSION` is 1, `[[` is the only escape sequence, resulting in a literal `[`. |
| 139 | /// - When `VERSION` is 2, all escape sequences begin with `\`. The only characters that may |
| 140 | /// currently follow are `\`, `[`, and `]`, all of which result in the literal character. All |
| 141 | /// other characters result in a lex error. |
| 142 | pub(super) fn lex<const VERSION: usize>( |
| 143 | mut input: &[u8], |
| 144 | ) -> Lexed<impl Iterator<Item = Result<Token<'_>, Error>>> { |
| 145 | validate_version!(VERSION); |
| 146 | |
| 147 | let mut depth: u8 = 0; |
| 148 | let mut iter = attach_location(input.iter()).peekable(); |
| 149 | let mut second_bracket_location = None; |
| 150 | |
| 151 | let iter = iter::from_fn(move || { |
| 152 | // The flag is only set when version is zero. |
| 153 | if version!(..=1) { |
| 154 | // There is a flag set to emit the second half of an escaped bracket pair. |
| 155 | if let Some(location) = second_bracket_location.take() { |
| 156 | return Some(Ok(Token::Bracket { |
| 157 | kind: BracketKind::Opening, |
| 158 | location, |
| 159 | })); |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | Some(Ok(match iter.next()? { |
| 164 | // possible escape sequence |
| 165 | (b' \\' , backslash_loc) if version!(2..) => { |
| 166 | match iter.next() { |
| 167 | Some((b' \\' | b'[' | b']' , char_loc)) => { |
| 168 | // The escaped character is emitted as-is. |
| 169 | let char = &input[1..2]; |
| 170 | input = &input[2..]; |
| 171 | if depth == 0 { |
| 172 | Token::Literal(char.spanned(backslash_loc.to(char_loc))) |
| 173 | } else { |
| 174 | Token::ComponentPart { |
| 175 | kind: ComponentKind::NotWhitespace, |
| 176 | value: char.spanned(backslash_loc.to(char_loc)), |
| 177 | } |
| 178 | } |
| 179 | } |
| 180 | Some((_, loc)) => { |
| 181 | return Some(Err(Error { |
| 182 | _inner: unused(loc.error("invalid escape sequence" )), |
| 183 | public: crate::error::InvalidFormatDescription::Expected { |
| 184 | what: "valid escape sequence" , |
| 185 | index: loc.byte as _, |
| 186 | }, |
| 187 | })); |
| 188 | } |
| 189 | None => { |
| 190 | return Some(Err(Error { |
| 191 | _inner: unused(backslash_loc.error("unexpected end of input" )), |
| 192 | public: crate::error::InvalidFormatDescription::Expected { |
| 193 | what: "valid escape sequence" , |
| 194 | index: backslash_loc.byte as _, |
| 195 | }, |
| 196 | })); |
| 197 | } |
| 198 | } |
| 199 | } |
| 200 | // potentially escaped opening bracket |
| 201 | (b'[' , location) if version!(..=1) => { |
| 202 | if let Some((_, second_location)) = iter.next_if(|&(&byte, _)| byte == b'[' ) { |
| 203 | // Escaped bracket. Store the location of the second so we can emit it later. |
| 204 | second_bracket_location = Some(second_location); |
| 205 | input = &input[2..]; |
| 206 | } else { |
| 207 | // opening bracket |
| 208 | depth += 1; |
| 209 | input = &input[1..]; |
| 210 | } |
| 211 | |
| 212 | Token::Bracket { |
| 213 | kind: BracketKind::Opening, |
| 214 | location, |
| 215 | } |
| 216 | } |
| 217 | // opening bracket |
| 218 | (b'[' , location) => { |
| 219 | depth += 1; |
| 220 | input = &input[1..]; |
| 221 | |
| 222 | Token::Bracket { |
| 223 | kind: BracketKind::Opening, |
| 224 | location, |
| 225 | } |
| 226 | } |
| 227 | // closing bracket |
| 228 | (b']' , location) if depth > 0 => { |
| 229 | depth -= 1; |
| 230 | input = &input[1..]; |
| 231 | |
| 232 | Token::Bracket { |
| 233 | kind: BracketKind::Closing, |
| 234 | location, |
| 235 | } |
| 236 | } |
| 237 | // literal |
| 238 | (_, start_location) if depth == 0 => { |
| 239 | let mut bytes = 1; |
| 240 | let mut end_location = start_location; |
| 241 | |
| 242 | while let Some((_, location)) = |
| 243 | iter.next_if(|&(&byte, _)| !((version!(2..) && byte == b' \\' ) || byte == b'[' )) |
| 244 | { |
| 245 | end_location = location; |
| 246 | bytes += 1; |
| 247 | } |
| 248 | |
| 249 | let value = &input[..bytes]; |
| 250 | input = &input[bytes..]; |
| 251 | |
| 252 | Token::Literal(value.spanned(start_location.to(end_location))) |
| 253 | } |
| 254 | // component part |
| 255 | (byte, start_location) => { |
| 256 | let mut bytes = 1; |
| 257 | let mut end_location = start_location; |
| 258 | let is_whitespace = byte.is_ascii_whitespace(); |
| 259 | |
| 260 | while let Some((_, location)) = iter.next_if(|&(byte, _)| { |
| 261 | !matches!(byte, b' \\' | b'[' | b']' ) |
| 262 | && is_whitespace == byte.is_ascii_whitespace() |
| 263 | }) { |
| 264 | end_location = location; |
| 265 | bytes += 1; |
| 266 | } |
| 267 | |
| 268 | let value = &input[..bytes]; |
| 269 | input = &input[bytes..]; |
| 270 | |
| 271 | Token::ComponentPart { |
| 272 | kind: if is_whitespace { |
| 273 | ComponentKind::Whitespace |
| 274 | } else { |
| 275 | ComponentKind::NotWhitespace |
| 276 | }, |
| 277 | value: value.spanned(start_location.to(end_location)), |
| 278 | } |
| 279 | } |
| 280 | })) |
| 281 | }); |
| 282 | |
| 283 | Lexed { |
| 284 | iter: iter.peekable(), |
| 285 | } |
| 286 | } |
| 287 | |