1 | use std::borrow::Cow; |
2 | use std::char; |
3 | use std::str; |
4 | |
5 | /// A span, designating a range of bytes where a token is located. |
6 | #[derive (Eq, PartialEq, Debug, Clone, Copy)] |
7 | pub struct Span { |
8 | /// The start of the range. |
9 | pub start: usize, |
10 | /// The end of the range (exclusive). |
11 | pub end: usize, |
12 | } |
13 | |
14 | impl From<Span> for (usize, usize) { |
15 | fn from(Span { start: usize, end: usize }: Span) -> (usize, usize) { |
16 | (start, end) |
17 | } |
18 | } |
19 | |
20 | #[derive (Eq, PartialEq, Debug)] |
21 | pub enum Token<'a> { |
22 | Whitespace(&'a str), |
23 | Newline, |
24 | Comment(&'a str), |
25 | |
26 | Equals, |
27 | Period, |
28 | Comma, |
29 | Colon, |
30 | Plus, |
31 | LeftBrace, |
32 | RightBrace, |
33 | LeftBracket, |
34 | RightBracket, |
35 | |
36 | Keylike(&'a str), |
37 | String { |
38 | src: &'a str, |
39 | val: Cow<'a, str>, |
40 | multiline: bool, |
41 | }, |
42 | } |
43 | |
44 | #[derive (Eq, PartialEq, Debug)] |
45 | pub enum Error { |
46 | InvalidCharInString(usize, char), |
47 | InvalidEscape(usize, char), |
48 | InvalidHexEscape(usize, char), |
49 | InvalidEscapeValue(usize, u32), |
50 | NewlineInString(usize), |
51 | Unexpected(usize, char), |
52 | UnterminatedString(usize), |
53 | NewlineInTableKey(usize), |
54 | MultilineStringKey(usize), |
55 | Wanted { |
56 | at: usize, |
57 | expected: &'static str, |
58 | found: &'static str, |
59 | }, |
60 | } |
61 | |
62 | #[derive (Clone)] |
63 | pub struct Tokenizer<'a> { |
64 | input: &'a str, |
65 | chars: CrlfFold<'a>, |
66 | } |
67 | |
68 | #[derive (Clone)] |
69 | struct CrlfFold<'a> { |
70 | chars: str::CharIndices<'a>, |
71 | } |
72 | |
73 | #[derive (Debug)] |
74 | enum MaybeString { |
75 | NotEscaped(usize), |
76 | Owned(String), |
77 | } |
78 | |
79 | impl<'a> Tokenizer<'a> { |
80 | pub fn new(input: &'a str) -> Tokenizer<'a> { |
81 | let mut t = Tokenizer { |
82 | input, |
83 | chars: CrlfFold { |
84 | chars: input.char_indices(), |
85 | }, |
86 | }; |
87 | // Eat utf-8 BOM |
88 | t.eatc(' \u{feff}' ); |
89 | t |
90 | } |
91 | |
92 | pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> { |
93 | let (start, token) = match self.one() { |
94 | Some((start, ' \n' )) => (start, Token::Newline), |
95 | Some((start, ' ' | ' \t' )) => (start, self.whitespace_token(start)), |
96 | Some((start, '#' )) => (start, self.comment_token(start)), |
97 | Some((start, '=' )) => (start, Token::Equals), |
98 | Some((start, '.' )) => (start, Token::Period), |
99 | Some((start, ',' )) => (start, Token::Comma), |
100 | Some((start, ':' )) => (start, Token::Colon), |
101 | Some((start, '+' )) => (start, Token::Plus), |
102 | Some((start, '{' )) => (start, Token::LeftBrace), |
103 | Some((start, '}' )) => (start, Token::RightBrace), |
104 | Some((start, '[' )) => (start, Token::LeftBracket), |
105 | Some((start, ']' )) => (start, Token::RightBracket), |
106 | Some((start, ' \'' )) => { |
107 | return self |
108 | .literal_string(start) |
109 | .map(|t| Some((self.step_span(start), t))) |
110 | } |
111 | Some((start, '"' )) => { |
112 | return self |
113 | .basic_string(start) |
114 | .map(|t| Some((self.step_span(start), t))) |
115 | } |
116 | Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)), |
117 | |
118 | Some((start, ch)) => return Err(Error::Unexpected(start, ch)), |
119 | None => return Ok(None), |
120 | }; |
121 | |
122 | let span = self.step_span(start); |
123 | Ok(Some((span, token))) |
124 | } |
125 | |
126 | pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> { |
127 | self.clone().next() |
128 | } |
129 | |
130 | pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> { |
131 | self.eat_spanned(expected).map(|s| s.is_some()) |
132 | } |
133 | |
134 | /// Eat a value, returning it's span if it was consumed. |
135 | pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> { |
136 | let span = match self.peek()? { |
137 | Some((span, ref found)) if expected == *found => span, |
138 | Some(_) | None => return Ok(None), |
139 | }; |
140 | |
141 | drop(self.next()); |
142 | Ok(Some(span)) |
143 | } |
144 | |
145 | pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> { |
146 | // ignore span |
147 | let _ = self.expect_spanned(expected)?; |
148 | Ok(()) |
149 | } |
150 | |
151 | /// Expect the given token returning its span. |
152 | pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> { |
153 | let current = self.current(); |
154 | match self.next()? { |
155 | Some((span, found)) => { |
156 | if expected == found { |
157 | Ok(span) |
158 | } else { |
159 | Err(Error::Wanted { |
160 | at: current, |
161 | expected: expected.describe(), |
162 | found: found.describe(), |
163 | }) |
164 | } |
165 | } |
166 | None => Err(Error::Wanted { |
167 | at: self.input.len(), |
168 | expected: expected.describe(), |
169 | found: "eof" , |
170 | }), |
171 | } |
172 | } |
173 | |
174 | pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> { |
175 | let current = self.current(); |
176 | match self.next()? { |
177 | Some((span, Token::Keylike(k))) => Ok((span, k.into())), |
178 | Some(( |
179 | span, |
180 | Token::String { |
181 | src, |
182 | val, |
183 | multiline, |
184 | }, |
185 | )) => { |
186 | let offset = self.substr_offset(src); |
187 | if multiline { |
188 | return Err(Error::MultilineStringKey(offset)); |
189 | } |
190 | match src.find(' \n' ) { |
191 | None => Ok((span, val)), |
192 | Some(i) => Err(Error::NewlineInTableKey(offset + i)), |
193 | } |
194 | } |
195 | Some((_, other)) => Err(Error::Wanted { |
196 | at: current, |
197 | expected: "a table key" , |
198 | found: other.describe(), |
199 | }), |
200 | None => Err(Error::Wanted { |
201 | at: self.input.len(), |
202 | expected: "a table key" , |
203 | found: "eof" , |
204 | }), |
205 | } |
206 | } |
207 | |
208 | pub fn eat_whitespace(&mut self) { |
209 | while self.eatc(' ' ) || self.eatc(' \t' ) { |
210 | // ... |
211 | } |
212 | } |
213 | |
214 | pub fn eat_comment(&mut self) -> Result<bool, Error> { |
215 | if !self.eatc('#' ) { |
216 | return Ok(false); |
217 | } |
218 | drop(self.comment_token(0)); |
219 | self.eat_newline_or_eof().map(|()| true) |
220 | } |
221 | |
222 | pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> { |
223 | let current = self.current(); |
224 | match self.next()? { |
225 | None | Some((_, Token::Newline)) => Ok(()), |
226 | Some((_, other)) => Err(Error::Wanted { |
227 | at: current, |
228 | expected: "newline" , |
229 | found: other.describe(), |
230 | }), |
231 | } |
232 | } |
233 | |
234 | pub fn skip_to_newline(&mut self) { |
235 | loop { |
236 | match self.one() { |
237 | Some((_, ' \n' )) | None => break, |
238 | _ => {} |
239 | } |
240 | } |
241 | } |
242 | |
243 | fn eatc(&mut self, ch: char) -> bool { |
244 | match self.chars.clone().next() { |
245 | Some((_, ch2)) if ch == ch2 => { |
246 | self.one(); |
247 | true |
248 | } |
249 | _ => false, |
250 | } |
251 | } |
252 | |
253 | pub fn current(&mut self) -> usize { |
254 | match self.chars.clone().next() { |
255 | Some(i) => i.0, |
256 | None => self.input.len(), |
257 | } |
258 | } |
259 | |
260 | fn whitespace_token(&mut self, start: usize) -> Token<'a> { |
261 | while self.eatc(' ' ) || self.eatc(' \t' ) { |
262 | // ... |
263 | } |
264 | Token::Whitespace(&self.input[start..self.current()]) |
265 | } |
266 | |
267 | fn comment_token(&mut self, start: usize) -> Token<'a> { |
268 | while let Some((_, ch)) = self.chars.clone().next() { |
269 | if ch != ' \t' && (ch < ' \u{20}' || ch > ' \u{10ffff}' ) { |
270 | break; |
271 | } |
272 | self.one(); |
273 | } |
274 | Token::Comment(&self.input[start..self.current()]) |
275 | } |
276 | |
277 | fn read_string( |
278 | &mut self, |
279 | delim: char, |
280 | start: usize, |
281 | new_ch: &mut dyn FnMut( |
282 | &mut Tokenizer, |
283 | &mut MaybeString, |
284 | bool, |
285 | usize, |
286 | char, |
287 | ) -> Result<(), Error>, |
288 | ) -> Result<Token<'a>, Error> { |
289 | let mut multiline = false; |
290 | if self.eatc(delim) { |
291 | if self.eatc(delim) { |
292 | multiline = true; |
293 | } else { |
294 | return Ok(Token::String { |
295 | src: &self.input[start..start + 2], |
296 | val: Cow::Borrowed("" ), |
297 | multiline: false, |
298 | }); |
299 | } |
300 | } |
301 | let mut val = MaybeString::NotEscaped(self.current()); |
302 | let mut n = 0; |
303 | loop { |
304 | n += 1; |
305 | match self.one() { |
306 | Some((i, ' \n' )) => { |
307 | if multiline { |
308 | if self.input.as_bytes()[i] == b' \r' { |
309 | val.make_owned(&self.input[..i]); |
310 | } |
311 | if n == 1 { |
312 | val = MaybeString::NotEscaped(self.current()); |
313 | } else { |
314 | val.push(' \n' ); |
315 | } |
316 | } else { |
317 | return Err(Error::NewlineInString(i)); |
318 | } |
319 | } |
320 | Some((mut i, ch)) if ch == delim => { |
321 | if multiline { |
322 | if !self.eatc(delim) { |
323 | val.push(delim); |
324 | continue; |
325 | } |
326 | if !self.eatc(delim) { |
327 | val.push(delim); |
328 | val.push(delim); |
329 | continue; |
330 | } |
331 | if self.eatc(delim) { |
332 | val.push(delim); |
333 | i += 1; |
334 | } |
335 | if self.eatc(delim) { |
336 | val.push(delim); |
337 | i += 1; |
338 | } |
339 | } |
340 | return Ok(Token::String { |
341 | src: &self.input[start..self.current()], |
342 | val: val.into_cow(&self.input[..i]), |
343 | multiline, |
344 | }); |
345 | } |
346 | Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?, |
347 | None => return Err(Error::UnterminatedString(start)), |
348 | } |
349 | } |
350 | } |
351 | |
352 | fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> { |
353 | self.read_string(' \'' , start, &mut |_me, val, _multi, i, ch| { |
354 | if ch == ' \u{09}' || (' \u{20}' <= ch && ch <= ' \u{10ffff}' && ch != ' \u{7f}' ) { |
355 | val.push(ch); |
356 | Ok(()) |
357 | } else { |
358 | Err(Error::InvalidCharInString(i, ch)) |
359 | } |
360 | }) |
361 | } |
362 | |
363 | fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> { |
364 | self.read_string('"' , start, &mut |me, val, multi, i, ch| match ch { |
365 | ' \\' => { |
366 | val.make_owned(&me.input[..i]); |
367 | match me.chars.next() { |
368 | Some((_, '"' )) => val.push('"' ), |
369 | Some((_, ' \\' )) => val.push(' \\' ), |
370 | Some((_, 'b' )) => val.push(' \u{8}' ), |
371 | Some((_, 'f' )) => val.push(' \u{c}' ), |
372 | Some((_, 'n' )) => val.push(' \n' ), |
373 | Some((_, 'r' )) => val.push(' \r' ), |
374 | Some((_, 't' )) => val.push(' \t' ), |
375 | Some((i, c @ ('u' | 'U' ))) => { |
376 | let len = if c == 'u' { 4 } else { 8 }; |
377 | val.push(me.hex(start, i, len)?); |
378 | } |
379 | Some((i, c @ (' ' | ' \t' | ' \n' ))) if multi => { |
380 | if c != ' \n' { |
381 | while let Some((_, ch)) = me.chars.clone().next() { |
382 | match ch { |
383 | ' ' | ' \t' => { |
384 | me.chars.next(); |
385 | continue; |
386 | } |
387 | ' \n' => { |
388 | me.chars.next(); |
389 | break; |
390 | } |
391 | _ => return Err(Error::InvalidEscape(i, c)), |
392 | } |
393 | } |
394 | } |
395 | while let Some((_, ch)) = me.chars.clone().next() { |
396 | match ch { |
397 | ' ' | ' \t' | ' \n' => { |
398 | me.chars.next(); |
399 | } |
400 | _ => break, |
401 | } |
402 | } |
403 | } |
404 | Some((i, c)) => return Err(Error::InvalidEscape(i, c)), |
405 | None => return Err(Error::UnterminatedString(start)), |
406 | } |
407 | Ok(()) |
408 | } |
409 | ch if ch == ' \u{09}' || (' \u{20}' <= ch && ch <= ' \u{10ffff}' && ch != ' \u{7f}' ) => { |
410 | val.push(ch); |
411 | Ok(()) |
412 | } |
413 | _ => Err(Error::InvalidCharInString(i, ch)), |
414 | }) |
415 | } |
416 | |
417 | fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> { |
418 | let mut buf = String::with_capacity(len); |
419 | for _ in 0..len { |
420 | match self.one() { |
421 | Some((_, ch)) if ch as u32 <= 0x7F && ch.is_ascii_hexdigit() => buf.push(ch), |
422 | Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)), |
423 | None => return Err(Error::UnterminatedString(start)), |
424 | } |
425 | } |
426 | let val = u32::from_str_radix(&buf, 16).unwrap(); |
427 | match char::from_u32(val) { |
428 | Some(ch) => Ok(ch), |
429 | None => Err(Error::InvalidEscapeValue(i, val)), |
430 | } |
431 | } |
432 | |
433 | fn keylike(&mut self, start: usize) -> Token<'a> { |
434 | while let Some((_, ch)) = self.peek_one() { |
435 | if !is_keylike(ch) { |
436 | break; |
437 | } |
438 | self.one(); |
439 | } |
440 | Token::Keylike(&self.input[start..self.current()]) |
441 | } |
442 | |
443 | pub fn substr_offset(&self, s: &'a str) -> usize { |
444 | assert!(s.len() <= self.input.len()); |
445 | let a = self.input.as_ptr() as usize; |
446 | let b = s.as_ptr() as usize; |
447 | assert!(a <= b); |
448 | b - a |
449 | } |
450 | |
451 | /// Calculate the span of a single character. |
452 | fn step_span(&mut self, start: usize) -> Span { |
453 | let end = match self.peek_one() { |
454 | Some(t) => t.0, |
455 | None => self.input.len(), |
456 | }; |
457 | Span { start, end } |
458 | } |
459 | |
460 | /// Peek one char without consuming it. |
461 | fn peek_one(&mut self) -> Option<(usize, char)> { |
462 | self.chars.clone().next() |
463 | } |
464 | |
465 | /// Take one char. |
466 | pub fn one(&mut self) -> Option<(usize, char)> { |
467 | self.chars.next() |
468 | } |
469 | } |
470 | |
471 | impl<'a> Iterator for CrlfFold<'a> { |
472 | type Item = (usize, char); |
473 | |
474 | fn next(&mut self) -> Option<(usize, char)> { |
475 | self.chars.next().map(|(i: usize, c: char)| { |
476 | if c == ' \r' { |
477 | let mut attempt: CharIndices<'_> = self.chars.clone(); |
478 | if let Some((_, ' \n' )) = attempt.next() { |
479 | self.chars = attempt; |
480 | return (i, ' \n' ); |
481 | } |
482 | } |
483 | (i, c) |
484 | }) |
485 | } |
486 | } |
487 | |
488 | impl MaybeString { |
489 | fn push(&mut self, ch: char) { |
490 | match *self { |
491 | MaybeString::NotEscaped(..) => {} |
492 | MaybeString::Owned(ref mut s: &mut String) => s.push(ch), |
493 | } |
494 | } |
495 | |
496 | fn make_owned(&mut self, input: &str) { |
497 | match *self { |
498 | MaybeString::NotEscaped(start: usize) => { |
499 | *self = MaybeString::Owned(input[start..].to_owned()); |
500 | } |
501 | MaybeString::Owned(..) => {} |
502 | } |
503 | } |
504 | |
505 | fn into_cow(self, input: &str) -> Cow<str> { |
506 | match self { |
507 | MaybeString::NotEscaped(start: usize) => Cow::Borrowed(&input[start..]), |
508 | MaybeString::Owned(s: String) => Cow::Owned(s), |
509 | } |
510 | } |
511 | } |
512 | |
513 | fn is_keylike(ch: char) -> bool { |
514 | ('A' <= ch && ch <= 'Z' ) |
515 | || ('a' <= ch && ch <= 'z' ) |
516 | || ('0' <= ch && ch <= '9' ) |
517 | || ch == '-' |
518 | || ch == '_' |
519 | } |
520 | |
521 | impl<'a> Token<'a> { |
522 | pub fn describe(&self) -> &'static str { |
523 | match *self { |
524 | Token::Keylike(_) => "an identifier" , |
525 | Token::Equals => "an equals" , |
526 | Token::Period => "a period" , |
527 | Token::Comment(_) => "a comment" , |
528 | Token::Newline => "a newline" , |
529 | Token::Whitespace(_) => "whitespace" , |
530 | Token::Comma => "a comma" , |
531 | Token::RightBrace => "a right brace" , |
532 | Token::LeftBrace => "a left brace" , |
533 | Token::RightBracket => "a right bracket" , |
534 | Token::LeftBracket => "a left bracket" , |
535 | Token::String { multiline, .. } => { |
536 | if multiline { |
537 | "a multiline string" |
538 | } else { |
539 | "a string" |
540 | } |
541 | } |
542 | Token::Colon => "a colon" , |
543 | Token::Plus => "a plus" , |
544 | } |
545 | } |
546 | } |
547 | |