1use std::borrow::Cow;
2use std::char;
3use std::str;
4use std::string;
5use std::string::String as StdString;
6
7use self::Token::*;
8
9/// A span, designating a range of bytes where a token is located.
10#[derive(Eq, PartialEq, Debug, Clone, Copy)]
11pub struct Span {
12 /// The start of the range.
13 pub start: usize,
14 /// The end of the range (exclusive).
15 pub end: usize,
16}
17
18impl From<Span> for (usize, usize) {
19 fn from(Span { start: usize, end: usize }: Span) -> (usize, usize) {
20 (start, end)
21 }
22}
23
24#[derive(Eq, PartialEq, Debug)]
25pub enum Token<'a> {
26 Whitespace(&'a str),
27 Newline,
28 Comment(&'a str),
29
30 Equals,
31 Period,
32 Comma,
33 Colon,
34 Plus,
35 LeftBrace,
36 RightBrace,
37 LeftBracket,
38 RightBracket,
39
40 Keylike(&'a str),
41 String {
42 src: &'a str,
43 val: Cow<'a, str>,
44 multiline: bool,
45 },
46}
47
48#[derive(Eq, PartialEq, Debug)]
49pub enum Error {
50 InvalidCharInString(usize, char),
51 InvalidEscape(usize, char),
52 InvalidHexEscape(usize, char),
53 InvalidEscapeValue(usize, u32),
54 NewlineInString(usize),
55 Unexpected(usize, char),
56 UnterminatedString(usize),
57 NewlineInTableKey(usize),
58 MultilineStringKey(usize),
59 Wanted {
60 at: usize,
61 expected: &'static str,
62 found: &'static str,
63 },
64}
65
66#[derive(Clone)]
67pub struct Tokenizer<'a> {
68 input: &'a str,
69 chars: CrlfFold<'a>,
70}
71
72#[derive(Clone)]
73struct CrlfFold<'a> {
74 chars: str::CharIndices<'a>,
75}
76
77#[derive(Debug)]
78enum MaybeString {
79 NotEscaped(usize),
80 Owned(string::String),
81}
82
83impl<'a> Tokenizer<'a> {
84 pub fn new(input: &'a str) -> Tokenizer<'a> {
85 let mut t = Tokenizer {
86 input,
87 chars: CrlfFold {
88 chars: input.char_indices(),
89 },
90 };
91 // Eat utf-8 BOM
92 t.eatc('\u{feff}');
93 t
94 }
95
96 pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
97 let (start, token) = match self.one() {
98 Some((start, '\n')) => (start, Newline),
99 Some((start, ' ')) => (start, self.whitespace_token(start)),
100 Some((start, '\t')) => (start, self.whitespace_token(start)),
101 Some((start, '#')) => (start, self.comment_token(start)),
102 Some((start, '=')) => (start, Equals),
103 Some((start, '.')) => (start, Period),
104 Some((start, ',')) => (start, Comma),
105 Some((start, ':')) => (start, Colon),
106 Some((start, '+')) => (start, Plus),
107 Some((start, '{')) => (start, LeftBrace),
108 Some((start, '}')) => (start, RightBrace),
109 Some((start, '[')) => (start, LeftBracket),
110 Some((start, ']')) => (start, RightBracket),
111 Some((start, '\'')) => {
112 return self
113 .literal_string(start)
114 .map(|t| Some((self.step_span(start), t)))
115 }
116 Some((start, '"')) => {
117 return self
118 .basic_string(start)
119 .map(|t| Some((self.step_span(start), t)))
120 }
121 Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
122
123 Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
124 None => return Ok(None),
125 };
126
127 let span = self.step_span(start);
128 Ok(Some((span, token)))
129 }
130
131 pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
132 self.clone().next()
133 }
134
135 pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
136 self.eat_spanned(expected).map(|s| s.is_some())
137 }
138
139 /// Eat a value, returning it's span if it was consumed.
140 pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
141 let span = match self.peek()? {
142 Some((span, ref found)) if expected == *found => span,
143 Some(_) => return Ok(None),
144 None => return Ok(None),
145 };
146
147 drop(self.next());
148 Ok(Some(span))
149 }
150
151 pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
152 // ignore span
153 let _ = self.expect_spanned(expected)?;
154 Ok(())
155 }
156
157 /// Expect the given token returning its span.
158 pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
159 let current = self.current();
160 match self.next()? {
161 Some((span, found)) => {
162 if expected == found {
163 Ok(span)
164 } else {
165 Err(Error::Wanted {
166 at: current,
167 expected: expected.describe(),
168 found: found.describe(),
169 })
170 }
171 }
172 None => Err(Error::Wanted {
173 at: self.input.len(),
174 expected: expected.describe(),
175 found: "eof",
176 }),
177 }
178 }
179
180 pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> {
181 let current = self.current();
182 match self.next()? {
183 Some((span, Token::Keylike(k))) => Ok((span, k.into())),
184 Some((
185 span,
186 Token::String {
187 src,
188 val,
189 multiline,
190 },
191 )) => {
192 let offset = self.substr_offset(src);
193 if multiline {
194 return Err(Error::MultilineStringKey(offset));
195 }
196 match src.find('\n') {
197 None => Ok((span, val)),
198 Some(i) => Err(Error::NewlineInTableKey(offset + i)),
199 }
200 }
201 Some((_, other)) => Err(Error::Wanted {
202 at: current,
203 expected: "a table key",
204 found: other.describe(),
205 }),
206 None => Err(Error::Wanted {
207 at: self.input.len(),
208 expected: "a table key",
209 found: "eof",
210 }),
211 }
212 }
213
214 pub fn eat_whitespace(&mut self) -> Result<(), Error> {
215 while self.eatc(' ') || self.eatc('\t') {
216 // ...
217 }
218 Ok(())
219 }
220
221 pub fn eat_comment(&mut self) -> Result<bool, Error> {
222 if !self.eatc('#') {
223 return Ok(false);
224 }
225 drop(self.comment_token(0));
226 self.eat_newline_or_eof().map(|()| true)
227 }
228
229 pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
230 let current = self.current();
231 match self.next()? {
232 None | Some((_, Token::Newline)) => Ok(()),
233 Some((_, other)) => Err(Error::Wanted {
234 at: current,
235 expected: "newline",
236 found: other.describe(),
237 }),
238 }
239 }
240
241 pub fn skip_to_newline(&mut self) {
242 loop {
243 match self.one() {
244 Some((_, '\n')) | None => break,
245 _ => {}
246 }
247 }
248 }
249
250 fn eatc(&mut self, ch: char) -> bool {
251 match self.chars.clone().next() {
252 Some((_, ch2)) if ch == ch2 => {
253 self.one();
254 true
255 }
256 _ => false,
257 }
258 }
259
260 pub fn current(&mut self) -> usize {
261 self.chars
262 .clone()
263 .next()
264 .map(|i| i.0)
265 .unwrap_or_else(|| self.input.len())
266 }
267
268 pub fn input(&self) -> &'a str {
269 self.input
270 }
271
272 fn whitespace_token(&mut self, start: usize) -> Token<'a> {
273 while self.eatc(' ') || self.eatc('\t') {
274 // ...
275 }
276 Whitespace(&self.input[start..self.current()])
277 }
278
279 fn comment_token(&mut self, start: usize) -> Token<'a> {
280 while let Some((_, ch)) = self.chars.clone().next() {
281 if ch != '\t' && !('\u{20}'..='\u{10ffff}').contains(&ch) {
282 break;
283 }
284 self.one();
285 }
286 Comment(&self.input[start..self.current()])
287 }
288
289 #[allow(clippy::type_complexity)]
290 fn read_string(
291 &mut self,
292 delim: char,
293 start: usize,
294 new_ch: &mut dyn FnMut(
295 &mut Tokenizer<'_>,
296 &mut MaybeString,
297 bool,
298 usize,
299 char,
300 ) -> Result<(), Error>,
301 ) -> Result<Token<'a>, Error> {
302 let mut multiline = false;
303 if self.eatc(delim) {
304 if self.eatc(delim) {
305 multiline = true;
306 } else {
307 return Ok(String {
308 src: &self.input[start..start + 2],
309 val: Cow::Borrowed(""),
310 multiline: false,
311 });
312 }
313 }
314 let mut val = MaybeString::NotEscaped(self.current());
315 let mut n = 0;
316 'outer: loop {
317 n += 1;
318 match self.one() {
319 Some((i, '\n')) => {
320 if multiline {
321 if self.input.as_bytes()[i] == b'\r' {
322 val.to_owned(&self.input[..i]);
323 }
324 if n == 1 {
325 val = MaybeString::NotEscaped(self.current());
326 } else {
327 val.push('\n');
328 }
329 continue;
330 } else {
331 return Err(Error::NewlineInString(i));
332 }
333 }
334 Some((mut i, ch)) if ch == delim => {
335 if multiline {
336 if !self.eatc(delim) {
337 val.push(delim);
338 continue 'outer;
339 }
340 if !self.eatc(delim) {
341 val.push(delim);
342 val.push(delim);
343 continue 'outer;
344 }
345 if self.eatc(delim) {
346 val.push(delim);
347 i += 1;
348 }
349 if self.eatc(delim) {
350 val.push(delim);
351 i += 1;
352 }
353 }
354 return Ok(String {
355 src: &self.input[start..self.current()],
356 val: val.into_cow(&self.input[..i]),
357 multiline,
358 });
359 }
360 Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
361 None => return Err(Error::UnterminatedString(start)),
362 }
363 }
364 }
365
366 fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
367 self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
368 if ch == '\u{09}' || (('\u{20}'..='\u{10ffff}').contains(&ch) && ch != '\u{7f}') {
369 val.push(ch);
370 Ok(())
371 } else {
372 Err(Error::InvalidCharInString(i, ch))
373 }
374 })
375 }
376
377 fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
378 self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
379 '\\' => {
380 val.to_owned(&me.input[..i]);
381 match me.chars.next() {
382 Some((_, '"')) => val.push('"'),
383 Some((_, '\\')) => val.push('\\'),
384 Some((_, 'b')) => val.push('\u{8}'),
385 Some((_, 'f')) => val.push('\u{c}'),
386 Some((_, 'n')) => val.push('\n'),
387 Some((_, 'r')) => val.push('\r'),
388 Some((_, 't')) => val.push('\t'),
389 Some((i, c @ 'u')) | Some((i, c @ 'U')) => {
390 let len = if c == 'u' { 4 } else { 8 };
391 val.push(me.hex(start, i, len)?);
392 }
393 Some((i, c @ ' ')) | Some((i, c @ '\t')) | Some((i, c @ '\n')) if multi => {
394 if c != '\n' {
395 while let Some((_, ch)) = me.chars.clone().next() {
396 match ch {
397 ' ' | '\t' => {
398 me.chars.next();
399 continue;
400 }
401 '\n' => {
402 me.chars.next();
403 break;
404 }
405 _ => return Err(Error::InvalidEscape(i, c)),
406 }
407 }
408 }
409 while let Some((_, ch)) = me.chars.clone().next() {
410 match ch {
411 ' ' | '\t' | '\n' => {
412 me.chars.next();
413 }
414 _ => break,
415 }
416 }
417 }
418 Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
419 None => return Err(Error::UnterminatedString(start)),
420 }
421 Ok(())
422 }
423 ch if ch == '\u{09}' || (('\u{20}'..='\u{10ffff}').contains(&ch) && ch != '\u{7f}') => {
424 val.push(ch);
425 Ok(())
426 }
427 _ => Err(Error::InvalidCharInString(i, ch)),
428 })
429 }
430
431 fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
432 let mut buf = StdString::with_capacity(len);
433 for _ in 0..len {
434 match self.one() {
435 Some((_, ch)) if ch as u32 <= 0x7F && ch.is_ascii_hexdigit() => buf.push(ch),
436 Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
437 None => return Err(Error::UnterminatedString(start)),
438 }
439 }
440 let val = u32::from_str_radix(&buf, 16).unwrap();
441 match char::from_u32(val) {
442 Some(ch) => Ok(ch),
443 None => Err(Error::InvalidEscapeValue(i, val)),
444 }
445 }
446
447 fn keylike(&mut self, start: usize) -> Token<'a> {
448 while let Some((_, ch)) = self.peek_one() {
449 if !is_keylike(ch) {
450 break;
451 }
452 self.one();
453 }
454 Keylike(&self.input[start..self.current()])
455 }
456
457 pub fn substr_offset(&self, s: &'a str) -> usize {
458 assert!(s.len() <= self.input.len());
459 let a = self.input.as_ptr() as usize;
460 let b = s.as_ptr() as usize;
461 assert!(a <= b);
462 b - a
463 }
464
465 /// Calculate the span of a single character.
466 fn step_span(&mut self, start: usize) -> Span {
467 let end = self
468 .peek_one()
469 .map(|t| t.0)
470 .unwrap_or_else(|| self.input.len());
471 Span { start, end }
472 }
473
474 /// Peek one char without consuming it.
475 fn peek_one(&mut self) -> Option<(usize, char)> {
476 self.chars.clone().next()
477 }
478
479 /// Take one char.
480 pub fn one(&mut self) -> Option<(usize, char)> {
481 self.chars.next()
482 }
483}
484
485impl<'a> Iterator for CrlfFold<'a> {
486 type Item = (usize, char);
487
488 fn next(&mut self) -> Option<(usize, char)> {
489 self.chars.next().map(|(i: usize, c: char)| {
490 if c == '\r' {
491 let mut attempt: CharIndices<'_> = self.chars.clone();
492 if let Some((_, '\n')) = attempt.next() {
493 self.chars = attempt;
494 return (i, '\n');
495 }
496 }
497 (i, c)
498 })
499 }
500}
501
502impl MaybeString {
503 fn push(&mut self, ch: char) {
504 match *self {
505 MaybeString::NotEscaped(..) => {}
506 MaybeString::Owned(ref mut s) => s.push(ch),
507 }
508 }
509
510 #[allow(clippy::wrong_self_convention)]
511 fn to_owned(&mut self, input: &str) {
512 match *self {
513 MaybeString::NotEscaped(start) => {
514 *self = MaybeString::Owned(input[start..].to_owned());
515 }
516 MaybeString::Owned(..) => {}
517 }
518 }
519
520 fn into_cow(self, input: &str) -> Cow<'_, str> {
521 match self {
522 MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
523 MaybeString::Owned(s) => Cow::Owned(s),
524 }
525 }
526}
527
528fn is_keylike(ch: char) -> bool {
529 ('A'..='Z').contains(&ch)
530 || ('a'..='z').contains(&ch)
531 || ('0'..='9').contains(&ch)
532 || ch == '-'
533 || ch == '_'
534}
535
536impl<'a> Token<'a> {
537 pub fn describe(&self) -> &'static str {
538 match *self {
539 Token::Keylike(_) => "an identifier",
540 Token::Equals => "an equals",
541 Token::Period => "a period",
542 Token::Comment(_) => "a comment",
543 Token::Newline => "a newline",
544 Token::Whitespace(_) => "whitespace",
545 Token::Comma => "a comma",
546 Token::RightBrace => "a right brace",
547 Token::LeftBrace => "a left brace",
548 Token::RightBracket => "a right bracket",
549 Token::LeftBracket => "a left bracket",
550 Token::String { multiline, .. } => {
551 if multiline {
552 "a multiline string"
553 } else {
554 "a string"
555 }
556 }
557 Token::Colon => "a colon",
558 Token::Plus => "a plus",
559 }
560 }
561}
562
563#[cfg(test)]
564mod tests {
565 use super::{Error, Token, Tokenizer};
566 use std::borrow::Cow;
567
568 fn err(input: &str, err: Error) {
569 let mut t = Tokenizer::new(input);
570 let token = t.next().unwrap_err();
571 assert_eq!(token, err);
572 assert!(t.next().unwrap().is_none());
573 }
574
575 #[test]
576 fn literal_strings() {
577 fn t(input: &str, val: &str, multiline: bool) {
578 let mut t = Tokenizer::new(input);
579 let (_, token) = t.next().unwrap().unwrap();
580 assert_eq!(
581 token,
582 Token::String {
583 src: input,
584 val: Cow::Borrowed(val),
585 multiline,
586 }
587 );
588 assert!(t.next().unwrap().is_none());
589 }
590
591 t("''", "", false);
592 t("''''''", "", true);
593 t("'''\n'''", "", true);
594 t("'a'", "a", false);
595 t("'\"a'", "\"a", false);
596 t("''''a'''", "'a", true);
597 t("'''\n'a\n'''", "'a\n", true);
598 t("'''a\n'a\r\n'''", "a\n'a\n", true);
599 }
600
601 #[test]
602 fn basic_strings() {
603 fn t(input: &str, val: &str, multiline: bool) {
604 let mut t = Tokenizer::new(input);
605 let (_, token) = t.next().unwrap().unwrap();
606 assert_eq!(
607 token,
608 Token::String {
609 src: input,
610 val: Cow::Borrowed(val),
611 multiline,
612 }
613 );
614 assert!(t.next().unwrap().is_none());
615 }
616
617 t(r#""""#, "", false);
618 t(r#""""""""#, "", true);
619 t(r#""a""#, "a", false);
620 t(r#""""a""""#, "a", true);
621 t(r#""\t""#, "\t", false);
622 t(r#""\u0000""#, "\0", false);
623 t(r#""\U00000000""#, "\0", false);
624 t(r#""\U000A0000""#, "\u{A0000}", false);
625 t(r#""\\t""#, "\\t", false);
626 t("\"\t\"", "\t", false);
627 t("\"\"\"\n\t\"\"\"", "\t", true);
628 t("\"\"\"\\\n\"\"\"", "", true);
629 t(
630 "\"\"\"\\\n \t \t \\\r\n \t \n \t \r\n\"\"\"",
631 "",
632 true,
633 );
634 t(r#""\r""#, "\r", false);
635 t(r#""\n""#, "\n", false);
636 t(r#""\b""#, "\u{8}", false);
637 t(r#""a\fa""#, "a\u{c}a", false);
638 t(r#""\"a""#, "\"a", false);
639 t("\"\"\"\na\"\"\"", "a", true);
640 t("\"\"\"\n\"\"\"", "", true);
641 t(r#""""a\"""b""""#, "a\"\"\"b", true);
642 err(r#""\a"#, Error::InvalidEscape(2, 'a'));
643 err("\"\\\n", Error::InvalidEscape(2, '\n'));
644 err("\"\\\r\n", Error::InvalidEscape(2, '\n'));
645 err("\"\\", Error::UnterminatedString(0));
646 err("\"\u{0}", Error::InvalidCharInString(1, '\u{0}'));
647 err(r#""\U00""#, Error::InvalidHexEscape(5, '"'));
648 err(r#""\U00"#, Error::UnterminatedString(0));
649 err(r#""\uD800"#, Error::InvalidEscapeValue(2, 0xd800));
650 err(r#""\UFFFFFFFF"#, Error::InvalidEscapeValue(2, 0xffff_ffff));
651 }
652
653 #[test]
654 fn keylike() {
655 fn t(input: &str) {
656 let mut t = Tokenizer::new(input);
657 let (_, token) = t.next().unwrap().unwrap();
658 assert_eq!(token, Token::Keylike(input));
659 assert!(t.next().unwrap().is_none());
660 }
661 t("foo");
662 t("0bar");
663 t("bar0");
664 t("1234");
665 t("a-b");
666 t("a_B");
667 t("-_-");
668 t("___");
669 }
670
671 #[test]
672 fn all() {
673 fn t(input: &str, expected: &[((usize, usize), Token<'_>, &str)]) {
674 let mut tokens = Tokenizer::new(input);
675 let mut actual: Vec<((usize, usize), Token<'_>, &str)> = Vec::new();
676 while let Some((span, token)) = tokens.next().unwrap() {
677 actual.push((span.into(), token, &input[span.start..span.end]));
678 }
679 for (a, b) in actual.iter().zip(expected) {
680 assert_eq!(a, b);
681 }
682 assert_eq!(actual.len(), expected.len());
683 }
684
685 t(
686 " a ",
687 &[
688 ((0, 1), Token::Whitespace(" "), " "),
689 ((1, 2), Token::Keylike("a"), "a"),
690 ((2, 3), Token::Whitespace(" "), " "),
691 ],
692 );
693
694 t(
695 " a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ",
696 &[
697 ((0, 1), Token::Whitespace(" "), " "),
698 ((1, 2), Token::Keylike("a"), "a"),
699 ((2, 4), Token::Whitespace("\t "), "\t "),
700 ((4, 5), Token::LeftBracket, "["),
701 ((5, 6), Token::LeftBracket, "["),
702 ((6, 7), Token::RightBracket, "]"),
703 ((7, 8), Token::RightBracket, "]"),
704 ((8, 11), Token::Whitespace(" \t "), " \t "),
705 ((11, 12), Token::LeftBracket, "["),
706 ((12, 13), Token::RightBracket, "]"),
707 ((13, 14), Token::Whitespace(" "), " "),
708 ((14, 15), Token::LeftBrace, "{"),
709 ((15, 16), Token::RightBrace, "}"),
710 ((16, 17), Token::Whitespace(" "), " "),
711 ((17, 18), Token::Comma, ","),
712 ((18, 19), Token::Whitespace(" "), " "),
713 ((19, 20), Token::Period, "."),
714 ((20, 21), Token::Whitespace(" "), " "),
715 ((21, 22), Token::Equals, "="),
716 ((22, 23), Token::Newline, "\n"),
717 ((23, 29), Token::Comment("# foo "), "# foo "),
718 ((29, 31), Token::Newline, "\r\n"),
719 ((31, 36), Token::Comment("#foo "), "#foo "),
720 ((36, 37), Token::Newline, "\n"),
721 ((37, 38), Token::Whitespace(" "), " "),
722 ],
723 );
724 }
725
726 #[test]
727 fn bare_cr_bad() {
728 err("\r", Error::Unexpected(0, '\r'));
729 err("'\n", Error::NewlineInString(1));
730 err("'\u{0}", Error::InvalidCharInString(1, '\u{0}'));
731 err("'", Error::UnterminatedString(0));
732 err("\u{0}", Error::Unexpected(0, '\u{0}'));
733 }
734
735 #[test]
736 fn bad_comment() {
737 let mut t = Tokenizer::new("#\u{0}");
738 t.next().unwrap().unwrap();
739 assert_eq!(t.next(), Err(Error::Unexpected(1, '\u{0}')));
740 assert!(t.next().unwrap().is_none());
741 }
742}
743