1use std::borrow::Cow;
2use std::char;
3use std::str;
4
5/// A span, designating a range of bytes where a token is located.
6#[derive(Eq, PartialEq, Debug, Clone, Copy)]
7pub struct Span {
8 /// The start of the range.
9 pub start: usize,
10 /// The end of the range (exclusive).
11 pub end: usize,
12}
13
14impl From<Span> for (usize, usize) {
15 fn from(Span { start, end }: Span) -> (usize, usize) {
16 (start, end)
17 }
18}
19
20#[derive(Eq, PartialEq, Debug)]
21pub enum Token<'a> {
22 Whitespace(&'a str),
23 Newline,
24 Comment(&'a str),
25
26 Equals,
27 Period,
28 Comma,
29 Colon,
30 Plus,
31 LeftBrace,
32 RightBrace,
33 LeftBracket,
34 RightBracket,
35
36 Keylike(&'a str),
37 String {
38 src: &'a str,
39 val: Cow<'a, str>,
40 multiline: bool,
41 },
42}
43
44#[derive(Eq, PartialEq, Debug)]
45pub enum Error {
46 InvalidCharInString(usize, char),
47 InvalidEscape(usize, char),
48 InvalidHexEscape(usize, char),
49 InvalidEscapeValue(usize, u32),
50 NewlineInString(usize),
51 Unexpected(usize, char),
52 UnterminatedString(usize),
53 NewlineInTableKey(usize),
54 MultilineStringKey(usize),
55 Wanted {
56 at: usize,
57 expected: &'static str,
58 found: &'static str,
59 },
60}
61
62#[derive(Clone)]
63pub struct Tokenizer<'a> {
64 input: &'a str,
65 chars: CrlfFold<'a>,
66}
67
68#[derive(Clone)]
69struct CrlfFold<'a> {
70 chars: str::CharIndices<'a>,
71}
72
73#[derive(Debug)]
74enum MaybeString {
75 NotEscaped(usize),
76 Owned(String),
77}
78
79impl<'a> Tokenizer<'a> {
80 pub fn new(input: &'a str) -> Tokenizer<'a> {
81 let mut t = Tokenizer {
82 input,
83 chars: CrlfFold {
84 chars: input.char_indices(),
85 },
86 };
87 // Eat utf-8 BOM
88 t.eatc('\u{feff}');
89 t
90 }
91
92 pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
93 let (start, token) = match self.one() {
94 Some((start, '\n')) => (start, Token::Newline),
95 Some((start, ' ' | '\t')) => (start, self.whitespace_token(start)),
96 Some((start, '#')) => (start, self.comment_token(start)),
97 Some((start, '=')) => (start, Token::Equals),
98 Some((start, '.')) => (start, Token::Period),
99 Some((start, ',')) => (start, Token::Comma),
100 Some((start, ':')) => (start, Token::Colon),
101 Some((start, '+')) => (start, Token::Plus),
102 Some((start, '{')) => (start, Token::LeftBrace),
103 Some((start, '}')) => (start, Token::RightBrace),
104 Some((start, '[')) => (start, Token::LeftBracket),
105 Some((start, ']')) => (start, Token::RightBracket),
106 Some((start, '\'')) => {
107 return self
108 .literal_string(start)
109 .map(|t| Some((self.step_span(start), t)))
110 }
111 Some((start, '"')) => {
112 return self
113 .basic_string(start)
114 .map(|t| Some((self.step_span(start), t)))
115 }
116 Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
117
118 Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
119 None => return Ok(None),
120 };
121
122 let span = self.step_span(start);
123 Ok(Some((span, token)))
124 }
125
126 pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
127 self.clone().next()
128 }
129
130 pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
131 self.eat_spanned(expected).map(|s| s.is_some())
132 }
133
134 /// Eat a value, returning it's span if it was consumed.
135 pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
136 let span = match self.peek()? {
137 Some((span, ref found)) if expected == *found => span,
138 Some(_) | None => return Ok(None),
139 };
140
141 drop(self.next());
142 Ok(Some(span))
143 }
144
145 pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
146 // ignore span
147 let _ = self.expect_spanned(expected)?;
148 Ok(())
149 }
150
151 /// Expect the given token returning its span.
152 pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
153 let current = self.current();
154 match self.next()? {
155 Some((span, found)) => {
156 if expected == found {
157 Ok(span)
158 } else {
159 Err(Error::Wanted {
160 at: current,
161 expected: expected.describe(),
162 found: found.describe(),
163 })
164 }
165 }
166 None => Err(Error::Wanted {
167 at: self.input.len(),
168 expected: expected.describe(),
169 found: "eof",
170 }),
171 }
172 }
173
174 pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> {
175 let current = self.current();
176 match self.next()? {
177 Some((span, Token::Keylike(k))) => Ok((span, k.into())),
178 Some((
179 span,
180 Token::String {
181 src,
182 val,
183 multiline,
184 },
185 )) => {
186 let offset = self.substr_offset(src);
187 if multiline {
188 return Err(Error::MultilineStringKey(offset));
189 }
190 match src.find('\n') {
191 None => Ok((span, val)),
192 Some(i) => Err(Error::NewlineInTableKey(offset + i)),
193 }
194 }
195 Some((_, other)) => Err(Error::Wanted {
196 at: current,
197 expected: "a table key",
198 found: other.describe(),
199 }),
200 None => Err(Error::Wanted {
201 at: self.input.len(),
202 expected: "a table key",
203 found: "eof",
204 }),
205 }
206 }
207
208 pub fn eat_whitespace(&mut self) {
209 while self.eatc(' ') || self.eatc('\t') {
210 // ...
211 }
212 }
213
214 pub fn eat_comment(&mut self) -> Result<bool, Error> {
215 if !self.eatc('#') {
216 return Ok(false);
217 }
218 drop(self.comment_token(0));
219 self.eat_newline_or_eof().map(|()| true)
220 }
221
222 pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
223 let current = self.current();
224 match self.next()? {
225 None | Some((_, Token::Newline)) => Ok(()),
226 Some((_, other)) => Err(Error::Wanted {
227 at: current,
228 expected: "newline",
229 found: other.describe(),
230 }),
231 }
232 }
233
234 pub fn skip_to_newline(&mut self) {
235 loop {
236 match self.one() {
237 Some((_, '\n')) | None => break,
238 _ => {}
239 }
240 }
241 }
242
243 fn eatc(&mut self, ch: char) -> bool {
244 match self.chars.clone().next() {
245 Some((_, ch2)) if ch == ch2 => {
246 self.one();
247 true
248 }
249 _ => false,
250 }
251 }
252
253 pub fn current(&mut self) -> usize {
254 match self.chars.clone().next() {
255 Some(i) => i.0,
256 None => self.input.len(),
257 }
258 }
259
260 fn whitespace_token(&mut self, start: usize) -> Token<'a> {
261 while self.eatc(' ') || self.eatc('\t') {
262 // ...
263 }
264 Token::Whitespace(&self.input[start..self.current()])
265 }
266
267 fn comment_token(&mut self, start: usize) -> Token<'a> {
268 while let Some((_, ch)) = self.chars.clone().next() {
269 if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') {
270 break;
271 }
272 self.one();
273 }
274 Token::Comment(&self.input[start..self.current()])
275 }
276
277 fn read_string(
278 &mut self,
279 delim: char,
280 start: usize,
281 new_ch: &mut dyn FnMut(
282 &mut Tokenizer,
283 &mut MaybeString,
284 bool,
285 usize,
286 char,
287 ) -> Result<(), Error>,
288 ) -> Result<Token<'a>, Error> {
289 let mut multiline = false;
290 if self.eatc(delim) {
291 if self.eatc(delim) {
292 multiline = true;
293 } else {
294 return Ok(Token::String {
295 src: &self.input[start..start + 2],
296 val: Cow::Borrowed(""),
297 multiline: false,
298 });
299 }
300 }
301 let mut val = MaybeString::NotEscaped(self.current());
302 let mut n = 0;
303 loop {
304 n += 1;
305 match self.one() {
306 Some((i, '\n')) => {
307 if multiline {
308 if self.input.as_bytes()[i] == b'\r' {
309 val.make_owned(&self.input[..i]);
310 }
311 if n == 1 {
312 val = MaybeString::NotEscaped(self.current());
313 } else {
314 val.push('\n');
315 }
316 } else {
317 return Err(Error::NewlineInString(i));
318 }
319 }
320 Some((mut i, ch)) if ch == delim => {
321 if multiline {
322 if !self.eatc(delim) {
323 val.push(delim);
324 continue;
325 }
326 if !self.eatc(delim) {
327 val.push(delim);
328 val.push(delim);
329 continue;
330 }
331 if self.eatc(delim) {
332 val.push(delim);
333 i += 1;
334 }
335 if self.eatc(delim) {
336 val.push(delim);
337 i += 1;
338 }
339 }
340 return Ok(Token::String {
341 src: &self.input[start..self.current()],
342 val: val.into_cow(&self.input[..i]),
343 multiline,
344 });
345 }
346 Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
347 None => return Err(Error::UnterminatedString(start)),
348 }
349 }
350 }
351
352 fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
353 self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
354 if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') {
355 val.push(ch);
356 Ok(())
357 } else {
358 Err(Error::InvalidCharInString(i, ch))
359 }
360 })
361 }
362
363 fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
364 self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
365 '\\' => {
366 val.make_owned(&me.input[..i]);
367 match me.chars.next() {
368 Some((_, '"')) => val.push('"'),
369 Some((_, '\\')) => val.push('\\'),
370 Some((_, 'b')) => val.push('\u{8}'),
371 Some((_, 'f')) => val.push('\u{c}'),
372 Some((_, 'n')) => val.push('\n'),
373 Some((_, 'r')) => val.push('\r'),
374 Some((_, 't')) => val.push('\t'),
375 Some((i, c @ ('u' | 'U'))) => {
376 let len = if c == 'u' { 4 } else { 8 };
377 val.push(me.hex(start, i, len)?);
378 }
379 Some((i, c @ (' ' | '\t' | '\n'))) if multi => {
380 if c != '\n' {
381 while let Some((_, ch)) = me.chars.clone().next() {
382 match ch {
383 ' ' | '\t' => {
384 me.chars.next();
385 continue;
386 }
387 '\n' => {
388 me.chars.next();
389 break;
390 }
391 _ => return Err(Error::InvalidEscape(i, c)),
392 }
393 }
394 }
395 while let Some((_, ch)) = me.chars.clone().next() {
396 match ch {
397 ' ' | '\t' | '\n' => {
398 me.chars.next();
399 }
400 _ => break,
401 }
402 }
403 }
404 Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
405 None => return Err(Error::UnterminatedString(start)),
406 }
407 Ok(())
408 }
409 ch if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') => {
410 val.push(ch);
411 Ok(())
412 }
413 _ => Err(Error::InvalidCharInString(i, ch)),
414 })
415 }
416
417 fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
418 let mut buf = String::with_capacity(len);
419 for _ in 0..len {
420 match self.one() {
421 Some((_, ch)) if ch as u32 <= 0x7F && ch.is_ascii_hexdigit() => buf.push(ch),
422 Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
423 None => return Err(Error::UnterminatedString(start)),
424 }
425 }
426 let val = u32::from_str_radix(&buf, 16).unwrap();
427 match char::from_u32(val) {
428 Some(ch) => Ok(ch),
429 None => Err(Error::InvalidEscapeValue(i, val)),
430 }
431 }
432
433 fn keylike(&mut self, start: usize) -> Token<'a> {
434 while let Some((_, ch)) = self.peek_one() {
435 if !is_keylike(ch) {
436 break;
437 }
438 self.one();
439 }
440 Token::Keylike(&self.input[start..self.current()])
441 }
442
443 pub fn substr_offset(&self, s: &'a str) -> usize {
444 assert!(s.len() <= self.input.len());
445 let a = self.input.as_ptr() as usize;
446 let b = s.as_ptr() as usize;
447 assert!(a <= b);
448 b - a
449 }
450
451 /// Calculate the span of a single character.
452 fn step_span(&mut self, start: usize) -> Span {
453 let end = match self.peek_one() {
454 Some(t) => t.0,
455 None => self.input.len(),
456 };
457 Span { start, end }
458 }
459
460 /// Peek one char without consuming it.
461 fn peek_one(&mut self) -> Option<(usize, char)> {
462 self.chars.clone().next()
463 }
464
465 /// Take one char.
466 pub fn one(&mut self) -> Option<(usize, char)> {
467 self.chars.next()
468 }
469}
470
471impl<'a> Iterator for CrlfFold<'a> {
472 type Item = (usize, char);
473
474 fn next(&mut self) -> Option<(usize, char)> {
475 self.chars.next().map(|(i, c)| {
476 if c == '\r' {
477 let mut attempt = self.chars.clone();
478 if let Some((_, '\n')) = attempt.next() {
479 self.chars = attempt;
480 return (i, '\n');
481 }
482 }
483 (i, c)
484 })
485 }
486}
487
488impl MaybeString {
489 fn push(&mut self, ch: char) {
490 match *self {
491 MaybeString::NotEscaped(..) => {}
492 MaybeString::Owned(ref mut s) => s.push(ch),
493 }
494 }
495
496 fn make_owned(&mut self, input: &str) {
497 match *self {
498 MaybeString::NotEscaped(start) => {
499 *self = MaybeString::Owned(input[start..].to_owned());
500 }
501 MaybeString::Owned(..) => {}
502 }
503 }
504
505 fn into_cow(self, input: &str) -> Cow<str> {
506 match self {
507 MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
508 MaybeString::Owned(s) => Cow::Owned(s),
509 }
510 }
511}
512
513fn is_keylike(ch: char) -> bool {
514 ('A' <= ch && ch <= 'Z')
515 || ('a' <= ch && ch <= 'z')
516 || ('0' <= ch && ch <= '9')
517 || ch == '-'
518 || ch == '_'
519}
520
521impl<'a> Token<'a> {
522 pub fn describe(&self) -> &'static str {
523 match *self {
524 Token::Keylike(_) => "an identifier",
525 Token::Equals => "an equals",
526 Token::Period => "a period",
527 Token::Comment(_) => "a comment",
528 Token::Newline => "a newline",
529 Token::Whitespace(_) => "whitespace",
530 Token::Comma => "a comma",
531 Token::RightBrace => "a right brace",
532 Token::LeftBrace => "a left brace",
533 Token::RightBracket => "a right bracket",
534 Token::LeftBracket => "a left bracket",
535 Token::String { multiline, .. } => {
536 if multiline {
537 "a multiline string"
538 } else {
539 "a string"
540 }
541 }
542 Token::Colon => "a colon",
543 Token::Plus => "a plus",
544 }
545 }
546}
547