1 | use anyhow::{bail, Result}; |
2 | use std::char; |
3 | use std::fmt; |
4 | use std::str; |
5 | use unicode_xid::UnicodeXID; |
6 | |
7 | use self::Token::*; |
8 | |
9 | #[derive (Clone)] |
10 | pub struct Tokenizer<'a> { |
11 | input: &'a str, |
12 | span_offset: u32, |
13 | chars: CrlfFold<'a>, |
14 | require_f32_f64: bool, |
15 | } |
16 | |
17 | #[derive (Clone)] |
18 | struct CrlfFold<'a> { |
19 | chars: str::CharIndices<'a>, |
20 | } |
21 | |
22 | /// A span, designating a range of bytes where a token is located. |
23 | #[derive (Eq, PartialEq, Debug, Clone, Copy)] |
24 | pub struct Span { |
25 | /// The start of the range. |
26 | pub start: u32, |
27 | /// The end of the range (exclusive). |
28 | pub end: u32, |
29 | } |
30 | |
31 | #[derive (Eq, PartialEq, Debug, Copy, Clone)] |
32 | pub enum Token { |
33 | Whitespace, |
34 | Comment, |
35 | |
36 | Equals, |
37 | Comma, |
38 | Colon, |
39 | Period, |
40 | Semicolon, |
41 | LeftParen, |
42 | RightParen, |
43 | LeftBrace, |
44 | RightBrace, |
45 | LessThan, |
46 | GreaterThan, |
47 | RArrow, |
48 | Star, |
49 | At, |
50 | Slash, |
51 | Plus, |
52 | Minus, |
53 | |
54 | Use, |
55 | Type, |
56 | Func, |
57 | U8, |
58 | U16, |
59 | U32, |
60 | U64, |
61 | S8, |
62 | S16, |
63 | S32, |
64 | S64, |
65 | F32, |
66 | F64, |
67 | Char, |
68 | Record, |
69 | Resource, |
70 | Own, |
71 | Borrow, |
72 | Flags, |
73 | Variant, |
74 | Enum, |
75 | Bool, |
76 | String_, |
77 | Option_, |
78 | Result_, |
79 | Future, |
80 | Stream, |
81 | ErrorContext, |
82 | List, |
83 | Underscore, |
84 | As, |
85 | From_, |
86 | Static, |
87 | Interface, |
88 | Tuple, |
89 | Import, |
90 | Export, |
91 | World, |
92 | Package, |
93 | Constructor, |
94 | |
95 | Id, |
96 | ExplicitId, |
97 | |
98 | Integer, |
99 | |
100 | Include, |
101 | With, |
102 | } |
103 | |
104 | #[derive (Eq, PartialEq, Debug)] |
105 | #[allow (dead_code)] |
106 | pub enum Error { |
107 | InvalidCharInId(u32, char), |
108 | IdPartEmpty(u32), |
109 | InvalidEscape(u32, char), |
110 | Unexpected(u32, char), |
111 | UnterminatedComment(u32), |
112 | Wanted { |
113 | at: u32, |
114 | expected: &'static str, |
115 | found: &'static str, |
116 | }, |
117 | } |
118 | |
119 | // NB: keep in sync with `crates/wit-component/src/printing.rs`. |
120 | const REQUIRE_F32_F64_BY_DEFAULT: bool = true; |
121 | |
122 | impl<'a> Tokenizer<'a> { |
123 | pub fn new( |
124 | input: &'a str, |
125 | span_offset: u32, |
126 | require_f32_f64: Option<bool>, |
127 | ) -> Result<Tokenizer<'a>> { |
128 | detect_invalid_input(input)?; |
129 | |
130 | let mut t = Tokenizer { |
131 | input, |
132 | span_offset, |
133 | chars: CrlfFold { |
134 | chars: input.char_indices(), |
135 | }, |
136 | require_f32_f64: require_f32_f64.unwrap_or_else(|| { |
137 | match std::env::var("WIT_REQUIRE_F32_F64" ) { |
138 | Ok(s) => s == "1" , |
139 | Err(_) => REQUIRE_F32_F64_BY_DEFAULT, |
140 | } |
141 | }), |
142 | }; |
143 | // Eat utf-8 BOM |
144 | t.eatc(' \u{feff}' ); |
145 | Ok(t) |
146 | } |
147 | |
148 | pub fn expect_semicolon(&mut self) -> Result<()> { |
149 | self.expect(Token::Semicolon)?; |
150 | Ok(()) |
151 | } |
152 | |
153 | pub fn get_span(&self, span: Span) -> &'a str { |
154 | let start = usize::try_from(span.start - self.span_offset).unwrap(); |
155 | let end = usize::try_from(span.end - self.span_offset).unwrap(); |
156 | &self.input[start..end] |
157 | } |
158 | |
159 | pub fn parse_id(&self, span: Span) -> Result<&'a str> { |
160 | let ret = self.get_span(span); |
161 | validate_id(span.start, &ret)?; |
162 | Ok(ret) |
163 | } |
164 | |
165 | pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> { |
166 | let token = self.get_span(span); |
167 | let id_part = token.strip_prefix('%' ).unwrap(); |
168 | validate_id(span.start, id_part)?; |
169 | Ok(id_part) |
170 | } |
171 | |
172 | pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> { |
173 | loop { |
174 | match self.next_raw()? { |
175 | Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {} |
176 | other => break Ok(other), |
177 | } |
178 | } |
179 | } |
180 | |
181 | /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an |
182 | /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more |
183 | /// tokens available. |
184 | pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> { |
185 | let (str_start, ch) = match self.chars.next() { |
186 | Some(pair) => pair, |
187 | None => return Ok(None), |
188 | }; |
189 | let start = self.span_offset + u32::try_from(str_start).unwrap(); |
190 | let token = match ch { |
191 | ' \n' | ' \t' | ' ' => { |
192 | // Eat all contiguous whitespace tokens |
193 | while self.eatc(' ' ) || self.eatc(' \t' ) || self.eatc(' \n' ) {} |
194 | Whitespace |
195 | } |
196 | '/' => { |
197 | // Eat a line comment if it's `//...` |
198 | if self.eatc('/' ) { |
199 | for (_, ch) in &mut self.chars { |
200 | if ch == ' \n' { |
201 | break; |
202 | } |
203 | } |
204 | Comment |
205 | // eat a block comment if it's `/*...` |
206 | } else if self.eatc('*' ) { |
207 | let mut depth = 1; |
208 | while depth > 0 { |
209 | let (_, ch) = match self.chars.next() { |
210 | Some(pair) => pair, |
211 | None => return Err(Error::UnterminatedComment(start)), |
212 | }; |
213 | match ch { |
214 | '/' if self.eatc('*' ) => depth += 1, |
215 | '*' if self.eatc('/' ) => depth -= 1, |
216 | _ => {} |
217 | } |
218 | } |
219 | Comment |
220 | } else { |
221 | Slash |
222 | } |
223 | } |
224 | '=' => Equals, |
225 | ',' => Comma, |
226 | ':' => Colon, |
227 | '.' => Period, |
228 | ';' => Semicolon, |
229 | '(' => LeftParen, |
230 | ')' => RightParen, |
231 | '{' => LeftBrace, |
232 | '}' => RightBrace, |
233 | '<' => LessThan, |
234 | '>' => GreaterThan, |
235 | '*' => Star, |
236 | '@' => At, |
237 | '-' => { |
238 | if self.eatc('>' ) { |
239 | RArrow |
240 | } else { |
241 | Minus |
242 | } |
243 | } |
244 | '+' => Plus, |
245 | '%' => { |
246 | let mut iter = self.chars.clone(); |
247 | if let Some((_, ch)) = iter.next() { |
248 | if is_keylike_start(ch) { |
249 | self.chars = iter.clone(); |
250 | while let Some((_, ch)) = iter.next() { |
251 | if !is_keylike_continue(ch) { |
252 | break; |
253 | } |
254 | self.chars = iter.clone(); |
255 | } |
256 | } |
257 | } |
258 | ExplicitId |
259 | } |
260 | ch if is_keylike_start(ch) => { |
261 | let remaining = self.chars.chars.as_str().len(); |
262 | let mut iter = self.chars.clone(); |
263 | while let Some((_, ch)) = iter.next() { |
264 | if !is_keylike_continue(ch) { |
265 | break; |
266 | } |
267 | self.chars = iter.clone(); |
268 | } |
269 | let str_end = |
270 | str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len()); |
271 | match &self.input[str_start..str_end] { |
272 | "use" => Use, |
273 | "type" => Type, |
274 | "func" => Func, |
275 | "u8" => U8, |
276 | "u16" => U16, |
277 | "u32" => U32, |
278 | "u64" => U64, |
279 | "s8" => S8, |
280 | "s16" => S16, |
281 | "s32" => S32, |
282 | "s64" => S64, |
283 | "f32" => F32, |
284 | "f64" => F64, |
285 | "float32" if !self.require_f32_f64 => F32, |
286 | "float64" if !self.require_f32_f64 => F64, |
287 | "char" => Char, |
288 | "resource" => Resource, |
289 | "own" => Own, |
290 | "borrow" => Borrow, |
291 | "record" => Record, |
292 | "flags" => Flags, |
293 | "variant" => Variant, |
294 | "enum" => Enum, |
295 | "bool" => Bool, |
296 | "string" => String_, |
297 | "option" => Option_, |
298 | "result" => Result_, |
299 | "future" => Future, |
300 | "stream" => Stream, |
301 | "error-context" => ErrorContext, |
302 | "list" => List, |
303 | "_" => Underscore, |
304 | "as" => As, |
305 | "from" => From_, |
306 | "static" => Static, |
307 | "interface" => Interface, |
308 | "tuple" => Tuple, |
309 | "world" => World, |
310 | "import" => Import, |
311 | "export" => Export, |
312 | "package" => Package, |
313 | "constructor" => Constructor, |
314 | "include" => Include, |
315 | "with" => With, |
316 | _ => Id, |
317 | } |
318 | } |
319 | |
320 | ch if ch.is_ascii_digit() => { |
321 | let mut iter = self.chars.clone(); |
322 | while let Some((_, ch)) = iter.next() { |
323 | if !ch.is_ascii_digit() { |
324 | break; |
325 | } |
326 | self.chars = iter.clone(); |
327 | } |
328 | |
329 | Integer |
330 | } |
331 | |
332 | ch => return Err(Error::Unexpected(start, ch)), |
333 | }; |
334 | let end = match self.chars.clone().next() { |
335 | Some((i, _)) => i, |
336 | None => self.input.len(), |
337 | }; |
338 | |
339 | let end = self.span_offset + u32::try_from(end).unwrap(); |
340 | Ok(Some((Span { start, end }, token))) |
341 | } |
342 | |
343 | pub fn eat(&mut self, expected: Token) -> Result<bool, Error> { |
344 | let mut other = self.clone(); |
345 | match other.next()? { |
346 | Some((_span, found)) if expected == found => { |
347 | *self = other; |
348 | Ok(true) |
349 | } |
350 | Some(_) => Ok(false), |
351 | None => Ok(false), |
352 | } |
353 | } |
354 | |
355 | pub fn expect(&mut self, expected: Token) -> Result<Span, Error> { |
356 | match self.next()? { |
357 | Some((span, found)) => { |
358 | if expected == found { |
359 | Ok(span) |
360 | } else { |
361 | Err(Error::Wanted { |
362 | at: span.start, |
363 | expected: expected.describe(), |
364 | found: found.describe(), |
365 | }) |
366 | } |
367 | } |
368 | None => Err(Error::Wanted { |
369 | at: self.span_offset + u32::try_from(self.input.len()).unwrap(), |
370 | expected: expected.describe(), |
371 | found: "eof" , |
372 | }), |
373 | } |
374 | } |
375 | |
376 | fn eatc(&mut self, ch: char) -> bool { |
377 | let mut iter = self.chars.clone(); |
378 | match iter.next() { |
379 | Some((_, ch2)) if ch == ch2 => { |
380 | self.chars = iter; |
381 | true |
382 | } |
383 | _ => false, |
384 | } |
385 | } |
386 | |
387 | pub fn eof_span(&self) -> Span { |
388 | let end = self.span_offset + u32::try_from(self.input.len()).unwrap(); |
389 | Span { start: end, end } |
390 | } |
391 | } |
392 | |
393 | impl<'a> Iterator for CrlfFold<'a> { |
394 | type Item = (usize, char); |
395 | |
396 | fn next(&mut self) -> Option<(usize, char)> { |
397 | self.chars.next().map(|(i: usize, c: char)| { |
398 | if c == ' \r' { |
399 | let mut attempt: CharIndices<'a> = self.chars.clone(); |
400 | if let Some((_, ' \n' )) = attempt.next() { |
401 | self.chars = attempt; |
402 | return (i, ' \n' ); |
403 | } |
404 | } |
405 | (i, c) |
406 | }) |
407 | } |
408 | } |
409 | |
410 | fn detect_invalid_input(input: &str) -> Result<()> { |
411 | // Disallow specific codepoints. |
412 | let mut line = 1; |
413 | for ch in input.chars() { |
414 | match ch { |
415 | ' \n' => line += 1, |
416 | ' \r' | ' \t' => {} |
417 | |
418 | // Bidirectional override codepoints can be used to craft source code that |
419 | // appears to have a different meaning than its actual meaning. See |
420 | // [CVE-2021-42574] for background and motivation. |
421 | // |
422 | // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574 |
423 | ' \u{202a}' | ' \u{202b}' | ' \u{202c}' | ' \u{202d}' | ' \u{202e}' | ' \u{2066}' |
424 | | ' \u{2067}' | ' \u{2068}' | ' \u{2069}' => { |
425 | bail!( |
426 | "Input contains bidirectional override codepoint {:?} at line {}" , |
427 | ch.escape_unicode(), |
428 | line |
429 | ); |
430 | } |
431 | |
432 | // Disallow several characters which are deprecated or discouraged in Unicode. |
433 | // |
434 | // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs. |
435 | // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks. |
436 | // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels. |
437 | // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see |
438 | // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged. |
439 | ' \u{149}' | ' \u{673}' | ' \u{f77}' | ' \u{f79}' | ' \u{17a3}' | ' \u{17a4}' |
440 | | ' \u{17b4}' | ' \u{17b5}' => { |
441 | bail!( |
442 | "Codepoint {:?} at line {} is discouraged by Unicode" , |
443 | ch.escape_unicode(), |
444 | line |
445 | ); |
446 | } |
447 | |
448 | // Disallow control codes other than the ones explicitly recognized above, |
449 | // so that viewing a wit file on a terminal doesn't have surprising side |
450 | // effects or appear to have a different meaning than its actual meaning. |
451 | ch if ch.is_control() => { |
452 | bail!("Control code ' {}' at line {}" , ch.escape_unicode(), line); |
453 | } |
454 | |
455 | _ => {} |
456 | } |
457 | } |
458 | |
459 | Ok(()) |
460 | } |
461 | |
462 | fn is_keylike_start(ch: char) -> bool { |
463 | // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars, |
464 | // but we'll diagnose that after we've lexed the full string. |
465 | UnicodeXID::is_xid_start(self:ch) || ch == '_' || ch == '-' |
466 | } |
467 | |
468 | fn is_keylike_continue(ch: char) -> bool { |
469 | // Lex any XID continue (which includes `_`) or '-'. |
470 | UnicodeXID::is_xid_continue(self:ch) || ch == '-' |
471 | } |
472 | |
473 | pub fn validate_id(start: u32, id: &str) -> Result<(), Error> { |
474 | // IDs must have at least one part. |
475 | if id.is_empty() { |
476 | return Err(Error::IdPartEmpty(start)); |
477 | } |
478 | |
479 | // Ids consist of parts separated by '-'s. |
480 | for part in id.split('-' ) { |
481 | // Parts must be non-empty and contain either all ASCII lowercase or |
482 | // all ASCII uppercase. |
483 | let upper = match part.chars().next() { |
484 | None => return Err(Error::IdPartEmpty(start)), |
485 | Some(first) => { |
486 | if first.is_ascii_lowercase() { |
487 | false |
488 | } else if first.is_ascii_uppercase() { |
489 | true |
490 | } else { |
491 | return Err(Error::InvalidCharInId(start, first)); |
492 | } |
493 | } |
494 | }; |
495 | |
496 | for ch in part.chars() { |
497 | if ch.is_ascii_digit() { |
498 | // Digits are accepted in both uppercase and lowercase segments. |
499 | } else if upper { |
500 | if !ch.is_ascii_uppercase() { |
501 | return Err(Error::InvalidCharInId(start, ch)); |
502 | } |
503 | } else if !ch.is_ascii_lowercase() { |
504 | return Err(Error::InvalidCharInId(start, ch)); |
505 | } |
506 | } |
507 | } |
508 | |
509 | Ok(()) |
510 | } |
511 | |
512 | impl Token { |
513 | pub fn describe(&self) -> &'static str { |
514 | match self { |
515 | Whitespace => "whitespace" , |
516 | Comment => "a comment" , |
517 | Equals => "'='" , |
518 | Comma => "','" , |
519 | Colon => "':'" , |
520 | Period => "'.'" , |
521 | Semicolon => "';'" , |
522 | LeftParen => "'('" , |
523 | RightParen => "')'" , |
524 | LeftBrace => "'{'" , |
525 | RightBrace => "'}'" , |
526 | LessThan => "'<'" , |
527 | GreaterThan => "'>'" , |
528 | Use => "keyword `use`" , |
529 | Type => "keyword `type`" , |
530 | Func => "keyword `func`" , |
531 | U8 => "keyword `u8`" , |
532 | U16 => "keyword `u16`" , |
533 | U32 => "keyword `u32`" , |
534 | U64 => "keyword `u64`" , |
535 | S8 => "keyword `s8`" , |
536 | S16 => "keyword `s16`" , |
537 | S32 => "keyword `s32`" , |
538 | S64 => "keyword `s64`" , |
539 | F32 => "keyword `f32`" , |
540 | F64 => "keyword `f64`" , |
541 | Char => "keyword `char`" , |
542 | Own => "keyword `own`" , |
543 | Borrow => "keyword `borrow`" , |
544 | Resource => "keyword `resource`" , |
545 | Record => "keyword `record`" , |
546 | Flags => "keyword `flags`" , |
547 | Variant => "keyword `variant`" , |
548 | Enum => "keyword `enum`" , |
549 | Bool => "keyword `bool`" , |
550 | String_ => "keyword `string`" , |
551 | Option_ => "keyword `option`" , |
552 | Result_ => "keyword `result`" , |
553 | Future => "keyword `future`" , |
554 | Stream => "keyword `stream`" , |
555 | ErrorContext => "keyword `error-context`" , |
556 | List => "keyword `list`" , |
557 | Underscore => "keyword `_`" , |
558 | Id => "an identifier" , |
559 | ExplicitId => "an '%' identifier" , |
560 | RArrow => "`->`" , |
561 | Star => "`*`" , |
562 | At => "`@`" , |
563 | Slash => "`/`" , |
564 | Plus => "`+`" , |
565 | Minus => "`-`" , |
566 | As => "keyword `as`" , |
567 | From_ => "keyword `from`" , |
568 | Static => "keyword `static`" , |
569 | Interface => "keyword `interface`" , |
570 | Tuple => "keyword `tuple`" , |
571 | Import => "keyword `import`" , |
572 | Export => "keyword `export`" , |
573 | World => "keyword `world`" , |
574 | Package => "keyword `package`" , |
575 | Constructor => "keyword `constructor`" , |
576 | Integer => "an integer" , |
577 | Include => "keyword `include`" , |
578 | With => "keyword `with`" , |
579 | } |
580 | } |
581 | } |
582 | |
583 | impl std::error::Error for Error {} |
584 | |
585 | impl fmt::Display for Error { |
586 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
587 | match self { |
588 | Error::Unexpected(_, ch: &char) => write!(f, "unexpected character {:?}" , ch), |
589 | Error::UnterminatedComment(_) => write!(f, "unterminated block comment" ), |
590 | Error::Wanted { |
591 | expected: &&'static str, found: &&'static str, .. |
592 | } => write!(f, "expected {}, found {}" , expected, found), |
593 | Error::InvalidCharInId(_, ch: &char) => write!(f, "invalid character in identifier {:?}" , ch), |
594 | Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s" ), |
595 | Error::InvalidEscape(_, ch: &char) => write!(f, "invalid escape in string {:?}" , ch), |
596 | } |
597 | } |
598 | } |
599 | |
600 | #[test ] |
601 | fn test_validate_id() { |
602 | validate_id(0, "apple" ).unwrap(); |
603 | validate_id(0, "apple-pear" ).unwrap(); |
604 | validate_id(0, "apple-pear-grape" ).unwrap(); |
605 | validate_id(0, "a0" ).unwrap(); |
606 | validate_id(0, "a" ).unwrap(); |
607 | validate_id(0, "a-a" ).unwrap(); |
608 | validate_id(0, "bool" ).unwrap(); |
609 | validate_id(0, "APPLE" ).unwrap(); |
610 | validate_id(0, "APPLE-PEAR" ).unwrap(); |
611 | validate_id(0, "APPLE-PEAR-GRAPE" ).unwrap(); |
612 | validate_id(0, "apple-PEAR-grape" ).unwrap(); |
613 | validate_id(0, "APPLE-pear-GRAPE" ).unwrap(); |
614 | validate_id(0, "ENOENT" ).unwrap(); |
615 | validate_id(0, "is-XML" ).unwrap(); |
616 | |
617 | assert!(validate_id(0, "" ).is_err()); |
618 | assert!(validate_id(0, "0" ).is_err()); |
619 | assert!(validate_id(0, "%" ).is_err()); |
620 | assert!(validate_id(0, "$" ).is_err()); |
621 | assert!(validate_id(0, "0a" ).is_err()); |
622 | assert!(validate_id(0, "." ).is_err()); |
623 | assert!(validate_id(0, "·" ).is_err()); |
624 | assert!(validate_id(0, "a a" ).is_err()); |
625 | assert!(validate_id(0, "_" ).is_err()); |
626 | assert!(validate_id(0, "-" ).is_err()); |
627 | assert!(validate_id(0, "a-" ).is_err()); |
628 | assert!(validate_id(0, "-a" ).is_err()); |
629 | assert!(validate_id(0, "Apple" ).is_err()); |
630 | assert!(validate_id(0, "applE" ).is_err()); |
631 | assert!(validate_id(0, "-apple-pear" ).is_err()); |
632 | assert!(validate_id(0, "apple-pear-" ).is_err()); |
633 | assert!(validate_id(0, "apple_pear" ).is_err()); |
634 | assert!(validate_id(0, "apple.pear" ).is_err()); |
635 | assert!(validate_id(0, "apple pear" ).is_err()); |
636 | assert!(validate_id(0, "apple/pear" ).is_err()); |
637 | assert!(validate_id(0, "apple|pear" ).is_err()); |
638 | assert!(validate_id(0, "apple-Pear" ).is_err()); |
639 | assert!(validate_id(0, "apple-0" ).is_err()); |
640 | assert!(validate_id(0, "()()" ).is_err()); |
641 | assert!(validate_id(0, "" ).is_err()); |
642 | assert!(validate_id(0, "*" ).is_err()); |
643 | assert!(validate_id(0, "apple \u{5f3}pear" ).is_err()); |
644 | assert!(validate_id(0, "apple \u{200c}pear" ).is_err()); |
645 | assert!(validate_id(0, "apple \u{200d}pear" ).is_err()); |
646 | assert!(validate_id(0, "apple--pear" ).is_err()); |
647 | assert!(validate_id(0, "_apple" ).is_err()); |
648 | assert!(validate_id(0, "apple_" ).is_err()); |
649 | assert!(validate_id(0, "_Znwj" ).is_err()); |
650 | assert!(validate_id(0, "__i386" ).is_err()); |
651 | assert!(validate_id(0, "__i386__" ).is_err()); |
652 | assert!(validate_id(0, "Москва" ).is_err()); |
653 | assert!(validate_id(0, "garçon-hühnervögel-Москва-東京" ).is_err()); |
654 | assert!(validate_id(0, "😼" ).is_err(), "non-identifier" ); |
655 | assert!(validate_id(0, " \u{212b}" ).is_err(), "non-ascii" ); |
656 | } |
657 | |
658 | #[test ] |
659 | fn test_tokenizer() { |
660 | fn collect(s: &str) -> Result<Vec<Token>> { |
661 | let mut t = Tokenizer::new(s, 0, None)?; |
662 | let mut tokens = Vec::new(); |
663 | while let Some(token) = t.next()? { |
664 | tokens.push(token.1); |
665 | } |
666 | Ok(tokens) |
667 | } |
668 | |
669 | assert_eq!(collect("" ).unwrap(), vec![]); |
670 | assert_eq!(collect("_" ).unwrap(), vec![Token::Underscore]); |
671 | assert_eq!(collect("apple" ).unwrap(), vec![Token::Id]); |
672 | assert_eq!(collect("apple-pear" ).unwrap(), vec![Token::Id]); |
673 | assert_eq!(collect("apple--pear" ).unwrap(), vec![Token::Id]); |
674 | assert_eq!(collect("apple-Pear" ).unwrap(), vec![Token::Id]); |
675 | assert_eq!(collect("apple-pear-grape" ).unwrap(), vec![Token::Id]); |
676 | assert_eq!(collect("apple pear" ).unwrap(), vec![Token::Id, Token::Id]); |
677 | assert_eq!(collect("_a_p_p_l_e_" ).unwrap(), vec![Token::Id]); |
678 | assert_eq!(collect("garçon" ).unwrap(), vec![Token::Id]); |
679 | assert_eq!(collect("hühnervögel" ).unwrap(), vec![Token::Id]); |
680 | assert_eq!(collect("москва" ).unwrap(), vec![Token::Id]); |
681 | assert_eq!(collect("東京" ).unwrap(), vec![Token::Id]); |
682 | assert_eq!( |
683 | collect("garçon-hühnervögel-москва-東京" ).unwrap(), |
684 | vec![Token::Id] |
685 | ); |
686 | assert_eq!(collect("a0" ).unwrap(), vec![Token::Id]); |
687 | assert_eq!(collect("a" ).unwrap(), vec![Token::Id]); |
688 | assert_eq!(collect("%a" ).unwrap(), vec![Token::ExplicitId]); |
689 | assert_eq!(collect("%a-a" ).unwrap(), vec![Token::ExplicitId]); |
690 | assert_eq!(collect("%bool" ).unwrap(), vec![Token::ExplicitId]); |
691 | assert_eq!(collect("%" ).unwrap(), vec![Token::ExplicitId]); |
692 | assert_eq!(collect("APPLE" ).unwrap(), vec![Token::Id]); |
693 | assert_eq!(collect("APPLE-PEAR" ).unwrap(), vec![Token::Id]); |
694 | assert_eq!(collect("APPLE-PEAR-GRAPE" ).unwrap(), vec![Token::Id]); |
695 | assert_eq!(collect("apple-PEAR-grape" ).unwrap(), vec![Token::Id]); |
696 | assert_eq!(collect("APPLE-pear-GRAPE" ).unwrap(), vec![Token::Id]); |
697 | assert_eq!(collect("ENOENT" ).unwrap(), vec![Token::Id]); |
698 | assert_eq!(collect("is-XML" ).unwrap(), vec![Token::Id]); |
699 | |
700 | assert_eq!(collect("func" ).unwrap(), vec![Token::Func]); |
701 | assert_eq!( |
702 | collect("a: func()" ).unwrap(), |
703 | vec![ |
704 | Token::Id, |
705 | Token::Colon, |
706 | Token::Func, |
707 | Token::LeftParen, |
708 | Token::RightParen |
709 | ] |
710 | ); |
711 | |
712 | assert_eq!(collect("resource" ).unwrap(), vec![Token::Resource]); |
713 | |
714 | assert_eq!(collect("own" ).unwrap(), vec![Token::Own]); |
715 | assert_eq!( |
716 | collect("own<some-id>" ).unwrap(), |
717 | vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan] |
718 | ); |
719 | |
720 | assert_eq!(collect("borrow" ).unwrap(), vec![Token::Borrow]); |
721 | assert_eq!( |
722 | collect("borrow<some-id>" ).unwrap(), |
723 | vec![ |
724 | Token::Borrow, |
725 | Token::LessThan, |
726 | Token::Id, |
727 | Token::GreaterThan |
728 | ] |
729 | ); |
730 | |
731 | assert!(collect(" \u{149}" ).is_err(), "strongly discouraged" ); |
732 | assert!(collect(" \u{673}" ).is_err(), "strongly discouraged" ); |
733 | assert!(collect(" \u{17a3}" ).is_err(), "strongly discouraged" ); |
734 | assert!(collect(" \u{17a4}" ).is_err(), "strongly discouraged" ); |
735 | assert!(collect(" \u{202a}" ).is_err(), "bidirectional override" ); |
736 | assert!(collect(" \u{2068}" ).is_err(), "bidirectional override" ); |
737 | assert!(collect(" \u{0}" ).is_err(), "control code" ); |
738 | assert!(collect(" \u{b}" ).is_err(), "control code" ); |
739 | assert!(collect(" \u{c}" ).is_err(), "control code" ); |
740 | assert!(collect(" \u{85}" ).is_err(), "control code" ); |
741 | } |
742 | |