1use anyhow::{bail, Result};
2use std::char;
3use std::fmt;
4use std::str;
5use unicode_xid::UnicodeXID;
6
7use self::Token::*;
8
9#[derive(Clone)]
10pub struct Tokenizer<'a> {
11 input: &'a str,
12 span_offset: u32,
13 chars: CrlfFold<'a>,
14 require_f32_f64: bool,
15}
16
17#[derive(Clone)]
18struct CrlfFold<'a> {
19 chars: str::CharIndices<'a>,
20}
21
22/// A span, designating a range of bytes where a token is located.
23#[derive(Eq, PartialEq, Debug, Clone, Copy)]
24pub struct Span {
25 /// The start of the range.
26 pub start: u32,
27 /// The end of the range (exclusive).
28 pub end: u32,
29}
30
31#[derive(Eq, PartialEq, Debug, Copy, Clone)]
32pub enum Token {
33 Whitespace,
34 Comment,
35
36 Equals,
37 Comma,
38 Colon,
39 Period,
40 Semicolon,
41 LeftParen,
42 RightParen,
43 LeftBrace,
44 RightBrace,
45 LessThan,
46 GreaterThan,
47 RArrow,
48 Star,
49 At,
50 Slash,
51 Plus,
52 Minus,
53
54 Use,
55 Type,
56 Func,
57 U8,
58 U16,
59 U32,
60 U64,
61 S8,
62 S16,
63 S32,
64 S64,
65 F32,
66 F64,
67 Char,
68 Record,
69 Resource,
70 Own,
71 Borrow,
72 Flags,
73 Variant,
74 Enum,
75 Bool,
76 String_,
77 Option_,
78 Result_,
79 Future,
80 Stream,
81 ErrorContext,
82 List,
83 Underscore,
84 As,
85 From_,
86 Static,
87 Interface,
88 Tuple,
89 Import,
90 Export,
91 World,
92 Package,
93 Constructor,
94
95 Id,
96 ExplicitId,
97
98 Integer,
99
100 Include,
101 With,
102}
103
104#[derive(Eq, PartialEq, Debug)]
105#[allow(dead_code)]
106pub enum Error {
107 InvalidCharInId(u32, char),
108 IdPartEmpty(u32),
109 InvalidEscape(u32, char),
110 Unexpected(u32, char),
111 UnterminatedComment(u32),
112 Wanted {
113 at: u32,
114 expected: &'static str,
115 found: &'static str,
116 },
117}
118
119// NB: keep in sync with `crates/wit-component/src/printing.rs`.
120const REQUIRE_F32_F64_BY_DEFAULT: bool = true;
121
122impl<'a> Tokenizer<'a> {
123 pub fn new(
124 input: &'a str,
125 span_offset: u32,
126 require_f32_f64: Option<bool>,
127 ) -> Result<Tokenizer<'a>> {
128 detect_invalid_input(input)?;
129
130 let mut t = Tokenizer {
131 input,
132 span_offset,
133 chars: CrlfFold {
134 chars: input.char_indices(),
135 },
136 require_f32_f64: require_f32_f64.unwrap_or_else(|| {
137 match std::env::var("WIT_REQUIRE_F32_F64") {
138 Ok(s) => s == "1",
139 Err(_) => REQUIRE_F32_F64_BY_DEFAULT,
140 }
141 }),
142 };
143 // Eat utf-8 BOM
144 t.eatc('\u{feff}');
145 Ok(t)
146 }
147
148 pub fn expect_semicolon(&mut self) -> Result<()> {
149 self.expect(Token::Semicolon)?;
150 Ok(())
151 }
152
153 pub fn get_span(&self, span: Span) -> &'a str {
154 let start = usize::try_from(span.start - self.span_offset).unwrap();
155 let end = usize::try_from(span.end - self.span_offset).unwrap();
156 &self.input[start..end]
157 }
158
159 pub fn parse_id(&self, span: Span) -> Result<&'a str> {
160 let ret = self.get_span(span);
161 validate_id(span.start, &ret)?;
162 Ok(ret)
163 }
164
165 pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
166 let token = self.get_span(span);
167 let id_part = token.strip_prefix('%').unwrap();
168 validate_id(span.start, id_part)?;
169 Ok(id_part)
170 }
171
172 pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
173 loop {
174 match self.next_raw()? {
175 Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
176 other => break Ok(other),
177 }
178 }
179 }
180
181 /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an
182 /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more
183 /// tokens available.
184 pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
185 let (str_start, ch) = match self.chars.next() {
186 Some(pair) => pair,
187 None => return Ok(None),
188 };
189 let start = self.span_offset + u32::try_from(str_start).unwrap();
190 let token = match ch {
191 '\n' | '\t' | ' ' => {
192 // Eat all contiguous whitespace tokens
193 while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
194 Whitespace
195 }
196 '/' => {
197 // Eat a line comment if it's `//...`
198 if self.eatc('/') {
199 for (_, ch) in &mut self.chars {
200 if ch == '\n' {
201 break;
202 }
203 }
204 Comment
205 // eat a block comment if it's `/*...`
206 } else if self.eatc('*') {
207 let mut depth = 1;
208 while depth > 0 {
209 let (_, ch) = match self.chars.next() {
210 Some(pair) => pair,
211 None => return Err(Error::UnterminatedComment(start)),
212 };
213 match ch {
214 '/' if self.eatc('*') => depth += 1,
215 '*' if self.eatc('/') => depth -= 1,
216 _ => {}
217 }
218 }
219 Comment
220 } else {
221 Slash
222 }
223 }
224 '=' => Equals,
225 ',' => Comma,
226 ':' => Colon,
227 '.' => Period,
228 ';' => Semicolon,
229 '(' => LeftParen,
230 ')' => RightParen,
231 '{' => LeftBrace,
232 '}' => RightBrace,
233 '<' => LessThan,
234 '>' => GreaterThan,
235 '*' => Star,
236 '@' => At,
237 '-' => {
238 if self.eatc('>') {
239 RArrow
240 } else {
241 Minus
242 }
243 }
244 '+' => Plus,
245 '%' => {
246 let mut iter = self.chars.clone();
247 if let Some((_, ch)) = iter.next() {
248 if is_keylike_start(ch) {
249 self.chars = iter.clone();
250 while let Some((_, ch)) = iter.next() {
251 if !is_keylike_continue(ch) {
252 break;
253 }
254 self.chars = iter.clone();
255 }
256 }
257 }
258 ExplicitId
259 }
260 ch if is_keylike_start(ch) => {
261 let remaining = self.chars.chars.as_str().len();
262 let mut iter = self.chars.clone();
263 while let Some((_, ch)) = iter.next() {
264 if !is_keylike_continue(ch) {
265 break;
266 }
267 self.chars = iter.clone();
268 }
269 let str_end =
270 str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
271 match &self.input[str_start..str_end] {
272 "use" => Use,
273 "type" => Type,
274 "func" => Func,
275 "u8" => U8,
276 "u16" => U16,
277 "u32" => U32,
278 "u64" => U64,
279 "s8" => S8,
280 "s16" => S16,
281 "s32" => S32,
282 "s64" => S64,
283 "f32" => F32,
284 "f64" => F64,
285 "float32" if !self.require_f32_f64 => F32,
286 "float64" if !self.require_f32_f64 => F64,
287 "char" => Char,
288 "resource" => Resource,
289 "own" => Own,
290 "borrow" => Borrow,
291 "record" => Record,
292 "flags" => Flags,
293 "variant" => Variant,
294 "enum" => Enum,
295 "bool" => Bool,
296 "string" => String_,
297 "option" => Option_,
298 "result" => Result_,
299 "future" => Future,
300 "stream" => Stream,
301 "error-context" => ErrorContext,
302 "list" => List,
303 "_" => Underscore,
304 "as" => As,
305 "from" => From_,
306 "static" => Static,
307 "interface" => Interface,
308 "tuple" => Tuple,
309 "world" => World,
310 "import" => Import,
311 "export" => Export,
312 "package" => Package,
313 "constructor" => Constructor,
314 "include" => Include,
315 "with" => With,
316 _ => Id,
317 }
318 }
319
320 ch if ch.is_ascii_digit() => {
321 let mut iter = self.chars.clone();
322 while let Some((_, ch)) = iter.next() {
323 if !ch.is_ascii_digit() {
324 break;
325 }
326 self.chars = iter.clone();
327 }
328
329 Integer
330 }
331
332 ch => return Err(Error::Unexpected(start, ch)),
333 };
334 let end = match self.chars.clone().next() {
335 Some((i, _)) => i,
336 None => self.input.len(),
337 };
338
339 let end = self.span_offset + u32::try_from(end).unwrap();
340 Ok(Some((Span { start, end }, token)))
341 }
342
343 pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
344 let mut other = self.clone();
345 match other.next()? {
346 Some((_span, found)) if expected == found => {
347 *self = other;
348 Ok(true)
349 }
350 Some(_) => Ok(false),
351 None => Ok(false),
352 }
353 }
354
355 pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
356 match self.next()? {
357 Some((span, found)) => {
358 if expected == found {
359 Ok(span)
360 } else {
361 Err(Error::Wanted {
362 at: span.start,
363 expected: expected.describe(),
364 found: found.describe(),
365 })
366 }
367 }
368 None => Err(Error::Wanted {
369 at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
370 expected: expected.describe(),
371 found: "eof",
372 }),
373 }
374 }
375
376 fn eatc(&mut self, ch: char) -> bool {
377 let mut iter = self.chars.clone();
378 match iter.next() {
379 Some((_, ch2)) if ch == ch2 => {
380 self.chars = iter;
381 true
382 }
383 _ => false,
384 }
385 }
386
387 pub fn eof_span(&self) -> Span {
388 let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
389 Span { start: end, end }
390 }
391}
392
393impl<'a> Iterator for CrlfFold<'a> {
394 type Item = (usize, char);
395
396 fn next(&mut self) -> Option<(usize, char)> {
397 self.chars.next().map(|(i: usize, c: char)| {
398 if c == '\r' {
399 let mut attempt: CharIndices<'a> = self.chars.clone();
400 if let Some((_, '\n')) = attempt.next() {
401 self.chars = attempt;
402 return (i, '\n');
403 }
404 }
405 (i, c)
406 })
407 }
408}
409
410fn detect_invalid_input(input: &str) -> Result<()> {
411 // Disallow specific codepoints.
412 let mut line = 1;
413 for ch in input.chars() {
414 match ch {
415 '\n' => line += 1,
416 '\r' | '\t' => {}
417
418 // Bidirectional override codepoints can be used to craft source code that
419 // appears to have a different meaning than its actual meaning. See
420 // [CVE-2021-42574] for background and motivation.
421 //
422 // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
423 '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
424 | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
425 bail!(
426 "Input contains bidirectional override codepoint {:?} at line {}",
427 ch.escape_unicode(),
428 line
429 );
430 }
431
432 // Disallow several characters which are deprecated or discouraged in Unicode.
433 //
434 // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs.
435 // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks.
436 // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels.
437 // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see
438 // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
439 '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
440 | '\u{17b4}' | '\u{17b5}' => {
441 bail!(
442 "Codepoint {:?} at line {} is discouraged by Unicode",
443 ch.escape_unicode(),
444 line
445 );
446 }
447
448 // Disallow control codes other than the ones explicitly recognized above,
449 // so that viewing a wit file on a terminal doesn't have surprising side
450 // effects or appear to have a different meaning than its actual meaning.
451 ch if ch.is_control() => {
452 bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
453 }
454
455 _ => {}
456 }
457 }
458
459 Ok(())
460}
461
462fn is_keylike_start(ch: char) -> bool {
463 // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars,
464 // but we'll diagnose that after we've lexed the full string.
465 UnicodeXID::is_xid_start(self:ch) || ch == '_' || ch == '-'
466}
467
468fn is_keylike_continue(ch: char) -> bool {
469 // Lex any XID continue (which includes `_`) or '-'.
470 UnicodeXID::is_xid_continue(self:ch) || ch == '-'
471}
472
473pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
474 // IDs must have at least one part.
475 if id.is_empty() {
476 return Err(Error::IdPartEmpty(start));
477 }
478
479 // Ids consist of parts separated by '-'s.
480 for part in id.split('-') {
481 // Parts must be non-empty and contain either all ASCII lowercase or
482 // all ASCII uppercase.
483 let upper = match part.chars().next() {
484 None => return Err(Error::IdPartEmpty(start)),
485 Some(first) => {
486 if first.is_ascii_lowercase() {
487 false
488 } else if first.is_ascii_uppercase() {
489 true
490 } else {
491 return Err(Error::InvalidCharInId(start, first));
492 }
493 }
494 };
495
496 for ch in part.chars() {
497 if ch.is_ascii_digit() {
498 // Digits are accepted in both uppercase and lowercase segments.
499 } else if upper {
500 if !ch.is_ascii_uppercase() {
501 return Err(Error::InvalidCharInId(start, ch));
502 }
503 } else if !ch.is_ascii_lowercase() {
504 return Err(Error::InvalidCharInId(start, ch));
505 }
506 }
507 }
508
509 Ok(())
510}
511
512impl Token {
513 pub fn describe(&self) -> &'static str {
514 match self {
515 Whitespace => "whitespace",
516 Comment => "a comment",
517 Equals => "'='",
518 Comma => "','",
519 Colon => "':'",
520 Period => "'.'",
521 Semicolon => "';'",
522 LeftParen => "'('",
523 RightParen => "')'",
524 LeftBrace => "'{'",
525 RightBrace => "'}'",
526 LessThan => "'<'",
527 GreaterThan => "'>'",
528 Use => "keyword `use`",
529 Type => "keyword `type`",
530 Func => "keyword `func`",
531 U8 => "keyword `u8`",
532 U16 => "keyword `u16`",
533 U32 => "keyword `u32`",
534 U64 => "keyword `u64`",
535 S8 => "keyword `s8`",
536 S16 => "keyword `s16`",
537 S32 => "keyword `s32`",
538 S64 => "keyword `s64`",
539 F32 => "keyword `f32`",
540 F64 => "keyword `f64`",
541 Char => "keyword `char`",
542 Own => "keyword `own`",
543 Borrow => "keyword `borrow`",
544 Resource => "keyword `resource`",
545 Record => "keyword `record`",
546 Flags => "keyword `flags`",
547 Variant => "keyword `variant`",
548 Enum => "keyword `enum`",
549 Bool => "keyword `bool`",
550 String_ => "keyword `string`",
551 Option_ => "keyword `option`",
552 Result_ => "keyword `result`",
553 Future => "keyword `future`",
554 Stream => "keyword `stream`",
555 ErrorContext => "keyword `error-context`",
556 List => "keyword `list`",
557 Underscore => "keyword `_`",
558 Id => "an identifier",
559 ExplicitId => "an '%' identifier",
560 RArrow => "`->`",
561 Star => "`*`",
562 At => "`@`",
563 Slash => "`/`",
564 Plus => "`+`",
565 Minus => "`-`",
566 As => "keyword `as`",
567 From_ => "keyword `from`",
568 Static => "keyword `static`",
569 Interface => "keyword `interface`",
570 Tuple => "keyword `tuple`",
571 Import => "keyword `import`",
572 Export => "keyword `export`",
573 World => "keyword `world`",
574 Package => "keyword `package`",
575 Constructor => "keyword `constructor`",
576 Integer => "an integer",
577 Include => "keyword `include`",
578 With => "keyword `with`",
579 }
580 }
581}
582
583impl std::error::Error for Error {}
584
585impl fmt::Display for Error {
586 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
587 match self {
588 Error::Unexpected(_, ch: &char) => write!(f, "unexpected character {:?}", ch),
589 Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
590 Error::Wanted {
591 expected: &&'static str, found: &&'static str, ..
592 } => write!(f, "expected {}, found {}", expected, found),
593 Error::InvalidCharInId(_, ch: &char) => write!(f, "invalid character in identifier {:?}", ch),
594 Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
595 Error::InvalidEscape(_, ch: &char) => write!(f, "invalid escape in string {:?}", ch),
596 }
597 }
598}
599
600#[test]
601fn test_validate_id() {
602 validate_id(0, "apple").unwrap();
603 validate_id(0, "apple-pear").unwrap();
604 validate_id(0, "apple-pear-grape").unwrap();
605 validate_id(0, "a0").unwrap();
606 validate_id(0, "a").unwrap();
607 validate_id(0, "a-a").unwrap();
608 validate_id(0, "bool").unwrap();
609 validate_id(0, "APPLE").unwrap();
610 validate_id(0, "APPLE-PEAR").unwrap();
611 validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
612 validate_id(0, "apple-PEAR-grape").unwrap();
613 validate_id(0, "APPLE-pear-GRAPE").unwrap();
614 validate_id(0, "ENOENT").unwrap();
615 validate_id(0, "is-XML").unwrap();
616
617 assert!(validate_id(0, "").is_err());
618 assert!(validate_id(0, "0").is_err());
619 assert!(validate_id(0, "%").is_err());
620 assert!(validate_id(0, "$").is_err());
621 assert!(validate_id(0, "0a").is_err());
622 assert!(validate_id(0, ".").is_err());
623 assert!(validate_id(0, "·").is_err());
624 assert!(validate_id(0, "a a").is_err());
625 assert!(validate_id(0, "_").is_err());
626 assert!(validate_id(0, "-").is_err());
627 assert!(validate_id(0, "a-").is_err());
628 assert!(validate_id(0, "-a").is_err());
629 assert!(validate_id(0, "Apple").is_err());
630 assert!(validate_id(0, "applE").is_err());
631 assert!(validate_id(0, "-apple-pear").is_err());
632 assert!(validate_id(0, "apple-pear-").is_err());
633 assert!(validate_id(0, "apple_pear").is_err());
634 assert!(validate_id(0, "apple.pear").is_err());
635 assert!(validate_id(0, "apple pear").is_err());
636 assert!(validate_id(0, "apple/pear").is_err());
637 assert!(validate_id(0, "apple|pear").is_err());
638 assert!(validate_id(0, "apple-Pear").is_err());
639 assert!(validate_id(0, "apple-0").is_err());
640 assert!(validate_id(0, "()()").is_err());
641 assert!(validate_id(0, "").is_err());
642 assert!(validate_id(0, "*").is_err());
643 assert!(validate_id(0, "apple\u{5f3}pear").is_err());
644 assert!(validate_id(0, "apple\u{200c}pear").is_err());
645 assert!(validate_id(0, "apple\u{200d}pear").is_err());
646 assert!(validate_id(0, "apple--pear").is_err());
647 assert!(validate_id(0, "_apple").is_err());
648 assert!(validate_id(0, "apple_").is_err());
649 assert!(validate_id(0, "_Znwj").is_err());
650 assert!(validate_id(0, "__i386").is_err());
651 assert!(validate_id(0, "__i386__").is_err());
652 assert!(validate_id(0, "Москва").is_err());
653 assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
654 assert!(validate_id(0, "😼").is_err(), "non-identifier");
655 assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
656}
657
658#[test]
659fn test_tokenizer() {
660 fn collect(s: &str) -> Result<Vec<Token>> {
661 let mut t = Tokenizer::new(s, 0, None)?;
662 let mut tokens = Vec::new();
663 while let Some(token) = t.next()? {
664 tokens.push(token.1);
665 }
666 Ok(tokens)
667 }
668
669 assert_eq!(collect("").unwrap(), vec![]);
670 assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
671 assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
672 assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
673 assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
674 assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
675 assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
676 assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
677 assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
678 assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
679 assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
680 assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
681 assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
682 assert_eq!(
683 collect("garçon-hühnervögel-москва-東京").unwrap(),
684 vec![Token::Id]
685 );
686 assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
687 assert_eq!(collect("a").unwrap(), vec![Token::Id]);
688 assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
689 assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
690 assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
691 assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
692 assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
693 assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
694 assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
695 assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
696 assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
697 assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
698 assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
699
700 assert_eq!(collect("func").unwrap(), vec![Token::Func]);
701 assert_eq!(
702 collect("a: func()").unwrap(),
703 vec![
704 Token::Id,
705 Token::Colon,
706 Token::Func,
707 Token::LeftParen,
708 Token::RightParen
709 ]
710 );
711
712 assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
713
714 assert_eq!(collect("own").unwrap(), vec![Token::Own]);
715 assert_eq!(
716 collect("own<some-id>").unwrap(),
717 vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
718 );
719
720 assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
721 assert_eq!(
722 collect("borrow<some-id>").unwrap(),
723 vec![
724 Token::Borrow,
725 Token::LessThan,
726 Token::Id,
727 Token::GreaterThan
728 ]
729 );
730
731 assert!(collect("\u{149}").is_err(), "strongly discouraged");
732 assert!(collect("\u{673}").is_err(), "strongly discouraged");
733 assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
734 assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
735 assert!(collect("\u{202a}").is_err(), "bidirectional override");
736 assert!(collect("\u{2068}").is_err(), "bidirectional override");
737 assert!(collect("\u{0}").is_err(), "control code");
738 assert!(collect("\u{b}").is_err(), "control code");
739 assert!(collect("\u{c}").is_err(), "control code");
740 assert!(collect("\u{85}").is_err(), "control code");
741}
742