1 | //! Types supporting the UTF-8 parser |
2 | |
3 | /// Action to take when receiving a byte |
4 | #[derive (Debug, Copy, Clone)] |
5 | pub enum Action { |
6 | /// Unexpected byte; sequence is invalid |
7 | InvalidSequence = 0, |
8 | /// Received valid 7-bit ASCII byte which can be directly emitted. |
9 | EmitByte = 1, |
10 | /// Set the bottom continuation byte |
11 | SetByte1 = 2, |
12 | /// Set the 2nd-from-last continuation byte |
13 | SetByte2 = 3, |
14 | /// Set the 2nd-from-last byte which is part of a two byte sequence |
15 | SetByte2Top = 4, |
16 | /// Set the 3rd-from-last continuation byte |
17 | SetByte3 = 5, |
18 | /// Set the 3rd-from-last byte which is part of a three byte sequence |
19 | SetByte3Top = 6, |
20 | /// Set the top byte of a four byte sequence. |
21 | SetByte4 = 7, |
22 | } |
23 | |
24 | /// States the parser can be in. |
25 | /// |
26 | /// There is a state for each initial input of the 3 and 4 byte sequences since |
27 | /// the following bytes are subject to different conditions than a tail byte. |
28 | #[allow (non_camel_case_types)] |
29 | #[derive (Debug, Copy, Clone, PartialEq, Eq)] |
30 | pub enum State { |
31 | /// Ground state; expect anything |
32 | Ground = 0, |
33 | /// 3 tail bytes |
34 | Tail3 = 1, |
35 | /// 2 tail bytes |
36 | Tail2 = 2, |
37 | /// 1 tail byte |
38 | Tail1 = 3, |
39 | /// UTF8-3 starting with E0 |
40 | U3_2_e0 = 4, |
41 | /// UTF8-3 starting with ED |
42 | U3_2_ed = 5, |
43 | /// UTF8-4 starting with F0 |
44 | Utf8_4_3_f0 = 6, |
45 | /// UTF8-4 starting with F4 |
46 | Utf8_4_3_f4 = 7, |
47 | } |
48 | |
49 | impl Default for State { |
50 | fn default() -> State { |
51 | State::Ground |
52 | } |
53 | } |
54 | |
55 | impl State { |
56 | /// Advance the parser state. |
57 | /// |
58 | /// This takes the current state and input byte into consideration, to determine the next state |
59 | /// and any action that should be taken. |
60 | #[inline ] |
61 | pub fn advance(self, byte: u8) -> (State, Action) { |
62 | match self { |
63 | State::Ground => match byte { |
64 | 0x00..=0x7f => (State::Ground, Action::EmitByte), |
65 | 0xc2..=0xdf => (State::Tail1, Action::SetByte2Top), |
66 | 0xe0 => (State::U3_2_e0, Action::SetByte3Top), |
67 | 0xe1..=0xec => (State::Tail2, Action::SetByte3Top), |
68 | 0xed => (State::U3_2_ed, Action::SetByte3Top), |
69 | 0xee..=0xef => (State::Tail2, Action::SetByte3Top), |
70 | 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4), |
71 | 0xf1..=0xf3 => (State::Tail3, Action::SetByte4), |
72 | 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4), |
73 | _ => (State::Ground, Action::InvalidSequence), |
74 | }, |
75 | State::U3_2_e0 => match byte { |
76 | 0xa0..=0xbf => (State::Tail1, Action::SetByte2), |
77 | _ => (State::Ground, Action::InvalidSequence), |
78 | }, |
79 | State::U3_2_ed => match byte { |
80 | 0x80..=0x9f => (State::Tail1, Action::SetByte2), |
81 | _ => (State::Ground, Action::InvalidSequence), |
82 | }, |
83 | State::Utf8_4_3_f0 => match byte { |
84 | 0x90..=0xbf => (State::Tail2, Action::SetByte3), |
85 | _ => (State::Ground, Action::InvalidSequence), |
86 | }, |
87 | State::Utf8_4_3_f4 => match byte { |
88 | 0x80..=0x8f => (State::Tail2, Action::SetByte3), |
89 | _ => (State::Ground, Action::InvalidSequence), |
90 | }, |
91 | State::Tail3 => match byte { |
92 | 0x80..=0xbf => (State::Tail2, Action::SetByte3), |
93 | _ => (State::Ground, Action::InvalidSequence), |
94 | }, |
95 | State::Tail2 => match byte { |
96 | 0x80..=0xbf => (State::Tail1, Action::SetByte2), |
97 | _ => (State::Ground, Action::InvalidSequence), |
98 | }, |
99 | State::Tail1 => match byte { |
100 | 0x80..=0xbf => (State::Ground, Action::SetByte1), |
101 | _ => (State::Ground, Action::InvalidSequence), |
102 | }, |
103 | } |
104 | } |
105 | } |
106 | |