| 1 | //! Types supporting the UTF-8 parser |
| 2 | |
| 3 | /// Action to take when receiving a byte |
| 4 | #[derive (Debug, Copy, Clone)] |
| 5 | pub enum Action { |
| 6 | /// Unexpected byte; sequence is invalid |
| 7 | InvalidSequence = 0, |
| 8 | /// Received valid 7-bit ASCII byte which can be directly emitted. |
| 9 | EmitByte = 1, |
| 10 | /// Set the bottom continuation byte |
| 11 | SetByte1 = 2, |
| 12 | /// Set the 2nd-from-last continuation byte |
| 13 | SetByte2 = 3, |
| 14 | /// Set the 2nd-from-last byte which is part of a two byte sequence |
| 15 | SetByte2Top = 4, |
| 16 | /// Set the 3rd-from-last continuation byte |
| 17 | SetByte3 = 5, |
| 18 | /// Set the 3rd-from-last byte which is part of a three byte sequence |
| 19 | SetByte3Top = 6, |
| 20 | /// Set the top byte of a four byte sequence. |
| 21 | SetByte4 = 7, |
| 22 | } |
| 23 | |
| 24 | /// States the parser can be in. |
| 25 | /// |
| 26 | /// There is a state for each initial input of the 3 and 4 byte sequences since |
| 27 | /// the following bytes are subject to different conditions than a tail byte. |
| 28 | #[allow (non_camel_case_types)] |
| 29 | #[derive (Debug, Default, Copy, Clone, PartialEq, Eq)] |
| 30 | pub enum State { |
| 31 | /// Ground state; expect anything |
| 32 | #[default] |
| 33 | Ground = 0, |
| 34 | /// 3 tail bytes |
| 35 | Tail3 = 1, |
| 36 | /// 2 tail bytes |
| 37 | Tail2 = 2, |
| 38 | /// 1 tail byte |
| 39 | Tail1 = 3, |
| 40 | /// UTF8-3 starting with E0 |
| 41 | U3_2_e0 = 4, |
| 42 | /// UTF8-3 starting with ED |
| 43 | U3_2_ed = 5, |
| 44 | /// UTF8-4 starting with F0 |
| 45 | Utf8_4_3_f0 = 6, |
| 46 | /// UTF8-4 starting with F4 |
| 47 | Utf8_4_3_f4 = 7, |
| 48 | } |
| 49 | |
| 50 | impl State { |
| 51 | /// Advance the parser state. |
| 52 | /// |
| 53 | /// This takes the current state and input byte into consideration, to determine the next state |
| 54 | /// and any action that should be taken. |
| 55 | #[inline ] |
| 56 | pub fn advance(self, byte: u8) -> (State, Action) { |
| 57 | match self { |
| 58 | State::Ground => match byte { |
| 59 | 0x00..=0x7f => (State::Ground, Action::EmitByte), |
| 60 | 0xc2..=0xdf => (State::Tail1, Action::SetByte2Top), |
| 61 | 0xe0 => (State::U3_2_e0, Action::SetByte3Top), |
| 62 | 0xe1..=0xec => (State::Tail2, Action::SetByte3Top), |
| 63 | 0xed => (State::U3_2_ed, Action::SetByte3Top), |
| 64 | 0xee..=0xef => (State::Tail2, Action::SetByte3Top), |
| 65 | 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4), |
| 66 | 0xf1..=0xf3 => (State::Tail3, Action::SetByte4), |
| 67 | 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4), |
| 68 | _ => (State::Ground, Action::InvalidSequence), |
| 69 | }, |
| 70 | State::U3_2_e0 => match byte { |
| 71 | 0xa0..=0xbf => (State::Tail1, Action::SetByte2), |
| 72 | _ => (State::Ground, Action::InvalidSequence), |
| 73 | }, |
| 74 | State::U3_2_ed => match byte { |
| 75 | 0x80..=0x9f => (State::Tail1, Action::SetByte2), |
| 76 | _ => (State::Ground, Action::InvalidSequence), |
| 77 | }, |
| 78 | State::Utf8_4_3_f0 => match byte { |
| 79 | 0x90..=0xbf => (State::Tail2, Action::SetByte3), |
| 80 | _ => (State::Ground, Action::InvalidSequence), |
| 81 | }, |
| 82 | State::Utf8_4_3_f4 => match byte { |
| 83 | 0x80..=0x8f => (State::Tail2, Action::SetByte3), |
| 84 | _ => (State::Ground, Action::InvalidSequence), |
| 85 | }, |
| 86 | State::Tail3 => match byte { |
| 87 | 0x80..=0xbf => (State::Tail2, Action::SetByte3), |
| 88 | _ => (State::Ground, Action::InvalidSequence), |
| 89 | }, |
| 90 | State::Tail2 => match byte { |
| 91 | 0x80..=0xbf => (State::Tail1, Action::SetByte2), |
| 92 | _ => (State::Ground, Action::InvalidSequence), |
| 93 | }, |
| 94 | State::Tail1 => match byte { |
| 95 | 0x80..=0xbf => (State::Ground, Action::SetByte1), |
| 96 | _ => (State::Ground, Action::InvalidSequence), |
| 97 | }, |
| 98 | } |
| 99 | } |
| 100 | } |
| 101 | |