| 1 | //! A table-driven UTF-8 Parser |
| 2 | //! |
| 3 | //! This module implements a table-driven UTF-8 parser which should |
| 4 | //! theoretically contain the minimal number of branches (1). The only branch is |
| 5 | //! on the `Action` returned from unpacking a transition. |
| 6 | #![deny (clippy::all, clippy::if_not_else, clippy::enum_glob_use)] |
| 7 | #![cfg_attr (all(feature = "nightly" , test), feature(test))] |
| 8 | #![no_std ] |
| 9 | |
| 10 | use core::char; |
| 11 | |
| 12 | mod types; |
| 13 | |
| 14 | use types::{Action, State}; |
| 15 | |
| 16 | /// Handles codepoint and invalid sequence events from the parser. |
| 17 | pub trait Receiver { |
| 18 | /// Called whenever a codepoint is parsed successfully |
| 19 | fn codepoint(&mut self, _: char); |
| 20 | |
| 21 | /// Called when an invalid_sequence is detected |
| 22 | fn invalid_sequence(&mut self); |
| 23 | } |
| 24 | |
| 25 | /// A parser for Utf8 Characters |
| 26 | /// |
| 27 | /// Repeatedly call `advance` with bytes to emit Utf8 characters |
| 28 | #[derive (Clone, Default, PartialEq, Eq, Debug)] |
| 29 | pub struct Parser { |
| 30 | point: u32, |
| 31 | state: State, |
| 32 | } |
| 33 | |
| 34 | /// Continuation bytes are masked with this value. |
| 35 | const CONTINUATION_MASK: u8 = 0b0011_1111; |
| 36 | |
| 37 | impl Parser { |
| 38 | /// Create a new Parser |
| 39 | pub fn new() -> Parser { |
| 40 | Parser { point: 0, state: State::Ground } |
| 41 | } |
| 42 | |
| 43 | /// Advance the parser |
| 44 | /// |
| 45 | /// The provider receiver will be called whenever a codepoint is completed or an invalid |
| 46 | /// sequence is detected. |
| 47 | pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) |
| 48 | where |
| 49 | R: Receiver, |
| 50 | { |
| 51 | let (state, action) = self.state.advance(byte); |
| 52 | self.perform_action(receiver, byte, action); |
| 53 | self.state = state; |
| 54 | } |
| 55 | |
| 56 | fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) |
| 57 | where |
| 58 | R: Receiver, |
| 59 | { |
| 60 | match action { |
| 61 | Action::InvalidSequence => { |
| 62 | self.point = 0; |
| 63 | receiver.invalid_sequence(); |
| 64 | }, |
| 65 | Action::EmitByte => { |
| 66 | receiver.codepoint(byte as char); |
| 67 | }, |
| 68 | Action::SetByte1 => { |
| 69 | let point = self.point | ((byte & CONTINUATION_MASK) as u32); |
| 70 | let c = unsafe { char::from_u32_unchecked(point) }; |
| 71 | self.point = 0; |
| 72 | |
| 73 | receiver.codepoint(c); |
| 74 | }, |
| 75 | Action::SetByte2 => { |
| 76 | self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; |
| 77 | }, |
| 78 | Action::SetByte2Top => { |
| 79 | self.point |= ((byte & 0b0001_1111) as u32) << 6; |
| 80 | }, |
| 81 | Action::SetByte3 => { |
| 82 | self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; |
| 83 | }, |
| 84 | Action::SetByte3Top => { |
| 85 | self.point |= ((byte & 0b0000_1111) as u32) << 12; |
| 86 | }, |
| 87 | Action::SetByte4 => { |
| 88 | self.point |= ((byte & 0b0000_0111) as u32) << 18; |
| 89 | }, |
| 90 | } |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | #[cfg (all(feature = "nightly" , test))] |
| 95 | mod benches { |
| 96 | extern crate std; |
| 97 | extern crate test; |
| 98 | |
| 99 | use super::{Parser, Receiver}; |
| 100 | |
| 101 | use self::test::{black_box, Bencher}; |
| 102 | |
| 103 | static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt" ); |
| 104 | |
| 105 | impl Receiver for () { |
| 106 | fn codepoint(&mut self, c: char) { |
| 107 | black_box(c); |
| 108 | } |
| 109 | |
| 110 | fn invalid_sequence(&mut self) {} |
| 111 | } |
| 112 | |
| 113 | #[bench ] |
| 114 | fn parse_bench_utf8_demo(b: &mut Bencher) { |
| 115 | let mut parser = Parser::new(); |
| 116 | |
| 117 | b.iter(|| { |
| 118 | for byte in UTF8_DEMO { |
| 119 | parser.advance(&mut (), *byte); |
| 120 | } |
| 121 | }) |
| 122 | } |
| 123 | |
| 124 | #[bench ] |
| 125 | fn std_string_parse_utf8(b: &mut Bencher) { |
| 126 | b.iter(|| { |
| 127 | for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() { |
| 128 | black_box(c); |
| 129 | } |
| 130 | }); |
| 131 | } |
| 132 | } |
| 133 | |