| 1 | //! A table-driven UTF-8 Parser | 
| 2 | //! | 
|---|
| 3 | //! This module implements a table-driven UTF-8 parser which should | 
|---|
| 4 | //! theoretically contain the minimal number of branches (1). The only branch is | 
|---|
| 5 | //! on the `Action` returned from unpacking a transition. | 
|---|
| 6 | #![ deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)] | 
|---|
| 7 | #![ cfg_attr(all(feature = "nightly", test), feature(test))] | 
|---|
| 8 | #![ no_std] | 
|---|
| 9 |  | 
|---|
| 10 | use core::char; | 
|---|
| 11 |  | 
|---|
| 12 | mod types; | 
|---|
| 13 |  | 
|---|
| 14 | use types::{Action, State}; | 
|---|
| 15 |  | 
|---|
| 16 | /// Handles codepoint and invalid sequence events from the parser. | 
|---|
| 17 | pub trait Receiver { | 
|---|
| 18 | /// Called whenever a codepoint is parsed successfully | 
|---|
| 19 | fn codepoint(&mut self, _: char); | 
|---|
| 20 |  | 
|---|
| 21 | /// Called when an invalid_sequence is detected | 
|---|
| 22 | fn invalid_sequence(&mut self); | 
|---|
| 23 | } | 
|---|
| 24 |  | 
|---|
| 25 | /// A parser for Utf8 Characters | 
|---|
| 26 | /// | 
|---|
| 27 | /// Repeatedly call `advance` with bytes to emit Utf8 characters | 
|---|
| 28 | #[ derive(Clone, Default, PartialEq, Eq, Debug)] | 
|---|
| 29 | pub struct Parser { | 
|---|
| 30 | point: u32, | 
|---|
| 31 | state: State, | 
|---|
| 32 | } | 
|---|
| 33 |  | 
|---|
| 34 | /// Continuation bytes are masked with this value. | 
|---|
| 35 | const CONTINUATION_MASK: u8 = 0b0011_1111; | 
|---|
| 36 |  | 
|---|
| 37 | impl Parser { | 
|---|
| 38 | /// Create a new Parser | 
|---|
| 39 | pub fn new() -> Parser { | 
|---|
| 40 | Parser { point: 0, state: State::Ground } | 
|---|
| 41 | } | 
|---|
| 42 |  | 
|---|
| 43 | /// Advance the parser | 
|---|
| 44 | /// | 
|---|
| 45 | /// The provider receiver will be called whenever a codepoint is completed or an invalid | 
|---|
| 46 | /// sequence is detected. | 
|---|
| 47 | pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) | 
|---|
| 48 | where | 
|---|
| 49 | R: Receiver, | 
|---|
| 50 | { | 
|---|
| 51 | let (state, action) = self.state.advance(byte); | 
|---|
| 52 | self.perform_action(receiver, byte, action); | 
|---|
| 53 | self.state = state; | 
|---|
| 54 | } | 
|---|
| 55 |  | 
|---|
| 56 | fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) | 
|---|
| 57 | where | 
|---|
| 58 | R: Receiver, | 
|---|
| 59 | { | 
|---|
| 60 | match action { | 
|---|
| 61 | Action::InvalidSequence => { | 
|---|
| 62 | self.point = 0; | 
|---|
| 63 | receiver.invalid_sequence(); | 
|---|
| 64 | }, | 
|---|
| 65 | Action::EmitByte => { | 
|---|
| 66 | receiver.codepoint(byte as char); | 
|---|
| 67 | }, | 
|---|
| 68 | Action::SetByte1 => { | 
|---|
| 69 | let point = self.point | ((byte & CONTINUATION_MASK) as u32); | 
|---|
| 70 | let c = unsafe { char::from_u32_unchecked(point) }; | 
|---|
| 71 | self.point = 0; | 
|---|
| 72 |  | 
|---|
| 73 | receiver.codepoint(c); | 
|---|
| 74 | }, | 
|---|
| 75 | Action::SetByte2 => { | 
|---|
| 76 | self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; | 
|---|
| 77 | }, | 
|---|
| 78 | Action::SetByte2Top => { | 
|---|
| 79 | self.point |= ((byte & 0b0001_1111) as u32) << 6; | 
|---|
| 80 | }, | 
|---|
| 81 | Action::SetByte3 => { | 
|---|
| 82 | self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; | 
|---|
| 83 | }, | 
|---|
| 84 | Action::SetByte3Top => { | 
|---|
| 85 | self.point |= ((byte & 0b0000_1111) as u32) << 12; | 
|---|
| 86 | }, | 
|---|
| 87 | Action::SetByte4 => { | 
|---|
| 88 | self.point |= ((byte & 0b0000_0111) as u32) << 18; | 
|---|
| 89 | }, | 
|---|
| 90 | } | 
|---|
| 91 | } | 
|---|
| 92 | } | 
|---|
| 93 |  | 
|---|
| 94 | #[ cfg(all(feature = "nightly", test))] | 
|---|
| 95 | mod benches { | 
|---|
| 96 | extern crate std; | 
|---|
| 97 | extern crate test; | 
|---|
| 98 |  | 
|---|
| 99 | use super::{Parser, Receiver}; | 
|---|
| 100 |  | 
|---|
| 101 | use self::test::{black_box, Bencher}; | 
|---|
| 102 |  | 
|---|
| 103 | static UTF8_DEMO: &[u8] = include_bytes!( "../tests/UTF-8-demo.txt"); | 
|---|
| 104 |  | 
|---|
| 105 | impl Receiver for () { | 
|---|
| 106 | fn codepoint(&mut self, c: char) { | 
|---|
| 107 | black_box(c); | 
|---|
| 108 | } | 
|---|
| 109 |  | 
|---|
| 110 | fn invalid_sequence(&mut self) {} | 
|---|
| 111 | } | 
|---|
| 112 |  | 
|---|
| 113 | #[ bench] | 
|---|
| 114 | fn parse_bench_utf8_demo(b: &mut Bencher) { | 
|---|
| 115 | let mut parser = Parser::new(); | 
|---|
| 116 |  | 
|---|
| 117 | b.iter(|| { | 
|---|
| 118 | for byte in UTF8_DEMO { | 
|---|
| 119 | parser.advance(&mut (), *byte); | 
|---|
| 120 | } | 
|---|
| 121 | }) | 
|---|
| 122 | } | 
|---|
| 123 |  | 
|---|
| 124 | #[ bench] | 
|---|
| 125 | fn std_string_parse_utf8(b: &mut Bencher) { | 
|---|
| 126 | b.iter(|| { | 
|---|
| 127 | for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() { | 
|---|
| 128 | black_box(c); | 
|---|
| 129 | } | 
|---|
| 130 | }); | 
|---|
| 131 | } | 
|---|
| 132 | } | 
|---|
| 133 |  | 
|---|