1 | //! A table-driven UTF-8 Parser |
2 | //! |
3 | //! This module implements a table-driven UTF-8 parser which should |
4 | //! theoretically contain the minimal number of branches (1). The only branch is |
5 | //! on the `Action` returned from unpacking a transition. |
6 | #![deny (clippy::all, clippy::if_not_else, clippy::enum_glob_use)] |
7 | #![cfg_attr (all(feature = "nightly" , test), feature(test))] |
8 | #![no_std ] |
9 | |
10 | use core::char; |
11 | |
12 | mod types; |
13 | |
14 | use types::{Action, State}; |
15 | |
16 | /// Handles codepoint and invalid sequence events from the parser. |
17 | pub trait Receiver { |
18 | /// Called whenever a codepoint is parsed successfully |
19 | fn codepoint(&mut self, _: char); |
20 | |
21 | /// Called when an invalid_sequence is detected |
22 | fn invalid_sequence(&mut self); |
23 | } |
24 | |
25 | /// A parser for Utf8 Characters |
26 | /// |
27 | /// Repeatedly call `advance` with bytes to emit Utf8 characters |
28 | #[derive (Clone, Default, PartialEq, Eq, Debug)] |
29 | pub struct Parser { |
30 | point: u32, |
31 | state: State, |
32 | } |
33 | |
34 | /// Continuation bytes are masked with this value. |
35 | const CONTINUATION_MASK: u8 = 0b0011_1111; |
36 | |
37 | impl Parser { |
38 | /// Create a new Parser |
39 | pub fn new() -> Parser { |
40 | Parser { point: 0, state: State::Ground } |
41 | } |
42 | |
43 | /// Advance the parser |
44 | /// |
45 | /// The provider receiver will be called whenever a codepoint is completed or an invalid |
46 | /// sequence is detected. |
47 | pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) |
48 | where |
49 | R: Receiver, |
50 | { |
51 | let (state, action) = self.state.advance(byte); |
52 | self.perform_action(receiver, byte, action); |
53 | self.state = state; |
54 | } |
55 | |
56 | fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) |
57 | where |
58 | R: Receiver, |
59 | { |
60 | match action { |
61 | Action::InvalidSequence => { |
62 | self.point = 0; |
63 | receiver.invalid_sequence(); |
64 | }, |
65 | Action::EmitByte => { |
66 | receiver.codepoint(byte as char); |
67 | }, |
68 | Action::SetByte1 => { |
69 | let point = self.point | ((byte & CONTINUATION_MASK) as u32); |
70 | let c = unsafe { char::from_u32_unchecked(point) }; |
71 | self.point = 0; |
72 | |
73 | receiver.codepoint(c); |
74 | }, |
75 | Action::SetByte2 => { |
76 | self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; |
77 | }, |
78 | Action::SetByte2Top => { |
79 | self.point |= ((byte & 0b0001_1111) as u32) << 6; |
80 | }, |
81 | Action::SetByte3 => { |
82 | self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; |
83 | }, |
84 | Action::SetByte3Top => { |
85 | self.point |= ((byte & 0b0000_1111) as u32) << 12; |
86 | }, |
87 | Action::SetByte4 => { |
88 | self.point |= ((byte & 0b0000_0111) as u32) << 18; |
89 | }, |
90 | } |
91 | } |
92 | } |
93 | |
94 | #[cfg (all(feature = "nightly" , test))] |
95 | mod benches { |
96 | extern crate std; |
97 | extern crate test; |
98 | |
99 | use super::{Parser, Receiver}; |
100 | |
101 | use self::test::{black_box, Bencher}; |
102 | |
103 | static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt" ); |
104 | |
105 | impl Receiver for () { |
106 | fn codepoint(&mut self, c: char) { |
107 | black_box(c); |
108 | } |
109 | |
110 | fn invalid_sequence(&mut self) {} |
111 | } |
112 | |
113 | #[bench ] |
114 | fn parse_bench_utf8_demo(b: &mut Bencher) { |
115 | let mut parser = Parser::new(); |
116 | |
117 | b.iter(|| { |
118 | for byte in UTF8_DEMO { |
119 | parser.advance(&mut (), *byte); |
120 | } |
121 | }) |
122 | } |
123 | |
124 | #[bench ] |
125 | fn std_string_parse_utf8(b: &mut Bencher) { |
126 | b.iter(|| { |
127 | for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() { |
128 | black_box(c); |
129 | } |
130 | }); |
131 | } |
132 | } |
133 | |