1//! A table-driven UTF-8 Parser
2//!
3//! This module implements a table-driven UTF-8 parser which should
4//! theoretically contain the minimal number of branches (1). The only branch is
5//! on the `Action` returned from unpacking a transition.
6#![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)]
7#![cfg_attr(all(feature = "nightly", test), feature(test))]
8#![no_std]
9
10use core::char;
11
12mod types;
13
14use types::{Action, State};
15
16/// Handles codepoint and invalid sequence events from the parser.
17pub trait Receiver {
18 /// Called whenever a codepoint is parsed successfully
19 fn codepoint(&mut self, _: char);
20
21 /// Called when an invalid_sequence is detected
22 fn invalid_sequence(&mut self);
23}
24
25/// A parser for Utf8 Characters
26///
27/// Repeatedly call `advance` with bytes to emit Utf8 characters
28#[derive(Clone, Default, PartialEq, Eq, Debug)]
29pub struct Parser {
30 point: u32,
31 state: State,
32}
33
34/// Continuation bytes are masked with this value.
35const CONTINUATION_MASK: u8 = 0b0011_1111;
36
37impl Parser {
38 /// Create a new Parser
39 pub fn new() -> Parser {
40 Parser { point: 0, state: State::Ground }
41 }
42
43 /// Advance the parser
44 ///
45 /// The provider receiver will be called whenever a codepoint is completed or an invalid
46 /// sequence is detected.
47 pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
48 where
49 R: Receiver,
50 {
51 let (state, action) = self.state.advance(byte);
52 self.perform_action(receiver, byte, action);
53 self.state = state;
54 }
55
56 fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
57 where
58 R: Receiver,
59 {
60 match action {
61 Action::InvalidSequence => {
62 self.point = 0;
63 receiver.invalid_sequence();
64 },
65 Action::EmitByte => {
66 receiver.codepoint(byte as char);
67 },
68 Action::SetByte1 => {
69 let point = self.point | ((byte & CONTINUATION_MASK) as u32);
70 let c = unsafe { char::from_u32_unchecked(point) };
71 self.point = 0;
72
73 receiver.codepoint(c);
74 },
75 Action::SetByte2 => {
76 self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
77 },
78 Action::SetByte2Top => {
79 self.point |= ((byte & 0b0001_1111) as u32) << 6;
80 },
81 Action::SetByte3 => {
82 self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
83 },
84 Action::SetByte3Top => {
85 self.point |= ((byte & 0b0000_1111) as u32) << 12;
86 },
87 Action::SetByte4 => {
88 self.point |= ((byte & 0b0000_0111) as u32) << 18;
89 },
90 }
91 }
92}
93
94#[cfg(all(feature = "nightly", test))]
95mod benches {
96 extern crate std;
97 extern crate test;
98
99 use super::{Parser, Receiver};
100
101 use self::test::{black_box, Bencher};
102
103 static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt");
104
105 impl Receiver for () {
106 fn codepoint(&mut self, c: char) {
107 black_box(c);
108 }
109
110 fn invalid_sequence(&mut self) {}
111 }
112
113 #[bench]
114 fn parse_bench_utf8_demo(b: &mut Bencher) {
115 let mut parser = Parser::new();
116
117 b.iter(|| {
118 for byte in UTF8_DEMO {
119 parser.advance(&mut (), *byte);
120 }
121 })
122 }
123
124 #[bench]
125 fn std_string_parse_utf8(b: &mut Bencher) {
126 b.iter(|| {
127 for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() {
128 black_box(c);
129 }
130 });
131 }
132}
133