lib.rs source code [crates/utf8parse/src/lib.rs]

1	//! A table-driven UTF-8 Parser
2	//!
3	//! This module implements a table-driven UTF-8 parser which should
4	//! theoretically contain the minimal number of branches (1). The only branch is
5	//! on the `Action` returned from unpacking a transition.
6	#![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)]
7	#![cfg_attr(all(feature = "nightly", test), feature(test))]
8	#![no_std]
9
10	use core::char;
11
12	mod types;
13
14	use types::{Action, State};
15
16	/// Handles codepoint and invalid sequence events from the parser.
17	pub trait Receiver {
18	/// Called whenever a codepoint is parsed successfully
19	fn codepoint(&mut self, _: char);
20
21	/// Called when an invalid_sequence is detected
22	fn invalid_sequence(&mut self);
23	}
24
25	/// A parser for Utf8 Characters
26	///
27	/// Repeatedly call `advance` with bytes to emit Utf8 characters
28	#[derive(Clone, Default, PartialEq, Eq, Debug)]
29	pub struct Parser {
30	point: u32,
31	state: State,
32	}
33
34	/// Continuation bytes are masked with this value.
35	const CONTINUATION_MASK: u8 = `0b0011_1111`;
36
37	impl Parser {
38	/// Create a new Parser
39	pub fn new() -> Parser {
40	Parser { point: `0`, state: State::Ground }
41	}
42
43	/// Advance the parser
44	///
45	/// The provider receiver will be called whenever a codepoint is completed or an invalid
46	/// sequence is detected.
47	pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
48	where
49	R: Receiver,
50	{
51	let (state, action) = self.state.advance(byte);
52	self.perform_action(receiver, byte, action);
53	self.state = state;
54	}
55
56	fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
57	where
58	R: Receiver,
59	{
60	match action {
61	Action::InvalidSequence => {
62	self.point = `0`;
63	receiver.invalid_sequence();
64	},
65	Action::EmitByte => {
66	receiver.codepoint(byte as char);
67	},
68	Action::SetByte1 => {
69	let point = self.point \| ((byte & CONTINUATION_MASK) as u32);
70	let c = unsafe { char::from_u32_unchecked(point) };
71	self.point = `0`;
72
73	receiver.codepoint(c);
74	},
75	Action::SetByte2 => {
76	self.point \|= ((byte & CONTINUATION_MASK) as u32) << `6`;
77	},
78	Action::SetByte2Top => {
79	self.point \|= ((byte & `0b0001_1111`) as u32) << `6`;
80	},
81	Action::SetByte3 => {
82	self.point \|= ((byte & CONTINUATION_MASK) as u32) << `12`;
83	},
84	Action::SetByte3Top => {
85	self.point \|= ((byte & `0b0000_1111`) as u32) << `12`;
86	},
87	Action::SetByte4 => {
88	self.point \|= ((byte & `0b0000_0111`) as u32) << `18`;
89	},
90	}
91	}
92	}
93
94	#[cfg(all(feature = "nightly", test))]
95	mod benches {
96	extern crate std;
97	extern crate test;
98
99	use super::{Parser, Receiver};
100
101	use self::test::{black_box, Bencher};
102
103	static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt");
104
105	impl Receiver for () {
106	fn codepoint(&mut self, c: char) {
107	black_box(c);
108	}
109
110	fn invalid_sequence(&mut self) {}
111	}
112
113	#[bench]
114	fn parse_bench_utf8_demo(b: &mut Bencher) {
115	let mut parser = Parser::new();
116
117	b.iter(\|\| {
118	for byte in UTF8_DEMO {
119	parser.advance(&mut (), *byte);
120	}
121	})
122	}
123
124	#[bench]
125	fn std_string_parse_utf8(b: &mut Bencher) {
126	b.iter(\|\| {
127	for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() {
128	black_box(c);
129	}
130	});
131	}
132	}
133