utf8.rs source code [crates/regex_lite/src/utf8.rs]

1	/// Returns true if and only if the given byte is considered a word character.
2	/// This only applies to ASCII.
3	pub(crate) fn is_word_byte(b: u8) -> bool {
4	const fn mkwordset() -> [bool; `256`] {
5	// FIXME: Use as_usize() once const functions in traits are stable.
6	let mut set = [`false`; `256`];
7	set[b'_' as usize] = `true`;
8
9	let mut byte = b'0';
10	while byte <= b'9' {
11	set[byte as usize] = `true`;
12	byte += `1`;
13	}
14	byte = b'A';
15	while byte <= b'Z' {
16	set[byte as usize] = `true`;
17	byte += `1`;
18	}
19	byte = b'a';
20	while byte <= b'z' {
21	set[byte as usize] = `true`;
22	byte += `1`;
23	}
24	set
25	}
26	const WORD: [bool; `256`] = mkwordset();
27	WORD[b as usize]
28	}
29
30	/// The accept state index. When we enter this state, we know we've found a
31	/// valid Unicode scalar value.
32	const ACCEPT: usize = `12`;
33	/// The reject state index. When we enter this state, we know that we've found
34	/// invalid UTF-8.
35	const REJECT: usize = `0`;
36
37	/// Like `decode`, but automatically converts the `None` case to the
38	/// replacement codepoint.
39	pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
40	match decode(slice) {
41	(Some(ch: char), size: usize) => (ch, size),
42	(None, size: usize) => ('`\u{FFFD}`', size),
43	}
44	}
45
46	/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
47	///
48	/// When successful, the corresponding Unicode scalar value is returned along
49	/// with the number of bytes it was encoded with. The number of bytes consumed
50	/// for a successful decode is always between 1 and 4, inclusive.
51	///
52	/// When unsuccessful, `None` is returned along with the number of bytes that
53	/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
54	/// the number of bytes consumed is always between 0 and 3, inclusive, where
55	/// 0 is only returned when `slice` is empty.
56	pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
57	let slice = slice.as_ref();
58	match slice.get(`0`) {
59	None => return (None, `0`),
60	Some(&b) if b <= `0x7F` => return (Some(b as char), `1`),
61	_ => {}
62	}
63
64	let (mut state, mut cp, mut i) = (ACCEPT, `0`, `0`);
65	while i < slice.len() {
66	decode_step(&mut state, &mut cp, slice[i]);
67	i += `1`;
68
69	if state == ACCEPT {
70	// OK since `decode_step` guarantees that `cp` is a valid Unicode
71	// scalar value in an ACCEPT state.
72	//
73	// We don't have to use safe code here, but do so because perf
74	// isn't our primary objective in regex-lite.
75	let ch = char::from_u32(cp).unwrap();
76	return (Some(ch), i);
77	} else if state == REJECT {
78	// At this point, we always want to advance at least one byte.
79	return (None, core::cmp::max(`1`, i.saturating_sub(`1`)));
80	}
81	}
82	(None, i)
83	}
84
85	/// Transitions to the next state and updates `cp` while it does.
86	fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
87	// Splits the space of all bytes into equivalence classes, such that
88	// any byte in the same class can never discriminate between whether a
89	// particular sequence is valid UTF-8 or not.
90	#[rustfmt::skip]
91	const CLASSES: [u8; `256`] = [
92	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
93	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
94	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
95	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
96	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,`9`,
97	`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`, `7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,`7`,
98	`8`,`8`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`, `2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,
99	`10`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`4`,`3`,`3`, `11`,`6`,`6`,`6`,`5`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,`8`,
100	];
101
102	// A state machine taken from `bstr` which was in turn adapted from:
103	// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
104	#[rustfmt::skip]
105	const STATES_FORWARD: &'static [u8] = &[
106	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
107	`12`, `0`, `24`, `36`, `60`, `96`, `84`, `0`, `0`, `0`, `48`, `72`,
108	`0`, `12`, `0`, `0`, `0`, `0`, `0`, `12`, `0`, `12`, `0`, `0`,
109	`0`, `24`, `0`, `0`, `0`, `0`, `0`, `24`, `0`, `24`, `0`, `0`,
110	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `24`, `0`, `0`, `0`, `0`,
111	`0`, `24`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `24`, `0`, `0`,
112	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `36`, `0`, `36`, `0`, `0`,
113	`0`, `36`, `0`, `0`, `0`, `0`, `0`, `36`, `0`, `36`, `0`, `0`,
114	`0`, `36`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
115	];
116
117	let class = CLASSES[usize::from(b)];
118	if *state == ACCEPT {
119	cp = (`0xFF` >> class) & (b as u32*);
120	} else {
121	cp = (b as u32* & `0b111111`) \| (*cp << `6`);
122	}
123	state = usize::from(STATES_FORWARD[state + usize::from(class)]);
124	}
125
126	#[cfg(test)]
127	mod tests {
128	use alloc::{vec, vec::Vec};
129
130	use super::*;
131
132	#[test]
133	fn decode_valid() {
134	fn d(mut s: &str) -> Vec<char> {
135	let mut chars = vec![];
136	while !s.is_empty() {
137	let (ch, size) = decode(s.as_bytes());
138	s = &s[size..];
139	chars.push(ch.unwrap());
140	}
141	chars
142	}
143
144	assert_eq!(vec!['☃'], d("☃"));
145	assert_eq!(vec!['☃', '☃'], d("☃☃"));
146	assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
147	assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
148	assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
149	}
150
151	#[test]
152	fn decode_invalid() {
153	let (ch, size) = decode(b"");
154	assert_eq!(None, ch);
155	assert_eq!(`0`, size);
156
157	let (ch, size) = decode(b"`\xFF`");
158	assert_eq!(None, ch);
159	assert_eq!(`1`, size);
160
161	let (ch, size) = decode(b"`\xCE\xF0`");
162	assert_eq!(None, ch);
163	assert_eq!(`1`, size);
164
165	let (ch, size) = decode(b"`\xE2\x98\xF0`");
166	assert_eq!(None, ch);
167	assert_eq!(`2`, size);
168
169	let (ch, size) = decode(b"`\xF0\x9D\x9D`");
170	assert_eq!(None, ch);
171	assert_eq!(`3`, size);
172
173	let (ch, size) = decode(b"`\xF0\x9D\x9D\xF0`");
174	assert_eq!(None, ch);
175	assert_eq!(`3`, size);
176
177	let (ch, size) = decode(b"`\xF0\x82\x82\xAC`");
178	assert_eq!(None, ch);
179	assert_eq!(`1`, size);
180
181	let (ch, size) = decode(b"`\xED\xA0\x80`");
182	assert_eq!(None, ch);
183	assert_eq!(`1`, size);
184
185	let (ch, size) = decode(b"`\xCE`a");
186	assert_eq!(None, ch);
187	assert_eq!(`1`, size);
188
189	let (ch, size) = decode(b"`\xE2\x98`a");
190	assert_eq!(None, ch);
191	assert_eq!(`2`, size);
192
193	let (ch, size) = decode(b"`\xF0\x9D\x9C`a");
194	assert_eq!(None, ch);
195	assert_eq!(`3`, size);
196	}
197
198	#[test]
199	fn decode_lossily() {
200	let (ch, size) = decode_lossy(b"");
201	assert_eq!('`\u{FFFD}`', ch);
202	assert_eq!(`0`, size);
203
204	let (ch, size) = decode_lossy(b"`\xFF`");
205	assert_eq!('`\u{FFFD}`', ch);
206	assert_eq!(`1`, size);
207
208	let (ch, size) = decode_lossy(b"`\xCE\xF0`");
209	assert_eq!('`\u{FFFD}`', ch);
210	assert_eq!(`1`, size);
211
212	let (ch, size) = decode_lossy(b"`\xE2\x98\xF0`");
213	assert_eq!('`\u{FFFD}`', ch);
214	assert_eq!(`2`, size);
215
216	let (ch, size) = decode_lossy(b"`\xF0\x9D\x9D\xF0`");
217	assert_eq!('`\u{FFFD}`', ch);
218	assert_eq!(`3`, size);
219
220	let (ch, size) = decode_lossy(b"`\xF0\x82\x82\xAC`");
221	assert_eq!('`\u{FFFD}`', ch);
222	assert_eq!(`1`, size);
223
224	let (ch, size) = decode_lossy(b"`\xED\xA0\x80`");
225	assert_eq!('`\u{FFFD}`', ch);
226	assert_eq!(`1`, size);
227
228	let (ch, size) = decode_lossy(b"`\xCE`a");
229	assert_eq!('`\u{FFFD}`', ch);
230	assert_eq!(`1`, size);
231
232	let (ch, size) = decode_lossy(b"`\xE2\x98`a");
233	assert_eq!('`\u{FFFD}`', ch);
234	assert_eq!(`2`, size);
235
236	let (ch, size) = decode_lossy(b"`\xF0\x9D\x9C`a");
237	assert_eq!('`\u{FFFD}`', ch);
238	assert_eq!(`3`, size);
239	}
240	}
241