utf8.rs source code [crates/regex-automata-0.4.9/src/util/utf8.rs]

1	/!*
2	Utilities for dealing with UTF-8.
3
4	This module provides some UTF-8 related helper routines, including an
5	incremental decoder.
6	*/
7
8	/// Returns true if and only if the given byte is considered a word character.
9	/// This only applies to ASCII.
10	///
11	/// This was copied from regex-syntax so that we can use it to determine the
12	/// starting DFA state while searching without depending on regex-syntax. The
13	/// definition is never going to change, so there's no maintenance/bit-rot
14	/// hazard here.
15	#[cfg_attr(feature = "perf-inline", inline(always))]
16	pub(crate) fn is_word_byte(b: u8) -> bool {
17	const fn mkwordset() -> [bool; `256`] {
18	// FIXME: Use as_usize() once const functions in traits are stable.
19	let mut set = [`false`; `256`];
20	set[b'_' as usize] = `true`;
21
22	let mut byte = b'0';
23	while byte <= b'9' {
24	set[byte as usize] = `true`;
25	byte += `1`;
26	}
27	byte = b'A';
28	while byte <= b'Z' {
29	set[byte as usize] = `true`;
30	byte += `1`;
31	}
32	byte = b'a';
33	while byte <= b'z' {
34	set[byte as usize] = `true`;
35	byte += `1`;
36	}
37	set
38	}
39	const WORD: [bool; `256`] = mkwordset();
40	WORD[b as usize]
41	}
42
43	/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
44	///
45	/// If no valid encoding of a codepoint exists at the beginning of the given
46	/// byte slice, then the first byte is returned instead.
47	///
48	/// This returns `None` if and only if `bytes` is empty.
49	///
50	/// This never panics.
51	///
52	/// WARNING: This is not designed for performance. If you're looking for a
53	/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
54	/// crate, then please file an issue and discuss your use case.
55	#[cfg_attr(feature = "perf-inline", inline(always))]
56	pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
57	if bytes.is_empty() {
58	return None;
59	}
60	let len: usize = match len(byte:bytes[`0`]) {
61	None => return Some(Err(bytes[`0`])),
62	Some(len: usize) if len > bytes.len() => return Some(Err(bytes[`0`])),
63	Some(`1`) => return Some(Ok(char::from(bytes[`0`]))),
64	Some(len: usize) => len,
65	};
66	match core::str::from_utf8(&bytes[..len]) {
67	Ok(s: &str) => Some(Ok(s.chars().next().unwrap())),
68	Err(_) => Some(Err(bytes[`0`])),
69	}
70	}
71
72	/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
73	///
74	/// If no valid encoding of a codepoint exists at the end of the given byte
75	/// slice, then the last byte is returned instead.
76	///
77	/// This returns `None` if and only if `bytes` is empty.
78	#[cfg_attr(feature = "perf-inline", inline(always))]
79	pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
80	if bytes.is_empty() {
81	return None;
82	}
83	let mut start: usize = bytes.len() - `1`;
84	let limit: usize = bytes.len().saturating_sub(`4`);
85	while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
86	start -= `1`;
87	}
88	match decode(&bytes[start..]) {
89	None => None,
90	Some(Ok(ch: char)) => Some(Ok(ch)),
91	Some(Err(_)) => Some(Err(bytes[bytes.len() - `1`])),
92	}
93	}
94
95	/// Given a UTF-8 leading byte, this returns the total number of code units
96	/// in the following encoded codepoint.
97	///
98	/// If the given byte is not a valid UTF-8 leading byte, then this returns
99	/// `None`.
100	#[cfg_attr(feature = "perf-inline", inline(always))]
101	fn len(byte: u8) -> Option<usize> {
102	if byte <= `0x7F` {
103	return Some(`1`);
104	} else if byte & `0b1100_0000` == `0b1000_0000` {
105	return None;
106	} else if byte <= `0b1101_1111` {
107	Some(`2`)
108	} else if byte <= `0b1110_1111` {
109	Some(`3`)
110	} else if byte <= `0b1111_0111` {
111	Some(`4`)
112	} else {
113	None
114	}
115	}
116
117	/// Returns true if and only if the given offset in the given bytes falls on a
118	/// valid UTF-8 encoded codepoint boundary.
119	///
120	/// If `bytes` is not valid UTF-8, then the behavior of this routine is
121	/// unspecified.
122	#[cfg_attr(feature = "perf-inline", inline(always))]
123	pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
124	match bytes.get(index:i) {
125	// The position at the end of the bytes always represents an empty
126	// string, which is a valid boundary. But anything after that doesn't
127	// make much sense to call valid a boundary.
128	None => i == bytes.len(),
129	// Other than ASCII (where the most significant bit is never set),
130	// valid starting bytes always have their most significant two bits
131	// set, where as continuation bytes never have their second most
132	// significant bit set. Therefore, this only returns true when bytes[i]
133	// corresponds to a byte that begins a valid UTF-8 encoding of a
134	// Unicode scalar value.
135	Some(&b: u8) => b <= `0b0111_1111` \|\| b >= `0b1100_0000`,
136	}
137	}
138
139	/// Returns true if and only if the given byte is either a valid leading UTF-8
140	/// byte, or is otherwise an invalid byte that can never appear anywhere in a
141	/// valid UTF-8 sequence.
142	#[cfg_attr(feature = "perf-inline", inline(always))]
143	fn is_leading_or_invalid_byte(b: u8) -> bool {
144	// In the ASCII case, the most significant bit is never set. The leading
145	// byte of a 2/3/4-byte sequence always has the top two most significant
146	// bits set. For bytes that can never appear anywhere in valid UTF-8, this
147	// also returns true, since every such byte has its two most significant
148	// bits set:
149	//
150	// \xC0 :: 11000000
151	// \xC1 :: 11000001
152	// \xF5 :: 11110101
153	// \xF6 :: 11110110
154	// \xF7 :: 11110111
155	// \xF8 :: 11111000
156	// \xF9 :: 11111001
157	// \xFA :: 11111010
158	// \xFB :: 11111011
159	// \xFC :: 11111100
160	// \xFD :: 11111101
161	// \xFE :: 11111110
162	// \xFF :: 11111111
163	(b & `0b1100_0000`) != `0b1000_0000`
164	}
165
166	/*
167	/// Returns the smallest possible index of the next valid UTF-8 sequence
168	/// starting after `i`.
169	///
170	/// For all inputs, including invalid UTF-8 and any value of `i`, the return
171	/// value is guaranteed to be greater than `i`. (If there is no value greater
172	/// than `i` that fits in `usize`, then this panics.)
173	///
174	/// Generally speaking, this should only be called on `text` when it is
175	/// permitted to assume that it is valid UTF-8 and where either `i >=
176	/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
177	///
178	/// NOTE: This method was used in a previous conception of iterators where we
179	/// specifically tried to skip over empty matches that split a codepoint by
180	/// simply requiring that our next search begin at the beginning of codepoint.
181	/// But we ended up changing that technique to always advance by 1 byte and
182	/// then filter out matches that split a codepoint after-the-fact. Thus, we no
183	/// longer use this method. But I've kept it around in case we want to switch
184	/// back to this approach. Its guarantees are a little subtle, so I'd prefer
185	/// not to rebuild it from whole cloth.
186	pub(crate) fn next(text: &[u8], i: usize) -> usize {
187	let b = match text.get(i) {
188	None => return i.checked_add(1).unwrap(),
189	Some(&b) => b,
190	};
191	// For cases where we see an invalid UTF-8 byte, there isn't much we can do
192	// other than just start at the next byte.
193	let inc = len(b).unwrap_or(1);
194	i.checked_add(inc).unwrap()
195	}
196	*/
197