mod.rs source code [crates/regex_automata/src/util/mod.rs]

1	/!*
2	TODO
3	*/
4
5	use core::{ascii, fmt, str};
6
7	#[cfg(feature = "alloc")]
8	use alloc::vec::Vec;
9
10	pub mod alphabet;
11	pub(crate) mod bytes;
12	#[cfg(feature = "alloc")]
13	pub(crate) mod determinize;
14	pub mod id;
15	#[cfg(feature = "alloc")]
16	pub(crate) mod lazy;
17	pub(crate) mod matchtypes;
18	pub mod prefilter;
19	#[cfg(feature = "alloc")]
20	pub(crate) mod sparse_set;
21	pub(crate) mod start;
22	#[cfg(feature = "alloc")]
23	pub(crate) mod syntax;
24
25	/// The offset, in bytes, that a match is delayed by in the DFAs generated by
26	/// this crate. (This includes lazy DFAs.)
27	///
28	/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
29	/// and $. In particular, both of these operators may require the
30	/// identification of the end of input in order to confirm a match. Not only
31	/// does this mean that all matches must therefore be delayed by a single byte,
32	/// but that a special EOI value is added to the alphabet of all DFAs. (Which
33	/// means that even though the alphabet of a DFA is typically all byte values,
34	/// the actual maximum alphabet size is 257 due to the extra EOI value.)
35	///
36	/// Since we delay matches by only 1 byte, this can't fully support a
37	/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
38	/// DFAs in this crate do not support it. (It's not as simple as just
39	/// increasing the match offset to do it---otherwise we would---but building
40	/// the full Unicode-aware word boundary detection into an automaton is quite
41	/// tricky.)
42	pub(crate) const MATCH_OFFSET: usize = `1`;
43
44	/// A type that wraps a single byte with a convenient fmt::Debug impl that
45	/// escapes the byte.
46	pub(crate) struct DebugByte(pub u8);
47
48	impl fmt::Debug for DebugByte {
49	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
50	// 10 bytes is enough to cover any output from ascii::escape_default.
51	let mut bytes: [u8; 10] = [`0u8`; `10`];
52	let mut len: usize = `0`;
53	for (i: usize, mut b: u8) in ascii::escape_default(self.0).enumerate() {
54	// capitalize \xab to \xAB
55	if i >= `2` && b'a' <= b && b <= b'f' {
56	b -= `32`;
57	}
58	bytes[len] = b;
59	len += `1`;
60	}
61	write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
62	}
63	}
64
65	/// Returns the smallest possible index of the next valid UTF-8 sequence
66	/// starting after `i`.
67	///
68	/// For all inputs, including invalid UTF-8 and any value of `i`, the return
69	/// value is guaranteed to be greater than `i`.
70	///
71	/// Generally speaking, this should only be called on `text` when it is
72	/// permitted to assume that it is valid UTF-8 and where either `i >=
73	/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
74	#[inline(always)]
75	pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
76	let b: u8 = match text.get(index:i) {
77	None => return i.checked_add(`1`).unwrap(),
78	Some(&b: u8) => b,
79	};
80	// For cases where we see an invalid UTF-8 byte, there isn't much we can do
81	// other than just start at the next byte.
82	let inc: usize = utf8_len(b).unwrap_or(default:`1`);
83	i.checked_add(inc).unwrap()
84	}
85
86	/// Returns true if and only if the given byte is considered a word character.
87	/// This only applies to ASCII.
88	///
89	/// This was copied from regex-syntax so that we can use it to determine the
90	/// starting DFA state while searching without depending on regex-syntax. The
91	/// definition is never going to change, so there's no maintenance/bit-rot
92	/// hazard here.
93	#[inline(always)]
94	pub(crate) fn is_word_byte(b: u8) -> bool {
95	match b {
96	b'_' \| b'0'..=b'9' \| b'a'..=b'z' \| b'A'..=b'Z' => `true`,
97	_ => `false`,
98	}
99	}
100
101	/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
102	///
103	/// If no valid encoding of a codepoint exists at the beginning of the given
104	/// byte slice, then the first byte is returned instead.
105	///
106	/// This returns `None` if and only if `bytes` is empty.
107	#[inline(always)]
108	pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
109	if bytes.is_empty() {
110	return None;
111	}
112	let len: usize = match utf8_len(byte:bytes[`0`]) {
113	None => return Some(Err(bytes[`0`])),
114	Some(len: usize) if len > bytes.len() => return Some(Err(bytes[`0`])),
115	Some(`1`) => return Some(Ok(bytes[`0`] as char)),
116	Some(len: usize) => len,
117	};
118	match str::from_utf8(&bytes[..len]) {
119	Ok(s: &str) => Some(Ok(s.chars().next().unwrap())),
120	Err(_) => Some(Err(bytes[`0`])),
121	}
122	}
123
124	/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
125	///
126	/// If no valid encoding of a codepoint exists at the end of the given byte
127	/// slice, then the last byte is returned instead.
128	///
129	/// This returns `None` if and only if `bytes` is empty.
130	#[inline(always)]
131	pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
132	if bytes.is_empty() {
133	return None;
134	}
135	let mut start: usize = bytes.len() - `1`;
136	let limit: usize = bytes.len().saturating_sub(`4`);
137	while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
138	start -= `1`;
139	}
140	match decode_utf8(&bytes[start..]) {
141	None => None,
142	Some(Ok(ch: char)) => Some(Ok(ch)),
143	Some(Err(_)) => Some(Err(bytes[bytes.len() - `1`])),
144	}
145	}
146
147	/// Given a UTF-8 leading byte, this returns the total number of code units
148	/// in the following encoded codepoint.
149	///
150	/// If the given byte is not a valid UTF-8 leading byte, then this returns
151	/// `None`.
152	#[inline(always)]
153	fn utf8_len(byte: u8) -> Option<usize> {
154	if byte <= `0x7F` {
155	return Some(`1`);
156	} else if byte & `0b1100_0000` == `0b1000_0000` {
157	return None;
158	} else if byte <= `0b1101_1111` {
159	Some(`2`)
160	} else if byte <= `0b1110_1111` {
161	Some(`3`)
162	} else if byte <= `0b1111_0111` {
163	Some(`4`)
164	} else {
165	None
166	}
167	}
168
169	/// Returns true if and only if the given byte is either a valid leading UTF-8
170	/// byte, or is otherwise an invalid byte that can never appear anywhere in a
171	/// valid UTF-8 sequence.
172	#[inline(always)]
173	fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
174	// In the ASCII case, the most significant bit is never set. The leading
175	// byte of a 2/3/4-byte sequence always has the top two most significant
176	// bits set. For bytes that can never appear anywhere in valid UTF-8, this
177	// also returns true, since every such byte has its two most significant
178	// bits set:
179	//
180	// \xC0 :: 11000000
181	// \xC1 :: 11000001
182	// \xF5 :: 11110101
183	// \xF6 :: 11110110
184	// \xF7 :: 11110111
185	// \xF8 :: 11111000
186	// \xF9 :: 11111001
187	// \xFA :: 11111010
188	// \xFB :: 11111011
189	// \xFC :: 11111100
190	// \xFD :: 11111101
191	// \xFE :: 11111110
192	// \xFF :: 11111111
193	(b & `0b1100_0000`) != `0b1000_0000`
194	}
195
196	#[cfg(feature = "alloc")]
197	#[inline(always)]
198	pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
199	use core::{ptr, sync::atomic::AtomicPtr};
200
201	use crate::{
202	dfa::{
203	dense::{self, DFA},
204	Automaton,
205	},
206	util::lazy,
207	};
208
209	static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
210
211	let dfa = lazy::get_or_init(&WORD, \|\| {
212	// TODO: Should we use a lazy DFA here instead? It does complicate
213	// things somewhat, since we then need a mutable cache, which probably
214	// means a thread local.
215	dense::Builder::new()
216	.configure(dense::Config::new().anchored(`true`))
217	.build(r"\w")
218	.unwrap()
219	});
220	// This is OK since '\w' contains no look-around.
221	let mut sid = dfa.universal_start_state();
222	while at < bytes.len() {
223	let byte = bytes[at];
224	sid = dfa.next_state(sid, byte);
225	at += `1`;
226	if dfa.is_special_state(sid) {
227	if dfa.is_match_state(sid) {
228	return `true`;
229	} else if dfa.is_dead_state(sid) {
230	return `false`;
231	}
232	}
233	}
234	dfa.is_match_state(dfa.next_eoi_state(sid))
235	}
236
237	#[cfg(feature = "alloc")]
238	#[inline(always)]
239	pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
240	use core::{ptr, sync::atomic::AtomicPtr};
241
242	use crate::{
243	dfa::{
244	dense::{self, DFA},
245	Automaton,
246	},
247	nfa::thompson::NFA,
248	};
249
250	static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
251
252	let dfa = lazy::get_or_init(&WORD, \|\| {
253	dense::Builder::new()
254	.configure(dense::Config::new().anchored(`true`))
255	.thompson(NFA::config().reverse(`true`).shrink(`true`))
256	.build(r"\w")
257	.unwrap()
258	});
259
260	// This is OK since '\w' contains no look-around.
261	let mut sid = dfa.universal_start_state();
262	while at > `0` {
263	at -= `1`;
264	let byte = bytes[at];
265	sid = dfa.next_state(sid, byte);
266	if dfa.is_special_state(sid) {
267	if dfa.is_match_state(sid) {
268	return `true`;
269	} else if dfa.is_dead_state(sid) {
270	return `false`;
271	}
272	}
273	}
274	dfa.is_match_state(dfa.next_eoi_state(sid))
275	}
276