prog.rs source code [crates/regex-1.8.4/src/prog.rs]

1	use std::cmp::Ordering;
2	use std::collections::HashMap;
3	use std::fmt;
4	use std::mem;
5	use std::ops::Deref;
6	use std::slice;
7	use std::sync::Arc;
8
9	use crate::input::Char;
10	use crate::literal::LiteralSearcher;
11
12	/// `InstPtr` represents the index of an instruction in a regex program.
13	pub type InstPtr = usize;
14
15	/// Program is a sequence of instructions and various facts about thos
16	/// instructions.
17	#[derive(Clone)]
18	pub struct Program {
19	/// A sequence of instructions that represents an NFA.
20	pub insts: Vec<Inst>,
21	/// Pointers to each Match instruction in the sequence.
22	///
23	/// This is always length 1 unless this program represents a regex set.
24	pub matches: Vec<InstPtr>,
25	/// The ordered sequence of all capture groups extracted from the AST.
26	/// Unnamed groups are `None`.
27	pub captures: Vec<Option<String>>,
28	/// Pointers to all named capture groups into `captures`.
29	pub capture_name_idx: Arc<HashMap<String, usize>>,
30	/// If the number of capture groups is the same for all possible matches,
31	/// then this is that number.
32	pub static_captures_len: Option<usize>,
33	/// A pointer to the start instruction. This can vary depending on how
34	/// the program was compiled. For example, programs for use with the DFA
35	/// engine have a `.?` inserted at the beginning of unanchored regular*
36	/// expressions. The actual starting point of the program is after the
37	/// `.?`.*
38	pub start: InstPtr,
39	/// A set of equivalence classes for discriminating bytes in the compiled
40	/// program.
41	pub byte_classes: Vec<u8>,
42	/// When true, this program can only match valid UTF-8.
43	pub only_utf8: bool,
44	/// When true, this program uses byte range instructions instead of Unicode
45	/// range instructions.
46	pub is_bytes: bool,
47	/// When true, the program is compiled for DFA matching. For example, this
48	/// implies `is_bytes` and also inserts a preceding `.?` for unanchored*
49	/// regexes.
50	pub is_dfa: bool,
51	/// When true, the program matches text in reverse (for use only in the
52	/// DFA).
53	pub is_reverse: bool,
54	/// Whether the regex must match from the start of the input.
55	pub is_anchored_start: bool,
56	/// Whether the regex must match at the end of the input.
57	pub is_anchored_end: bool,
58	/// Whether this program contains a Unicode word boundary instruction.
59	pub has_unicode_word_boundary: bool,
60	/// A possibly empty machine for very quickly matching prefix literals.
61	pub prefixes: LiteralSearcher,
62	/// A limit on the size of the cache that the DFA is allowed to use while
63	/// matching.
64	///
65	/// The cache limit specifies approximately how much space we're willing to
66	/// give to the state cache. Once the state cache exceeds the size, it is
67	/// wiped and all states must be re-computed.
68	///
69	/// Note that this value does not impact correctness. It can be set to 0
70	/// and the DFA will run just fine. (It will only ever store exactly one
71	/// state in the cache, and will likely run very slowly, but it will work.)
72	///
73	/// Also note that this limit is per thread of execution. That is,
74	/// if the same regex is used to search text across multiple threads
75	/// simultaneously, then the DFA cache is not shared. Instead, copies are
76	/// made.
77	pub dfa_size_limit: usize,
78	}
79
80	impl Program {
81	/// Creates an empty instruction sequence. Fields are given default
82	/// values.
83	pub fn new() -> Self {
84	Program {
85	insts: vec![],
86	matches: vec![],
87	captures: vec![],
88	capture_name_idx: Arc::new(HashMap::new()),
89	static_captures_len: None,
90	start: `0`,
91	byte_classes: vec![`0`; `256`],
92	only_utf8: `true`,
93	is_bytes: `false`,
94	is_dfa: `false`,
95	is_reverse: `false`,
96	is_anchored_start: `false`,
97	is_anchored_end: `false`,
98	has_unicode_word_boundary: `false`,
99	prefixes: LiteralSearcher::empty(),
100	dfa_size_limit: `2` * (`1` << `20`),
101	}
102	}
103
104	/// If pc is an index to a no-op instruction (like Save), then return the
105	/// next pc that is not a no-op instruction.
106	pub fn skip(&self, mut pc: usize) -> usize {
107	loop {
108	match self[pc] {
109	Inst::Save(ref i) => pc = i.goto,
110	_ => return pc,
111	}
112	}
113	}
114
115	/// Return true if and only if an execution engine at instruction `pc` will
116	/// always lead to a match.
117	pub fn leads_to_match(&self, pc: usize) -> bool {
118	if self.matches.len() > `1` {
119	// If we have a regex set, then we have more than one ending
120	// state, so leading to one of those states is generally
121	// meaningless.
122	return `false`;
123	}
124	match self[self.skip(pc)] {
125	Inst::Match(_) => `true`,
126	_ => `false`,
127	}
128	}
129
130	/// Returns true if the current configuration demands that an implicit
131	/// `.?` be prepended to the instruction sequence.*
132	pub fn needs_dotstar(&self) -> bool {
133	self.is_dfa && !self.is_reverse && !self.is_anchored_start
134	}
135
136	/// Returns true if this program uses Byte instructions instead of
137	/// Char/Range instructions.
138	pub fn uses_bytes(&self) -> bool {
139	self.is_bytes \|\| self.is_dfa
140	}
141
142	/// Returns true if this program exclusively matches valid UTF-8 bytes.
143	///
144	/// That is, if an invalid UTF-8 byte is seen, then no match is possible.
145	pub fn only_utf8(&self) -> bool {
146	self.only_utf8
147	}
148
149	/// Return the approximate heap usage of this instruction sequence in
150	/// bytes.
151	pub fn approximate_size(&self) -> usize {
152	// The only instruction that uses heap space is Ranges (for
153	// Unicode codepoint programs) to store non-overlapping codepoint
154	// ranges. To keep this operation constant time, we ignore them.
155	(self.len() * mem::size_of::<Inst>())
156	+ (self.matches.len() * mem::size_of::<InstPtr>())
157	+ (self.captures.len() * mem::size_of::<Option<String>>())
158	+ (self.capture_name_idx.len()
159	* (mem::size_of::<String>() + mem::size_of::<usize>()))
160	+ (self.byte_classes.len() * mem::size_of::<u8>())
161	+ self.prefixes.approximate_size()
162	}
163	}
164
165	impl Deref for Program {
166	type Target = [Inst];
167
168	#[cfg_attr(feature = "perf-inline", inline(always))]
169	fn deref(&self) -> &Self::Target {
170	&*self.insts
171	}
172	}
173
174	impl fmt::Debug for Program {
175	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
176	use self::Inst::*;
177
178	fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
179	if goto == cur + `1` {
180	fmtd
181	} else {
182	format!("{} (goto: {})", fmtd, goto)
183	}
184	}
185
186	fn visible_byte(b: u8) -> String {
187	use std::ascii::escape_default;
188	let escaped = escape_default(b).collect::<Vec<u8>>();
189	String::from_utf8_lossy(&escaped).into_owned()
190	}
191
192	for (pc, inst) in self.iter().enumerate() {
193	match *inst {
194	Match(slot) => write!(f, "{:`04`} Match({:?})", pc, slot)?,
195	Save(ref inst) => {
196	let s = format!("{:`04`} Save({})", pc, inst.slot);
197	write!(f, "{}", with_goto(pc, inst.goto, s))?;
198	}
199	Split(ref inst) => {
200	write!(
201	f,
202	"{:`04`} Split({}, {})",
203	pc, inst.goto1, inst.goto2
204	)?;
205	}
206	EmptyLook(ref inst) => {
207	let s = format!("{:?}", inst.look);
208	write!(f, "{:`04`} {}", pc, with_goto(pc, inst.goto, s))?;
209	}
210	Char(ref inst) => {
211	let s = format!("{:?}", inst.c);
212	write!(f, "{:`04`} {}", pc, with_goto(pc, inst.goto, s))?;
213	}
214	Ranges(ref inst) => {
215	let ranges = inst
216	.ranges
217	.iter()
218	.map(\|r\| format!("{:?}-{:?}", r.0, r.1))
219	.collect::<Vec<String>>()
220	.join(", ");
221	write!(
222	f,
223	"{:`04`} {}",
224	pc,
225	with_goto(pc, inst.goto, ranges)
226	)?;
227	}
228	Bytes(ref inst) => {
229	let s = format!(
230	"Bytes({}, {})",
231	visible_byte(inst.start),
232	visible_byte(inst.end)
233	);
234	write!(f, "{:`04`} {}", pc, with_goto(pc, inst.goto, s))?;
235	}
236	}
237	if pc == self.start {
238	write!(f, " (start)")?;
239	}
240	writeln!(f)?;
241	}
242	Ok(())
243	}
244	}
245
246	impl<'a> IntoIterator for &'a Program {
247	type Item = &'a Inst;
248	type IntoIter = slice::Iter<'a, Inst>;
249	fn into_iter(self) -> Self::IntoIter {
250	self.iter()
251	}
252	}
253
254	/// Inst is an instruction code in a Regex program.
255	///
256	/// Regrettably, a regex program either contains Unicode codepoint
257	/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
258	/// A regex program can never contain both.
259	///
260	/// It would be worth investigating splitting this into two distinct types and
261	/// then figuring out how to make the matching engines polymorphic over those
262	/// types without sacrificing performance.
263	///
264	/// Other than the benefit of moving invariants into the type system, another
265	/// benefit is the decreased size. If we remove the `Char` and `Ranges`
266	/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
267	/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
268	/// variant.) Given that byte based machines are typically much bigger than
269	/// their Unicode analogues (because they can decode UTF-8 directly), this ends
270	/// up being a pretty significant savings.
271	#[derive(Clone, Debug)]
272	pub enum Inst {
273	/// Match indicates that the program has reached a match state.
274	///
275	/// The number in the match corresponds to the Nth logical regular
276	/// expression in this program. This index is always 0 for normal regex
277	/// programs. Values greater than 0 appear when compiling regex sets, and
278	/// each match instruction gets its own unique value. The value corresponds
279	/// to the Nth regex in the set.
280	Match(usize),
281	/// Save causes the program to save the current location of the input in
282	/// the slot indicated by InstSave.
283	Save(InstSave),
284	/// Split causes the program to diverge to one of two paths in the
285	/// program, preferring goto1 in InstSplit.
286	Split(InstSplit),
287	/// EmptyLook represents a zero-width assertion in a regex program. A
288	/// zero-width assertion does not consume any of the input text.
289	EmptyLook(InstEmptyLook),
290	/// Char requires the regex program to match the character in InstChar at
291	/// the current position in the input.
292	Char(InstChar),
293	/// Ranges requires the regex program to match the character at the current
294	/// position in the input with one of the ranges specified in InstRanges.
295	Ranges(InstRanges),
296	/// Bytes is like Ranges, except it expresses a single byte range. It is
297	/// used in conjunction with Split instructions to implement multi-byte
298	/// character classes.
299	Bytes(InstBytes),
300	}
301
302	impl Inst {
303	/// Returns true if and only if this is a match instruction.
304	pub fn is_match(&self) -> bool {
305	match *self {
306	Inst::Match(_) => `true`,
307	_ => `false`,
308	}
309	}
310	}
311
312	/// Representation of the Save instruction.
313	#[derive(Clone, Debug)]
314	pub struct InstSave {
315	/// The next location to execute in the program.
316	pub goto: InstPtr,
317	/// The capture slot (there are two slots for every capture in a regex,
318	/// including the zeroth capture for the entire match).
319	pub slot: usize,
320	}
321
322	/// Representation of the Split instruction.
323	#[derive(Clone, Debug)]
324	pub struct InstSplit {
325	/// The first instruction to try. A match resulting from following goto1
326	/// has precedence over a match resulting from following goto2.
327	pub goto1: InstPtr,
328	/// The second instruction to try. A match resulting from following goto1
329	/// has precedence over a match resulting from following goto2.
330	pub goto2: InstPtr,
331	}
332
333	/// Representation of the `EmptyLook` instruction.
334	#[derive(Clone, Debug)]
335	pub struct InstEmptyLook {
336	/// The next location to execute in the program if this instruction
337	/// succeeds.
338	pub goto: InstPtr,
339	/// The type of zero-width assertion to check.
340	pub look: EmptyLook,
341	}
342
343	/// The set of zero-width match instructions.
344	#[derive(Clone, Copy, Debug, PartialEq, Eq)]
345	pub enum EmptyLook {
346	/// Start of line or input.
347	StartLine,
348	/// End of line or input.
349	EndLine,
350	/// Start of input.
351	StartText,
352	/// End of input.
353	EndText,
354	/// Word character on one side and non-word character on other.
355	WordBoundary,
356	/// Word character on both sides or non-word character on both sides.
357	NotWordBoundary,
358	/// ASCII word boundary.
359	WordBoundaryAscii,
360	/// Not ASCII word boundary.
361	NotWordBoundaryAscii,
362	}
363
364	/// Representation of the Char instruction.
365	#[derive(Clone, Debug)]
366	pub struct InstChar {
367	/// The next location to execute in the program if this instruction
368	/// succeeds.
369	pub goto: InstPtr,
370	/// The character to test.
371	pub c: char,
372	}
373
374	/// Representation of the Ranges instruction.
375	#[derive(Clone, Debug)]
376	pub struct InstRanges {
377	/// The next location to execute in the program if this instruction
378	/// succeeds.
379	pub goto: InstPtr,
380	/// The set of Unicode scalar value ranges to test.
381	pub ranges: Box<[(char, char)]>,
382	}
383
384	impl InstRanges {
385	/// Tests whether the given input character matches this instruction.
386	pub fn matches(&self, c: Char) -> bool {
387	// This speeds up the `match_class_unicode` benchmark by checking
388	// some common cases quickly without binary search. e.g., Matching
389	// a Unicode class on predominantly ASCII text.
390	for r in self.ranges.iter().take(`4`) {
391	if c < r.0 {
392	return `false`;
393	}
394	if c <= r.1 {
395	return `true`;
396	}
397	}
398	self.ranges
399	.binary_search_by(\|r\| {
400	if r.1 < c {
401	Ordering::Less
402	} else if r.0 > c {
403	Ordering::Greater
404	} else {
405	Ordering::Equal
406	}
407	})
408	.is_ok()
409	}
410
411	/// Return the number of distinct characters represented by all of the
412	/// ranges.
413	pub fn num_chars(&self) -> usize {
414	self.ranges
415	.iter()
416	.map(\|&(s, e)\| `1` + (e as u32) - (s as u32))
417	.sum::<u32>() as usize
418	}
419	}
420
421	/// Representation of the Bytes instruction.
422	#[derive(Clone, Debug)]
423	pub struct InstBytes {
424	/// The next location to execute in the program if this instruction
425	/// succeeds.
426	pub goto: InstPtr,
427	/// The start (inclusive) of this byte range.
428	pub start: u8,
429	/// The end (inclusive) of this byte range.
430	pub end: u8,
431	}
432
433	impl InstBytes {
434	/// Returns true if and only if the given byte is in this range.
435	pub fn matches(&self, byte: u8) -> bool {
436	self.start <= byte && byte <= self.end
437	}
438	}
439
440	#[cfg(test)]
441	mod test {
442	#[test]
443	#[cfg(target_pointer_width = "64")]
444	fn test_size_of_inst() {
445	use std::mem::size_of;
446
447	use super::Inst;
448
449	assert_eq!(`32`, size_of::<Inst>());
450	}
451	}
452