compile.rs source code [crates/regex-1.8.4/src/compile.rs]

1	use std::collections::HashMap;
2	use std::fmt;
3	use std::iter;
4	use std::result;
5	use std::sync::Arc;
6
7	use regex_syntax::hir::{self, Hir, Look};
8	use regex_syntax::is_word_byte;
9	use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10
11	use crate::prog::{
12	EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13	InstSave, InstSplit, Program,
14	};
15
16	use crate::Error;
17
18	type Result = result::Result<Patch, Error>;
19	type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20
21	#[derive(Debug)]
22	struct Patch {
23	hole: Hole,
24	entry: InstPtr,
25	}
26
27	/// A compiler translates a regular expression AST to a sequence of
28	/// instructions. The sequence of instructions represents an NFA.
29	// `Compiler` is only public via the `internal` module, so avoid deriving
30	// `Debug`.
31	#[allow(missing_debug_implementations)]
32	pub struct Compiler {
33	insts: Vec<MaybeInst>,
34	compiled: Program,
35	capture_name_idx: HashMap<String, usize>,
36	num_exprs: usize,
37	size_limit: usize,
38	suffix_cache: SuffixCache,
39	utf8_seqs: Option<Utf8Sequences>,
40	byte_classes: ByteClassSet,
41	// This keeps track of extra bytes allocated while compiling the regex
42	// program. Currently, this corresponds to two things. First is the heap
43	// memory allocated by Unicode character classes ('InstRanges'). Second is
44	// a "fake" amount of memory used by empty sub-expressions, so that enough
45	// empty sub-expressions will ultimately trigger the compiler to bail
46	// because of a size limit restriction. (That empty sub-expressions don't
47	// add to heap memory usage is more-or-less an implementation detail.) In
48	// the second case, if we don't bail, then an excessively large repetition
49	// on an empty sub-expression can result in the compiler using a very large
50	// amount of CPU time.
51	extra_inst_bytes: usize,
52	}
53
54	impl Compiler {
55	/// Create a new regular expression compiler.
56	///
57	/// Various options can be set before calling `compile` on an expression.
58	pub fn new() -> Self {
59	Compiler {
60	insts: vec![],
61	compiled: Program::new(),
62	capture_name_idx: HashMap::new(),
63	num_exprs: `0`,
64	size_limit: `10` * (`1` << `20`),
65	suffix_cache: SuffixCache::new(`1000`),
66	utf8_seqs: Some(Utf8Sequences::new('`\x00`', '`\x00`')),
67	byte_classes: ByteClassSet::new(),
68	extra_inst_bytes: `0`,
69	}
70	}
71
72	/// The size of the resulting program is limited by size_limit. If
73	/// the program approximately exceeds the given size (in bytes), then
74	/// compilation will stop and return an error.
75	pub fn size_limit(mut self, size_limit: usize) -> Self {
76	self.size_limit = size_limit;
77	self
78	}
79
80	/// If bytes is true, then the program is compiled as a byte based
81	/// automaton, which incorporates UTF-8 decoding into the machine. If it's
82	/// false, then the automaton is Unicode scalar value based, e.g., an
83	/// engine utilizing such an automaton is responsible for UTF-8 decoding.
84	///
85	/// The specific invariant is that when returning a byte based machine,
86	/// the neither the `Char` nor `Ranges` instructions are produced.
87	/// Conversely, when producing a Unicode scalar value machine, the `Bytes`
88	/// instruction is never produced.
89	///
90	/// Note that `dfa(true)` implies `bytes(true)`.
91	pub fn bytes(mut self, yes: bool) -> Self {
92	self.compiled.is_bytes = yes;
93	self
94	}
95
96	/// When disabled, the program compiled may match arbitrary bytes.
97	///
98	/// When enabled (the default), all compiled programs exclusively match
99	/// valid UTF-8 bytes.
100	pub fn only_utf8(mut self, yes: bool) -> Self {
101	self.compiled.only_utf8 = yes;
102	self
103	}
104
105	/// When set, the machine returned is suitable for use in the DFA matching
106	/// engine.
107	///
108	/// In particular, this ensures that if the regex is not anchored in the
109	/// beginning, then a preceding `.?` is included in the program. (The NFA*
110	/// based engines handle the preceding `.?` explicitly, which is difficult*
111	/// or impossible in the DFA engine.)
112	pub fn dfa(mut self, yes: bool) -> Self {
113	self.compiled.is_dfa = yes;
114	self
115	}
116
117	/// When set, the machine returned is suitable for matching text in
118	/// reverse. In particular, all concatenations are flipped.
119	pub fn reverse(mut self, yes: bool) -> Self {
120	self.compiled.is_reverse = yes;
121	self
122	}
123
124	/// Compile a regular expression given its AST.
125	///
126	/// The compiler is guaranteed to succeed unless the program exceeds the
127	/// specified size limit. If the size limit is exceeded, then compilation
128	/// stops and returns an error.
129	pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
130	debug_assert!(!exprs.is_empty());
131	self.num_exprs = exprs.len();
132	if exprs.len() == `1` {
133	self.compile_one(&exprs[`0`])
134	} else {
135	self.compile_many(exprs)
136	}
137	}
138
139	fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
140	if self.compiled.only_utf8
141	&& expr.properties().look_set().contains(Look::WordAsciiNegate)
142	{
143	return Err(Error::Syntax(
144	"ASCII-only `\\`B is not allowed in Unicode regexes \
145	because it may result in invalid UTF-8 matches"
146	.to_string(),
147	));
148	}
149	// If we're compiling a forward DFA and we aren't anchored, then
150	// add a `.?` before the first capture group.*
151	// Other matching engines handle this by baking the logic into the
152	// matching engine itself.
153	let mut dotstar_patch = Patch { hole: Hole::None, entry: `0` };
154	self.compiled.is_anchored_start =
155	expr.properties().look_set_prefix().contains(Look::Start);
156	self.compiled.is_anchored_end =
157	expr.properties().look_set_suffix().contains(Look::End);
158	if self.compiled.needs_dotstar() {
159	dotstar_patch = self.c_dotstar()?;
160	self.compiled.start = dotstar_patch.entry;
161	}
162	self.compiled.captures = vec![None];
163	let patch =
164	self.c_capture(`0`, expr)?.unwrap_or_else(\|\| self.next_inst());
165	if self.compiled.needs_dotstar() {
166	self.fill(dotstar_patch.hole, patch.entry);
167	} else {
168	self.compiled.start = patch.entry;
169	}
170	self.fill_to_next(patch.hole);
171	self.compiled.matches = vec![self.insts.len()];
172	self.push_compiled(Inst::Match(`0`));
173	self.compiled.static_captures_len =
174	expr.properties().static_explicit_captures_len();
175	self.compile_finish()
176	}
177
178	fn compile_many(
179	mut self,
180	exprs: &[Hir],
181	) -> result::Result<Program, Error> {
182	debug_assert!(exprs.len() > `1`);
183
184	self.compiled.is_anchored_start = exprs
185	.iter()
186	.all(\|e\| e.properties().look_set_prefix().contains(Look::Start));
187	self.compiled.is_anchored_end = exprs
188	.iter()
189	.all(\|e\| e.properties().look_set_suffix().contains(Look::End));
190	let mut dotstar_patch = Patch { hole: Hole::None, entry: `0` };
191	if self.compiled.needs_dotstar() {
192	dotstar_patch = self.c_dotstar()?;
193	self.compiled.start = dotstar_patch.entry;
194	} else {
195	self.compiled.start = `0`; // first instruction is always split
196	}
197	self.fill_to_next(dotstar_patch.hole);
198
199	let mut prev_hole = Hole::None;
200	for (i, expr) in exprs[`0`..exprs.len() - `1`].iter().enumerate() {
201	self.fill_to_next(prev_hole);
202	let split = self.push_split_hole();
203	let Patch { hole, entry } =
204	self.c_capture(`0`, expr)?.unwrap_or_else(\|\| self.next_inst());
205	self.fill_to_next(hole);
206	self.compiled.matches.push(self.insts.len());
207	self.push_compiled(Inst::Match(i));
208	prev_hole = self.fill_split(split, Some(entry), None);
209	}
210	let i = exprs.len() - `1`;
211	let Patch { hole, entry } =
212	self.c_capture(`0`, &exprs[i])?.unwrap_or_else(\|\| self.next_inst());
213	self.fill(prev_hole, entry);
214	self.fill_to_next(hole);
215	self.compiled.matches.push(self.insts.len());
216	self.push_compiled(Inst::Match(i));
217	self.compile_finish()
218	}
219
220	fn compile_finish(mut self) -> result::Result<Program, Error> {
221	self.compiled.insts =
222	self.insts.into_iter().map(\|inst\| inst.unwrap()).collect();
223	self.compiled.byte_classes = self.byte_classes.byte_classes();
224	self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
225	Ok(self.compiled)
226	}
227
228	/// Compile expr into self.insts, returning a patch on success,
229	/// or an error if we run out of memory.
230	///
231	/// All of the c_ methods of the compiler share the contract outlined*
232	/// here.
233	///
234	/// The main thing that a c_ method does is mutate `self.insts`*
235	/// to add a list of mostly compiled instructions required to execute
236	/// the given expression. `self.insts` contains MaybeInsts rather than
237	/// Insts because there is some backpatching required.
238	///
239	/// The `Patch` value returned by each c_ method provides metadata*
240	/// about the compiled instructions emitted to `self.insts`. The
241	/// `entry` member of the patch refers to the first instruction
242	/// (the entry point), while the `hole` member contains zero or
243	/// more offsets to partial instructions that need to be backpatched.
244	/// The c_ routine can't know where its list of instructions are going to*
245	/// jump to after execution, so it is up to the caller to patch
246	/// these jumps to point to the right place. So compiling some
247	/// expression, e, we would end up with a situation that looked like:
248	///
249	/// ```text
250	/// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
251	/// ^ ^ ^
252	/// \| \ /
253	/// entry \ /
254	/// hole
255	/// ```
256	///
257	/// To compile two expressions, e1 and e2, concatenated together we
258	/// would do:
259	///
260	/// ```ignore
261	/// let patch1 = self.c(e1);
262	/// let patch2 = self.c(e2);
263	/// ```
264	///
265	/// while leaves us with a situation that looks like
266	///
267	/// ```text
268	/// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
269	/// ^ ^ ^ ^
270	/// \| \| \| \|
271	/// entry1 hole1 entry2 hole2
272	/// ```
273	///
274	/// Then to merge the two patches together into one we would backpatch
275	/// hole1 with entry2 and return a new patch that enters at entry1
276	/// and has hole2 for a hole. In fact, if you look at the c_concat
277	/// method you will see that it does exactly this, though it handles
278	/// a list of expressions rather than just the two that we use for
279	/// an example.
280	///
281	/// Ok(None) is returned when an expression is compiled to no
282	/// instruction, and so no patch.entry value makes sense.
283	fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
284	use crate::prog;
285	use regex_syntax::hir::HirKind::*;
286
287	self.check_size()?;
288	match *expr.kind() {
289	Empty => self.c_empty(),
290	Literal(hir::Literal(ref bytes)) => {
291	if self.compiled.is_reverse {
292	let mut bytes = bytes.to_vec();
293	bytes.reverse();
294	self.c_literal(&bytes)
295	} else {
296	self.c_literal(bytes)
297	}
298	}
299	Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
300	Class(hir::Class::Bytes(ref cls)) => {
301	if self.compiled.uses_bytes() {
302	self.c_class_bytes(cls.ranges())
303	} else {
304	assert!(cls.is_ascii());
305	let mut char_ranges = vec![];
306	for r in cls.iter() {
307	let (s, e) = (r.start() as char, r.end() as char);
308	char_ranges.push(hir::ClassUnicodeRange::new(s, e));
309	}
310	self.c_class(&char_ranges)
311	}
312	}
313	Look(ref look) => match *look {
314	hir::Look::Start if self.compiled.is_reverse => {
315	self.c_empty_look(prog::EmptyLook::EndText)
316	}
317	hir::Look::Start => {
318	self.c_empty_look(prog::EmptyLook::StartText)
319	}
320	hir::Look::End if self.compiled.is_reverse => {
321	self.c_empty_look(prog::EmptyLook::StartText)
322	}
323	hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText),
324	hir::Look::StartLF if self.compiled.is_reverse => {
325	self.byte_classes.set_range(b'`\n`', b'`\n`');
326	self.c_empty_look(prog::EmptyLook::EndLine)
327	}
328	hir::Look::StartLF => {
329	self.byte_classes.set_range(b'`\n`', b'`\n`');
330	self.c_empty_look(prog::EmptyLook::StartLine)
331	}
332	hir::Look::EndLF if self.compiled.is_reverse => {
333	self.byte_classes.set_range(b'`\n`', b'`\n`');
334	self.c_empty_look(prog::EmptyLook::StartLine)
335	}
336	hir::Look::EndLF => {
337	self.byte_classes.set_range(b'`\n`', b'`\n`');
338	self.c_empty_look(prog::EmptyLook::EndLine)
339	}
340	hir::Look::StartCRLF \| hir::Look::EndCRLF => {
341	return Err(Error::Syntax(
342	"CRLF-aware line anchors are not supported yet"
343	.to_string(),
344	));
345	}
346	hir::Look::WordAscii => {
347	self.byte_classes.set_word_boundary();
348	self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
349	}
350	hir::Look::WordAsciiNegate => {
351	self.byte_classes.set_word_boundary();
352	self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
353	}
354	hir::Look::WordUnicode => {
355	if !cfg!(feature = "unicode-perl") {
356	return Err(Error::Syntax(
357	"Unicode word boundaries are unavailable when \
358	the unicode-perl feature is disabled"
359	.to_string(),
360	));
361	}
362	self.compiled.has_unicode_word_boundary = `true`;
363	self.byte_classes.set_word_boundary();
364	// We also make sure that all ASCII bytes are in a different
365	// class from non-ASCII bytes. Otherwise, it's possible for
366	// ASCII bytes to get lumped into the same class as non-ASCII
367	// bytes. This in turn may cause the lazy DFA to falsely start
368	// when it sees an ASCII byte that maps to a byte class with
369	// non-ASCII bytes. This ensures that never happens.
370	self.byte_classes.set_range(`0`, `0x7F`);
371	self.c_empty_look(prog::EmptyLook::WordBoundary)
372	}
373	hir::Look::WordUnicodeNegate => {
374	if !cfg!(feature = "unicode-perl") {
375	return Err(Error::Syntax(
376	"Unicode word boundaries are unavailable when \
377	the unicode-perl feature is disabled"
378	.to_string(),
379	));
380	}
381	self.compiled.has_unicode_word_boundary = `true`;
382	self.byte_classes.set_word_boundary();
383	// See comments above for why we set the ASCII range here.
384	self.byte_classes.set_range(`0`, `0x7F`);
385	self.c_empty_look(prog::EmptyLook::NotWordBoundary)
386	}
387	},
388	Capture(hir::Capture { index, ref name, ref sub }) => {
389	if index as usize >= self.compiled.captures.len() {
390	let name = match *name {
391	None => None,
392	Some(ref boxed_str) => Some(boxed_str.to_string()),
393	};
394	self.compiled.captures.push(name.clone());
395	if let Some(name) = name {
396	self.capture_name_idx.insert(name, index as usize);
397	}
398	}
399	self.c_capture(`2` * index as usize, sub)
400	}
401	Concat(ref es) => {
402	if self.compiled.is_reverse {
403	self.c_concat(es.iter().rev())
404	} else {
405	self.c_concat(es)
406	}
407	}
408	Alternation(ref es) => self.c_alternate(&**es),
409	Repetition(ref rep) => self.c_repeat(rep),
410	}
411	}
412
413	fn c_empty(&mut self) -> ResultOrEmpty {
414	// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
415	// See: CVE-2022-24713
416	//
417	// Since 'empty' sub-expressions don't increase the size of
418	// the actual compiled object, we "fake" an increase in its
419	// size so that our 'check_size_limit' routine will eventually
420	// stop compilation if there are too many empty sub-expressions
421	// (e.g., via a large repetition).
422	self.extra_inst_bytes += std::mem::size_of::<Inst>();
423	Ok(None)
424	}
425
426	fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
427	if self.num_exprs > `1` \|\| self.compiled.is_dfa {
428	// Don't ever compile Save instructions for regex sets because
429	// they are never used. They are also never used in DFA programs
430	// because DFAs can't handle captures.
431	self.c(expr)
432	} else {
433	let entry = self.insts.len();
434	let hole = self.push_hole(InstHole::Save { slot: first_slot });
435	let patch = self.c(expr)?.unwrap_or_else(\|\| self.next_inst());
436	self.fill(hole, patch.entry);
437	self.fill_to_next(patch.hole);
438	let hole = self.push_hole(InstHole::Save { slot: first_slot + `1` });
439	Ok(Some(Patch { hole, entry }))
440	}
441	}
442
443	fn c_dotstar(&mut self) -> Result {
444	let hir = if self.compiled.only_utf8() {
445	Hir::dot(hir::Dot::AnyChar)
446	} else {
447	Hir::dot(hir::Dot::AnyByte)
448	};
449	Ok(self
450	.c(&Hir::repetition(hir::Repetition {
451	min: `0`,
452	max: None,
453	greedy: `false`,
454	sub: Box::new(hir),
455	}))?
456	.unwrap())
457	}
458
459	fn c_char(&mut self, c: char) -> ResultOrEmpty {
460	if self.compiled.uses_bytes() {
461	if c.is_ascii() {
462	let b = c as u8;
463	let hole =
464	self.push_hole(InstHole::Bytes { start: b, end: b });
465	self.byte_classes.set_range(b, b);
466	Ok(Some(Patch { hole, entry: self.insts.len() - `1` }))
467	} else {
468	self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
469	}
470	} else {
471	let hole = self.push_hole(InstHole::Char { c });
472	Ok(Some(Patch { hole, entry: self.insts.len() - `1` }))
473	}
474	}
475
476	fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
477	use std::mem::size_of;
478
479	if ranges.is_empty() {
480	return Err(Error::Syntax(
481	"empty character classes are not allowed".to_string(),
482	));
483	}
484	if self.compiled.uses_bytes() {
485	Ok(Some(CompileClass { c: self, ranges }.compile()?))
486	} else {
487	let ranges: Vec<(char, char)> =
488	ranges.iter().map(\|r\| (r.start(), r.end())).collect();
489	let hole = if ranges.len() == `1` && ranges[`0`].0 == ranges[`0`].1 {
490	self.push_hole(InstHole::Char { c: ranges[`0`].0 })
491	} else {
492	self.extra_inst_bytes +=
493	ranges.len() * (size_of::<char>() * `2`);
494	self.push_hole(InstHole::Ranges { ranges })
495	};
496	Ok(Some(Patch { hole, entry: self.insts.len() - `1` }))
497	}
498	}
499
500	fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
501	self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
502	}
503
504	fn c_class_bytes(
505	&mut self,
506	ranges: &[hir::ClassBytesRange],
507	) -> ResultOrEmpty {
508	if ranges.is_empty() {
509	return Err(Error::Syntax(
510	"empty character classes are not allowed".to_string(),
511	));
512	}
513
514	let first_split_entry = self.insts.len();
515	let mut holes = vec![];
516	let mut prev_hole = Hole::None;
517	for r in &ranges[`0`..ranges.len() - `1`] {
518	self.fill_to_next(prev_hole);
519	let split = self.push_split_hole();
520	let next = self.insts.len();
521	self.byte_classes.set_range(r.start(), r.end());
522	holes.push(self.push_hole(InstHole::Bytes {
523	start: r.start(),
524	end: r.end(),
525	}));
526	prev_hole = self.fill_split(split, Some(next), None);
527	}
528	let next = self.insts.len();
529	let r = &ranges[ranges.len() - `1`];
530	self.byte_classes.set_range(r.start(), r.end());
531	holes.push(
532	self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
533	);
534	self.fill(prev_hole, next);
535	Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
536	}
537
538	fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
539	let hole = self.push_hole(InstHole::EmptyLook { look });
540	Ok(Some(Patch { hole, entry: self.insts.len() - `1` }))
541	}
542
543	fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty {
544	match core::str::from_utf8(bytes) {
545	Ok(string) => {
546	let mut it = string.chars();
547	let Patch { mut hole, entry } = loop {
548	match it.next() {
549	None => return self.c_empty(),
550	Some(ch) => {
551	if let Some(p) = self.c_char(ch)? {
552	break p;
553	}
554	}
555	}
556	};
557	for ch in it {
558	if let Some(p) = self.c_char(ch)? {
559	self.fill(hole, p.entry);
560	hole = p.hole;
561	}
562	}
563	Ok(Some(Patch { hole, entry }))
564	}
565	Err(_) => {
566	assert!(self.compiled.uses_bytes());
567	let mut it = bytes.iter().copied();
568	let Patch { mut hole, entry } = loop {
569	match it.next() {
570	None => return self.c_empty(),
571	Some(byte) => {
572	if let Some(p) = self.c_byte(byte)? {
573	break p;
574	}
575	}
576	}
577	};
578	for byte in it {
579	if let Some(p) = self.c_byte(byte)? {
580	self.fill(hole, p.entry);
581	hole = p.hole;
582	}
583	}
584	Ok(Some(Patch { hole, entry }))
585	}
586	}
587	}
588
589	fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
590	where
591	I: IntoIterator<Item = &'a Hir>,
592	{
593	let mut exprs = exprs.into_iter();
594	let Patch { mut hole, entry } = loop {
595	match exprs.next() {
596	None => return self.c_empty(),
597	Some(e) => {
598	if let Some(p) = self.c(e)? {
599	break p;
600	}
601	}
602	}
603	};
604	for e in exprs {
605	if let Some(p) = self.c(e)? {
606	self.fill(hole, p.entry);
607	hole = p.hole;
608	}
609	}
610	Ok(Some(Patch { hole, entry }))
611	}
612
613	fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
614	debug_assert!(
615	exprs.len() >= `2`,
616	"alternates must have at least 2 exprs"
617	);
618
619	// Initial entry point is always the first split.
620	let first_split_entry = self.insts.len();
621
622	// Save up all of the holes from each alternate. They will all get
623	// patched to point to the same location.
624	let mut holes = vec![];
625
626	// true indicates that the hole is a split where we want to fill
627	// the second branch.
628	let mut prev_hole = (Hole::None, `false`);
629	for e in &exprs[`0`..exprs.len() - `1`] {
630	if prev_hole.1 {
631	let next = self.insts.len();
632	self.fill_split(prev_hole.0, None, Some(next));
633	} else {
634	self.fill_to_next(prev_hole.0);
635	}
636	let split = self.push_split_hole();
637	if let Some(Patch { hole, entry }) = self.c(e)? {
638	holes.push(hole);
639	prev_hole = (self.fill_split(split, Some(entry), None), `false`);
640	} else {
641	let (split1, split2) = split.dup_one();
642	holes.push(split1);
643	prev_hole = (split2, `true`);
644	}
645	}
646	if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - `1`])? {
647	holes.push(hole);
648	if prev_hole.1 {
649	self.fill_split(prev_hole.0, None, Some(entry));
650	} else {
651	self.fill(prev_hole.0, entry);
652	}
653	} else {
654	// We ignore prev_hole.1. When it's true, it means we have two
655	// empty branches both pushing prev_hole.0 into holes, so both
656	// branches will go to the same place anyway.
657	holes.push(prev_hole.0);
658	}
659	Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
660	}
661
662	fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
663	match (rep.min, rep.max) {
664	(`0`, Some(`1`)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy),
665	(`0`, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy),
666	(`1`, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy),
667	(min, None) => {
668	self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min)
669	}
670	(min, Some(max)) => {
671	self.c_repeat_range(&rep.sub, rep.greedy, min, max)
672	}
673	}
674	}
675
676	fn c_repeat_zero_or_one(
677	&mut self,
678	expr: &Hir,
679	greedy: bool,
680	) -> ResultOrEmpty {
681	let split_entry = self.insts.len();
682	let split = self.push_split_hole();
683	let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
684	Some(p) => p,
685	None => return self.pop_split_hole(),
686	};
687	let split_hole = if greedy {
688	self.fill_split(split, Some(entry_rep), None)
689	} else {
690	self.fill_split(split, None, Some(entry_rep))
691	};
692	let holes = vec![hole_rep, split_hole];
693	Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
694	}
695
696	fn c_repeat_zero_or_more(
697	&mut self,
698	expr: &Hir,
699	greedy: bool,
700	) -> ResultOrEmpty {
701	let split_entry = self.insts.len();
702	let split = self.push_split_hole();
703	let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
704	Some(p) => p,
705	None => return self.pop_split_hole(),
706	};
707
708	self.fill(hole_rep, split_entry);
709	let split_hole = if greedy {
710	self.fill_split(split, Some(entry_rep), None)
711	} else {
712	self.fill_split(split, None, Some(entry_rep))
713	};
714	Ok(Some(Patch { hole: split_hole, entry: split_entry }))
715	}
716
717	fn c_repeat_one_or_more(
718	&mut self,
719	expr: &Hir,
720	greedy: bool,
721	) -> ResultOrEmpty {
722	let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
723	Some(p) => p,
724	None => return Ok(None),
725	};
726	self.fill_to_next(hole_rep);
727	let split = self.push_split_hole();
728
729	let split_hole = if greedy {
730	self.fill_split(split, Some(entry_rep), None)
731	} else {
732	self.fill_split(split, None, Some(entry_rep))
733	};
734	Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
735	}
736
737	fn c_repeat_range_min_or_more(
738	&mut self,
739	expr: &Hir,
740	greedy: bool,
741	min: u32,
742	) -> ResultOrEmpty {
743	let min = u32_to_usize(min);
744	// Using next_inst() is ok, because we can't return it (concat would
745	// have to return Some(_) while c_repeat_range_min_or_more returns
746	// None).
747	let patch_concat = self
748	.c_concat(iter::repeat(expr).take(min))?
749	.unwrap_or_else(\|\| self.next_inst());
750	if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
751	self.fill(patch_concat.hole, patch_rep.entry);
752	Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
753	} else {
754	Ok(None)
755	}
756	}
757
758	fn c_repeat_range(
759	&mut self,
760	expr: &Hir,
761	greedy: bool,
762	min: u32,
763	max: u32,
764	) -> ResultOrEmpty {
765	let (min, max) = (u32_to_usize(min), u32_to_usize(max));
766	debug_assert!(min <= max);
767	let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
768	if min == max {
769	return Ok(patch_concat);
770	}
771	// Same reasoning as in c_repeat_range_min_or_more (we know that min <
772	// max at this point).
773	let patch_concat = patch_concat.unwrap_or_else(\|\| self.next_inst());
774	let initial_entry = patch_concat.entry;
775	// It is much simpler to compile, e.g., `a{2,5}` as:
776	//
777	// aaa?a?a?
778	//
779	// But you end up with a sequence of instructions like this:
780	//
781	// 0: 'a'
782	// 1: 'a',
783	// 2: split(3, 4)
784	// 3: 'a'
785	// 4: split(5, 6)
786	// 5: 'a'
787	// 6: split(7, 8)
788	// 7: 'a'
789	// 8: MATCH
790	//
791	// This is incredibly* inefficient because the splits end*
792	// up forming a chain, which has to be resolved everything a
793	// transition is followed.
794	let mut holes = vec![];
795	let mut prev_hole = patch_concat.hole;
796	for _ in min..max {
797	self.fill_to_next(prev_hole);
798	let split = self.push_split_hole();
799	let Patch { hole, entry } = match self.c(expr)? {
800	Some(p) => p,
801	None => return self.pop_split_hole(),
802	};
803	prev_hole = hole;
804	if greedy {
805	holes.push(self.fill_split(split, Some(entry), None));
806	} else {
807	holes.push(self.fill_split(split, None, Some(entry)));
808	}
809	}
810	holes.push(prev_hole);
811	Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
812	}
813
814	/// Can be used as a default value for the c_ functions when the call to*
815	/// c_function is followed by inserting at least one instruction that is
816	/// always executed after the ones written by the c function.*
817	fn next_inst(&self) -> Patch {
818	Patch { hole: Hole::None, entry: self.insts.len() }
819	}
820
821	fn fill(&mut self, hole: Hole, goto: InstPtr) {
822	match hole {
823	Hole::None => {}
824	Hole::One(pc) => {
825	self.insts[pc].fill(goto);
826	}
827	Hole::Many(holes) => {
828	for hole in holes {
829	self.fill(hole, goto);
830	}
831	}
832	}
833	}
834
835	fn fill_to_next(&mut self, hole: Hole) {
836	let next = self.insts.len();
837	self.fill(hole, next);
838	}
839
840	fn fill_split(
841	&mut self,
842	hole: Hole,
843	goto1: Option<InstPtr>,
844	goto2: Option<InstPtr>,
845	) -> Hole {
846	match hole {
847	Hole::None => Hole::None,
848	Hole::One(pc) => match (goto1, goto2) {
849	(Some(goto1), Some(goto2)) => {
850	self.insts[pc].fill_split(goto1, goto2);
851	Hole::None
852	}
853	(Some(goto1), None) => {
854	self.insts[pc].half_fill_split_goto1(goto1);
855	Hole::One(pc)
856	}
857	(None, Some(goto2)) => {
858	self.insts[pc].half_fill_split_goto2(goto2);
859	Hole::One(pc)
860	}
861	(None, None) => unreachable!(
862	"at least one of the split \
863	holes must be filled"
864	),
865	},
866	Hole::Many(holes) => {
867	let mut new_holes = vec![];
868	for hole in holes {
869	new_holes.push(self.fill_split(hole, goto1, goto2));
870	}
871	if new_holes.is_empty() {
872	Hole::None
873	} else if new_holes.len() == `1` {
874	new_holes.pop().unwrap()
875	} else {
876	Hole::Many(new_holes)
877	}
878	}
879	}
880	}
881
882	fn push_compiled(&mut self, inst: Inst) {
883	self.insts.push(MaybeInst::Compiled(inst));
884	}
885
886	fn push_hole(&mut self, inst: InstHole) -> Hole {
887	let hole = self.insts.len();
888	self.insts.push(MaybeInst::Uncompiled(inst));
889	Hole::One(hole)
890	}
891
892	fn push_split_hole(&mut self) -> Hole {
893	let hole = self.insts.len();
894	self.insts.push(MaybeInst::Split);
895	Hole::One(hole)
896	}
897
898	fn pop_split_hole(&mut self) -> ResultOrEmpty {
899	self.insts.pop();
900	Ok(None)
901	}
902
903	fn check_size(&self) -> result::Result<(), Error> {
904	use std::mem::size_of;
905
906	let size =
907	self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
908	if size > self.size_limit {
909	Err(Error::CompiledTooBig(self.size_limit))
910	} else {
911	Ok(())
912	}
913	}
914	}
915
916	#[derive(Debug)]
917	enum Hole {
918	None,
919	One(InstPtr),
920	Many(Vec<Hole>),
921	}
922
923	impl Hole {
924	fn dup_one(self) -> (Self, Self) {
925	match self {
926	Hole::One(pc: usize) => (Hole::One(pc), Hole::One(pc)),
927	Hole::None \| Hole::Many(_) => {
928	unreachable!("must be called on single hole")
929	}
930	}
931	}
932	}
933
934	#[derive(Clone, Debug)]
935	enum MaybeInst {
936	Compiled(Inst),
937	Uncompiled(InstHole),
938	Split,
939	Split1(InstPtr),
940	Split2(InstPtr),
941	}
942
943	impl MaybeInst {
944	fn fill(&mut self, goto: InstPtr) {
945	let maybeinst = match *self {
946	MaybeInst::Split => MaybeInst::Split1(goto),
947	MaybeInst::Uncompiled(ref inst) => {
948	MaybeInst::Compiled(inst.fill(goto))
949	}
950	MaybeInst::Split1(goto1) => {
951	MaybeInst::Compiled(Inst::Split(InstSplit {
952	goto1,
953	goto2: goto,
954	}))
955	}
956	MaybeInst::Split2(goto2) => {
957	MaybeInst::Compiled(Inst::Split(InstSplit {
958	goto1: goto,
959	goto2,
960	}))
961	}
962	_ => unreachable!(
963	"not all instructions were compiled! \
964	found uncompiled instruction: {:?}",
965	self
966	),
967	};
968	*self = maybeinst;
969	}
970
971	fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
972	let filled = match *self {
973	MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
974	_ => unreachable!(
975	"must be called on Split instruction, \
976	instead it was called on: {:?}",
977	self
978	),
979	};
980	*self = MaybeInst::Compiled(filled);
981	}
982
983	fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
984	let half_filled = match *self {
985	MaybeInst::Split => goto1,
986	_ => unreachable!(
987	"must be called on Split instruction, \
988	instead it was called on: {:?}",
989	self
990	),
991	};
992	*self = MaybeInst::Split1(half_filled);
993	}
994
995	fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
996	let half_filled = match *self {
997	MaybeInst::Split => goto2,
998	_ => unreachable!(
999	"must be called on Split instruction, \
1000	instead it was called on: {:?}",
1001	self
1002	),
1003	};
1004	*self = MaybeInst::Split2(half_filled);
1005	}
1006
1007	fn unwrap(self) -> Inst {
1008	match self {
1009	MaybeInst::Compiled(inst) => inst,
1010	_ => unreachable!(
1011	"must be called on a compiled instruction, \
1012	instead it was called on: {:?}",
1013	self
1014	),
1015	}
1016	}
1017	}
1018
1019	#[derive(Clone, Debug)]
1020	enum InstHole {
1021	Save { slot: usize },
1022	EmptyLook { look: EmptyLook },
1023	Char { c: char },
1024	Ranges { ranges: Vec<(char, char)> },
1025	Bytes { start: u8, end: u8 },
1026	}
1027
1028	impl InstHole {
1029	fn fill(&self, goto: InstPtr) -> Inst {
1030	match *self {
1031	InstHole::Save { slot: usize } => Inst::Save(InstSave { goto, slot }),
1032	InstHole::EmptyLook { look: EmptyLook } => {
1033	Inst::EmptyLook(InstEmptyLook { goto, look })
1034	}
1035	InstHole::Char { c: char } => Inst::Char(InstChar { goto, c }),
1036	InstHole::Ranges { ref ranges: &Vec<(char, char)> } => Inst::Ranges(InstRanges {
1037	goto,
1038	ranges: ranges.clone().into_boxed_slice(),
1039	}),
1040	InstHole::Bytes { start: u8, end: u8 } => {
1041	Inst::Bytes(InstBytes { goto, start, end })
1042	}
1043	}
1044	}
1045	}
1046
1047	struct CompileClass<'a, 'b> {
1048	c: &'a mut Compiler,
1049	ranges: &'b [hir::ClassUnicodeRange],
1050	}
1051
1052	impl<'a, 'b> CompileClass<'a, 'b> {
1053	fn compile(mut self) -> Result {
1054	let mut holes = vec![];
1055	let mut initial_entry = None;
1056	let mut last_split = Hole::None;
1057	let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
1058	self.c.suffix_cache.clear();
1059
1060	for (i, range) in self.ranges.iter().enumerate() {
1061	let is_last_range = i + `1` == self.ranges.len();
1062	utf8_seqs.reset(range.start(), range.end());
1063	let mut it = (&mut utf8_seqs).peekable();
1064	loop {
1065	let utf8_seq = match it.next() {
1066	None => break,
1067	Some(utf8_seq) => utf8_seq,
1068	};
1069	if is_last_range && it.peek().is_none() {
1070	let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1071	holes.push(hole);
1072	self.c.fill(last_split, entry);
1073	last_split = Hole::None;
1074	if initial_entry.is_none() {
1075	initial_entry = Some(entry);
1076	}
1077	} else {
1078	if initial_entry.is_none() {
1079	initial_entry = Some(self.c.insts.len());
1080	}
1081	self.c.fill_to_next(last_split);
1082	last_split = self.c.push_split_hole();
1083	let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1084	holes.push(hole);
1085	last_split =
1086	self.c.fill_split(last_split, Some(entry), None);
1087	}
1088	}
1089	}
1090	self.c.utf8_seqs = Some(utf8_seqs);
1091	Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1092	}
1093
1094	fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1095	if self.c.compiled.is_reverse {
1096	self.c_utf8_seq_(seq)
1097	} else {
1098	self.c_utf8_seq_(seq.into_iter().rev())
1099	}
1100	}
1101
1102	fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1103	where
1104	I: IntoIterator<Item = &'r Utf8Range>,
1105	{
1106	// The initial instruction for each UTF-8 sequence should be the same.
1107	let mut from_inst = ::std::usize::MAX;
1108	let mut last_hole = Hole::None;
1109	for byte_range in seq {
1110	let key = SuffixCacheKey {
1111	from_inst,
1112	start: byte_range.start,
1113	end: byte_range.end,
1114	};
1115	{
1116	let pc = self.c.insts.len();
1117	if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1118	from_inst = cached_pc;
1119	continue;
1120	}
1121	}
1122	self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1123	if from_inst == ::std::usize::MAX {
1124	last_hole = self.c.push_hole(InstHole::Bytes {
1125	start: byte_range.start,
1126	end: byte_range.end,
1127	});
1128	} else {
1129	self.c.push_compiled(Inst::Bytes(InstBytes {
1130	goto: from_inst,
1131	start: byte_range.start,
1132	end: byte_range.end,
1133	}));
1134	}
1135	from_inst = self.c.insts.len().checked_sub(`1`).unwrap();
1136	debug_assert!(from_inst < ::std::usize::MAX);
1137	}
1138	debug_assert!(from_inst < ::std::usize::MAX);
1139	Ok(Patch { hole: last_hole, entry: from_inst })
1140	}
1141	}
1142
1143	/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1144	/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1145	/// The set of byte ranges looks like this:
1146	///
1147	/// [0-7F]
1148	/// [C2-DF][80-BF]
1149	/// [E0][A0-BF][80-BF]
1150	/// [E1-EC][80-BF][80-BF]
1151	/// [ED][80-9F][80-BF]
1152	/// [EE-EF][80-BF][80-BF]
1153	///
1154	/// Each line above translates to one alternate in the compiled regex program.
1155	/// However, all but one of the alternates end in the same suffix, which is
1156	/// a waste of an instruction. The suffix cache facilitates reusing them across
1157	/// alternates.
1158	///
1159	/// Note that a HashMap could be trivially used for this, but we don't need its
1160	/// overhead. Some small bounded space (LRU style) is more than enough.
1161	///
1162	/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1163	/// except it uses hashes as original indices and then compares full keys for
1164	/// validation against `dense` array.
1165	#[derive(Debug)]
1166	struct SuffixCache {
1167	sparse: Box<[usize]>,
1168	dense: Vec<SuffixCacheEntry>,
1169	}
1170
1171	#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1172	struct SuffixCacheEntry {
1173	key: SuffixCacheKey,
1174	pc: InstPtr,
1175	}
1176
1177	#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1178	struct SuffixCacheKey {
1179	from_inst: InstPtr,
1180	start: u8,
1181	end: u8,
1182	}
1183
1184	impl SuffixCache {
1185	fn new(size: usize) -> Self {
1186	SuffixCache {
1187	sparse: vec![`0usize`; size].into(),
1188	dense: Vec::with_capacity(size),
1189	}
1190	}
1191
1192	fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1193	let hash = self.hash(&key);
1194	let pos = &mut self.sparse[hash];
1195	if let Some(entry) = self.dense.get(*pos) {
1196	if entry.key == key {
1197	return Some(entry.pc);
1198	}
1199	}
1200	*pos = self.dense.len();
1201	self.dense.push(SuffixCacheEntry { key, pc });
1202	None
1203	}
1204
1205	fn clear(&mut self) {
1206	self.dense.clear();
1207	}
1208
1209	fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1210	// Basic FNV-1a hash as described:
1211	// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1212	const FNV_PRIME: u64 = `1_099_511_628_211`;
1213	let mut h = `14_695_981_039_346_656_037`;
1214	h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1215	h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1216	h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1217	(h as usize) % self.sparse.len()
1218	}
1219	}
1220
1221	struct ByteClassSet([bool; `256`]);
1222
1223	impl ByteClassSet {
1224	fn new() -> Self {
1225	ByteClassSet([`false`; `256`])
1226	}
1227
1228	fn set_range(&mut self, start: u8, end: u8) {
1229	debug_assert!(start <= end);
1230	if start > `0` {
1231	self.0[start as usize - `1`] = `true`;
1232	}
1233	self.0[end as usize] = `true`;
1234	}
1235
1236	fn set_word_boundary(&mut self) {
1237	// We need to mark all ranges of bytes whose pairs result in
1238	// evaluating \b differently.
1239	let iswb = is_word_byte;
1240	let mut b1: u16 = `0`;
1241	let mut b2: u16;
1242	while b1 <= `255` {
1243	b2 = b1 + `1`;
1244	while b2 <= `255` && iswb(b1 as u8) == iswb(b2 as u8) {
1245	b2 += `1`;
1246	}
1247	self.set_range(b1 as u8, (b2 - `1`) as u8);
1248	b1 = b2;
1249	}
1250	}
1251
1252	fn byte_classes(&self) -> Vec<u8> {
1253	// N.B. If you're debugging the DFA, it's useful to simply return
1254	// `(0..256).collect()`, which effectively removes the byte classes
1255	// and makes the transitions easier to read.
1256	// (0usize..256).map(\|x\| x as u8).collect()
1257	let mut byte_classes = vec![`0`; `256`];
1258	let mut class = `0u8`;
1259	let mut i = `0`;
1260	loop {
1261	byte_classes[i] = class as u8;
1262	if i >= `255` {
1263	break;
1264	}
1265	if self.0[i] {
1266	class = class.checked_add(`1`).unwrap();
1267	}
1268	i += `1`;
1269	}
1270	byte_classes
1271	}
1272	}
1273
1274	impl fmt::Debug for ByteClassSet {
1275	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1276	f.debug_tuple(name:"ByteClassSet").field(&&self.0[..]).finish()
1277	}
1278	}
1279
1280	fn u32_to_usize(n: u32) -> usize {
1281	// In case usize is less than 32 bits, we need to guard against overflow.
1282	// On most platforms this compiles to nothing.
1283	// TODO Use `std::convert::TryFrom` once it's stable.
1284	if (n as u64) > (::std::usize::MAX as u64) {
1285	panic!("BUG: {} is too big to be pointer sized", n)
1286	}
1287	n as usize
1288	}
1289
1290	#[cfg(test)]
1291	mod tests {
1292	use super::ByteClassSet;
1293
1294	#[test]
1295	fn byte_classes() {
1296	let mut set = ByteClassSet::new();
1297	set.set_range(b'a', b'z');
1298	let classes = set.byte_classes();
1299	assert_eq!(classes[`0`], `0`);
1300	assert_eq!(classes[`1`], `0`);
1301	assert_eq!(classes[`2`], `0`);
1302	assert_eq!(classes[b'a' as usize - `1`], `0`);
1303	assert_eq!(classes[b'a' as usize], `1`);
1304	assert_eq!(classes[b'm' as usize], `1`);
1305	assert_eq!(classes[b'z' as usize], `1`);
1306	assert_eq!(classes[b'z' as usize + `1`], `2`);
1307	assert_eq!(classes[`254`], `2`);
1308	assert_eq!(classes[`255`], `2`);
1309
1310	let mut set = ByteClassSet::new();
1311	set.set_range(`0`, `2`);
1312	set.set_range(`4`, `6`);
1313	let classes = set.byte_classes();
1314	assert_eq!(classes[`0`], `0`);
1315	assert_eq!(classes[`1`], `0`);
1316	assert_eq!(classes[`2`], `0`);
1317	assert_eq!(classes[`3`], `1`);
1318	assert_eq!(classes[`4`], `2`);
1319	assert_eq!(classes[`5`], `2`);
1320	assert_eq!(classes[`6`], `2`);
1321	assert_eq!(classes[`7`], `3`);
1322	assert_eq!(classes[`255`], `3`);
1323	}
1324
1325	#[test]
1326	fn full_byte_classes() {
1327	let mut set = ByteClassSet::new();
1328	for i in `0`..`256u16` {
1329	set.set_range(i as u8, i as u8);
1330	}
1331	assert_eq!(set.byte_classes().len(), `256`);
1332	}
1333	}
1334