mod.rs source code [crates/regex_lite/src/hir/mod.rs]

1	use alloc::{boxed::Box, string::String, vec, vec::Vec};
2
3	use crate::{error::Error, utf8};
4
5	mod parse;
6
7	/// Escapes all regular expression meta characters in `pattern`.
8	///
9	/// The string returned may be safely used as a literal in a regular
10	/// expression.
11	pub fn escape(pattern: &str) -> String {
12	let mut buf: String = String::new();
13	buf.reserve(additional:pattern.len());
14	for ch: char in pattern.chars() {
15	if is_meta_character(ch) {
16	buf.push(ch:'`\\`');
17	}
18	buf.push(ch);
19	}
20	buf
21	}
22
23	/// Returns true if the given character has significance in a regex.
24	///
25	/// Generally speaking, these are the only characters which _must_ be escaped
26	/// in order to match their literal meaning. For example, to match a literal
27	/// `\|`, one could write `\\|`. Sometimes escaping isn't always necessary. For
28	/// example, `-` is treated as a meta character because of its significance
29	/// for writing ranges inside of character classes, but the regex `-` will
30	/// match a literal `-` because `-` has no special meaning outside of character
31	/// classes.
32	///
33	/// In order to determine whether a character may be escaped at all, the
34	/// [`is_escapeable_character`] routine should be used. The difference between
35	/// `is_meta_character` and `is_escapeable_character` is that the latter will
36	/// return true for some characters that are _not_ meta characters. For
37	/// example, `%` and `\%` both match a literal `%` in all contexts. In other
38	/// words, `is_escapeable_character` includes "superfluous" escapes.
39	///
40	/// Note that the set of characters for which this function returns `true` or
41	/// `false` is fixed and won't change in a semver compatible release. (In this
42	/// case, "semver compatible release" actually refers to the `regex` crate
43	/// itself, since reducing or expanding the set of meta characters would be a
44	/// breaking change for not just `regex-syntax` but also `regex` itself.)
45	fn is_meta_character(c: char) -> bool {
46	match c {
47	'`\\`' \| '.' \| '+' \| '*' \| '?' \| '(' \| ')' \| '\|' \| '[' \| ']' \| '{'
48	\| '}' \| '^' \| '$' \| '#' \| '&' \| '-' \| '~' => `true`,
49	_ => `false`,
50	}
51	}
52
53	/// Returns true if the given character can be escaped in a regex.
54	///
55	/// This returns true in all cases that `is_meta_character` returns true, but
56	/// also returns true in some cases where `is_meta_character` returns false.
57	/// For example, `%` is not a meta character, but it is escapeable. That is,
58	/// `%` and `\%` both match a literal `%` in all contexts.
59	///
60	/// The purpose of this routine is to provide knowledge about what characters
61	/// may be escaped. Namely, most regex engines permit "superfluous" escapes
62	/// where characters without any special significance may be escaped even
63	/// though there is no actual _need_ to do so.
64	///
65	/// This will return false for some characters. For example, `e` is not
66	/// escapeable. Therefore, `\e` will either result in a parse error (which is
67	/// true today), or it could backwards compatibly evolve into a new construct
68	/// with its own meaning. Indeed, that is the purpose of banning _some_
69	/// superfluous escapes: it provides a way to evolve the syntax in a compatible
70	/// manner.
71	fn is_escapeable_character(c: char) -> bool {
72	// Certainly escapeable if it's a meta character.
73	if is_meta_character(c) {
74	return `true`;
75	}
76	// Any character that isn't ASCII is definitely not escapeable. There's
77	// no real need to allow things like \☃ right?
78	if !c.is_ascii() {
79	return `false`;
80	}
81	// Otherwise, we basically say that everything is escapeable unless it's a
82	// letter or digit. Things like \3 are either octal (when enabled) or an
83	// error, and we should keep it that way. Otherwise, letters are reserved
84	// for adding new syntax in a backwards compatible way.
85	match c {
86	'0'..='9' \| 'A'..='Z' \| 'a'..='z' => `false`,
87	// While not currently supported, we keep these as not escapeable to
88	// give us some flexibility with respect to supporting the \< and
89	// \> word boundary assertions in the future. By rejecting them as
90	// escapeable, \< and \> will result in a parse error. Thus, we can
91	// turn them into something else in the future without it being a
92	// backwards incompatible change.
93	'<' \| '>' => `false`,
94	_ => `true`,
95	}
96	}
97
98	/// The configuration for a regex parser.
99	#[derive(Clone, Copy, Debug)]
100	pub(crate) struct Config {
101	/// The maximum number of times we're allowed to recurse.
102	///
103	/// Note that unlike the regex-syntax parser, we actually use recursion in
104	/// this parser for simplicity. My hope is that by setting a conservative
105	/// default call limit and providing a way to configure it, that we can
106	/// keep this simplification. But if we must, we can re-work the parser to
107	/// put the call stack on the heap like regex-syntax does.
108	pub(crate) nest_limit: u32,
109	/// Various flags that control how a pattern is interpreted.
110	pub(crate) flags: Flags,
111	}
112
113	impl Default for Config {
114	fn default() -> Config {
115	Config { nest_limit: `50`, flags: Flags::default() }
116	}
117	}
118
119	/// Various flags that control the interpretation of the pattern.
120	///
121	/// These can be set via explicit configuration in code, or change dynamically
122	/// during parsing via inline flags. For example, `foo(?i:bar)baz` will match
123	/// `foo` and `baz` case sensitiviely and `bar` case insensitively (assuming a
124	/// default configuration).
125	#[derive(Clone, Copy, Debug, Default)]
126	pub(crate) struct Flags {
127	/// Whether to match case insensitively.
128	///
129	/// This is the `i` flag.
130	pub(crate) case_insensitive: bool,
131	/// Whether `^` and `$` should be treated as line anchors or not.
132	///
133	/// This is the `m` flag.
134	pub(crate) multi_line: bool,
135	/// Whether `.` should match line terminators or not.
136	///
137	/// This is the `s` flag.
138	pub(crate) dot_matches_new_line: bool,
139	/// Whether to swap the meaning of greedy and non-greedy operators.
140	///
141	/// This is the `U` flag.
142	pub(crate) swap_greed: bool,
143	/// Whether to enable CRLF mode.
144	///
145	/// This is the `R` flag.
146	pub(crate) crlf: bool,
147	/// Whether to ignore whitespace. i.e., verbose mode.
148	///
149	/// This is the `x` flag.
150	pub(crate) ignore_whitespace: bool,
151	}
152
153	#[derive(Clone, Debug, Eq, PartialEq)]
154	pub(crate) struct Hir {
155	kind: HirKind,
156	is_start_anchored: bool,
157	is_match_empty: bool,
158	static_explicit_captures_len: Option<usize>,
159	}
160
161	#[derive(Clone, Debug, Eq, PartialEq)]
162	pub(crate) enum HirKind {
163	Empty,
164	Char(char),
165	Class(Class),
166	Look(Look),
167	Repetition(Repetition),
168	Capture(Capture),
169	Concat(Vec<Hir>),
170	Alternation(Vec<Hir>),
171	}
172
173	impl Hir {
174	/// Parses the given pattern string with the given configuration into a
175	/// structured representation. If the pattern is invalid, then an error
176	/// is returned.
177	pub(crate) fn parse(config: Config, pattern: &str) -> Result<Hir, Error> {
178	self::parse::Parser::new(config, pattern).parse()
179	}
180
181	/// Returns the underlying kind of this high-level intermediate
182	/// representation.
183	///
184	/// Note that there is explicitly no way to build an `Hir` directly from
185	/// an `HirKind`. If you need to do that, then you must do case analysis
186	/// on the `HirKind` and call the appropriate smart constructor on `Hir`.
187	pub(crate) fn kind(&self) -> &HirKind {
188	&self.kind
189	}
190
191	/// Returns true if and only if this Hir expression can only match at the
192	/// beginning of a haystack.
193	pub(crate) fn is_start_anchored(&self) -> bool {
194	self.is_start_anchored
195	}
196
197	/// Returns true if and only if this Hir expression can match the empty
198	/// string.
199	pub(crate) fn is_match_empty(&self) -> bool {
200	self.is_match_empty
201	}
202
203	/// If the pattern always reports the same number of matching capture groups
204	/// for every match, then this returns the number of those groups. This
205	/// doesn't include the implicit group found in every pattern.
206	pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> {
207	self.static_explicit_captures_len
208	}
209
210	fn fail() -> Hir {
211	let kind = HirKind::Class(Class { ranges: vec![] });
212	Hir {
213	kind,
214	is_start_anchored: `false`,
215	is_match_empty: `false`,
216	static_explicit_captures_len: Some(`0`),
217	}
218	}
219
220	fn empty() -> Hir {
221	let kind = HirKind::Empty;
222	Hir {
223	kind,
224	is_start_anchored: `false`,
225	is_match_empty: `true`,
226	static_explicit_captures_len: Some(`0`),
227	}
228	}
229
230	fn char(ch: char) -> Hir {
231	let kind = HirKind::Char(ch);
232	Hir {
233	kind,
234	is_start_anchored: `false`,
235	is_match_empty: `false`,
236	static_explicit_captures_len: Some(`0`),
237	}
238	}
239
240	fn class(class: Class) -> Hir {
241	let kind = HirKind::Class(class);
242	Hir {
243	kind,
244	is_start_anchored: `false`,
245	is_match_empty: `false`,
246	static_explicit_captures_len: Some(`0`),
247	}
248	}
249
250	fn look(look: Look) -> Hir {
251	let kind = HirKind::Look(look);
252	Hir {
253	kind,
254	is_start_anchored: matches!(look, Look::Start),
255	is_match_empty: `true`,
256	static_explicit_captures_len: Some(`0`),
257	}
258	}
259
260	fn repetition(rep: Repetition) -> Hir {
261	if rep.min == `0` && rep.max == Some(`0`) {
262	return Hir::empty();
263	} else if rep.min == `1` && rep.max == Some(`1`) {
264	return *rep.sub;
265	}
266	let is_start_anchored = rep.min > `0` && rep.sub.is_start_anchored;
267	let is_match_empty = rep.min == `0` \|\| rep.sub.is_match_empty;
268	let mut static_explicit_captures_len =
269	rep.sub.static_explicit_captures_len;
270	// If the static captures len of the sub-expression is not known or
271	// is greater than zero, then it automatically propagates to the
272	// repetition, regardless of the repetition. Otherwise, it might
273	// change, but only when the repetition can match 0 times.
274	if rep.min == `0`
275	&& static_explicit_captures_len.map_or(`false`, \|len\| len > `0`)
276	{
277	// If we require a match 0 times, then our captures len is
278	// guaranteed to be zero. Otherwise, if we can* match the empty*
279	// string, then it's impossible to know how many captures will be
280	// in the resulting match.
281	if rep.max == Some(`0`) {
282	static_explicit_captures_len = Some(`0`);
283	} else {
284	static_explicit_captures_len = None;
285	}
286	}
287	Hir {
288	kind: HirKind::Repetition(rep),
289	is_start_anchored,
290	is_match_empty,
291	static_explicit_captures_len,
292	}
293	}
294
295	fn capture(cap: Capture) -> Hir {
296	let is_start_anchored = cap.sub.is_start_anchored;
297	let is_match_empty = cap.sub.is_match_empty;
298	let static_explicit_captures_len = cap
299	.sub
300	.static_explicit_captures_len
301	.map(\|len\| len.saturating_add(`1`));
302	let kind = HirKind::Capture(cap);
303	Hir {
304	kind,
305	is_start_anchored,
306	is_match_empty,
307	static_explicit_captures_len,
308	}
309	}
310
311	fn concat(mut subs: Vec<Hir>) -> Hir {
312	if subs.is_empty() {
313	Hir::empty()
314	} else if subs.len() == `1` {
315	subs.pop().unwrap()
316	} else {
317	let is_start_anchored = subs[`0`].is_start_anchored;
318	let mut is_match_empty = `true`;
319	let mut static_explicit_captures_len = Some(`0usize`);
320	for sub in subs.iter() {
321	is_match_empty = is_match_empty && sub.is_match_empty;
322	static_explicit_captures_len = static_explicit_captures_len
323	.and_then(\|len1\| {
324	Some((len1, sub.static_explicit_captures_len?))
325	})
326	.and_then(\|(len1, len2)\| Some(len1.saturating_add(len2)));
327	}
328	Hir {
329	kind: HirKind::Concat(subs),
330	is_start_anchored,
331	is_match_empty,
332	static_explicit_captures_len,
333	}
334	}
335	}
336
337	fn alternation(mut subs: Vec<Hir>) -> Hir {
338	if subs.is_empty() {
339	Hir::fail()
340	} else if subs.len() == `1` {
341	subs.pop().unwrap()
342	} else {
343	let mut it = subs.iter().peekable();
344	let mut is_start_anchored =
345	it.peek().map_or(`false`, \|sub\| sub.is_start_anchored);
346	let mut is_match_empty =
347	it.peek().map_or(`false`, \|sub\| sub.is_match_empty);
348	let mut static_explicit_captures_len =
349	it.peek().and_then(\|sub\| sub.static_explicit_captures_len);
350	for sub in it {
351	is_start_anchored = is_start_anchored && sub.is_start_anchored;
352	is_match_empty = is_match_empty \|\| sub.is_match_empty;
353	if static_explicit_captures_len
354	!= sub.static_explicit_captures_len
355	{
356	static_explicit_captures_len = None;
357	}
358	}
359	Hir {
360	kind: HirKind::Alternation(subs),
361	is_start_anchored,
362	is_match_empty,
363	static_explicit_captures_len,
364	}
365	}
366	}
367	}
368
369	impl HirKind {
370	/// Returns a slice of this kind's sub-expressions, if any.
371	fn subs(&self) -> &[Hir] {
372	use core::slice::from_ref;
373
374	match *self {
375	HirKind::Empty
376	\| HirKind::Char(_)
377	\| HirKind::Class(_)
378	\| HirKind::Look(_) => &[],
379	HirKind::Repetition(Repetition { ref sub: &Box, .. }) => from_ref(sub),
380	HirKind::Capture(Capture { ref sub: &Box, .. }) => from_ref(sub),
381	HirKind::Concat(ref subs: &Vec) => subs,
382	HirKind::Alternation(ref subs: &Vec) => subs,
383	}
384	}
385	}
386
387	#[derive(Clone, Debug, Eq, PartialEq)]
388	pub(crate) struct Class {
389	pub(crate) ranges: Vec<ClassRange>,
390	}
391
392	impl Class {
393	/// Create a new class from the given ranges. The ranges may be provided
394	/// in any order or may even overlap. They will be automatically
395	/// canonicalized.
396	fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class {
397	let mut class = Class { ranges: ranges.into_iter().collect() };
398	class.canonicalize();
399	class
400	}
401
402	/// Expand this class such that it matches the ASCII codepoints in this set
403	/// case insensitively.
404	fn ascii_case_fold(&mut self) {
405	let len = self.ranges.len();
406	for i in `0`..len {
407	if let Some(folded) = self.ranges[i].ascii_case_fold() {
408	self.ranges.push(folded);
409	}
410	}
411	self.canonicalize();
412	}
413
414	/// Negate this set.
415	///
416	/// For all `x` where `x` is any element, if `x` was in this set, then it
417	/// will not be in this set after negation.
418	fn negate(&mut self) {
419	const MIN: char = '`\x00`';
420	const MAX: char = char::MAX;
421
422	if self.ranges.is_empty() {
423	self.ranges.push(ClassRange { start: MIN, end: MAX });
424	return;
425	}
426
427	// There should be a way to do this in-place with constant memory,
428	// but I couldn't figure out a simple way to do it. So just append
429	// the negation to the end of this range, and then drain it before
430	// we're done.
431	let drain_end = self.ranges.len();
432
433	// If our class doesn't start the minimum possible char, then negation
434	// needs to include all codepoints up to the minimum in this set.
435	if self.ranges[`0`].start > MIN {
436	self.ranges.push(ClassRange {
437	start: MIN,
438	// OK because we know it's bigger than MIN.
439	end: prev_char(self.ranges[`0`].start).unwrap(),
440	});
441	}
442	for i in `1`..drain_end {
443	// let lower = self.ranges[i - 1].upper().increment();
444	// let upper = self.ranges[i].lower().decrement();
445	// self.ranges.push(I::create(lower, upper));
446	self.ranges.push(ClassRange {
447	// OK because we know i-1 is never the last range and therefore
448	// there must be a range greater than it. It therefore follows
449	// that 'end' can never be char::MAX, and thus there must be
450	// a next char.
451	start: next_char(self.ranges[i - `1`].end).unwrap(),
452	// Since 'i' is guaranteed to never be the first range, it
453	// follows that there is always a range before this and thus
454	// 'start' can never be '\x00'. Thus, there must be a previous
455	// char.
456	end: prev_char(self.ranges[i].start).unwrap(),
457	});
458	}
459	if self.ranges[drain_end - `1`].end < MAX {
460	// let lower = self.ranges[drain_end - 1].upper().increment();
461	// self.ranges.push(I::create(lower, I::Bound::max_value()));
462	self.ranges.push(ClassRange {
463	// OK because we know 'end' is less than char::MAX, and thus
464	// there is a next char.
465	start: next_char(self.ranges[drain_end - `1`].end).unwrap(),
466	end: MAX,
467	});
468	}
469	self.ranges.drain(..drain_end);
470	// We don't need to canonicalize because we processed the ranges above
471	// in canonical order and the new ranges we added based on those are
472	// also necessarily in canonical order.
473	}
474
475	/// Converts this set into a canonical ordering.
476	fn canonicalize(&mut self) {
477	if self.is_canonical() {
478	return;
479	}
480	self.ranges.sort();
481	assert!(!self.ranges.is_empty());
482
483	// Is there a way to do this in-place with constant memory? I couldn't
484	// figure out a way to do it. So just append the canonicalization to
485	// the end of this range, and then drain it before we're done.
486	let drain_end = self.ranges.len();
487	for oldi in `0`..drain_end {
488	// If we've added at least one new range, then check if we can
489	// merge this range in the previously added range.
490	if self.ranges.len() > drain_end {
491	let (last, rest) = self.ranges.split_last_mut().unwrap();
492	if let Some(union) = last.union(&rest[oldi]) {
493	*last = union;
494	continue;
495	}
496	}
497	self.ranges.push(self.ranges[oldi]);
498	}
499	self.ranges.drain(..drain_end);
500	}
501
502	/// Returns true if and only if this class is in a canonical ordering.
503	fn is_canonical(&self) -> bool {
504	for pair in self.ranges.windows(`2`) {
505	if pair[`0`] >= pair[`1`] {
506	return `false`;
507	}
508	if pair[`0`].is_contiguous(&pair[`1`]) {
509	return `false`;
510	}
511	}
512	`true`
513	}
514	}
515
516	#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)]
517	pub(crate) struct ClassRange {
518	pub(crate) start: char,
519	pub(crate) end: char,
520	}
521
522	impl ClassRange {
523	/// Apply simple case folding to this byte range. Only ASCII case mappings
524	/// (for A-Za-z) are applied.
525	///
526	/// Additional ranges are appended to the given vector. Canonical ordering
527	/// is not* maintained in the given vector.*
528	fn ascii_case_fold(&self) -> Option<ClassRange> {
529	if !(ClassRange { start: 'a', end: 'z' }).is_intersection_empty(self) {
530	let start = core::cmp::max(self.start, 'a');
531	let end = core::cmp::min(self.end, 'z');
532	return Some(ClassRange {
533	start: char::try_from(u32::from(start) - `32`).unwrap(),
534	end: char::try_from(u32::from(end) - `32`).unwrap(),
535	});
536	}
537	if !(ClassRange { start: 'A', end: 'Z' }).is_intersection_empty(self) {
538	let start = core::cmp::max(self.start, 'A');
539	let end = core::cmp::min(self.end, 'Z');
540	return Some(ClassRange {
541	start: char::try_from(u32::from(start) + `32`).unwrap(),
542	end: char::try_from(u32::from(end) + `32`).unwrap(),
543	});
544	}
545	None
546	}
547
548	/// Union the given overlapping range into this range.
549	///
550	/// If the two ranges aren't contiguous, then this returns `None`.
551	fn union(&self, other: &ClassRange) -> Option<ClassRange> {
552	if !self.is_contiguous(other) {
553	return None;
554	}
555	let start = core::cmp::min(self.start, other.start);
556	let end = core::cmp::max(self.end, other.end);
557	Some(ClassRange { start, end })
558	}
559
560	/// Returns true if and only if the two ranges are contiguous. Two ranges
561	/// are contiguous if and only if the ranges are either overlapping or
562	/// adjacent.
563	fn is_contiguous(&self, other: &ClassRange) -> bool {
564	let (s1, e1) = (u32::from(self.start), u32::from(self.end));
565	let (s2, e2) = (u32::from(other.start), u32::from(other.end));
566	core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(`1`)
567	}
568
569	/// Returns true if and only if the intersection of this range and the
570	/// other range is empty.
571	fn is_intersection_empty(&self, other: &ClassRange) -> bool {
572	let (s1, e1) = (self.start, self.end);
573	let (s2, e2) = (other.start, other.end);
574	core::cmp::max(s1, s2) > core::cmp::min(e1, e2)
575	}
576	}
577
578	/// The high-level intermediate representation for a look-around assertion.
579	///
580	/// An assertion match is always zero-length. Also called an "empty match."
581	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
582	pub(crate) enum Look {
583	/// Match the beginning of text. Specifically, this matches at the starting
584	/// position of the input.
585	Start = `1` << `0`,
586	/// Match the end of text. Specifically, this matches at the ending
587	/// position of the input.
588	End = `1` << `1`,
589	/// Match the beginning of a line or the beginning of text. Specifically,
590	/// this matches at the starting position of the input, or at the position
591	/// immediately following a `\n` character.
592	StartLF = `1` << `2`,
593	/// Match the end of a line or the end of text. Specifically, this matches
594	/// at the end position of the input, or at the position immediately
595	/// preceding a `\n` character.
596	EndLF = `1` << `3`,
597	/// Match the beginning of a line or the beginning of text. Specifically,
598	/// this matches at the starting position of the input, or at the position
599	/// immediately following either a `\r` or `\n` character, but never after
600	/// a `\r` when a `\n` follows.
601	StartCRLF = `1` << `4`,
602	/// Match the end of a line or the end of text. Specifically, this matches
603	/// at the end position of the input, or at the position immediately
604	/// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
605	/// precedes it.
606	EndCRLF = `1` << `5`,
607	/// Match an ASCII-only word boundary. That is, this matches a position
608	/// where the left adjacent character and right adjacent character
609	/// correspond to a word and non-word or a non-word and word character.
610	Word = `1` << `6`,
611	/// Match an ASCII-only negation of a word boundary.
612	WordNegate = `1` << `7`,
613	/// Match the start of an ASCII-only word boundary. That is, this matches a
614	/// position at either the beginning of the haystack or where the previous
615	/// character is not a word character and the following character is a word
616	/// character.
617	WordStart = `1` << `8`,
618	/// Match the end of an ASCII-only word boundary. That is, this matches
619	/// a position at either the end of the haystack or where the previous
620	/// character is a word character and the following character is not a word
621	/// character.
622	WordEnd = `1` << `9`,
623	/// Match the start half of an ASCII-only word boundary. That is, this
624	/// matches a position at either the beginning of the haystack or where the
625	/// previous character is not a word character.
626	WordStartHalf = `1` << `10`,
627	/// Match the end half of an ASCII-only word boundary. That is, this
628	/// matches a position at either the end of the haystack or where the
629	/// following character is not a word character.
630	WordEndHalf = `1` << `11`,
631	}
632
633	impl Look {
634	/// Returns true if the given position in the given haystack matches this
635	/// look-around assertion.
636	pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool {
637	use self::Look::*;
638
639	match *self {
640	Start => at == `0`,
641	End => at == haystack.len(),
642	StartLF => at == `0` \|\| haystack[at - `1`] == b'`\n`',
643	EndLF => at == haystack.len() \|\| haystack[at] == b'`\n`',
644	StartCRLF => {
645	at == `0`
646	\|\| haystack[at - `1`] == b'`\n`'
647	\|\| (haystack[at - `1`] == b'`\r`'
648	&& (at >= haystack.len() \|\| haystack[at] != b'`\n`'))
649	}
650	EndCRLF => {
651	at == haystack.len()
652	\|\| haystack[at] == b'`\r`'
653	\|\| (haystack[at] == b'`\n`'
654	&& (at == `0` \|\| haystack[at - `1`] != b'`\r`'))
655	}
656	Word => {
657	let word_before =
658	at > `0` && utf8::is_word_byte(haystack[at - `1`]);
659	let word_after =
660	at < haystack.len() && utf8::is_word_byte(haystack[at]);
661	word_before != word_after
662	}
663	WordNegate => {
664	let word_before =
665	at > `0` && utf8::is_word_byte(haystack[at - `1`]);
666	let word_after =
667	at < haystack.len() && utf8::is_word_byte(haystack[at]);
668	word_before == word_after
669	}
670	WordStart => {
671	let word_before =
672	at > `0` && utf8::is_word_byte(haystack[at - `1`]);
673	let word_after =
674	at < haystack.len() && utf8::is_word_byte(haystack[at]);
675	!word_before && word_after
676	}
677	WordEnd => {
678	let word_before =
679	at > `0` && utf8::is_word_byte(haystack[at - `1`]);
680	let word_after =
681	at < haystack.len() && utf8::is_word_byte(haystack[at]);
682	word_before && !word_after
683	}
684	WordStartHalf => {
685	let word_before =
686	at > `0` && utf8::is_word_byte(haystack[at - `1`]);
687	!word_before
688	}
689	WordEndHalf => {
690	let word_after =
691	at < haystack.len() && utf8::is_word_byte(haystack[at]);
692	!word_after
693	}
694	}
695	}
696	}
697
698	/// The high-level intermediate representation of a repetition operator.
699	///
700	/// A repetition operator permits the repetition of an arbitrary
701	/// sub-expression.
702	#[derive(Clone, Debug, Eq, PartialEq)]
703	pub(crate) struct Repetition {
704	/// The minimum range of the repetition.
705	///
706	/// Note that special cases like `?`, `+` and `` all get translated into*
707	/// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
708	///
709	/// When `min` is zero, this expression can match the empty string
710	/// regardless of what its sub-expression is.
711	pub(crate) min: u32,
712	/// The maximum range of the repetition.
713	///
714	/// Note that when `max` is `None`, `min` acts as a lower bound but where
715	/// there is no upper bound. For something like `x{5}` where the min and
716	/// max are equivalent, `min` will be set to `5` and `max` will be set to
717	/// `Some(5)`.
718	pub(crate) max: Option<u32>,
719	/// Whether this repetition operator is greedy or not. A greedy operator
720	/// will match as much as it can. A non-greedy operator will match as
721	/// little as it can.
722	///
723	/// Typically, operators are greedy by default and are only non-greedy when
724	/// a `?` suffix is used, e.g., `(expr)` is greedy while `(expr)?` is
725	/// not. However, this can be inverted via the `U` "ungreedy" flag.
726	pub(crate) greedy: bool,
727	/// The expression being repeated.
728	pub(crate) sub: Box<Hir>,
729	}
730
731	/// The high-level intermediate representation for a capturing group.
732	///
733	/// A capturing group always has an index and a child expression. It may
734	/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
735	/// necessary.
736	///
737	/// Note that there is no explicit representation of a non-capturing group
738	/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
739	/// the recursive structure of the `Hir` itself.
740	#[derive(Clone, Debug, Eq, PartialEq)]
741	pub(crate) struct Capture {
742	/// The capture index of the capture.
743	pub(crate) index: u32,
744	/// The name of the capture, if it exists.
745	pub(crate) name: Option<Box<str>>,
746	/// The expression inside the capturing group, which may be empty.
747	pub(crate) sub: Box<Hir>,
748	}
749
750	fn next_char(ch: char) -> Option<char> {
751	// Skip over the surrogate range.
752	if ch == '`\u{D7FF}`' {
753	return Some('`\u{E000}`');
754	}
755	// OK because char::MAX < u32::MAX and we handle U+D7FF above.
756	char::from_u32(u32::from(ch).checked_add(`1`).unwrap())
757	}
758
759	fn prev_char(ch: char) -> Option<char> {
760	// Skip over the surrogate range.
761	if ch == '`\u{E000}`' {
762	return Some('`\u{D7FF}`');
763	}
764	// OK because subtracting 1 from any valid scalar value other than 0
765	// and U+E000 yields a valid scalar value.
766	Some(char::from_u32(u32::from(ch).checked_sub(`1`)?).unwrap())
767	}
768
769	impl Drop for Hir {
770	fn drop(&mut self) {
771	use core::mem;
772
773	match *self.kind() {
774	HirKind::Empty
775	\| HirKind::Char(_)
776	\| HirKind::Class(_)
777	\| HirKind::Look(_) => return,
778	HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
779	HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
780	return
781	}
782	HirKind::Concat(ref x) if x.is_empty() => return,
783	HirKind::Alternation(ref x) if x.is_empty() => return,
784	_ => {}
785	}
786
787	let mut stack = vec![mem::replace(self, Hir::empty())];
788	while let Some(mut expr) = stack.pop() {
789	match expr.kind {
790	HirKind::Empty
791	\| HirKind::Char(_)
792	\| HirKind::Class(_)
793	\| HirKind::Look(_) => {}
794	HirKind::Capture(ref mut x) => {
795	stack.push(mem::replace(&mut x.sub, Hir::empty()));
796	}
797	HirKind::Repetition(ref mut x) => {
798	stack.push(mem::replace(&mut x.sub, Hir::empty()));
799	}
800	HirKind::Concat(ref mut x) => {
801	stack.extend(x.drain(..));
802	}
803	HirKind::Alternation(ref mut x) => {
804	stack.extend(x.drain(..));
805	}
806	}
807	}
808	}
809	}
810