interval.rs source code [crates/regex_syntax/src/hir/interval.rs]

1	use core::{char, cmp, fmt::Debug, slice};
2
3	use alloc::vec::Vec;
4
5	use crate::unicode;
6
7	// This module contains an internal* implementation of interval sets.*
8	//
9	// The primary invariant that interval sets guards is canonical ordering. That
10	// is, every interval set contains an ordered sequence of intervals where
11	// no two intervals are overlapping or adjacent. While this invariant is
12	// occasionally broken within the implementation, it should be impossible for
13	// callers to observe it.
14	//
15	// Since case folding (as implemented below) breaks that invariant, we roll
16	// that into this API even though it is a little out of place in an otherwise
17	// generic interval set. (Hence the reason why the `unicode` module is imported
18	// here.)
19	//
20	// Some of the implementation complexity here is a result of me wanting to
21	// preserve the sequential representation without using additional memory.
22	// In many cases, we do use linear extra memory, but it is at most 2x and it
23	// is amortized. If we relaxed the memory requirements, this implementation
24	// could become much simpler. The extra memory is honestly probably OK, but
25	// character classes (especially of the Unicode variety) can become quite
26	// large, and it would be nice to keep regex compilation snappy even in debug
27	// builds. (In the past, I have been careless with this area of code and it has
28	// caused slow regex compilations in debug mode, so this isn't entirely
29	// unwarranted.)
30	//
31	// Tests on this are relegated to the public API of HIR in src/hir.rs.
32
33	#[derive(Clone, Debug)]
34	pub struct IntervalSet<I> {
35	/// A sorted set of non-overlapping ranges.
36	ranges: Vec<I>,
37	/// While not required at all for correctness, we keep track of whether an
38	/// interval set has been case folded or not. This helps us avoid doing
39	/// redundant work if, for example, a set has already been cased folded.
40	/// And note that whether a set is folded or not is preserved through
41	/// all of the pairwise set operations. That is, if both interval sets
42	/// have been case folded, then any of difference, union, intersection or
43	/// symmetric difference all produce a case folded set.
44	///
45	/// Note that when this is true, it must* be the case that the set is case*
46	/// folded. But when it's false, the set may* be case folded. In other*
47	/// words, we only set this to true when we know it to be case, but we're
48	/// okay with it being false if it would otherwise be costly to determine
49	/// whether it should be true. This means code cannot assume that a false
50	/// value necessarily indicates that the set is not case folded.
51	///
52	/// Bottom line: this is a performance optimization.
53	folded: bool,
54	}
55
56	impl<I: Interval> Eq for IntervalSet<I> {}
57
58	// We implement PartialEq manually so that we don't consider the set's internal
59	// 'folded' property to be part of its identity. The 'folded' property is
60	// strictly an optimization.
61	impl<I: Interval> PartialEq for IntervalSet<I> {
62	fn eq(&self, other: &IntervalSet<I>) -> bool {
63	self.ranges.eq(&other.ranges)
64	}
65	}
66
67	impl<I: Interval> IntervalSet<I> {
68	/// Create a new set from a sequence of intervals. Each interval is
69	/// specified as a pair of bounds, where both bounds are inclusive.
70	///
71	/// The given ranges do not need to be in any specific order, and ranges
72	/// may overlap.
73	pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
74	let ranges: Vec<I> = intervals.into_iter().collect();
75	// An empty set is case folded.
76	let folded = ranges.is_empty();
77	let mut set = IntervalSet { ranges, folded };
78	set.canonicalize();
79	set
80	}
81
82	/// Add a new interval to this set.
83	pub fn push(&mut self, interval: I) {
84	// TODO: This could be faster. e.g., Push the interval such that
85	// it preserves canonicalization.
86	self.ranges.push(interval);
87	self.canonicalize();
88	// We don't know whether the new interval added here is considered
89	// case folded, so we conservatively assume that the entire set is
90	// no longer case folded if it was previously.
91	self.folded = `false`;
92	}
93
94	/// Return an iterator over all intervals in this set.
95	///
96	/// The iterator yields intervals in ascending order.
97	pub fn iter(&self) -> IntervalSetIter<'_, I> {
98	IntervalSetIter(self.ranges.iter())
99	}
100
101	/// Return an immutable slice of intervals in this set.
102	///
103	/// The sequence returned is in canonical ordering.
104	pub fn intervals(&self) -> &[I] {
105	&self.ranges
106	}
107
108	/// Expand this interval set such that it contains all case folded
109	/// characters. For example, if this class consists of the range `a-z`,
110	/// then applying case folding will result in the class containing both the
111	/// ranges `a-z` and `A-Z`.
112	///
113	/// This returns an error if the necessary case mapping data is not
114	/// available.
115	pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
116	if self.folded {
117	return Ok(());
118	}
119	let len = self.ranges.len();
120	for i in `0`..len {
121	let range = self.ranges[i];
122	if let Err(err) = range.case_fold_simple(&mut self.ranges) {
123	self.canonicalize();
124	return Err(err);
125	}
126	}
127	self.canonicalize();
128	self.folded = `true`;
129	Ok(())
130	}
131
132	/// Union this set with the given set, in place.
133	pub fn union(&mut self, other: &IntervalSet<I>) {
134	if other.ranges.is_empty() \|\| self.ranges == other.ranges {
135	return;
136	}
137	// This could almost certainly be done more efficiently.
138	self.ranges.extend(&other.ranges);
139	self.canonicalize();
140	self.folded = self.folded && other.folded;
141	}
142
143	/// Intersect this set with the given set, in place.
144	pub fn intersect(&mut self, other: &IntervalSet<I>) {
145	if self.ranges.is_empty() {
146	return;
147	}
148	if other.ranges.is_empty() {
149	self.ranges.clear();
150	// An empty set is case folded.
151	self.folded = `true`;
152	return;
153	}
154
155	// There should be a way to do this in-place with constant memory,
156	// but I couldn't figure out a simple way to do it. So just append
157	// the intersection to the end of this range, and then drain it before
158	// we're done.
159	let drain_end = self.ranges.len();
160
161	let mut ita = `0`..drain_end;
162	let mut itb = `0`..other.ranges.len();
163	let mut a = ita.next().unwrap();
164	let mut b = itb.next().unwrap();
165	loop {
166	if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
167	self.ranges.push(ab);
168	}
169	let (it, aorb) =
170	if self.ranges[a].upper() < other.ranges[b].upper() {
171	(&mut ita, &mut a)
172	} else {
173	(&mut itb, &mut b)
174	};
175	match it.next() {
176	Some(v) => *aorb = v,
177	None => break,
178	}
179	}
180	self.ranges.drain(..drain_end);
181	self.folded = self.folded && other.folded;
182	}
183
184	/// Subtract the given set from this set, in place.
185	pub fn difference(&mut self, other: &IntervalSet<I>) {
186	if self.ranges.is_empty() \|\| other.ranges.is_empty() {
187	return;
188	}
189
190	// This algorithm is (to me) surprisingly complex. A search of the
191	// interwebs indicate that this is a potentially interesting problem.
192	// Folks seem to suggest interval or segment trees, but I'd like to
193	// avoid the overhead (both runtime and conceptual) of that.
194	//
195	// The following is basically my Shitty First Draft. Therefore, in
196	// order to grok it, you probably need to read each line carefully.
197	// Simplifications are most welcome!
198	//
199	// Remember, we can assume the canonical format invariant here, which
200	// says that all ranges are sorted, not overlapping and not adjacent in
201	// each class.
202	let drain_end = self.ranges.len();
203	let (mut a, mut b) = (`0`, `0`);
204	'LOOP: while a < drain_end && b < other.ranges.len() {
205	// Basically, the easy cases are when neither range overlaps with
206	// each other. If the `b` range is less than our current `a`
207	// range, then we can skip it and move on.
208	if other.ranges[b].upper() < self.ranges[a].lower() {
209	b += `1`;
210	continue;
211	}
212	// ... similarly for the `a` range. If it's less than the smallest
213	// `b` range, then we can add it as-is.
214	if self.ranges[a].upper() < other.ranges[b].lower() {
215	let range = self.ranges[a];
216	self.ranges.push(range);
217	a += `1`;
218	continue;
219	}
220	// Otherwise, we have overlapping ranges.
221	assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
222
223	// This part is tricky and was non-obvious to me without looking
224	// at explicit examples (see the tests). The trickiness stems from
225	// two things: 1) subtracting a range from another range could
226	// yield two ranges and 2) after subtracting a range, it's possible
227	// that future ranges can have an impact. The loop below advances
228	// the `b` ranges until they can't possible impact the current
229	// range.
230	//
231	// For example, if our `a` range is `a-t` and our next three `b`
232	// ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
233	// subtraction three times before moving on to the next `a` range.
234	let mut range = self.ranges[a];
235	while b < other.ranges.len()
236	&& !range.is_intersection_empty(&other.ranges[b])
237	{
238	let old_range = range;
239	range = match range.difference(&other.ranges[b]) {
240	(None, None) => {
241	// We lost the entire range, so move on to the next
242	// without adding this one.
243	a += `1`;
244	continue 'LOOP;
245	}
246	(Some(range1), None) \| (None, Some(range1)) => range1,
247	(Some(range1), Some(range2)) => {
248	self.ranges.push(range1);
249	range2
250	}
251	};
252	// It's possible that the `b` range has more to contribute
253	// here. In particular, if it is greater than the original
254	// range, then it might impact the next `a` range and* it*
255	// has impacted the current `a` range as much as possible,
256	// so we can quit. We don't bump `b` so that the next `a`
257	// range can apply it.
258	if other.ranges[b].upper() > old_range.upper() {
259	break;
260	}
261	// Otherwise, the next `b` range might apply to the current
262	// `a` range.
263	b += `1`;
264	}
265	self.ranges.push(range);
266	a += `1`;
267	}
268	while a < drain_end {
269	let range = self.ranges[a];
270	self.ranges.push(range);
271	a += `1`;
272	}
273	self.ranges.drain(..drain_end);
274	self.folded = self.folded && other.folded;
275	}
276
277	/// Compute the symmetric difference of the two sets, in place.
278	///
279	/// This computes the symmetric difference of two interval sets. This
280	/// removes all elements in this set that are also in the given set,
281	/// but also adds all elements from the given set that aren't in this
282	/// set. That is, the set will contain all elements in either set,
283	/// but will not contain any elements that are in both sets.
284	pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
285	// TODO(burntsushi): Fix this so that it amortizes allocation.
286	let mut intersection = self.clone();
287	intersection.intersect(other);
288	self.union(other);
289	self.difference(&intersection);
290	}
291
292	/// Negate this interval set.
293	///
294	/// For all `x` where `x` is any element, if `x` was in this set, then it
295	/// will not be in this set after negation.
296	pub fn negate(&mut self) {
297	if self.ranges.is_empty() {
298	let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
299	self.ranges.push(I::create(min, max));
300	// The set containing everything must case folded.
301	self.folded = `true`;
302	return;
303	}
304
305	// There should be a way to do this in-place with constant memory,
306	// but I couldn't figure out a simple way to do it. So just append
307	// the negation to the end of this range, and then drain it before
308	// we're done.
309	let drain_end = self.ranges.len();
310
311	// We do checked arithmetic below because of the canonical ordering
312	// invariant.
313	if self.ranges[`0`].lower() > I::Bound::min_value() {
314	let upper = self.ranges[`0`].lower().decrement();
315	self.ranges.push(I::create(I::Bound::min_value(), upper));
316	}
317	for i in `1`..drain_end {
318	let lower = self.ranges[i - `1`].upper().increment();
319	let upper = self.ranges[i].lower().decrement();
320	self.ranges.push(I::create(lower, upper));
321	}
322	if self.ranges[drain_end - `1`].upper() < I::Bound::max_value() {
323	let lower = self.ranges[drain_end - `1`].upper().increment();
324	self.ranges.push(I::create(lower, I::Bound::max_value()));
325	}
326	self.ranges.drain(..drain_end);
327	// We don't need to update whether this set is folded or not, because
328	// it is conservatively preserved through negation. Namely, if a set
329	// is not folded, then it is possible that its negation is folded, for
330	// example, [^☃]. But we're fine with assuming that the set is not
331	// folded in that case. (`folded` permits false negatives but not false
332	// positives.)
333	//
334	// But what about when a set is folded, is its negation also
335	// necessarily folded? Yes. Because if a set is folded, then for every
336	// character in the set, it necessarily included its equivalence class
337	// of case folded characters. Negating it in turn means that all
338	// equivalence classes in the set are negated, and any equivalence
339	// class that was previously not in the set is now entirely in the set.
340	}
341
342	/// Converts this set into a canonical ordering.
343	fn canonicalize(&mut self) {
344	if self.is_canonical() {
345	return;
346	}
347	self.ranges.sort();
348	assert!(!self.ranges.is_empty());
349
350	// Is there a way to do this in-place with constant memory? I couldn't
351	// figure out a way to do it. So just append the canonicalization to
352	// the end of this range, and then drain it before we're done.
353	let drain_end = self.ranges.len();
354	for oldi in `0`..drain_end {
355	// If we've added at least one new range, then check if we can
356	// merge this range in the previously added range.
357	if self.ranges.len() > drain_end {
358	let (last, rest) = self.ranges.split_last_mut().unwrap();
359	if let Some(union) = last.union(&rest[oldi]) {
360	*last = union;
361	continue;
362	}
363	}
364	let range = self.ranges[oldi];
365	self.ranges.push(range);
366	}
367	self.ranges.drain(..drain_end);
368	}
369
370	/// Returns true if and only if this class is in a canonical ordering.
371	fn is_canonical(&self) -> bool {
372	for pair in self.ranges.windows(`2`) {
373	if pair[`0`] >= pair[`1`] {
374	return `false`;
375	}
376	if pair[`0`].is_contiguous(&pair[`1`]) {
377	return `false`;
378	}
379	}
380	`true`
381	}
382	}
383
384	/// An iterator over intervals.
385	#[derive(Debug)]
386	pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
387
388	impl<'a, I> Iterator for IntervalSetIter<'a, I> {
389	type Item = &'a I;
390
391	fn next(&mut self) -> Option<&'a I> {
392	self.0.next()
393	}
394	}
395
396	pub trait Interval:
397	Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
398	{
399	type Bound: Bound;
400
401	fn lower(&self) -> Self::Bound;
402	fn upper(&self) -> Self::Bound;
403	fn set_lower(&mut self, bound: Self::Bound);
404	fn set_upper(&mut self, bound: Self::Bound);
405	fn case_fold_simple(
406	&self,
407	intervals: &mut Vec<Self>,
408	) -> Result<(), unicode::CaseFoldError>;
409
410	/// Create a new interval.
411	fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
412	let mut int = Self::default();
413	if lower <= upper {
414	int.set_lower(lower);
415	int.set_upper(upper);
416	} else {
417	int.set_lower(upper);
418	int.set_upper(lower);
419	}
420	int
421	}
422
423	/// Union the given overlapping range into this range.
424	///
425	/// If the two ranges aren't contiguous, then this returns `None`.
426	fn union(&self, other: &Self) -> Option<Self> {
427	if !self.is_contiguous(other) {
428	return None;
429	}
430	let lower = cmp::min(self.lower(), other.lower());
431	let upper = cmp::max(self.upper(), other.upper());
432	Some(Self::create(lower, upper))
433	}
434
435	/// Intersect this range with the given range and return the result.
436	///
437	/// If the intersection is empty, then this returns `None`.
438	fn intersect(&self, other: &Self) -> Option<Self> {
439	let lower = cmp::max(self.lower(), other.lower());
440	let upper = cmp::min(self.upper(), other.upper());
441	if lower <= upper {
442	Some(Self::create(lower, upper))
443	} else {
444	None
445	}
446	}
447
448	/// Subtract the given range from this range and return the resulting
449	/// ranges.
450	///
451	/// If subtraction would result in an empty range, then no ranges are
452	/// returned.
453	fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
454	if self.is_subset(other) {
455	return (None, None);
456	}
457	if self.is_intersection_empty(other) {
458	return (Some(self.clone()), None);
459	}
460	let add_lower = other.lower() > self.lower();
461	let add_upper = other.upper() < self.upper();
462	// We know this because !self.is_subset(other) and the ranges have
463	// a non-empty intersection.
464	assert!(add_lower \|\| add_upper);
465	let mut ret = (None, None);
466	if add_lower {
467	let upper = other.lower().decrement();
468	ret.0 = Some(Self::create(self.lower(), upper));
469	}
470	if add_upper {
471	let lower = other.upper().increment();
472	let range = Self::create(lower, self.upper());
473	if ret.0.is_none() {
474	ret.0 = Some(range);
475	} else {
476	ret.1 = Some(range);
477	}
478	}
479	ret
480	}
481
482	/// Returns true if and only if the two ranges are contiguous. Two ranges
483	/// are contiguous if and only if the ranges are either overlapping or
484	/// adjacent.
485	fn is_contiguous(&self, other: &Self) -> bool {
486	let lower1 = self.lower().as_u32();
487	let upper1 = self.upper().as_u32();
488	let lower2 = other.lower().as_u32();
489	let upper2 = other.upper().as_u32();
490	cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(`1`)
491	}
492
493	/// Returns true if and only if the intersection of this range and the
494	/// other range is empty.
495	fn is_intersection_empty(&self, other: &Self) -> bool {
496	let (lower1, upper1) = (self.lower(), self.upper());
497	let (lower2, upper2) = (other.lower(), other.upper());
498	cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
499	}
500
501	/// Returns true if and only if this range is a subset of the other range.
502	fn is_subset(&self, other: &Self) -> bool {
503	let (lower1, upper1) = (self.lower(), self.upper());
504	let (lower2, upper2) = (other.lower(), other.upper());
505	(lower2 <= lower1 && lower1 <= upper2)
506	&& (lower2 <= upper1 && upper1 <= upper2)
507	}
508	}
509
510	pub trait Bound:
511	Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
512	{
513	fn min_value() -> Self;
514	fn max_value() -> Self;
515	fn as_u32(self) -> u32;
516	fn increment(self) -> Self;
517	fn decrement(self) -> Self;
518	}
519
520	impl Bound for u8 {
521	fn min_value() -> Self {
522	u8::MIN
523	}
524	fn max_value() -> Self {
525	u8::MAX
526	}
527	fn as_u32(self) -> u32 {
528	u32::from(self)
529	}
530	fn increment(self) -> Self {
531	self.checked_add(`1`).unwrap()
532	}
533	fn decrement(self) -> Self {
534	self.checked_sub(`1`).unwrap()
535	}
536	}
537
538	impl Bound for char {
539	fn min_value() -> Self {
540	'`\x00`'
541	}
542	fn max_value() -> Self {
543	'`\u{10FFFF}`'
544	}
545	fn as_u32(self) -> u32 {
546	u32::from(self)
547	}
548
549	fn increment(self) -> Self {
550	match self {
551	'`\u{D7FF}`' => '`\u{E000}`',
552	c => char::from_u32(u32::from(c).checked_add(`1`).unwrap()).unwrap(),
553	}
554	}
555
556	fn decrement(self) -> Self {
557	match self {
558	'`\u{E000}`' => '`\u{D7FF}`',
559	c => char::from_u32(u32::from(c).checked_sub(`1`).unwrap()).unwrap(),
560	}
561	}
562	}
563
564	// Tests for interval sets are written in src/hir.rs against the public API.
565